-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Native BFloat16 support not working on AMD EPYC 9554 #54025
Comments
For what is worth, I couldn't reproduce the issue on Nvidia Grace (ARM Neoverse V2, which has
I know it's a different architecture, but just to say this isn't completely broken everywhere 🙂 |
It might be worth trying this out with assertions on. And on latest master. Though be aware that native BFloat16 support requires LLVM 17 IIRC |
I just tried an assert build of the current master and got a segfault:
This is only for ARM. For x86 it should be LLVM 15: |
Given the link in my previous comment, the configuration you tried, @giordano, had Using the assert build on AMD EPYC 9554 I still don't see
|
Yeah, I mentioned yesterday on Slack that on aarch64 I don't get
; Function Signature: +(BFloat16s.BFloat16, BFloat16s.BFloat16)
define i16 @"julia_+_6962"(i16 zeroext %"x::BFloat16", i16 zeroext %"y::BFloat16") #0 {
top:
%0 = zext i16 %"x::BFloat16" to i32
%1 = shl nuw i32 %0, 16
%bitcast_coercion = bitcast i32 %1 to float
%2 = zext i16 %"y::BFloat16" to i32
%3 = shl nuw i32 %2, 16
%bitcast_coercion7 = bitcast i32 %3 to float
%4 = fadd float %bitcast_coercion, %bitcast_coercion7
%5 = fcmp ord float %4, 0.000000e+00
br i1 %5, label %L13, label %L30
L13: ; preds = %top
%bitcast_coercion9 = bitcast float %4 to i32
%6 = lshr i32 %bitcast_coercion9, 16
%7 = and i32 %6, 1
%narrow = add nuw nsw i32 %7, 32767
%8 = zext i32 %narrow to i64
%9 = zext i32 %bitcast_coercion9 to i64
%10 = add nuw nsw i64 %8, %9
%11 = lshr i64 %10, 16
%12 = trunc i64 %11 to i16
br label %L30
L30: ; preds = %L13, %top
%value_phi = phi i16 [ %12, %L13 ], [ 32704, %top ]
ret i16 %value_phi
} |
I think this only gets enabled on LLVM 17. We have some annoying code because of the couple ABI breaks that have happened |
There is an infinite recursion within LLVM in between
which leads to a I briefly checked above LLVM code, but couldn't make much sense of it (unless the How should we proceed with this bug? Can I hand this off to you / one of the core developers? I'm not an LLVM expert and don't have much time to dig into this at the moment, unfortunately. GDB Backtrace (truncated)
|
The problem persists on the current nightly, Version 1.12.0-DEV.629 (2024-05-30). |
The steps to this would be that someone finds a standalone llvm IR reproducer and then we can file this with upstream. |
But how if it's |
You would use |
Seems to work for me on AMD EPYC 9654, which has
LLVM IR
; Function Signature: +(Core.BFloat16, Core.BFloat16)
define bfloat @"julia_+_2817"(bfloat %"x::BFloat16", bfloat %"y::BFloat16") #0 {
top:
%0 = fadd bfloat %"x::BFloat16", %"y::BFloat16"
ret bfloat %0
}
; Function Signature: +(Array{Core.BFloat16, 1}, Array{Core.BFloat16, 1})
define nonnull ptr @"julia_+_3200"(ptr noundef nonnull align 8 dereferenceable(24) %"A::Array", ptr noundef nonnull align 8 dereferenceable(24) %"Bs[1]::Array") #0 {
top:
%gcframe1 = alloca [5 x ptr], align 16
call void @llvm.memset.p0.i64(ptr align 16 %gcframe1, i8 0, i64 40, i1 true)
%"new::Tuple" = alloca [1 x [1 x i64]], align 8
%"new::Tuple6" = alloca [1 x [1 x i64]], align 8
%thread_ptr = call ptr asm "movq %fs:0, $0", "=r"() #16
%tls_ppgcstack = getelementptr i8, ptr %thread_ptr, i64 -8
%tls_pgcstack = load ptr, ptr %tls_ppgcstack, align 8
store i64 12, ptr %gcframe1, align 8
%frame.prev = getelementptr inbounds ptr, ptr %gcframe1, i64 1
%task.gcstack = load ptr, ptr %tls_pgcstack, align 8
store ptr %task.gcstack, ptr %frame.prev, align 8
store ptr %gcframe1, ptr %tls_pgcstack, align 8
%0 = getelementptr inbounds i8, ptr %"A::Array", i64 16
%"A::Array.size.sroa.0.0.copyload" = load i64, ptr %0, align 8
store i64 %"A::Array.size.sroa.0.0.copyload", ptr %"new::Tuple", align 8
%1 = getelementptr inbounds i8, ptr %"Bs[1]::Array", i64 16
%"Bs[1].size.sroa.0.0.copyload" = load i64, ptr %1, align 8
store i64 %"Bs[1].size.sroa.0.0.copyload", ptr %"new::Tuple6", align 8
%.not = icmp eq i64 %"A::Array.size.sroa.0.0.copyload", %"Bs[1].size.sroa.0.0.copyload"
br i1 %.not, label %L55, label %L20
L20: ; preds = %top
call void @j_throw_promote_shape_mismatch_3207(ptr nocapture nonnull readonly %"new::Tuple", ptr nocapture nonnull readonly %"new::Tuple6", i64 signext 1) #7
unreachable
L55: ; preds = %top
%.not426 = icmp eq i64 %"A::Array.size.sroa.0.0.copyload", 0
br i1 %.not426, label %L68, label %L70
L68: ; preds = %L55
%.instance = load atomic ptr, ptr getelementptr inbounds (ptr, ptr @"+Core.GenericMemory#3209.jit", i64 4) unordered, align 8
br label %L93
L70: ; preds = %L55
%"Memory{BFloat16}[]" = call ptr @jl_alloc_genericmemory(ptr nonnull @"+Core.GenericMemory#3209.jit", i64 %"A::Array.size.sroa.0.0.copyload")
br label %L93
L93: ; preds = %L70, %L68
%2 = phi ptr [ %.instance, %L68 ], [ %"Memory{BFloat16}[]", %L70 ]
%.data_ptr = getelementptr inbounds { i64, ptr }, ptr %2, i64 0, i32 1
%3 = load ptr, ptr %.data_ptr, align 8
%gc_slot_addr_0 = getelementptr inbounds ptr, ptr %gcframe1, i64 2
store ptr %2, ptr %gc_slot_addr_0, align 8
%ptls_field = getelementptr inbounds ptr, ptr %tls_pgcstack, i64 2
%ptls_load = load ptr, ptr %ptls_field, align 8
%"new::Array" = call noalias nonnull align 8 dereferenceable(32) ptr @ijl_gc_small_alloc(ptr %ptls_load, i32 552, i32 32, i64 23456089731920) #14
%"new::Array.tag_addr" = getelementptr inbounds i64, ptr %"new::Array", i64 -1
store atomic i64 23456089731920, ptr %"new::Array.tag_addr" unordered, align 8
%4 = getelementptr inbounds ptr, ptr %"new::Array", i64 1
store ptr %3, ptr %"new::Array", align 8
store ptr %2, ptr %4, align 8
%"new::Array.size_ptr" = getelementptr inbounds i8, ptr %"new::Array", i64 16
store i64 %"A::Array.size.sroa.0.0.copyload", ptr %"new::Array.size_ptr", align 8
%.not430 = icmp eq ptr %"new::Array", %"A::Array"
%brmerge = or i1 %.not430, %.not426
%"A::Array.size.sroa.0.0.copyload.mux" = select i1 %.not430, i64 %"A::Array.size.sroa.0.0.copyload", i64 0
br i1 %brmerge, label %L158, label %L108
L108: ; preds = %L93
%5 = getelementptr inbounds ptr, ptr %2, i64 2
%.not433 = icmp eq ptr %5, %3
br i1 %.not433, label %guard_exit222, label %guard_pass221
L144: ; preds = %guard_exit246
%6 = load ptr, ptr %"A::Array", align 8
%gc_slot_addr_2 = getelementptr inbounds ptr, ptr %gcframe1, i64 4
store ptr %"new::Array", ptr %gc_slot_addr_2, align 8
store ptr %323, ptr %gc_slot_addr_0, align 8
%7 = call nonnull ptr @jlplt_jl_genericmemory_copy_slice_3238_got.jit(ptr %323, ptr %6, i64 %"A::Array.size.sroa.0.0.copyload")
%.data_ptr280 = getelementptr inbounds { i64, ptr }, ptr %7, i64 0, i32 1
%8 = load ptr, ptr %.data_ptr280, align 8
%"A::Array.size282.sroa.0.0.copyload" = load i64, ptr %0, align 8
store ptr %7, ptr %gc_slot_addr_0, align 8
%ptls_load664 = load ptr, ptr %ptls_field, align 8
%"new::Array286" = call noalias nonnull align 8 dereferenceable(32) ptr @ijl_gc_small_alloc(ptr %ptls_load664, i32 552, i32 32, i64 23456089731920) #14
%"new::Array286.tag_addr" = getelementptr inbounds i64, ptr %"new::Array286", i64 -1
store atomic i64 23456089731920, ptr %"new::Array286.tag_addr" unordered, align 8
%9 = getelementptr inbounds ptr, ptr %"new::Array286", i64 1
store ptr %8, ptr %"new::Array286", align 8
store ptr %7, ptr %9, align 8
%"new::Array286.size_ptr" = getelementptr inbounds i8, ptr %"new::Array286", i64 16
store i64 %"A::Array.size282.sroa.0.0.copyload", ptr %"new::Array286.size_ptr", align 8
%"new::Array.size138.sroa.0.0.copyload.pre" = load i64, ptr %"new::Array.size_ptr", align 8
%value_phi44.size.sroa.0.0.copyload.pre.pre = load i64, ptr %1, align 8
br label %L158
L158: ; preds = %guard_exit246, %L144, %L93
%value_phi44.size.sroa.0.0.copyload.pre = phi i64 [ %"A::Array.size.sroa.0.0.copyload", %L93 ], [ %value_phi44.size.sroa.0.0.copyload.pre.pre, %L144 ], [ %"A::Array.size.sroa.0.0.copyload", %guard_exit246 ]
%ptls_load672687 = phi ptr [ %ptls_load, %L93 ], [ %ptls_load664, %L144 ], [ %ptls_load, %guard_exit246 ]
%"new::Array.size138.sroa.0.0.copyload" = phi i64 [ %"A::Array.size.sroa.0.0.copyload", %L93 ], [ %"new::Array.size138.sroa.0.0.copyload.pre", %L144 ], [ %"A::Array.size.sroa.0.0.copyload", %guard_exit246 ]
%value_phi36.size.sroa.0.0.copyload = phi i64 [ %"A::Array.size.sroa.0.0.copyload.mux", %L93 ], [ %"A::Array.size282.sroa.0.0.copyload", %L144 ], [ %"A::Array.size.sroa.0.0.copyload", %guard_exit246 ]
%value_phi36 = phi ptr [ %"A::Array", %L93 ], [ %"new::Array286", %L144 ], [ %"A::Array", %guard_exit246 ]
%value_phi36.size.sroa.0.0.copyload.fr = freeze i64 %value_phi36.size.sroa.0.0.copyload
%.not440 = icmp eq i64 %value_phi36.size.sroa.0.0.copyload.fr, 1
%.not441 = icmp eq ptr %"new::Array", %"Bs[1]::Array"
%.not442 = icmp eq i64 %"new::Array.size138.sroa.0.0.copyload", 0
%or.cond = select i1 %.not441, i1 true, i1 %.not442
br i1 %or.cond, label %L232, label %L176
L176: ; preds = %L158
%.not443 = icmp eq i64 %value_phi44.size.sroa.0.0.copyload.pre, 0
br i1 %.not443, label %L232, label %L182
L182: ; preds = %L176
%10 = load ptr, ptr %4, align 8
%11 = getelementptr inbounds { i64, ptr }, ptr %10, i64 0, i32 1
%12 = load ptr, ptr %11, align 8
%13 = getelementptr inbounds ptr, ptr %10, i64 2
%.not444 = icmp eq ptr %13, %12
br i1 %.not444, label %guard_exit, label %guard_pass
L218: ; preds = %guard_exit167
%14 = load ptr, ptr %"Bs[1]::Array", align 8
%gc_slot_addr_2652 = getelementptr inbounds ptr, ptr %gcframe1, i64 4
store ptr %"new::Array", ptr %gc_slot_addr_2652, align 8
%gc_slot_addr_1 = getelementptr inbounds ptr, ptr %gcframe1, i64 3
store ptr %value_phi36, ptr %gc_slot_addr_1, align 8
store ptr %306, ptr %gc_slot_addr_0, align 8
%15 = call nonnull ptr @jlplt_jl_genericmemory_copy_slice_3238_got.jit(ptr %306, ptr %14, i64 %value_phi44.size.sroa.0.0.copyload.pre)
%.data_ptr200 = getelementptr inbounds { i64, ptr }, ptr %15, i64 0, i32 1
%16 = load ptr, ptr %.data_ptr200, align 8
%"Bs[1]10.size202.sroa.0.0.copyload" = load i64, ptr %1, align 8
store ptr %15, ptr %gc_slot_addr_0, align 8
%ptls_load670 = load ptr, ptr %ptls_field, align 8
%"new::Array206" = call noalias nonnull align 8 dereferenceable(32) ptr @ijl_gc_small_alloc(ptr %ptls_load670, i32 552, i32 32, i64 23456089731920) #14
%"new::Array206.tag_addr" = getelementptr inbounds i64, ptr %"new::Array206", i64 -1
store atomic i64 23456089731920, ptr %"new::Array206.tag_addr" unordered, align 8
%17 = getelementptr inbounds ptr, ptr %"new::Array206", i64 1
store ptr %16, ptr %"new::Array206", align 8
store ptr %15, ptr %17, align 8
%"new::Array206.size_ptr" = getelementptr inbounds i8, ptr %"new::Array206", i64 16
store i64 %"Bs[1]10.size202.sroa.0.0.copyload", ptr %"new::Array206.size_ptr", align 8
br label %L232
L232: ; preds = %guard_exit167, %L218, %L176, %L158
%ptls_load672 = phi ptr [ %ptls_load672687, %L176 ], [ %ptls_load670, %L218 ], [ %ptls_load672687, %guard_exit167 ], [ %ptls_load672687, %L158 ]
%"Bs[1]10.size119.sroa.0.0.copyload" = phi i64 [ 0, %L176 ], [ %"Bs[1]10.size202.sroa.0.0.copyload", %L218 ], [ %value_phi44.size.sroa.0.0.copyload.pre, %guard_exit167 ], [ %value_phi44.size.sroa.0.0.copyload.pre, %L158 ]
%.pre-phi = phi ptr [ %"Bs[1]::Array", %L176 ], [ %"new::Array206", %L218 ], [ %"Bs[1]::Array", %guard_exit167 ], [ %"Bs[1]::Array", %L158 ]
%value_phi44.size.sroa.0.0.copyload.fr = freeze i64 %"Bs[1]10.size119.sroa.0.0.copyload"
%.not451 = icmp eq i64 %value_phi44.size.sroa.0.0.copyload.fr, 1
%18 = icmp slt i64 %"A::Array.size.sroa.0.0.copyload", 1
br i1 %18, label %L360, label %L249.preheader
L249.preheader: ; preds = %L232
%19 = load ptr, ptr %value_phi36, align 8
%20 = load ptr, ptr %.pre-phi, align 8
%21 = load ptr, ptr %"new::Array", align 8
%min.iters.check601 = icmp ult i64 %"A::Array.size.sroa.0.0.copyload", 16
br i1 %.not440, label %L249.preheader.split.us, label %L249.preheader.split
L249.preheader.split.us: ; preds = %L249.preheader
br i1 %.not451, label %iter.check603, label %iter.check559
iter.check559: ; preds = %L249.preheader.split.us
br i1 %min.iters.check601, label %vec.epilog.scalar.ph573, label %vector.memcheck545
vector.memcheck545: ; preds = %iter.check559
%22 = shl nuw i64 %"A::Array.size.sroa.0.0.copyload", 1
%scevgep546 = getelementptr i8, ptr %21, i64 %22
%scevgep547 = getelementptr i8, ptr %19, i64 2
%scevgep548 = getelementptr i8, ptr %20, i64 %22
%bound0549 = icmp ult ptr %21, %scevgep547
%bound1550 = icmp ult ptr %19, %scevgep546
%found.conflict551 = and i1 %bound0549, %bound1550
%bound0552 = icmp ult ptr %21, %scevgep548
%bound1553 = icmp ult ptr %20, %scevgep546
%found.conflict554 = and i1 %bound0552, %bound1553
%conflict.rdx555 = or i1 %found.conflict551, %found.conflict554
br i1 %conflict.rdx555, label %vec.epilog.scalar.ph573, label %vector.main.loop.iter.check561
vector.main.loop.iter.check561: ; preds = %vector.memcheck545
%min.iters.check560 = icmp ult i64 %"A::Array.size.sroa.0.0.copyload", 32
br i1 %min.iters.check560, label %vec.epilog.ph575, label %vector.ph562
vector.ph562: ; preds = %vector.main.loop.iter.check561
%n.vec564 = and i64 %"A::Array.size.sroa.0.0.copyload", 9223372036854775776
%.pre684 = load bfloat, ptr %19, align 2
br label %vector.body566
vector.body566: ; preds = %vector.body566, %vector.ph562
%index567 = phi i64 [ 0, %vector.ph562 ], [ %index.next571, %vector.body566 ]
%23 = shl nuw i64 %index567, 1
%broadcast.splatinsert569 = insertelement <32 x bfloat> poison, bfloat %.pre684, i64 0
%broadcast.splat570 = shufflevector <32 x bfloat> %broadcast.splatinsert569, <32 x bfloat> poison, <32 x i32> zeroinitializer
%24 = getelementptr inbounds i8, ptr %20, i64 %23
%wide.load568 = load <32 x bfloat>, ptr %24, align 2
%25 = fadd <32 x bfloat> %broadcast.splat570, %wide.load568
%26 = getelementptr inbounds i8, ptr %21, i64 %23
store <32 x bfloat> %25, ptr %26, align 2
%index.next571 = add nuw i64 %index567, 32
%27 = icmp eq i64 %index.next571, %n.vec564
br i1 %27, label %middle.block556, label %vector.body566
middle.block556: ; preds = %vector.body566
%cmp.n565 = icmp eq i64 %"A::Array.size.sroa.0.0.copyload", %n.vec564
br i1 %cmp.n565, label %L360, label %vec.epilog.iter.check574
vec.epilog.iter.check574: ; preds = %middle.block556
%n.vec.remaining576 = and i64 %"A::Array.size.sroa.0.0.copyload", 16
%min.epilog.iters.check577.not.not = icmp eq i64 %n.vec.remaining576, 0
br i1 %min.epilog.iters.check577.not.not, label %vec.epilog.scalar.ph573, label %vec.epilog.ph575
vec.epilog.ph575: ; preds = %vec.epilog.iter.check574, %vector.main.loop.iter.check561
%vec.epilog.resume.val578 = phi i64 [ %n.vec564, %vec.epilog.iter.check574 ], [ 0, %vector.main.loop.iter.check561 ]
%n.vec580 = and i64 %"A::Array.size.sroa.0.0.copyload", 9223372036854775792
%.pre685 = load bfloat, ptr %19, align 2
br label %vec.epilog.vector.body583
vec.epilog.vector.body583: ; preds = %vec.epilog.vector.body583, %vec.epilog.ph575
%index584 = phi i64 [ %vec.epilog.resume.val578, %vec.epilog.ph575 ], [ %index.next588, %vec.epilog.vector.body583 ]
%28 = shl nuw i64 %index584, 1
%broadcast.splatinsert586 = insertelement <16 x bfloat> poison, bfloat %.pre685, i64 0
%broadcast.splat587 = shufflevector <16 x bfloat> %broadcast.splatinsert586, <16 x bfloat> poison, <16 x i32> zeroinitializer
%29 = getelementptr inbounds i8, ptr %20, i64 %28
%wide.load585 = load <16 x bfloat>, ptr %29, align 2
%30 = fadd <16 x bfloat> %broadcast.splat587, %wide.load585
%31 = getelementptr inbounds i8, ptr %21, i64 %28
store <16 x bfloat> %30, ptr %31, align 2
%index.next588 = add nuw i64 %index584, 16
%32 = icmp eq i64 %index.next588, %n.vec580
br i1 %32, label %vec.epilog.middle.block572, label %vec.epilog.vector.body583
vec.epilog.middle.block572: ; preds = %vec.epilog.vector.body583
%cmp.n582 = icmp eq i64 %"A::Array.size.sroa.0.0.copyload", %n.vec580
br i1 %cmp.n582, label %L360, label %vec.epilog.scalar.ph573
vec.epilog.scalar.ph573: ; preds = %vec.epilog.middle.block572, %vec.epilog.iter.check574, %vector.memcheck545, %iter.check559
%bc.resume.val581 = phi i64 [ %n.vec580, %vec.epilog.middle.block572 ], [ %n.vec564, %vec.epilog.iter.check574 ], [ 0, %vector.memcheck545 ], [ 0, %iter.check559 ]
%xtraiter644 = and i64 %"A::Array.size.sroa.0.0.copyload", 7
%lcmp.mod645.not = icmp eq i64 %xtraiter644, 0
br i1 %lcmp.mod645.not, label %L252.us.prol.loopexit, label %L252.us.prol
L252.us.prol: ; preds = %L252.us.prol, %vec.epilog.scalar.ph573
%value_phi52470.us.prol = phi i64 [ %39, %L252.us.prol ], [ %bc.resume.val581, %vec.epilog.scalar.ph573 ]
%prol.iter646 = phi i64 [ %prol.iter646.next, %L252.us.prol ], [ 0, %vec.epilog.scalar.ph573 ]
%33 = shl nuw i64 %value_phi52470.us.prol, 1
%34 = load bfloat, ptr %19, align 2
%35 = getelementptr inbounds i8, ptr %20, i64 %33
%36 = load bfloat, ptr %35, align 2
%37 = fadd bfloat %34, %36
%38 = getelementptr inbounds i8, ptr %21, i64 %33
store bfloat %37, ptr %38, align 2
%39 = add nuw nsw i64 %value_phi52470.us.prol, 1
%prol.iter646.next = add i64 %prol.iter646, 1
%prol.iter646.cmp.not = icmp eq i64 %prol.iter646.next, %xtraiter644
br i1 %prol.iter646.cmp.not, label %L252.us.prol.loopexit, label %L252.us.prol
L252.us.prol.loopexit: ; preds = %L252.us.prol, %vec.epilog.scalar.ph573
%value_phi52470.us.unr = phi i64 [ %bc.resume.val581, %vec.epilog.scalar.ph573 ], [ %39, %L252.us.prol ]
%40 = sub nsw i64 %bc.resume.val581, %"A::Array.size.sroa.0.0.copyload"
%41 = icmp ugt i64 %40, -8
br i1 %41, label %L360, label %L252.us
iter.check603: ; preds = %L249.preheader.split.us
br i1 %min.iters.check601, label %vec.epilog.scalar.ph618, label %vector.memcheck589
vector.memcheck589: ; preds = %iter.check603
%42 = shl nuw i64 %"A::Array.size.sroa.0.0.copyload", 1
%scevgep590 = getelementptr i8, ptr %21, i64 %42
%scevgep591 = getelementptr i8, ptr %19, i64 2
%scevgep592 = getelementptr i8, ptr %20, i64 2
%bound0593 = icmp ult ptr %21, %scevgep591
%bound1594 = icmp ult ptr %19, %scevgep590
%found.conflict595 = and i1 %bound0593, %bound1594
%bound0596 = icmp ult ptr %21, %scevgep592
%bound1597 = icmp ult ptr %20, %scevgep590
%found.conflict598 = and i1 %bound0596, %bound1597
%conflict.rdx599 = or i1 %found.conflict595, %found.conflict598
br i1 %conflict.rdx599, label %vec.epilog.scalar.ph618, label %vector.main.loop.iter.check605
vector.main.loop.iter.check605: ; preds = %vector.memcheck589
%min.iters.check604 = icmp ult i64 %"A::Array.size.sroa.0.0.copyload", 32
br i1 %min.iters.check604, label %vec.epilog.ph620, label %vector.ph606
vector.ph606: ; preds = %vector.main.loop.iter.check605
%n.vec608 = and i64 %"A::Array.size.sroa.0.0.copyload", 9223372036854775776
br label %vector.body610
vector.body610: ; preds = %vector.body610, %vector.ph606
%index611 = phi i64 [ 0, %vector.ph606 ], [ %index.next616, %vector.body610 ]
%43 = shl nuw i64 %index611, 1
%44 = load bfloat, ptr %19, align 2
%45 = load bfloat, ptr %20, align 2
%.scalar = fadd bfloat %44, %45
%46 = insertelement <32 x bfloat> poison, bfloat %.scalar, i64 0
%47 = shufflevector <32 x bfloat> %46, <32 x bfloat> poison, <32 x i32> zeroinitializer
%48 = getelementptr inbounds i8, ptr %21, i64 %43
store <32 x bfloat> %47, ptr %48, align 2
%index.next616 = add nuw i64 %index611, 32
%49 = icmp eq i64 %index.next616, %n.vec608
br i1 %49, label %middle.block600, label %vector.body610
middle.block600: ; preds = %vector.body610
%cmp.n609 = icmp eq i64 %"A::Array.size.sroa.0.0.copyload", %n.vec608
br i1 %cmp.n609, label %L360, label %vec.epilog.iter.check619
vec.epilog.iter.check619: ; preds = %middle.block600
%n.vec.remaining621 = and i64 %"A::Array.size.sroa.0.0.copyload", 16
%min.epilog.iters.check622.not.not = icmp eq i64 %n.vec.remaining621, 0
br i1 %min.epilog.iters.check622.not.not, label %vec.epilog.scalar.ph618, label %vec.epilog.ph620
vec.epilog.ph620: ; preds = %vec.epilog.iter.check619, %vector.main.loop.iter.check605
%vec.epilog.resume.val623 = phi i64 [ %n.vec608, %vec.epilog.iter.check619 ], [ 0, %vector.main.loop.iter.check605 ]
%n.vec625 = and i64 %"A::Array.size.sroa.0.0.copyload", 9223372036854775792
br label %vec.epilog.vector.body628
vec.epilog.vector.body628: ; preds = %vec.epilog.vector.body628, %vec.epilog.ph620
%index629 = phi i64 [ %vec.epilog.resume.val623, %vec.epilog.ph620 ], [ %index.next634, %vec.epilog.vector.body628 ]
%50 = shl nuw i64 %index629, 1
%51 = load bfloat, ptr %19, align 2
%52 = load bfloat, ptr %20, align 2
%.scalar637 = fadd bfloat %51, %52
%53 = insertelement <16 x bfloat> poison, bfloat %.scalar637, i64 0
%54 = shufflevector <16 x bfloat> %53, <16 x bfloat> poison, <16 x i32> zeroinitializer
%55 = getelementptr inbounds i8, ptr %21, i64 %50
store <16 x bfloat> %54, ptr %55, align 2
%index.next634 = add nuw i64 %index629, 16
%56 = icmp eq i64 %index.next634, %n.vec625
br i1 %56, label %vec.epilog.middle.block617, label %vec.epilog.vector.body628
vec.epilog.middle.block617: ; preds = %vec.epilog.vector.body628
%cmp.n627 = icmp eq i64 %"A::Array.size.sroa.0.0.copyload", %n.vec625
br i1 %cmp.n627, label %L360, label %vec.epilog.scalar.ph618
vec.epilog.scalar.ph618: ; preds = %vec.epilog.middle.block617, %vec.epilog.iter.check619, %vector.memcheck589, %iter.check603
%bc.resume.val626 = phi i64 [ %n.vec625, %vec.epilog.middle.block617 ], [ %n.vec608, %vec.epilog.iter.check619 ], [ 0, %vector.memcheck589 ], [ 0, %iter.check603 ]
%xtraiter647 = and i64 %"A::Array.size.sroa.0.0.copyload", 7
%lcmp.mod648.not = icmp eq i64 %xtraiter647, 0
br i1 %lcmp.mod648.not, label %L252.us.us.prol.loopexit, label %L252.us.us.prol
L252.us.us.prol: ; preds = %L252.us.us.prol, %vec.epilog.scalar.ph618
%value_phi52470.us.us.prol = phi i64 [ %62, %L252.us.us.prol ], [ %bc.resume.val626, %vec.epilog.scalar.ph618 ]
%prol.iter649 = phi i64 [ %prol.iter649.next, %L252.us.us.prol ], [ 0, %vec.epilog.scalar.ph618 ]
%57 = shl nuw i64 %value_phi52470.us.us.prol, 1
%58 = load bfloat, ptr %19, align 2
%59 = load bfloat, ptr %20, align 2
%60 = fadd bfloat %58, %59
%61 = getelementptr inbounds i8, ptr %21, i64 %57
store bfloat %60, ptr %61, align 2
%62 = add nuw nsw i64 %value_phi52470.us.us.prol, 1
%prol.iter649.next = add i64 %prol.iter649, 1
%prol.iter649.cmp.not = icmp eq i64 %prol.iter649.next, %xtraiter647
br i1 %prol.iter649.cmp.not, label %L252.us.us.prol.loopexit, label %L252.us.us.prol
L252.us.us.prol.loopexit: ; preds = %L252.us.us.prol, %vec.epilog.scalar.ph618
%value_phi52470.us.us.unr = phi i64 [ %bc.resume.val626, %vec.epilog.scalar.ph618 ], [ %62, %L252.us.us.prol ]
%63 = sub nsw i64 %bc.resume.val626, %"A::Array.size.sroa.0.0.copyload"
%64 = icmp ugt i64 %63, -8
br i1 %64, label %L360, label %L252.us.us
L252.us.us: ; preds = %L252.us.us, %L252.us.us.prol.loopexit
%value_phi52470.us.us = phi i64 [ %98, %L252.us.us ], [ %value_phi52470.us.us.unr, %L252.us.us.prol.loopexit ]
%65 = shl i64 %value_phi52470.us.us, 1
%66 = load bfloat, ptr %19, align 2
%67 = load bfloat, ptr %20, align 2
%68 = fadd bfloat %66, %67
%69 = getelementptr i8, ptr %21, i64 %65
store bfloat %68, ptr %69, align 2
%70 = load bfloat, ptr %19, align 2
%71 = load bfloat, ptr %20, align 2
%72 = fadd bfloat %70, %71
%73 = getelementptr i8, ptr %69, i64 2
store bfloat %72, ptr %73, align 2
%74 = load bfloat, ptr %19, align 2
%75 = load bfloat, ptr %20, align 2
%76 = fadd bfloat %74, %75
%77 = getelementptr i8, ptr %69, i64 4
store bfloat %76, ptr %77, align 2
%78 = load bfloat, ptr %19, align 2
%79 = load bfloat, ptr %20, align 2
%80 = fadd bfloat %78, %79
%81 = getelementptr i8, ptr %69, i64 6
store bfloat %80, ptr %81, align 2
%82 = load bfloat, ptr %19, align 2
%83 = load bfloat, ptr %20, align 2
%84 = fadd bfloat %82, %83
%85 = getelementptr i8, ptr %69, i64 8
store bfloat %84, ptr %85, align 2
%86 = load bfloat, ptr %19, align 2
%87 = load bfloat, ptr %20, align 2
%88 = fadd bfloat %86, %87
%89 = getelementptr i8, ptr %69, i64 10
store bfloat %88, ptr %89, align 2
%90 = load bfloat, ptr %19, align 2
%91 = load bfloat, ptr %20, align 2
%92 = fadd bfloat %90, %91
%93 = getelementptr i8, ptr %69, i64 12
store bfloat %92, ptr %93, align 2
%94 = load bfloat, ptr %19, align 2
%95 = load bfloat, ptr %20, align 2
%96 = fadd bfloat %94, %95
%97 = getelementptr i8, ptr %69, i64 14
store bfloat %96, ptr %97, align 2
%98 = add nuw nsw i64 %value_phi52470.us.us, 8
%exitcond.not.7 = icmp eq i64 %98, %"A::Array.size.sroa.0.0.copyload"
br i1 %exitcond.not.7, label %L360, label %L252.us.us
L252.us: ; preds = %L252.us, %L252.us.prol.loopexit
%value_phi52470.us = phi i64 [ %147, %L252.us ], [ %value_phi52470.us.unr, %L252.us.prol.loopexit ]
%99 = shl i64 %value_phi52470.us, 1
%100 = load bfloat, ptr %19, align 2
%101 = getelementptr inbounds i8, ptr %20, i64 %99
%102 = load bfloat, ptr %101, align 2
%103 = fadd bfloat %100, %102
%104 = getelementptr inbounds i8, ptr %21, i64 %99
store bfloat %103, ptr %104, align 2
%105 = add i64 %99, 2
%106 = load bfloat, ptr %19, align 2
%107 = getelementptr inbounds i8, ptr %20, i64 %105
%108 = load bfloat, ptr %107, align 2
%109 = fadd bfloat %106, %108
%110 = getelementptr inbounds i8, ptr %21, i64 %105
store bfloat %109, ptr %110, align 2
%111 = add i64 %99, 4
%112 = load bfloat, ptr %19, align 2
%113 = getelementptr inbounds i8, ptr %20, i64 %111
%114 = load bfloat, ptr %113, align 2
%115 = fadd bfloat %112, %114
%116 = getelementptr inbounds i8, ptr %21, i64 %111
store bfloat %115, ptr %116, align 2
%117 = add i64 %99, 6
%118 = load bfloat, ptr %19, align 2
%119 = getelementptr inbounds i8, ptr %20, i64 %117
%120 = load bfloat, ptr %119, align 2
%121 = fadd bfloat %118, %120
%122 = getelementptr inbounds i8, ptr %21, i64 %117
store bfloat %121, ptr %122, align 2
%123 = add i64 %99, 8
%124 = load bfloat, ptr %19, align 2
%125 = getelementptr inbounds i8, ptr %20, i64 %123
%126 = load bfloat, ptr %125, align 2
%127 = fadd bfloat %124, %126
%128 = getelementptr inbounds i8, ptr %21, i64 %123
store bfloat %127, ptr %128, align 2
%129 = add i64 %99, 10
%130 = load bfloat, ptr %19, align 2
%131 = getelementptr inbounds i8, ptr %20, i64 %129
%132 = load bfloat, ptr %131, align 2
%133 = fadd bfloat %130, %132
%134 = getelementptr inbounds i8, ptr %21, i64 %129
store bfloat %133, ptr %134, align 2
%135 = add i64 %99, 12
%136 = load bfloat, ptr %19, align 2
%137 = getelementptr inbounds i8, ptr %20, i64 %135
%138 = load bfloat, ptr %137, align 2
%139 = fadd bfloat %136, %138
%140 = getelementptr inbounds i8, ptr %21, i64 %135
store bfloat %139, ptr %140, align 2
%141 = add i64 %99, 14
%142 = load bfloat, ptr %19, align 2
%143 = getelementptr inbounds i8, ptr %20, i64 %141
%144 = load bfloat, ptr %143, align 2
%145 = fadd bfloat %142, %144
%146 = getelementptr inbounds i8, ptr %21, i64 %141
store bfloat %145, ptr %146, align 2
%147 = add nuw nsw i64 %value_phi52470.us, 8
%exitcond478.not.7 = icmp eq i64 %147, %"A::Array.size.sroa.0.0.copyload"
br i1 %exitcond478.not.7, label %L360, label %L252.us
L249.preheader.split: ; preds = %L249.preheader
br i1 %.not451, label %iter.check517, label %iter.check
iter.check: ; preds = %L249.preheader.split
br i1 %min.iters.check601, label %vec.epilog.scalar.ph, label %vector.memcheck
vector.memcheck: ; preds = %iter.check
%148 = shl nuw i64 %"A::Array.size.sroa.0.0.copyload", 1
%scevgep = getelementptr i8, ptr %21, i64 %148
%scevgep489 = getelementptr i8, ptr %19, i64 %148
%scevgep490 = getelementptr i8, ptr %20, i64 %148
%bound0 = icmp ult ptr %21, %scevgep489
%bound1 = icmp ult ptr %19, %scevgep
%found.conflict = and i1 %bound0, %bound1
%bound0491 = icmp ult ptr %21, %scevgep490
%bound1492 = icmp ult ptr %20, %scevgep
%found.conflict493 = and i1 %bound0491, %bound1492
%conflict.rdx = or i1 %found.conflict, %found.conflict493
br i1 %conflict.rdx, label %vec.epilog.scalar.ph, label %vector.main.loop.iter.check
vector.main.loop.iter.check: ; preds = %vector.memcheck
%min.iters.check494 = icmp ult i64 %"A::Array.size.sroa.0.0.copyload", 32
br i1 %min.iters.check494, label %vec.epilog.ph, label %vector.ph
vector.ph: ; preds = %vector.main.loop.iter.check
%n.vec = and i64 %"A::Array.size.sroa.0.0.copyload", 9223372036854775776
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%149 = shl nuw i64 %index, 1
%150 = getelementptr inbounds i8, ptr %19, i64 %149
%wide.load = load <32 x bfloat>, ptr %150, align 2
%151 = getelementptr inbounds i8, ptr %20, i64 %149
%wide.load495 = load <32 x bfloat>, ptr %151, align 2
%152 = fadd <32 x bfloat> %wide.load, %wide.load495
%153 = getelementptr inbounds i8, ptr %21, i64 %149
store <32 x bfloat> %152, ptr %153, align 2
%index.next = add nuw i64 %index, 32
%154 = icmp eq i64 %index.next, %n.vec
br i1 %154, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 %"A::Array.size.sroa.0.0.copyload", %n.vec
br i1 %cmp.n, label %L360, label %vec.epilog.iter.check
vec.epilog.iter.check: ; preds = %middle.block
%n.vec.remaining = and i64 %"A::Array.size.sroa.0.0.copyload", 16
%min.epilog.iters.check.not.not = icmp eq i64 %n.vec.remaining, 0
br i1 %min.epilog.iters.check.not.not, label %vec.epilog.scalar.ph, label %vec.epilog.ph
vec.epilog.ph: ; preds = %vec.epilog.iter.check, %vector.main.loop.iter.check
%vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
%n.vec497 = and i64 %"A::Array.size.sroa.0.0.copyload", 9223372036854775792
br label %vec.epilog.vector.body
vec.epilog.vector.body: ; preds = %vec.epilog.vector.body, %vec.epilog.ph
%index499 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next502, %vec.epilog.vector.body ]
%155 = shl nuw i64 %index499, 1
%156 = getelementptr inbounds i8, ptr %19, i64 %155
%wide.load500 = load <16 x bfloat>, ptr %156, align 2
%157 = getelementptr inbounds i8, ptr %20, i64 %155
%wide.load501 = load <16 x bfloat>, ptr %157, align 2
%158 = fadd <16 x bfloat> %wide.load500, %wide.load501
%159 = getelementptr inbounds i8, ptr %21, i64 %155
store <16 x bfloat> %158, ptr %159, align 2
%index.next502 = add nuw i64 %index499, 16
%160 = icmp eq i64 %index.next502, %n.vec497
br i1 %160, label %vec.epilog.middle.block, label %vec.epilog.vector.body
vec.epilog.middle.block: ; preds = %vec.epilog.vector.body
%cmp.n498 = icmp eq i64 %"A::Array.size.sroa.0.0.copyload", %n.vec497
br i1 %cmp.n498, label %L360, label %vec.epilog.scalar.ph
vec.epilog.scalar.ph: ; preds = %vec.epilog.middle.block, %vec.epilog.iter.check, %vector.memcheck, %iter.check
%bc.resume.val = phi i64 [ %n.vec497, %vec.epilog.middle.block ], [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.memcheck ], [ 0, %iter.check ]
%xtraiter = and i64 %"A::Array.size.sroa.0.0.copyload", 7
%lcmp.mod.not = icmp eq i64 %xtraiter, 0
br i1 %lcmp.mod.not, label %L252.prol.loopexit, label %L252.prol
L252.prol: ; preds = %L252.prol, %vec.epilog.scalar.ph
%value_phi52470.prol = phi i64 [ %168, %L252.prol ], [ %bc.resume.val, %vec.epilog.scalar.ph ]
%prol.iter = phi i64 [ %prol.iter.next, %L252.prol ], [ 0, %vec.epilog.scalar.ph ]
%161 = shl nuw i64 %value_phi52470.prol, 1
%162 = getelementptr inbounds i8, ptr %19, i64 %161
%163 = load bfloat, ptr %162, align 2
%164 = getelementptr inbounds i8, ptr %20, i64 %161
%165 = load bfloat, ptr %164, align 2
%166 = fadd bfloat %163, %165
%167 = getelementptr inbounds i8, ptr %21, i64 %161
store bfloat %166, ptr %167, align 2
%168 = add nuw nsw i64 %value_phi52470.prol, 1
%prol.iter.next = add i64 %prol.iter, 1
%prol.iter.cmp.not = icmp eq i64 %prol.iter.next, %xtraiter
br i1 %prol.iter.cmp.not, label %L252.prol.loopexit, label %L252.prol
L252.prol.loopexit: ; preds = %L252.prol, %vec.epilog.scalar.ph
%value_phi52470.unr = phi i64 [ %bc.resume.val, %vec.epilog.scalar.ph ], [ %168, %L252.prol ]
%169 = sub nsw i64 %bc.resume.val, %"A::Array.size.sroa.0.0.copyload"
%170 = icmp ugt i64 %169, -8
br i1 %170, label %L360, label %L252
iter.check517: ; preds = %L249.preheader.split
br i1 %min.iters.check601, label %vec.epilog.scalar.ph529, label %vector.memcheck503
vector.memcheck503: ; preds = %iter.check517
%171 = shl nuw i64 %"A::Array.size.sroa.0.0.copyload", 1
%scevgep504 = getelementptr i8, ptr %21, i64 %171
%scevgep505 = getelementptr i8, ptr %19, i64 %171
%scevgep506 = getelementptr i8, ptr %20, i64 2
%bound0507 = icmp ult ptr %21, %scevgep505
%bound1508 = icmp ult ptr %19, %scevgep504
%found.conflict509 = and i1 %bound0507, %bound1508
%bound0510 = icmp ult ptr %21, %scevgep506
%bound1511 = icmp ult ptr %20, %scevgep504
%found.conflict512 = and i1 %bound0510, %bound1511
%conflict.rdx513 = or i1 %found.conflict509, %found.conflict512
br i1 %conflict.rdx513, label %vec.epilog.scalar.ph529, label %vector.main.loop.iter.check519
vector.main.loop.iter.check519: ; preds = %vector.memcheck503
%min.iters.check518 = icmp ult i64 %"A::Array.size.sroa.0.0.copyload", 32
br i1 %min.iters.check518, label %vec.epilog.ph531, label %vector.ph520
vector.ph520: ; preds = %vector.main.loop.iter.check519
%n.vec522 = and i64 %"A::Array.size.sroa.0.0.copyload", 9223372036854775776
%.pre = load bfloat, ptr %20, align 2
br label %vector.body524
vector.body524: ; preds = %vector.body524, %vector.ph520
%index525 = phi i64 [ 0, %vector.ph520 ], [ %index.next527, %vector.body524 ]
%172 = shl nuw i64 %index525, 1
%173 = getelementptr inbounds i8, ptr %19, i64 %172
%wide.load526 = load <32 x bfloat>, ptr %173, align 2
%broadcast.splatinsert = insertelement <32 x bfloat> poison, bfloat %.pre, i64 0
%broadcast.splat = shufflevector <32 x bfloat> %broadcast.splatinsert, <32 x bfloat> poison, <32 x i32> zeroinitializer
%174 = fadd <32 x bfloat> %wide.load526, %broadcast.splat
%175 = getelementptr inbounds i8, ptr %21, i64 %172
store <32 x bfloat> %174, ptr %175, align 2
%index.next527 = add nuw i64 %index525, 32
%176 = icmp eq i64 %index.next527, %n.vec522
br i1 %176, label %middle.block514, label %vector.body524
middle.block514: ; preds = %vector.body524
%cmp.n523 = icmp eq i64 %"A::Array.size.sroa.0.0.copyload", %n.vec522
br i1 %cmp.n523, label %L360, label %vec.epilog.iter.check530
vec.epilog.iter.check530: ; preds = %middle.block514
%n.vec.remaining532 = and i64 %"A::Array.size.sroa.0.0.copyload", 16
%min.epilog.iters.check533.not.not = icmp eq i64 %n.vec.remaining532, 0
br i1 %min.epilog.iters.check533.not.not, label %vec.epilog.scalar.ph529, label %vec.epilog.ph531
vec.epilog.ph531: ; preds = %vec.epilog.iter.check530, %vector.main.loop.iter.check519
%vec.epilog.resume.val534 = phi i64 [ %n.vec522, %vec.epilog.iter.check530 ], [ 0, %vector.main.loop.iter.check519 ]
%n.vec536 = and i64 %"A::Array.size.sroa.0.0.copyload", 9223372036854775792
%.pre683 = load bfloat, ptr %20, align 2
br label %vec.epilog.vector.body539
vec.epilog.vector.body539: ; preds = %vec.epilog.vector.body539, %vec.epilog.ph531
%index540 = phi i64 [ %vec.epilog.resume.val534, %vec.epilog.ph531 ], [ %index.next544, %vec.epilog.vector.body539 ]
%177 = shl nuw i64 %index540, 1
%178 = getelementptr inbounds i8, ptr %19, i64 %177
%wide.load541 = load <16 x bfloat>, ptr %178, align 2
%broadcast.splatinsert542 = insertelement <16 x bfloat> poison, bfloat %.pre683, i64 0
%broadcast.splat543 = shufflevector <16 x bfloat> %broadcast.splatinsert542, <16 x bfloat> poison, <16 x i32> zeroinitializer
%179 = fadd <16 x bfloat> %wide.load541, %broadcast.splat543
%180 = getelementptr inbounds i8, ptr %21, i64 %177
store <16 x bfloat> %179, ptr %180, align 2
%index.next544 = add nuw i64 %index540, 16
%181 = icmp eq i64 %index.next544, %n.vec536
br i1 %181, label %vec.epilog.middle.block528, label %vec.epilog.vector.body539
vec.epilog.middle.block528: ; preds = %vec.epilog.vector.body539
%cmp.n538 = icmp eq i64 %"A::Array.size.sroa.0.0.copyload", %n.vec536
br i1 %cmp.n538, label %L360, label %vec.epilog.scalar.ph529
vec.epilog.scalar.ph529: ; preds = %vec.epilog.middle.block528, %vec.epilog.iter.check530, %vector.memcheck503, %iter.check517
%bc.resume.val537 = phi i64 [ %n.vec536, %vec.epilog.middle.block528 ], [ %n.vec522, %vec.epilog.iter.check530 ], [ 0, %vector.memcheck503 ], [ 0, %iter.check517 ]
%xtraiter641 = and i64 %"A::Array.size.sroa.0.0.copyload", 7
%lcmp.mod642.not = icmp eq i64 %xtraiter641, 0
br i1 %lcmp.mod642.not, label %L252.us471.prol.loopexit, label %L252.us471.prol
L252.us471.prol: ; preds = %L252.us471.prol, %vec.epilog.scalar.ph529
%value_phi52470.us472.prol = phi i64 [ %188, %L252.us471.prol ], [ %bc.resume.val537, %vec.epilog.scalar.ph529 ]
%prol.iter643 = phi i64 [ %prol.iter643.next, %L252.us471.prol ], [ 0, %vec.epilog.scalar.ph529 ]
%182 = shl nuw i64 %value_phi52470.us472.prol, 1
%183 = getelementptr inbounds i8, ptr %19, i64 %182
%184 = load bfloat, ptr %183, align 2
%185 = load bfloat, ptr %20, align 2
%186 = fadd bfloat %184, %185
%187 = getelementptr inbounds i8, ptr %21, i64 %182
store bfloat %186, ptr %187, align 2
%188 = add nuw nsw i64 %value_phi52470.us472.prol, 1
%prol.iter643.next = add i64 %prol.iter643, 1
%prol.iter643.cmp.not = icmp eq i64 %prol.iter643.next, %xtraiter641
br i1 %prol.iter643.cmp.not, label %L252.us471.prol.loopexit, label %L252.us471.prol
L252.us471.prol.loopexit: ; preds = %L252.us471.prol, %vec.epilog.scalar.ph529
%value_phi52470.us472.unr = phi i64 [ %bc.resume.val537, %vec.epilog.scalar.ph529 ], [ %188, %L252.us471.prol ]
%189 = sub nsw i64 %bc.resume.val537, %"A::Array.size.sroa.0.0.copyload"
%190 = icmp ugt i64 %189, -8
br i1 %190, label %L360, label %L252.us471
L252.us471: ; preds = %L252.us471, %L252.us471.prol.loopexit
%value_phi52470.us472 = phi i64 [ %239, %L252.us471 ], [ %value_phi52470.us472.unr, %L252.us471.prol.loopexit ]
%191 = shl i64 %value_phi52470.us472, 1
%192 = getelementptr inbounds i8, ptr %19, i64 %191
%193 = load bfloat, ptr %192, align 2
%194 = load bfloat, ptr %20, align 2
%195 = fadd bfloat %193, %194
%196 = getelementptr inbounds i8, ptr %21, i64 %191
store bfloat %195, ptr %196, align 2
%197 = add i64 %191, 2
%198 = getelementptr inbounds i8, ptr %19, i64 %197
%199 = load bfloat, ptr %198, align 2
%200 = load bfloat, ptr %20, align 2
%201 = fadd bfloat %199, %200
%202 = getelementptr inbounds i8, ptr %21, i64 %197
store bfloat %201, ptr %202, align 2
%203 = add i64 %191, 4
%204 = getelementptr inbounds i8, ptr %19, i64 %203
%205 = load bfloat, ptr %204, align 2
%206 = load bfloat, ptr %20, align 2
%207 = fadd bfloat %205, %206
%208 = getelementptr inbounds i8, ptr %21, i64 %203
store bfloat %207, ptr %208, align 2
%209 = add i64 %191, 6
%210 = getelementptr inbounds i8, ptr %19, i64 %209
%211 = load bfloat, ptr %210, align 2
%212 = load bfloat, ptr %20, align 2
%213 = fadd bfloat %211, %212
%214 = getelementptr inbounds i8, ptr %21, i64 %209
store bfloat %213, ptr %214, align 2
%215 = add i64 %191, 8
%216 = getelementptr inbounds i8, ptr %19, i64 %215
%217 = load bfloat, ptr %216, align 2
%218 = load bfloat, ptr %20, align 2
%219 = fadd bfloat %217, %218
%220 = getelementptr inbounds i8, ptr %21, i64 %215
store bfloat %219, ptr %220, align 2
%221 = add i64 %191, 10
%222 = getelementptr inbounds i8, ptr %19, i64 %221
%223 = load bfloat, ptr %222, align 2
%224 = load bfloat, ptr %20, align 2
%225 = fadd bfloat %223, %224
%226 = getelementptr inbounds i8, ptr %21, i64 %221
store bfloat %225, ptr %226, align 2
%227 = add i64 %191, 12
%228 = getelementptr inbounds i8, ptr %19, i64 %227
%229 = load bfloat, ptr %228, align 2
%230 = load bfloat, ptr %20, align 2
%231 = fadd bfloat %229, %230
%232 = getelementptr inbounds i8, ptr %21, i64 %227
store bfloat %231, ptr %232, align 2
%233 = add i64 %191, 14
%234 = getelementptr inbounds i8, ptr %19, i64 %233
%235 = load bfloat, ptr %234, align 2
%236 = load bfloat, ptr %20, align 2
%237 = fadd bfloat %235, %236
%238 = getelementptr inbounds i8, ptr %21, i64 %233
store bfloat %237, ptr %238, align 2
%239 = add nuw nsw i64 %value_phi52470.us472, 8
%exitcond477.not.7 = icmp eq i64 %239, %"A::Array.size.sroa.0.0.copyload"
br i1 %exitcond477.not.7, label %L360, label %L252.us471
L252: ; preds = %L252, %L252.prol.loopexit
%value_phi52470 = phi i64 [ %296, %L252 ], [ %value_phi52470.unr, %L252.prol.loopexit ]
%240 = shl i64 %value_phi52470, 1
%241 = getelementptr inbounds i8, ptr %19, i64 %240
%242 = load bfloat, ptr %241, align 2
%243 = getelementptr inbounds i8, ptr %20, i64 %240
%244 = load bfloat, ptr %243, align 2
%245 = fadd bfloat %242, %244
%246 = getelementptr inbounds i8, ptr %21, i64 %240
store bfloat %245, ptr %246, align 2
%247 = add i64 %240, 2
%248 = getelementptr inbounds i8, ptr %19, i64 %247
%249 = load bfloat, ptr %248, align 2
%250 = getelementptr inbounds i8, ptr %20, i64 %247
%251 = load bfloat, ptr %250, align 2
%252 = fadd bfloat %249, %251
%253 = getelementptr inbounds i8, ptr %21, i64 %247
store bfloat %252, ptr %253, align 2
%254 = add i64 %240, 4
%255 = getelementptr inbounds i8, ptr %19, i64 %254
%256 = load bfloat, ptr %255, align 2
%257 = getelementptr inbounds i8, ptr %20, i64 %254
%258 = load bfloat, ptr %257, align 2
%259 = fadd bfloat %256, %258
%260 = getelementptr inbounds i8, ptr %21, i64 %254
store bfloat %259, ptr %260, align 2
%261 = add i64 %240, 6
%262 = getelementptr inbounds i8, ptr %19, i64 %261
%263 = load bfloat, ptr %262, align 2
%264 = getelementptr inbounds i8, ptr %20, i64 %261
%265 = load bfloat, ptr %264, align 2
%266 = fadd bfloat %263, %265
%267 = getelementptr inbounds i8, ptr %21, i64 %261
store bfloat %266, ptr %267, align 2
%268 = add i64 %240, 8
%269 = getelementptr inbounds i8, ptr %19, i64 %268
%270 = load bfloat, ptr %269, align 2
%271 = getelementptr inbounds i8, ptr %20, i64 %268
%272 = load bfloat, ptr %271, align 2
%273 = fadd bfloat %270, %272
%274 = getelementptr inbounds i8, ptr %21, i64 %268
store bfloat %273, ptr %274, align 2
%275 = add i64 %240, 10
%276 = getelementptr inbounds i8, ptr %19, i64 %275
%277 = load bfloat, ptr %276, align 2
%278 = getelementptr inbounds i8, ptr %20, i64 %275
%279 = load bfloat, ptr %278, align 2
%280 = fadd bfloat %277, %279
%281 = getelementptr inbounds i8, ptr %21, i64 %275
store bfloat %280, ptr %281, align 2
%282 = add i64 %240, 12
%283 = getelementptr inbounds i8, ptr %19, i64 %282
%284 = load bfloat, ptr %283, align 2
%285 = getelementptr inbounds i8, ptr %20, i64 %282
%286 = load bfloat, ptr %285, align 2
%287 = fadd bfloat %284, %286
%288 = getelementptr inbounds i8, ptr %21, i64 %282
store bfloat %287, ptr %288, align 2
%289 = add i64 %240, 14
%290 = getelementptr inbounds i8, ptr %19, i64 %289
%291 = load bfloat, ptr %290, align 2
%292 = getelementptr inbounds i8, ptr %20, i64 %289
%293 = load bfloat, ptr %292, align 2
%294 = fadd bfloat %291, %293
%295 = getelementptr inbounds i8, ptr %21, i64 %289
store bfloat %294, ptr %295, align 2
%296 = add nuw nsw i64 %value_phi52470, 8
%exitcond479.not.7 = icmp eq i64 %296, %"A::Array.size.sroa.0.0.copyload"
br i1 %exitcond479.not.7, label %L360, label %L252
L360: ; preds = %L252, %L252.us471, %L252.us471.prol.loopexit, %vec.epilog.middle.block528, %middle.block514, %L252.prol.loopexit, %vec.epilog.middle.block, %middle.block, %L252.us, %L252.us.us, %L252.us.us.prol.loopexit, %vec.epilog.middle.block617, %middle.block600, %L252.us.prol.loopexit, %vec.epilog.middle.block572, %middle.block556, %L232
%"A::Array.size115.sroa.0.0.copyload" = load i64, ptr %0, align 8
%297 = icmp eq i64 %value_phi44.size.sroa.0.0.copyload.fr, %"A::Array.size115.sroa.0.0.copyload"
%value_phi123.v453 = icmp eq i64 %"A::Array.size115.sroa.0.0.copyload", 1
%value_phi123.v.not.not636 = or i1 %value_phi123.v453, %297
%or.cond485 = or i1 %value_phi123.v.not.not636, %.not451
br i1 %or.cond485, label %L395, label %L384
L384: ; preds = %L360
%"new::LazyString" = call noalias nonnull align 8 dereferenceable(32) ptr @ijl_gc_small_alloc(ptr %ptls_load672, i32 552, i32 32, i64 23455907064416) #14
%"new::LazyString.tag_addr" = getelementptr inbounds i64, ptr %"new::LazyString", i64 -1
store atomic i64 23455907064416, ptr %"new::LazyString.tag_addr" unordered, align 8
store ptr null, ptr %"new::LazyString", align 8
%298 = getelementptr inbounds ptr, ptr %"new::LazyString", i64 1
store ptr null, ptr %298, align 8
store ptr %"new::LazyString", ptr %gc_slot_addr_0, align 8
%ptls_load675 = load ptr, ptr %ptls_field, align 8
%"box::Tuple" = call noalias nonnull align 8 dereferenceable(48) ptr @ijl_gc_small_alloc(ptr %ptls_load675, i32 600, i32 48, i64 23455912939920) #14
%"box::Tuple.tag_addr" = getelementptr inbounds i64, ptr %"box::Tuple", i64 -1
store atomic i64 23455912939920, ptr %"box::Tuple.tag_addr" unordered, align 8
store ptr @"jl_global#3227.jit", ptr %"box::Tuple", align 8
%"box::Tuple.repack456" = getelementptr inbounds { ptr, [1 x i64], ptr, [1 x i64] }, ptr %"box::Tuple", i64 0, i32 1
store i64 %"A::Array.size115.sroa.0.0.copyload", ptr %"box::Tuple.repack456", align 8
%"box::Tuple.repack458" = getelementptr inbounds { ptr, [1 x i64], ptr, [1 x i64] }, ptr %"box::Tuple", i64 0, i32 2
store ptr @"jl_global#3228.jit", ptr %"box::Tuple.repack458", align 8
%"box::Tuple.repack460" = getelementptr inbounds { ptr, [1 x i64], ptr, [1 x i64] }, ptr %"box::Tuple", i64 0, i32 3
store i64 %value_phi44.size.sroa.0.0.copyload.fr, ptr %"box::Tuple.repack460", align 8
store atomic ptr %"box::Tuple", ptr %"new::LazyString" release, align 8
%jl_nothing = load ptr, ptr @jl_nothing, align 8
store atomic ptr %jl_nothing, ptr %298 release, align 8
%"box::DimensionMismatch" = call noalias nonnull align 8 dereferenceable(16) ptr @ijl_gc_small_alloc(ptr %ptls_load675, i32 504, i32 16, i64 23455953203136) #14
%"box::DimensionMismatch.tag_addr" = getelementptr inbounds i64, ptr %"box::DimensionMismatch", i64 -1
store atomic i64 23455953203136, ptr %"box::DimensionMismatch.tag_addr" unordered, align 8
store ptr %"new::LazyString", ptr %"box::DimensionMismatch", align 8
call void @ijl_throw(ptr nonnull %"box::DimensionMismatch")
unreachable
L395: ; preds = %L360
%frame.prev680 = load ptr, ptr %frame.prev, align 8
store ptr %frame.prev680, ptr %tls_pgcstack, align 8
ret ptr %"new::Array"
guard_pass: ; preds = %L182
%299 = load ptr, ptr %13, align 8
%300 = icmp eq ptr %299, null
%301 = select i1 %300, ptr %10, ptr %299
br label %guard_exit
guard_exit: ; preds = %guard_pass, %L182
%guard_res = phi ptr [ %10, %L182 ], [ %301, %guard_pass ]
%guard_res.tag_addr = getelementptr inbounds i64, ptr %guard_res, i64 -1
%guard_res.tag = load atomic i64, ptr %guard_res.tag_addr unordered, align 8
%302 = and i64 %guard_res.tag, -16
%303 = inttoptr i64 %302 to ptr
%exactly_isa.not = icmp eq ptr %303, @"+Core.GenericMemory#3209.jit"
%value_phi158 = select i1 %exactly_isa.not, ptr %guard_res, ptr %10
%304 = getelementptr inbounds i8, ptr %value_phi158, i64 8
%bitcast161 = load ptr, ptr %304, align 8
%305 = getelementptr inbounds { ptr, ptr }, ptr %"Bs[1]::Array", i64 0, i32 1
%306 = load ptr, ptr %305, align 8
%307 = getelementptr inbounds { i64, ptr }, ptr %306, i64 0, i32 1
%308 = load ptr, ptr %307, align 8
%309 = getelementptr inbounds ptr, ptr %306, i64 2
%.not446 = icmp eq ptr %309, %308
br i1 %.not446, label %guard_exit167, label %guard_pass166
guard_pass166: ; preds = %guard_exit
%310 = load ptr, ptr %309, align 8
%311 = icmp eq ptr %310, null
%312 = select i1 %311, ptr %306, ptr %310
br label %guard_exit167
guard_exit167: ; preds = %guard_pass166, %guard_exit
%guard_res168 = phi ptr [ %306, %guard_exit ], [ %312, %guard_pass166 ]
%guard_res168.tag_addr = getelementptr inbounds i64, ptr %guard_res168, i64 -1
%guard_res168.tag = load atomic i64, ptr %guard_res168.tag_addr unordered, align 8
%313 = and i64 %guard_res168.tag, -16
%314 = inttoptr i64 %313 to ptr
%exactly_isa173.not = icmp eq ptr %314, @"+Core.GenericMemory#3209.jit"
%value_phi181 = select i1 %exactly_isa173.not, ptr %guard_res168, ptr %306
%315 = getelementptr inbounds i8, ptr %value_phi181, i64 8
%bitcast184 = load ptr, ptr %315, align 8
%.not448.not = icmp eq ptr %bitcast161, %bitcast184
br i1 %.not448.not, label %L218, label %L232
guard_pass221: ; preds = %L108
%316 = load ptr, ptr %5, align 8
%317 = icmp eq ptr %316, null
%318 = select i1 %317, ptr %2, ptr %316
br label %guard_exit222
guard_exit222: ; preds = %guard_pass221, %L108
%guard_res223 = phi ptr [ %2, %L108 ], [ %318, %guard_pass221 ]
%guard_res223.tag_addr = getelementptr inbounds i64, ptr %guard_res223, i64 -1
%guard_res223.tag = load atomic i64, ptr %guard_res223.tag_addr unordered, align 8
%319 = and i64 %guard_res223.tag, -16
%320 = inttoptr i64 %319 to ptr
%exactly_isa228.not = icmp eq ptr %320, @"+Core.GenericMemory#3209.jit"
%value_phi236 = select i1 %exactly_isa228.not, ptr %guard_res223, ptr %2
%321 = getelementptr inbounds i8, ptr %value_phi236, i64 8
%bitcast239 = load ptr, ptr %321, align 8
%322 = getelementptr inbounds { ptr, ptr }, ptr %"A::Array", i64 0, i32 1
%323 = load ptr, ptr %322, align 8
%324 = getelementptr inbounds { i64, ptr }, ptr %323, i64 0, i32 1
%325 = load ptr, ptr %324, align 8
%326 = getelementptr inbounds ptr, ptr %323, i64 2
%.not435 = icmp eq ptr %326, %325
br i1 %.not435, label %guard_exit246, label %guard_pass245
guard_pass245: ; preds = %guard_exit222
%327 = load ptr, ptr %326, align 8
%328 = icmp eq ptr %327, null
%329 = select i1 %328, ptr %323, ptr %327
br label %guard_exit246
guard_exit246: ; preds = %guard_pass245, %guard_exit222
%guard_res247 = phi ptr [ %323, %guard_exit222 ], [ %329, %guard_pass245 ]
%guard_res247.tag_addr = getelementptr inbounds i64, ptr %guard_res247, i64 -1
%guard_res247.tag = load atomic i64, ptr %guard_res247.tag_addr unordered, align 8
%330 = and i64 %guard_res247.tag, -16
%331 = inttoptr i64 %330 to ptr
%exactly_isa252.not = icmp eq ptr %331, @"+Core.GenericMemory#3209.jit"
%value_phi260 = select i1 %exactly_isa252.not, ptr %guard_res247, ptr %323
%332 = getelementptr inbounds i8, ptr %value_phi260, i64 8
%bitcast263 = load ptr, ptr %332, align 8
%.not437.not = icmp eq ptr %bitcast239, %bitcast263
br i1 %.not437.not, label %L144, label %L158
} |
As of #51470 and JuliaMath/BFloat16s.jl#51, I was hoping that Julia may natively support BF16 on the CPU. I did some smoke testing on an CPU with
avx512_bf16
support (according tolscpu
) but observed some strange failure modes:@code_llvm A+A
below)@code_llvm A*A
below)In JuliaMath/BFloat16s.jl#68 I tried executing some code on
v1.11.0-alpha2
, while below I merely try to generate the LLVM IR but on the currentnightly
(available fromjuliaup
).The text was updated successfully, but these errors were encountered: