Hang calling llc with aie2p target, after opt with -On n>0. #315

newling · 2025-01-29T00:49:00Z

I'm starting to integrate support for aie2p into the IREE compiler. My first attempt for a small matmul is hitting a hang during object file generation with llc, but only when opt is run with -On for n>0.

I am using peano wheel from today, 28 January: llvm_aie-19.0.0.2025012801+24e4e160.dist-info from https://github.com/Xilinx/llvm-aie/releases

input.ll is attached, and below.

input.opt.ll with -O0 is generated with the following command:

llvm-aie/bin/opt -vectorize-loops=false -vectorize-slp=false --two-entry-phi-node-folding-threshold=10 -mandatory-inlining-before-opt=false -basic-aa-full-phi-analysis=true -basic-aa-max-lookup-search-depth=10 -O0 --inline-threshold=10 --disable-builtin=memset -S input.ll -o input.opt.ll

input.opt.ll with -O1 is generated with the following command (identical to the above, but with -O1):

llvm-aie/bin/opt -vectorize-loops=false -vectorize-slp=false --two-entry-phi-node-folding-threshold=10 -mandatory-inlining-before-opt=false -basic-aa-full-phi-analysis=true -basic-aa-max-lookup-search-depth=10 -O1 --inline-threshold=10 --disable-builtin=memset -S input.ll -o input.opt.ll

Object file generation works fine with the following command when run on the output of opt with '-O0':

llvm-aie/bin/llc input.opt.ll -O2 --march=aie2p --function-sections --filetype=obj -o input.o

but the above fails when run on the output of opt with '-O1':

repro_files.zip

What is causing the hang?

input.ll:


; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target triple = "aie2p"

@buff_11 = external global [1024 x i8]
@buff_10 = external global [1024 x i8]
@buff_9 = external global [1024 x i8]
@buff_8 = external global [1024 x i8]
@buff_7 = external global [1024 x i32]
@buff_6 = external global [1024 x i32]
@buff_5 = external global [1024 x i8]
@buff_4 = external global [1024 x i8]
@buff_3 = external global [1024 x i8]
@buff_2 = external global [1024 x i8]
@buff_1 = external global [1024 x i32]
@buff_0 = external global [1024 x i32]
@shim_2 = external global [64 x [64 x i32]]
@shim_1 = external global [64 x [64 x i8]]
@shim_0 = external global [64 x [64 x i8]]

define void @generic_matmul_0_outlined(ptr %0, ptr %1, ptr %2) {
  br label %4

4:                                                ; preds = %36, %3
  %5 = phi i64 [ %37, %36 ], [ 0, %3 ]
  %6 = icmp slt i64 %5, 4
  br i1 %6, label %7, label %38

7:                                                ; preds = %34, %4
  %8 = phi i64 [ %35, %34 ], [ 0, %4 ]
  %9 = icmp slt i64 %8, 4
  br i1 %9, label %10, label %36

10:                                               ; preds = %7
  %11 = mul nsw i64 %8, 256
  %12 = mul nsw i64 %5, 64
  %13 = add i64 %11, %12
  br label %14

14:                                               ; preds = %17, %10
  %15 = phi i64 [ %33, %17 ], [ 0, %10 ]
  %16 = icmp slt i64 %15, 4
  br i1 %16, label %17, label %34

17:                                               ; preds = %14
  %18 = mul nsw i64 %15, 256
  %19 = add i64 %18, %12
  %20 = getelementptr i8, ptr %0, i64 %19
  %21 = load <64 x i8>, ptr %20, align 64
  %22 = mul nsw i64 %15, 64
  %23 = add i64 %11, %22
  %24 = getelementptr i8, ptr %1, i64 %23
  %25 = load <64 x i8>, ptr %24, align 64
  %26 = getelementptr i32, ptr %2, i64 %13
  %27 = load <64 x i32>, ptr %26, align 4
  %28 = bitcast <64 x i8> %21 to <16 x i32>
  %29 = bitcast <64 x i8> %25 to <32 x i16>
  %30 = bitcast <64 x i32> %27 to <32 x i64>
  %31 = call <32 x i64> @llvm.aie2p.I512.I512.ACC2048.mac.conf(<16 x i32> %28, <32 x i16> %29, <32 x i64> %30, i32 776)
  %32 = bitcast <32 x i64> %31 to <64 x i32>
  store <64 x i32> %32, ptr %26, align 4
  %33 = add i64 %15, 1
  br label %14

34:                                               ; preds = %14
  %35 = add i64 %8, 1
  br label %7

36:                                               ; preds = %7
  %37 = add i64 %5, 1
  br label %4

38:                                               ; preds = %4
  ret void
}

define void @core_0_2() {
  call void @llvm.assume(i1 true) [ "align"(ptr @buff_6, i64 32) ]
  call void @llvm.assume(i1 true) [ "align"(ptr @buff_10, i64 32) ]
  call void @llvm.assume(i1 true) [ "align"(ptr @buff_8, i64 32) ]
  call void @llvm.assume(i1 true) [ "align"(ptr @buff_11, i64 32) ]
  call void @llvm.assume(i1 true) [ "align"(ptr @buff_9, i64 32) ]
  call void @llvm.assume(i1 true) [ "align"(ptr @buff_7, i64 32) ]
  br label %1

1:                                                ; preds = %88, %0
  %2 = phi i64 [ %89, %88 ], [ 0, %0 ]
  %3 = icmp slt i64 %2, 4
  br i1 %3, label %4, label %90

4:                                                ; preds = %1
  call void @llvm.aie2p.acquire(i32 52, i32 -1)
  br label %5

5:                                                ; preds = %44, %4
  %6 = phi i64 [ %45, %44 ], [ 0, %4 ]
  %7 = icmp slt i64 %6, 1
  br i1 %7, label %8, label %46

8:                                                ; preds = %42, %5
  %9 = phi i64 [ %43, %42 ], [ 0, %5 ]
  %10 = icmp slt i64 %9, 1
  br i1 %10, label %11, label %44

11:                                               ; preds = %40, %8
  %12 = phi i64 [ %41, %40 ], [ 0, %8 ]
  %13 = icmp slt i64 %12, 4
  br i1 %13, label %14, label %42

14:                                               ; preds = %38, %11
  %15 = phi i64 [ %39, %38 ], [ 0, %11 ]
  %16 = icmp slt i64 %15, 4
  br i1 %16, label %17, label %40

17:                                               ; preds = %36, %14
  %18 = phi i64 [ %37, %36 ], [ 0, %14 ]
  %19 = icmp slt i64 %18, 8
  br i1 %19, label %20, label %38

20:                                               ; preds = %23, %17
  %21 = phi i64 [ %35, %23 ], [ 0, %17 ]
  %22 = icmp slt i64 %21, 8
  br i1 %22, label %23, label %36

23:                                               ; preds = %20
  %24 = mul i64 %6, 1024
  %25 = mul i64 %9, 1024
  %26 = add i64 %24, %25
  %27 = mul i64 %12, 256
  %28 = add i64 %26, %27
  %29 = mul i64 %15, 64
  %30 = add i64 %28, %29
  %31 = mul i64 %18, 8
  %32 = add i64 %30, %31
  %33 = add i64 %32, %21
  %34 = getelementptr i32, ptr @buff_6, i64 %33
  store i32 0, ptr %34, align 4
  %35 = add i64 %21, 1
  br label %20

36:                                               ; preds = %20
  %37 = add i64 %18, 1
  br label %17

38:                                               ; preds = %17
  %39 = add i64 %15, 1
  br label %14

40:                                               ; preds = %14
  %41 = add i64 %12, 1
  br label %11

42:                                               ; preds = %11
  %43 = add i64 %9, 1
  br label %8

44:                                               ; preds = %8
  %45 = add i64 %6, 1
  br label %5

46:                                               ; preds = %5
  call void @llvm.aie2p.acquire(i32 49, i32 -1)
  call void @llvm.aie2p.acquire(i32 51, i32 -1)
  call void @generic_matmul_0_outlined(ptr @buff_10, ptr @buff_8, ptr @buff_6)
  call void @llvm.aie2p.release(i32 48, i32 1)
  call void @llvm.aie2p.acquire(i32 49, i32 -1)
  call void @llvm.aie2p.release(i32 50, i32 1)
  call void @llvm.aie2p.acquire(i32 51, i32 -1)
  call void @generic_matmul_0_outlined(ptr @buff_11, ptr @buff_9, ptr @buff_6)
  call void @llvm.aie2p.release(i32 48, i32 1)
  call void @llvm.aie2p.release(i32 50, i32 1)
  call void @llvm.aie2p.release(i32 53, i32 1)
  call void @llvm.aie2p.acquire(i32 52, i32 -1)
  br label %47

47:                                               ; preds = %86, %46
  %48 = phi i64 [ %87, %86 ], [ 0, %46 ]
  %49 = icmp slt i64 %48, 1
  br i1 %49, label %50, label %88

50:                                               ; preds = %84, %47
  %51 = phi i64 [ %85, %84 ], [ 0, %47 ]
  %52 = icmp slt i64 %51, 1
  br i1 %52, label %53, label %86

53:                                               ; preds = %82, %50
  %54 = phi i64 [ %83, %82 ], [ 0, %50 ]
  %55 = icmp slt i64 %54, 4
  br i1 %55, label %56, label %84

56:                                               ; preds = %80, %53
  %57 = phi i64 [ %81, %80 ], [ 0, %53 ]
  %58 = icmp slt i64 %57, 4
  br i1 %58, label %59, label %82

59:                                               ; preds = %78, %56
  %60 = phi i64 [ %79, %78 ], [ 0, %56 ]
  %61 = icmp slt i64 %60, 8
  br i1 %61, label %62, label %80

62:                                               ; preds = %65, %59
  %63 = phi i64 [ %77, %65 ], [ 0, %59 ]
  %64 = icmp slt i64 %63, 8
  br i1 %64, label %65, label %78

65:                                               ; preds = %62
  %66 = mul i64 %48, 1024
  %67 = mul i64 %51, 1024
  %68 = add i64 %66, %67
  %69 = mul i64 %54, 256
  %70 = add i64 %68, %69
  %71 = mul i64 %57, 64
  %72 = add i64 %70, %71
  %73 = mul i64 %60, 8
  %74 = add i64 %72, %73
  %75 = add i64 %74, %63
  %76 = getelementptr i32, ptr @buff_7, i64 %75
  store i32 0, ptr %76, align 4
  %77 = add i64 %63, 1
  br label %62

78:                                               ; preds = %62
  %79 = add i64 %60, 1
  br label %59

80:                                               ; preds = %59
  %81 = add i64 %57, 1
  br label %56

82:                                               ; preds = %56
  %83 = add i64 %54, 1
  br label %53

84:                                               ; preds = %53
  %85 = add i64 %51, 1
  br label %50

86:                                               ; preds = %50
  %87 = add i64 %48, 1
  br label %47

88:                                               ; preds = %47
  call void @llvm.aie2p.acquire(i32 49, i32 -1)
  call void @llvm.aie2p.acquire(i32 51, i32 -1)
  call void @generic_matmul_0_outlined(ptr @buff_10, ptr @buff_8, ptr @buff_7)
  call void @llvm.aie2p.release(i32 48, i32 1)
  call void @llvm.aie2p.acquire(i32 49, i32 -1)
  call void @llvm.aie2p.release(i32 50, i32 1)
  call void @llvm.aie2p.acquire(i32 51, i32 -1)
  call void @generic_matmul_0_outlined(ptr @buff_11, ptr @buff_9, ptr @buff_7)
  call void @llvm.aie2p.release(i32 48, i32 1)
  call void @llvm.aie2p.release(i32 50, i32 1)
  call void @llvm.aie2p.release(i32 53, i32 1)
  %89 = add i64 %2, 2
  br label %1

90:                                               ; preds = %1
  ret void
}

declare void @llvm.aie2p.acquire(i32, i32)

declare void @llvm.aie2p.release(i32, i32)

declare <32 x i64> @llvm.aie2p.I512.I512.ACC2048.mac.conf(<16 x i32>, <32 x i16>, <32 x i64>, i32)

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
declare void @llvm.assume(i1 noundef) #0

attributes #0 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"Debug Info Version", i32 3}

The text was updated successfully, but these errors were encountered:

martien-de-jong · 2025-01-29T16:41:19Z

First an a-priory clarification, llc doesn't link, it transforms the llvm code to an object file. Anyway, I'm looking at this.

martien-de-jong · 2025-01-30T09:27:31Z

BTW, thanks for the reproducer, it hangs beautifully. FYI, it usually suffices to have the input to llc, i.e. the output of opt.

martien-de-jong · 2025-01-30T09:41:59Z

legalizerinput.txt

We seem to be looping in the legalizer around G_UNMERGE_VALUES.

rename .txt to .mir
llc --march=aie2p --start-after=irtranslator legalizerinput.mir

The process memory size doesn't grow.

We are stuck here:

MachineDominatorTree::dominates
CombinerHelper::dominates
findLastRegUseInBB
findEarliestInsertPoint
findPostIncMatch
matchLdStInc

but we sample different G_PTR_ADDs with the same G_LOAD

martien-de-jong · 2025-01-30T10:58:13Z

The G_LOAD is a scalar load that doesn't occur in the input program. I guess it derives from scalarizing a vector load.

martien-de-jong · 2025-01-30T11:32:17Z

OMG. We have scalarized every vector load and end up with one basic block of ~30000 instructions. We're stuck on the very first scalar load in that block. Perhaps it isn't an infinite loop, just a bit quadratic in block size.
Anyway, I think the code wasn't intended to be scalarized like this.

Well, not every vector load.. We are missing alignment on some of them, but they dominate the code size.

define void @generic_matmul_0_outlined(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr nocapture %2) local_unnamed_addr #0 {
  .preheader:
    %.promoted = load <32 x i64>, ptr %2, align 4
    %3 = load <16 x i32>, ptr %0, align 64
    %4 = load <32 x i16>, ptr %1, align 64
    %5 = tail call <32 x i64> @llvm.aie2p.I512.I512.ACC2048.mac.conf(<16 x i32> %3, <32 x i16> %4, <32 x i64> %.promoted, i32 776)

%3 and %4 are properly aligned, and result in a proper vector load.
%.promoted is not aligned and gets composed from 32 scalar loads. Oh, make that 64.

@newling, we currently need proper alignment on all loads and stores. Perhaps we can compose them from fewer aligned vector loads around the unaligned address, but I assume that will still not be the intended code .

newling · 2025-01-30T15:31:43Z

@martien-de-jong thanks for investigating this.

If this is an load/store alignment problem, it's not the first time for us, it's usually what trips me up lowering through peano/AIE. I've tried changing

(1) all loads in input.ll to align 64
(2) all loads and stores in input.ll to align 64
(3) all loads and stores and '@llvm.assume' calls in input.ll to have align 64

and then running with opt at -O1 and then llc. In all cases I get an invalid instruction error in llc , example: Cannot get Load Store opcode for: G_STORE %364:accregbank(<32 x s64>), %344:ptrregbank(p0) :: (store (<32 x s64>) into %ir.301, align 64)

Can you please remind me what the alignment expectation is (or point me to docs to shed some light)? If I can get any input.ll to get through opt with -O1 and llc, then I will know what the MLIR passes we run should be generating. I'm 90% sure the code is fully vectorizable, and so the infinite/quadratic loop we're seeing is down path we shouldn't be on anyway

martien-de-jong · 2025-01-30T16:18:39Z

I'll dig a bit further. I'm mostly looking at the llvm input to llc, which is translated to machine IR by the standard irtranslator. That initial machine IR has the 4 byte aligned load. I hope opt is cooperating in propagating alignments from input to output loads, and I hope irtranslator is doing the right thing, because that is a bit of a black box for us.
As for missing coverage of G_STORE on 32 x s64, I've been informed that this recent pr #307 might have fixed that.

martien-de-jong · 2025-01-30T16:32:05Z

On aie2p any vector load / store of size 512 bits or larger should have at least 64 byte alignment. In you input.ll there are several instances of 4 byte aligned loads and stores, like
%.promoted = load <32 x i64>, ptr %2, align 4

newling · 2025-01-30T20:09:31Z

As for missing coverage of G_STORE on 32 x s64, I've been informed that this recent pr #307 might have fixed that.

Indeed. You folks are moving fast, it's not often that bumping a component forward by 2 days fixes anything :) I have a path forward now: fix up the alignments that our compiler is generating. And for me the 'hang' isn't high priority anymore (although of course an error for unsupported scalar code would be nice).

The majority of this PR is refactoring to make it possible to support 2+ target ISAs. - aievec.matmul now verifies its operand shapes based on the target device in the module. - I've removed a bunch of intrinsic matmul shapes for phoenix that we're not using, to simplify the code - The XLLVM dialect ops now include the target device in their names. i.e. the name includes either `AIE2` or `AIE2P` now. - Only matmul can lower to XLLVM from aievec for AIE2P, all other aievec ops (UPS, etc) have an assert on them that the device is AIE2. - A few XLLVM ops are removed: we don't use broadcast or a few others, so I've trimmed the set of ops we support down. - This PR adds utils `isAie2` and `isAie2P` to AMDAIEUtils.h I have confirmed that a linalg matmul can compile all the way through peano for AIE2P, but only with -O0. Next step after this is to fix alignment issues in iree-amd-aie to get this to work for -On n>0: Xilinx/llvm-aie#315

newling mentioned this issue Feb 3, 2025

Initial support for using peano with AIE2P (strix). nod-ai/iree-amd-aie#1071

Merged

newling mentioned this issue Feb 4, 2025

Numerical test for strix compiled with peano nod-ai/iree-amd-aie#1077

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Hang calling llc with aie2p target, after opt with -On n>0. #315

Hang calling llc with aie2p target, after opt with -On n>0. #315

newling commented Jan 29, 2025 •

edited

Loading

martien-de-jong commented Jan 29, 2025

martien-de-jong commented Jan 30, 2025

martien-de-jong commented Jan 30, 2025 •

edited

Loading

martien-de-jong commented Jan 30, 2025

martien-de-jong commented Jan 30, 2025 •

edited

Loading

newling commented Jan 30, 2025

martien-de-jong commented Jan 30, 2025

martien-de-jong commented Jan 30, 2025

newling commented Jan 30, 2025 •

edited

Loading

Hang calling llc with aie2p target, after opt with -On n>0. #315

Hang calling llc with aie2p target, after opt with -On n>0. #315

Comments

newling commented Jan 29, 2025 • edited Loading

martien-de-jong commented Jan 29, 2025

martien-de-jong commented Jan 30, 2025

martien-de-jong commented Jan 30, 2025 • edited Loading

martien-de-jong commented Jan 30, 2025

martien-de-jong commented Jan 30, 2025 • edited Loading

newling commented Jan 30, 2025

martien-de-jong commented Jan 30, 2025

martien-de-jong commented Jan 30, 2025

newling commented Jan 30, 2025 • edited Loading

newling commented Jan 29, 2025 •

edited

Loading

martien-de-jong commented Jan 30, 2025 •

edited

Loading

martien-de-jong commented Jan 30, 2025 •

edited

Loading

newling commented Jan 30, 2025 •

edited

Loading