Skip to content

Commit

Permalink
[ARM] Lower i1 concat via MVETRUNC
Browse files Browse the repository at this point in the history
The MVETRUNC operation can perform the same truncate of two vectors, without
requiring lane inserts/extracts from every vector lane. This moves the concat
i1 lowering to use it for v8i1 and v16i1 result types, trading a bit of extra
stack space for less instructions.
  • Loading branch information
davemgreen committed Oct 18, 2023
1 parent e494a96 commit 8a70102
Show file tree
Hide file tree
Showing 8 changed files with 460 additions and 732 deletions.
13 changes: 11 additions & 2 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9095,13 +9095,21 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
unsigned NumElts = 2 * Op1VT.getVectorNumElements();

EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
// Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
// ConcatVT.
SDValue ConVec =
DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
DAG.getConstant(ARMCC::NE, dl, MVT::i32));
}

// Extract the vector elements from Op1 and Op2 one by one and truncate them
// to be the right size for the destination. For example, if Op1 is v4i1
// then the promoted vector is v4i32. The result of concatenation gives a
// v8i1, which when promoted is v8i16. That means each i32 element from Op1
// needs truncating to i16 and inserting in the result.
EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
EVT NewVT = NewV.getValueType();
EVT ConcatVT = ConVec.getValueType();
Expand All @@ -9119,6 +9127,7 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
return ConVec;
};
unsigned j = 0;
SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
ConVec = ExtractInto(NewV1, ConVec, j);
ConVec = ExtractInto(NewV2, ConVec, j);

Expand Down
37 changes: 15 additions & 22 deletions llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -284,40 +284,33 @@ define half @fadd_select_fneg_posk_f16(i32 %arg0, half %x, half %y) {
define <8 x half> @fadd_vselect_fneg_posk_v8f16(<8 x i32> %arg0, <8 x half> %x, <8 x half> %y) {
; CHECK-LABEL: fadd_vselect_fneg_posk_v8f16:
; CHECK: @ %bb.0:
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: vmov d0, r0, r1
; CHECK-NEXT: vmov.i8 q1, #0xff
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: add r0, sp, #16
; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q1, #0xff
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vpsel q2, q1, q0
; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vmov r2, r1, d4
; CHECK-NEXT: add r12, sp, #32
; CHECK-NEXT: vmov r4, r5, d5
; CHECK-NEXT: vmov.16 q2[0], r2
; CHECK-NEXT: vmov.16 q2[1], r1
; CHECK-NEXT: vcmp.i32 eq, q3, zr
; CHECK-NEXT: vpsel q1, q1, q0
; CHECK-NEXT: vmov.16 q2[2], r4
; CHECK-NEXT: vmov r3, r0, d2
; CHECK-NEXT: vmov.16 q2[3], r5
; CHECK-NEXT: vmov.16 q2[4], r3
; CHECK-NEXT: vmov r6, lr, d3
; CHECK-NEXT: vmov.16 q2[5], r0
; CHECK-NEXT: vldrw.u32 q1, [r12]
; CHECK-NEXT: vmov.16 q2[6], r6
; CHECK-NEXT: vmov.i16 q0, #0xc400
; CHECK-NEXT: vmov.16 q2[7], lr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vstrh.32 q2, [r0]
; CHECK-NEXT: vstrh.32 q0, [r0, #8]
; CHECK-NEXT: add r1, sp, #32
; CHECK-NEXT: vldrw.u32 q2, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vmov.i16 q1, #0xc400
; CHECK-NEXT: add r0, sp, #48
; CHECK-NEXT: vcmp.i16 ne, q2, zr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vsub.f16 q0, q1, q0
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: bx lr
%cmp = icmp eq <8 x i32> %arg0, zeroinitializer
%neg.x = fneg <8 x half> %x
%select = select <8 x i1> %cmp, <8 x half> %neg.x, <8 x half> <half 4.0, half 4.0, half 4.0, half 4.0, half 4.0, half 4.0, half 4.0, half 4.0>
Expand Down
202 changes: 76 additions & 126 deletions llvm/test/CodeGen/Thumb2/active_lane_mask.ll
Original file line number Diff line number Diff line change
Expand Up @@ -146,54 +146,47 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
; CHECK-LABEL: v8i16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: adr.w r12, .LCPI3_0
; CHECK-NEXT: vdup.32 q1, r1
; CHECK-NEXT: vldrw.u32 q0, [r12]
; CHECK-NEXT: vmov.i8 q2, #0x0
; CHECK-NEXT: vmov.i8 q3, #0xff
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: mov r4, sp
; CHECK-NEXT: adr r1, .LCPI3_1
; CHECK-NEXT: vqadd.u32 q0, q0, r0
; CHECK-NEXT: vcmp.u32 hi, q1, q0
; CHECK-NEXT: vpsel q4, q3, q2
; CHECK-NEXT: vmov r1, r12, d8
; CHECK-NEXT: vmov.16 q0[0], r1
; CHECK-NEXT: vmov.16 q0[1], r12
; CHECK-NEXT: vmov r1, r12, d9
; CHECK-NEXT: vmov.16 q0[2], r1
; CHECK-NEXT: adr r1, .LCPI3_1
; CHECK-NEXT: vldrw.u32 q4, [r1]
; CHECK-NEXT: vmov.16 q0[3], r12
; CHECK-NEXT: vqadd.u32 q4, q4, r0
; CHECK-NEXT: vcmp.u32 hi, q1, q4
; CHECK-NEXT: vpsel q1, q3, q2
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: vmov.16 q0[4], r0
; CHECK-NEXT: vmov.16 q0[5], r1
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmov.16 q0[6], r0
; CHECK-NEXT: add r0, sp, #24
; CHECK-NEXT: vmov.16 q0[7], r1
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vpsel q3, q2, q0
; CHECK-NEXT: vstrh.32 q3, [r4, #8]
; CHECK-NEXT: vldrw.u32 q3, [r1]
; CHECK-NEXT: vqadd.u32 q3, q3, r0
; CHECK-NEXT: add r0, sp, #32
; CHECK-NEXT: vcmp.u32 hi, q1, q3
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vcmp.i16 ne, q0, zr
; CHECK-NEXT: vldr d1, [sp, #16]
; CHECK-NEXT: vpsel q0, q2, q0
; CHECK-NEXT: vstrh.32 q0, [r4]
; CHECK-NEXT: vldr d1, [sp, #24]
; CHECK-NEXT: vldrw.u32 q2, [r4]
; CHECK-NEXT: vmov d0, r2, r3
; CHECK-NEXT: vcmp.i16 ne, q2, zr
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI3_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
; CHECK-NEXT: .LCPI3_1:
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 5 @ 0x5
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 7 @ 0x7
; CHECK-NEXT: .LCPI3_1:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC)
%select = select <8 x i1> %active.lane.mask, <8 x i16> %V1, <8 x i16> %V2
ret <8 x i16> %select
Expand All @@ -202,122 +195,79 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
; CHECK-LABEL: v16i8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: sub sp, #48
; CHECK-NEXT: adr.w r12, .LCPI4_0
; CHECK-NEXT: vdup.32 q3, r1
; CHECK-NEXT: vdup.32 q2, r1
; CHECK-NEXT: vldrw.u32 q0, [r12]
; CHECK-NEXT: vmov.i8 q1, #0xff
; CHECK-NEXT: add r5, sp, #16
; CHECK-NEXT: adr r1, .LCPI4_1
; CHECK-NEXT: vqadd.u32 q0, q0, r0
; CHECK-NEXT: vcmp.u32 hi, q3, q0
; CHECK-NEXT: adr r4, .LCPI4_3
; CHECK-NEXT: vcmp.u32 hi, q2, q0
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vpsel q4, q1, q0
; CHECK-NEXT: vmov r1, r12, d8
; CHECK-NEXT: vmov.16 q2[0], r1
; CHECK-NEXT: vmov.16 q2[1], r12
; CHECK-NEXT: vmov r1, r12, d9
; CHECK-NEXT: vmov.16 q2[2], r1
; CHECK-NEXT: adr r1, .LCPI4_1
; CHECK-NEXT: vldrw.u32 q4, [r1]
; CHECK-NEXT: vmov.16 q2[3], r12
; CHECK-NEXT: vqadd.u32 q4, q4, r0
; CHECK-NEXT: vcmp.u32 hi, q3, q4
; CHECK-NEXT: vpsel q4, q1, q0
; CHECK-NEXT: vmov r1, r12, d8
; CHECK-NEXT: vmov.16 q2[4], r1
; CHECK-NEXT: vmov.16 q2[5], r12
; CHECK-NEXT: vmov r1, r12, d9
; CHECK-NEXT: vmov.16 q2[6], r1
; CHECK-NEXT: vmov.16 q2[7], r12
; CHECK-NEXT: vcmp.i16 ne, q2, zr
; CHECK-NEXT: vpsel q4, q1, q0
; CHECK-NEXT: vmov.u16 r1, q4[0]
; CHECK-NEXT: vmov.8 q2[0], r1
; CHECK-NEXT: vmov.u16 r1, q4[1]
; CHECK-NEXT: vmov.8 q2[1], r1
; CHECK-NEXT: vmov.u16 r1, q4[2]
; CHECK-NEXT: vmov.8 q2[2], r1
; CHECK-NEXT: vmov.u16 r1, q4[3]
; CHECK-NEXT: vmov.8 q2[3], r1
; CHECK-NEXT: vmov.u16 r1, q4[4]
; CHECK-NEXT: vmov.8 q2[4], r1
; CHECK-NEXT: vmov.u16 r1, q4[5]
; CHECK-NEXT: vmov.8 q2[5], r1
; CHECK-NEXT: vmov.u16 r1, q4[6]
; CHECK-NEXT: vmov.8 q2[6], r1
; CHECK-NEXT: vmov.u16 r1, q4[7]
; CHECK-NEXT: vmov.8 q2[7], r1
; CHECK-NEXT: vpsel q3, q1, q0
; CHECK-NEXT: vstrh.32 q3, [r5, #8]
; CHECK-NEXT: vldrw.u32 q3, [r1]
; CHECK-NEXT: adr r1, .LCPI4_2
; CHECK-NEXT: vldrw.u32 q4, [r1]
; CHECK-NEXT: vqadd.u32 q4, q4, r0
; CHECK-NEXT: vcmp.u32 hi, q3, q4
; CHECK-NEXT: vpsel q5, q1, q0
; CHECK-NEXT: vmov r1, r12, d10
; CHECK-NEXT: vmov.16 q4[0], r1
; CHECK-NEXT: vmov.16 q4[1], r12
; CHECK-NEXT: vmov r1, r12, d11
; CHECK-NEXT: vmov.16 q4[2], r1
; CHECK-NEXT: adr r1, .LCPI4_3
; CHECK-NEXT: vldrw.u32 q5, [r1]
; CHECK-NEXT: vmov.16 q4[3], r12
; CHECK-NEXT: vqadd.u32 q5, q5, r0
; CHECK-NEXT: vcmp.u32 hi, q3, q5
; CHECK-NEXT: vqadd.u32 q3, q3, r0
; CHECK-NEXT: vcmp.u32 hi, q2, q3
; CHECK-NEXT: vpsel q3, q1, q0
; CHECK-NEXT: vstrh.32 q3, [r5]
; CHECK-NEXT: vldrw.u32 q3, [r1]
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vqadd.u32 q3, q3, r0
; CHECK-NEXT: vcmp.u32 hi, q2, q3
; CHECK-NEXT: vpsel q3, q1, q0
; CHECK-NEXT: vmov r0, r1, d6
; CHECK-NEXT: vmov.16 q4[4], r0
; CHECK-NEXT: vmov.16 q4[5], r1
; CHECK-NEXT: vmov r0, r1, d7
; CHECK-NEXT: vmov.16 q4[6], r0
; CHECK-NEXT: vmov.16 q4[7], r1
; CHECK-NEXT: vcmp.i16 ne, q4, zr
; CHECK-NEXT: vstrh.32 q3, [r1, #8]
; CHECK-NEXT: vldrw.u32 q3, [r4]
; CHECK-NEXT: vqadd.u32 q3, q3, r0
; CHECK-NEXT: add r0, sp, #32
; CHECK-NEXT: vcmp.u32 hi, q2, q3
; CHECK-NEXT: vpsel q2, q1, q0
; CHECK-NEXT: vstrh.32 q2, [r1]
; CHECK-NEXT: vldrw.u32 q2, [r5]
; CHECK-NEXT: vcmp.i16 ne, q2, zr
; CHECK-NEXT: vpsel q2, q1, q0
; CHECK-NEXT: vstrb.16 q2, [r0, #8]
; CHECK-NEXT: vldrw.u32 q2, [r1]
; CHECK-NEXT: add r1, sp, #72
; CHECK-NEXT: vcmp.i16 ne, q2, zr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vmov.u16 r0, q0[0]
; CHECK-NEXT: vmov.8 q2[8], r0
; CHECK-NEXT: vmov.u16 r0, q0[1]
; CHECK-NEXT: vmov.8 q2[9], r0
; CHECK-NEXT: vmov.u16 r0, q0[2]
; CHECK-NEXT: vmov.8 q2[10], r0
; CHECK-NEXT: vmov.u16 r0, q0[3]
; CHECK-NEXT: vmov.8 q2[11], r0
; CHECK-NEXT: vmov.u16 r0, q0[4]
; CHECK-NEXT: vmov.8 q2[12], r0
; CHECK-NEXT: vmov.u16 r0, q0[5]
; CHECK-NEXT: vmov.8 q2[13], r0
; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.8 q2[14], r0
; CHECK-NEXT: vmov.u16 r0, q0[7]
; CHECK-NEXT: vmov.8 q2[15], r0
; CHECK-NEXT: add r0, sp, #40
; CHECK-NEXT: vldr d1, [sp, #32]
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vcmp.i8 ne, q2, zr
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vstrb.16 q0, [r0]
; CHECK-NEXT: vldr d1, [sp, #64]
; CHECK-NEXT: vldrw.u32 q2, [r0]
; CHECK-NEXT: vmov d0, r2, r3
; CHECK-NEXT: vcmp.i8 ne, q2, zr
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
; CHECK-NEXT: add sp, #48
; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI4_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
; CHECK-NEXT: .long 12 @ 0xc
; CHECK-NEXT: .long 13 @ 0xd
; CHECK-NEXT: .long 14 @ 0xe
; CHECK-NEXT: .long 15 @ 0xf
; CHECK-NEXT: .LCPI4_1:
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 5 @ 0x5
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 7 @ 0x7
; CHECK-NEXT: .LCPI4_2:
; CHECK-NEXT: .long 8 @ 0x8
; CHECK-NEXT: .long 9 @ 0x9
; CHECK-NEXT: .long 10 @ 0xa
; CHECK-NEXT: .long 11 @ 0xb
; CHECK-NEXT: .LCPI4_2:
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 5 @ 0x5
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 7 @ 0x7
; CHECK-NEXT: .LCPI4_3:
; CHECK-NEXT: .long 12 @ 0xc
; CHECK-NEXT: .long 13 @ 0xd
; CHECK-NEXT: .long 14 @ 0xe
; CHECK-NEXT: .long 15 @ 0xf
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %TC)
%select = select <16 x i1> %active.lane.mask, <16 x i8> %V1, <16 x i8> %V2
ret <16 x i8> %select
Expand Down
Loading

0 comments on commit 8a70102

Please sign in to comment.