Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64] Improve non-SVE popcount for 32bit and 64 bit using udot #95881

Merged
merged 9 commits into from
Jul 2, 2024

Conversation

tgymnich
Copy link
Member

fixes #95860

Use udot instead of a sequence of uaddlp instructions when summing up lanes for popcount.

@llvmbot
Copy link
Collaborator

llvmbot commented Jun 18, 2024

@llvm/pr-subscribers-backend-aarch64

Author: Tim Gymnich (tgymnich)

Changes

fixes #95860

Use udot instead of a sequence of uaddlp instructions when summing up lanes for popcount.


Full diff: https://github.com/llvm/llvm-project/pull/95881.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+20)
  • (modified) llvm/test/CodeGen/AArch64/popcount.ll (+295)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 35871cc5ade7f..839b43b983d12 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9740,6 +9740,26 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
   Val = DAG.getBitcast(VT8Bit, Val);
   Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
 
+  if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16) {
+    EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
+    SDValue Zeros = DAG.getSplatBuildVector(
+        DT, DL, DAG.getConstant(0, DL, DT.getScalarType()));
+    SDValue Ones =
+        DAG.getSplatBuildVector(VT8Bit, DL, DAG.getConstant(1, DL, MVT::i8));
+
+    if (VT == MVT::v2i64) {
+      Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
+      Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
+    } else if (VT == MVT::v2i32) {
+      Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
+    } else if (VT == MVT::v4i32) {
+      Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
+    } else {
+      llvm_unreachable("Unexpected type for custom ctpop lowering");
+    }
+
+    return Val;
+  }
   // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
   unsigned EltSize = 8;
   unsigned NumElts = VT.is64BitVector() ? 8 : 16;
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index b1231eeac1ea4..c041620fcc104 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O0 -mtriple=aarch64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon | FileCheck %s --check-prefix=NEON
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon,+dotprod | FileCheck %s --check-prefix=DOT
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+sve | FileCheck %s --check-prefix=SVE
 
 ; Function Attrs: nobuiltin nounwind readonly
 define i8 @popcount128(ptr nocapture nonnull readonly %0) {
@@ -12,6 +15,36 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NEON-LABEL: popcount128:
+; NEON:       // %bb.0: // %Entry
+; NEON-NEXT:    ldr d0, [x0]
+; NEON-NEXT:    add x8, x0, #8
+; NEON-NEXT:    ld1 { v0.d }[1], [x8]
+; NEON-NEXT:    cnt v0.16b, v0.16b
+; NEON-NEXT:    uaddlv h0, v0.16b
+; NEON-NEXT:    fmov w0, s0
+; NEON-NEXT:    ret
+;
+; DOT-LABEL: popcount128:
+; DOT:       // %bb.0: // %Entry
+; DOT-NEXT:    ldr d0, [x0]
+; DOT-NEXT:    add x8, x0, #8
+; DOT-NEXT:    ld1 { v0.d }[1], [x8]
+; DOT-NEXT:    cnt v0.16b, v0.16b
+; DOT-NEXT:    uaddlv h0, v0.16b
+; DOT-NEXT:    fmov w0, s0
+; DOT-NEXT:    ret
+;
+; SVE-LABEL: popcount128:
+; SVE:       // %bb.0: // %Entry
+; SVE-NEXT:    ldr d0, [x0]
+; SVE-NEXT:    add x8, x0, #8
+; SVE-NEXT:    ld1 { v0.d }[1], [x8]
+; SVE-NEXT:    cnt v0.16b, v0.16b
+; SVE-NEXT:    uaddlv h0, v0.16b
+; SVE-NEXT:    fmov w0, s0
+; SVE-NEXT:    ret
 Entry:
   %1 = load i128, ptr %0, align 16
   %2 = tail call i128 @llvm.ctpop.i128(i128 %1)
@@ -56,6 +89,57 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    adds x8, x8, x9
 ; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
+;
+; NEON-LABEL: popcount256:
+; NEON:       // %bb.0: // %Entry
+; NEON-NEXT:    ldr d0, [x0, #16]
+; NEON-NEXT:    ldr d1, [x0]
+; NEON-NEXT:    add x8, x0, #8
+; NEON-NEXT:    add x9, x0, #24
+; NEON-NEXT:    ld1 { v0.d }[1], [x9]
+; NEON-NEXT:    ld1 { v1.d }[1], [x8]
+; NEON-NEXT:    cnt v0.16b, v0.16b
+; NEON-NEXT:    cnt v1.16b, v1.16b
+; NEON-NEXT:    uaddlv h0, v0.16b
+; NEON-NEXT:    uaddlv h1, v1.16b
+; NEON-NEXT:    fmov w8, s0
+; NEON-NEXT:    fmov w9, s1
+; NEON-NEXT:    add w0, w9, w8
+; NEON-NEXT:    ret
+;
+; DOT-LABEL: popcount256:
+; DOT:       // %bb.0: // %Entry
+; DOT-NEXT:    ldr d0, [x0, #16]
+; DOT-NEXT:    ldr d1, [x0]
+; DOT-NEXT:    add x8, x0, #8
+; DOT-NEXT:    add x9, x0, #24
+; DOT-NEXT:    ld1 { v0.d }[1], [x9]
+; DOT-NEXT:    ld1 { v1.d }[1], [x8]
+; DOT-NEXT:    cnt v0.16b, v0.16b
+; DOT-NEXT:    cnt v1.16b, v1.16b
+; DOT-NEXT:    uaddlv h0, v0.16b
+; DOT-NEXT:    uaddlv h1, v1.16b
+; DOT-NEXT:    fmov w8, s0
+; DOT-NEXT:    fmov w9, s1
+; DOT-NEXT:    add w0, w9, w8
+; DOT-NEXT:    ret
+;
+; SVE-LABEL: popcount256:
+; SVE:       // %bb.0: // %Entry
+; SVE-NEXT:    ldr d0, [x0, #16]
+; SVE-NEXT:    ldr d1, [x0]
+; SVE-NEXT:    add x8, x0, #8
+; SVE-NEXT:    add x9, x0, #24
+; SVE-NEXT:    ld1 { v0.d }[1], [x9]
+; SVE-NEXT:    ld1 { v1.d }[1], [x8]
+; SVE-NEXT:    cnt v0.16b, v0.16b
+; SVE-NEXT:    cnt v1.16b, v1.16b
+; SVE-NEXT:    uaddlv h0, v0.16b
+; SVE-NEXT:    uaddlv h1, v1.16b
+; SVE-NEXT:    fmov w8, s0
+; SVE-NEXT:    fmov w9, s1
+; SVE-NEXT:    add w0, w9, w8
+; SVE-NEXT:    ret
 Entry:
   %1 = load i256, ptr %0, align 16
   %2 = tail call i256 @llvm.ctpop.i256(i256 %1)
@@ -83,9 +167,220 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
 ; CHECK-NEXT:    // kill: def $x8 killed $w8
 ; CHECK-NEXT:    bfi x0, x8, #32, #32
 ; CHECK-NEXT:    ret
+;
+; NEON-LABEL: popcount1x128:
+; NEON:       // %bb.0: // %Entry
+; NEON-NEXT:    fmov d1, x0
+; NEON-NEXT:    movi v0.2d, #0000000000000000
+; NEON-NEXT:    mov v1.d[1], x1
+; NEON-NEXT:    cnt v1.16b, v1.16b
+; NEON-NEXT:    uaddlv h1, v1.16b
+; NEON-NEXT:    mov v0.s[0], v1.s[0]
+; NEON-NEXT:    mov x1, v0.d[1]
+; NEON-NEXT:    fmov x0, d0
+; NEON-NEXT:    ret
+;
+; DOT-LABEL: popcount1x128:
+; DOT:       // %bb.0: // %Entry
+; DOT-NEXT:    fmov d1, x0
+; DOT-NEXT:    movi v0.2d, #0000000000000000
+; DOT-NEXT:    mov v1.d[1], x1
+; DOT-NEXT:    cnt v1.16b, v1.16b
+; DOT-NEXT:    uaddlv h1, v1.16b
+; DOT-NEXT:    mov v0.s[0], v1.s[0]
+; DOT-NEXT:    mov x1, v0.d[1]
+; DOT-NEXT:    fmov x0, d0
+; DOT-NEXT:    ret
+;
+; SVE-LABEL: popcount1x128:
+; SVE:       // %bb.0: // %Entry
+; SVE-NEXT:    fmov d1, x0
+; SVE-NEXT:    movi v0.2d, #0000000000000000
+; SVE-NEXT:    mov v1.d[1], x1
+; SVE-NEXT:    cnt v1.16b, v1.16b
+; SVE-NEXT:    uaddlv h1, v1.16b
+; SVE-NEXT:    mov v0.s[0], v1.s[0]
+; SVE-NEXT:    mov x1, v0.d[1]
+; SVE-NEXT:    fmov x0, d0
+; SVE-NEXT:    ret
 Entry:
   %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)
   ret <1 x i128> %1
 }
 
 declare <1 x i128> @llvm.ctpop.v1.i128(<1 x i128>)
+
+define <2 x i64> @popcount2x64(<2 x i64> %0) {
+; CHECK-LABEL: popcount2x64:
+; CHECK:       // %bb.0: // %Entry
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-NEXT:    ret
+;
+; NEON-LABEL: popcount2x64:
+; NEON:       // %bb.0: // %Entry
+; NEON-NEXT:    cnt v0.16b, v0.16b
+; NEON-NEXT:    uaddlp v0.8h, v0.16b
+; NEON-NEXT:    uaddlp v0.4s, v0.8h
+; NEON-NEXT:    uaddlp v0.2d, v0.4s
+; NEON-NEXT:    ret
+;
+; DOT-LABEL: popcount2x64:
+; DOT:       // %bb.0: // %Entry
+; DOT-NEXT:    movi v1.16b, #1
+; DOT-NEXT:    cnt v0.16b, v0.16b
+; DOT-NEXT:    movi v2.2d, #0000000000000000
+; DOT-NEXT:    udot v2.4s, v1.16b, v0.16b
+; DOT-NEXT:    uaddlp v0.2d, v2.4s
+; DOT-NEXT:    ret
+;
+; SVE-LABEL: popcount2x64:
+; SVE:       // %bb.0: // %Entry
+; SVE-NEXT:    cnt v0.16b, v0.16b
+; SVE-NEXT:    uaddlp v0.8h, v0.16b
+; SVE-NEXT:    uaddlp v0.4s, v0.8h
+; SVE-NEXT:    uaddlp v0.2d, v0.4s
+; SVE-NEXT:    ret
+Entry:
+  %1 = tail call <2 x i64> @llvm.ctpop.v2.i64(<2 x i64> %0)
+  ret <2 x i64> %1
+}
+
+declare <2 x i64> @llvm.ctpop.v2.i64(<2 x i64>)
+
+define <4 x i32> @popcount4x32(<4 x i32> %0) {
+; CHECK-LABEL: popcount4x32:
+; CHECK:       // %bb.0: // %Entry
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-NEXT:    ret
+;
+; NEON-LABEL: popcount4x32:
+; NEON:       // %bb.0: // %Entry
+; NEON-NEXT:    cnt v0.16b, v0.16b
+; NEON-NEXT:    uaddlp v0.8h, v0.16b
+; NEON-NEXT:    uaddlp v0.4s, v0.8h
+; NEON-NEXT:    ret
+;
+; DOT-LABEL: popcount4x32:
+; DOT:       // %bb.0: // %Entry
+; DOT-NEXT:    movi v1.16b, #1
+; DOT-NEXT:    cnt v2.16b, v0.16b
+; DOT-NEXT:    movi v0.2d, #0000000000000000
+; DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
+; DOT-NEXT:    ret
+;
+; SVE-LABEL: popcount4x32:
+; SVE:       // %bb.0: // %Entry
+; SVE-NEXT:    cnt v0.16b, v0.16b
+; SVE-NEXT:    uaddlp v0.8h, v0.16b
+; SVE-NEXT:    uaddlp v0.4s, v0.8h
+; SVE-NEXT:    ret
+Entry:
+  %1 = tail call <4 x i32> @llvm.ctpop.v4.i32(<4 x i32> %0)
+  ret <4 x i32> %1
+}
+
+declare <4 x i32> @llvm.ctpop.v4.i32(<4 x i32>)
+
+define <2 x i32> @popcount2x32(<2 x i32> %0) {
+; CHECK-LABEL: popcount2x32:
+; CHECK:       // %bb.0: // %Entry
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    uaddlp v0.2s, v0.4h
+; CHECK-NEXT:    ret
+;
+; NEON-LABEL: popcount2x32:
+; NEON:       // %bb.0: // %Entry
+; NEON-NEXT:    cnt v0.8b, v0.8b
+; NEON-NEXT:    uaddlp v0.4h, v0.8b
+; NEON-NEXT:    uaddlp v0.2s, v0.4h
+; NEON-NEXT:    ret
+;
+; DOT-LABEL: popcount2x32:
+; DOT:       // %bb.0: // %Entry
+; DOT-NEXT:    movi v1.2d, #0000000000000000
+; DOT-NEXT:    cnt v0.8b, v0.8b
+; DOT-NEXT:    movi v2.8b, #1
+; DOT-NEXT:    udot v1.2s, v2.8b, v0.8b
+; DOT-NEXT:    fmov d0, d1
+; DOT-NEXT:    ret
+;
+; SVE-LABEL: popcount2x32:
+; SVE:       // %bb.0: // %Entry
+; SVE-NEXT:    cnt v0.8b, v0.8b
+; SVE-NEXT:    uaddlp v0.4h, v0.8b
+; SVE-NEXT:    uaddlp v0.2s, v0.4h
+; SVE-NEXT:    ret
+Entry:
+  %1 = tail call <2 x i32> @llvm.ctpop.v2.i32(<2 x i32> %0)
+  ret <2 x i32> %1
+}
+
+declare <2 x i32> @llvm.ctpop.v2.i32(<2 x i32>)
+
+define <8 x i16> @popcount8x16(<8 x i16> %0) {
+; CHECK-LABEL: popcount8x16:
+; CHECK:       // %bb.0: // %Entry
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    ret
+;
+; NEON-LABEL: popcount8x16:
+; NEON:       // %bb.0: // %Entry
+; NEON-NEXT:    cnt v0.16b, v0.16b
+; NEON-NEXT:    uaddlp v0.8h, v0.16b
+; NEON-NEXT:    ret
+;
+; DOT-LABEL: popcount8x16:
+; DOT:       // %bb.0: // %Entry
+; DOT-NEXT:    cnt v0.16b, v0.16b
+; DOT-NEXT:    uaddlp v0.8h, v0.16b
+; DOT-NEXT:    ret
+;
+; SVE-LABEL: popcount8x16:
+; SVE:       // %bb.0: // %Entry
+; SVE-NEXT:    cnt v0.16b, v0.16b
+; SVE-NEXT:    uaddlp v0.8h, v0.16b
+; SVE-NEXT:    ret
+Entry:
+  %1 = tail call <8 x i16> @llvm.ctpop.v8.i16(<8 x i16> %0)
+  ret <8 x i16> %1
+}
+
+declare <8 x i16> @llvm.ctpop.v8.i16(<8 x i16>)
+
+define <4 x i16> @popcount4x16(<4 x i16> %0) {
+; CHECK-LABEL: popcount4x16:
+; CHECK:       // %bb.0: // %Entry
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    ret
+;
+; NEON-LABEL: popcount4x16:
+; NEON:       // %bb.0: // %Entry
+; NEON-NEXT:    cnt v0.8b, v0.8b
+; NEON-NEXT:    uaddlp v0.4h, v0.8b
+; NEON-NEXT:    ret
+;
+; DOT-LABEL: popcount4x16:
+; DOT:       // %bb.0: // %Entry
+; DOT-NEXT:    cnt v0.8b, v0.8b
+; DOT-NEXT:    uaddlp v0.4h, v0.8b
+; DOT-NEXT:    ret
+;
+; SVE-LABEL: popcount4x16:
+; SVE:       // %bb.0: // %Entry
+; SVE-NEXT:    cnt v0.8b, v0.8b
+; SVE-NEXT:    uaddlp v0.4h, v0.8b
+; SVE-NEXT:    ret
+Entry:
+  %1 = tail call <4 x i16> @llvm.ctpop.v4.i16(<4 x i16> %0)
+  ret <4 x i16> %1
+}
+
+declare <4 x i16> @llvm.ctpop.v4.i16(<4 x i16>)

llvm/test/CodeGen/AArch64/popcount.ll Outdated Show resolved Hide resolved
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved
@tgymnich tgymnich force-pushed the arm-ctpop-udot branch 2 times, most recently from 21df20b to 9c81cff Compare June 18, 2024 13:49
Copy link
Collaborator

@davemgreen davemgreen left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I have a couple of test suggestions but I think this looks OK.

llvm/test/CodeGen/AArch64/popcount.ll Outdated Show resolved Hide resolved
llvm/test/CodeGen/AArch64/popcount.ll Outdated Show resolved Hide resolved
Copy link
Collaborator

@davemgreen davemgreen left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. LGTM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved
@aengelke
Copy link
Contributor

Any chance we could get the pattern also for GlobalISel? (Not blocking etc., but if we keep adding patterns to SDAG and not GlobalISel, we'll probably never switch the default...)

@tgymnich
Copy link
Member Author

@aengelke I created a follow up PR #96409

@tgymnich
Copy link
Member Author

tgymnich commented Jul 2, 2024

@davemgreen @efriedma-quic are we ready to merge?

@davemgreen
Copy link
Collaborator

Yeah still LGTM. (You can merge, right? I sometimes don't know who can and can't).

@tgymnich
Copy link
Member Author

tgymnich commented Jul 2, 2024

@davemgreen I can't merge.

@davemgreen
Copy link
Collaborator

Oh I thought you could because you are a "member". Apparently that wasn't the right place to look - sorry about that.

@davemgreen davemgreen merged commit 696805d into llvm:main Jul 2, 2024
7 checks passed
@tgymnich tgymnich deleted the arm-ctpop-udot branch July 2, 2024 14:23
davemgreen pushed a commit that referenced this pull request Jul 2, 2024
…sing udot (#96409)

Follow up for #95881

Use udot instead of a sequence of uaddlp instructions when summing up
lanes for popcount.
lravenclaw pushed a commit to lravenclaw/llvm-project that referenced this pull request Jul 3, 2024
…lvm#95881)

fixes llvm#95860

Use `udot` instead of a sequence of `uaddlp` instructions when summing
up lanes for `popcount`.
lravenclaw pushed a commit to lravenclaw/llvm-project that referenced this pull request Jul 3, 2024
…sing udot (llvm#96409)

Follow up for llvm#95881

Use udot instead of a sequence of uaddlp instructions when summing up
lanes for popcount.
kbluck pushed a commit to kbluck/llvm-project that referenced this pull request Jul 6, 2024
…lvm#95881)

fixes llvm#95860

Use `udot` instead of a sequence of `uaddlp` instructions when summing
up lanes for `popcount`.
kbluck pushed a commit to kbluck/llvm-project that referenced this pull request Jul 6, 2024
…sing udot (llvm#96409)

Follow up for llvm#95881

Use udot instead of a sequence of uaddlp instructions when summing up
lanes for popcount.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

AARCH64: Non-SVE popcount autovect for 32bit and 64 bit could be improved using v8.4-a's udot instruction
4 participants