[ARM] Simplify address calculation for NEON load/store

The patch attempts to optimize a sequence of SIMD loads from the same base pointer: %0 = gep float*, float* base, i32 4 %1 = bitcast float* %0 to <4 x float>* %2 = load <4 x float>, <4 x float>* %1 ... %n1 = gep float*, float* base, i32 N %n2 = bitcast float* %n1 to <4 x float>* %n3 = load <4 x float>, <4 x float>* %n2 For AArch64 the compiler generates a sequence of LDR Qt, [Xn, #16]. However, 32-bit NEON VLD1/VST1 lack the [Wn, #imm] addressing mode, so the address is computed before every ld/st instruction: add r2, r0, #32 add r0, r0, #16 vld1.32 {d18, d19}, [r2] vld1.32 {d22, d23}, [r0] This can be improved by computing address for the first load, and then using a post-indexed form of VLD1/VST1 to load the rest: add r0, r0, #16 vld1.32 {d18, d19}, [r0]! vld1.32 {d22, d23}, [r0] In order to do that, the patch adds more patterns to DAGCombine: - (load (add ptr inc1)) and (add ptr inc2) are now folded if inc1 and inc2 are constants. - (or ptr inc) is now recognized as a pointer increment if ptr is sufficiently aligned. In addition to that, we now search for all possible base updates and then pick the best one. Differential Revision: https://reviews.llvm.org/D108988
intel · Oct 14, 2021 · dc8a41d · dc8a41d
1 parent 8848766
commit dc8a41d
Show file tree

Hide file tree

Showing 12 changed files with 1,113 additions and 614 deletions.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
diff --git a/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll b/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
@@ -3,80 +3,57 @@
 ; rdar://12713765
 ; When realign-stack is set to false, make sure we are not creating stack
 ; objects that are assumed to be 64-byte aligned.
-@T3_retval = common global <16 x float> zeroinitializer, align 16
 
 define void @test1(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp "no-realign-stack" {
-entry:
 ; CHECK-LABEL: test1:
-; CHECK: ldr     r[[R1:[0-9]+]], [pc, r[[R1]]]
-; CHECK: mov     r[[R2:[0-9]+]], r[[R1]]
-; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]!
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: add     r[[R3:[0-9]+]], r[[R1]], #32
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: add     r[[R3:[0-9]+]], r[[R1]], #48
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: mov     r[[R2:[0-9]+]], sp
-; CHECK: add     r[[R3:[0-9]+]], r[[R2]], #48
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: add     r[[R4:[0-9]+]], r[[R2]], #32
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
-; CHECK: mov     r[[R5:[0-9]+]], r[[R2]]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: add     r[[R1:[0-9]+]], r0, #48
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: add     r[[R1:[0-9]+]], r0, #32
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
+; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
+; CHECK: mov r[[NOTALIGNED:[0-9]+]], sp
+; CHECK: add r[[NOTALIGNED]], r[[NOTALIGNED]], #32
+; CHECK: add r[[PTR]], r[[PTR]], #32
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
+; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
+; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
+entry:
  %retval = alloca <16 x float>, align 64
- %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
- store <16 x float> %0, <16 x float>* %retval
- %1 = load <16 x float>, <16 x float>* %retval
- store <16 x float> %1, <16 x float>* %agg.result, align 16
+ %a1 = bitcast <16 x float>* %retval to float*
+ %a2 = getelementptr inbounds float, float* %a1, i64 8
+ %a3 = bitcast float* %a2 to <4 x float>*
+
+ %b1 = bitcast <16 x float>* %agg.result to float*
+ %b2 = getelementptr inbounds float, float* %b1, i64 8
+ %b3 = bitcast float* %b2 to <4 x float>*
+
+ %0 = load <4 x float>, <4 x float>* %a3, align 16
+ %1 = load <4 x float>, <4 x float>* %b3, align 16
+ store <4 x float> %0, <4 x float>* %b3, align 16
+ store <4 x float> %1, <4 x float>* %a3, align 16
  ret void
 }
 
 define void @test2(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp {
-entry:
 ; CHECK-LABEL: test2:
-; CHECK: ldr     r[[R1:[0-9]+]], [pc, r[[R1]]]
-; CHECK: add     r[[R2:[0-9]+]], r[[R1]], #48
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: add     r[[R2:[0-9]+]], r[[R1]], #32
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: mov     r[[R1:[0-9]+]], sp
-; CHECK: orr     r[[R2:[0-9]+]], r[[R1]], #16
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: mov     r[[R3:[0-9]+]], #32
-; CHECK: mov     r[[R9:[0-9]+]], r[[R1]]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128], r[[R3]]
-; CHECK: mov     r[[R3:[0-9]+]], r[[R9]]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: add     r[[R1:[0-9]+]], r0, #48
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: add     r[[R1:[0-9]+]], r0, #32
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
+; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
+; CHECK: mov r[[ALIGNED:[0-9]+]], sp
+; CHECK: orr r[[ALIGNED]], r[[ALIGNED]], #32
+; CHECK: add r[[PTR]], r[[PTR]], #32
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
+; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
+; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
+entry:
+ %retval = alloca <16 x float>, align 64
+ %a1 = bitcast <16 x float>* %retval to float*
+ %a2 = getelementptr inbounds float, float* %a1, i64 8
+ %a3 = bitcast float* %a2 to <4 x float>*
 
+ %b1 = bitcast <16 x float>* %agg.result to float*
+ %b2 = getelementptr inbounds float, float* %b1, i64 8
+ %b3 = bitcast float* %b2 to <4 x float>*
 
-%retval = alloca <16 x float>, align 64
- %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
- store <16 x float> %0, <16 x float>* %retval
- %1 = load <16 x float>, <16 x float>* %retval
- store <16 x float> %1, <16 x float>* %agg.result, align 16
+ %0 = load <4 x float>, <4 x float>* %a3, align 16
+ %1 = load <4 x float>, <4 x float>* %b3, align 16
+ store <4 x float> %0, <4 x float>* %b3, align 16
+ store <4 x float> %1, <4 x float>* %a3, align 16
  ret void
 }