Add LLVM patch D14260 to improve SIMD code

Arch Robison proposed the patch <http://reviews.llvm.org/D14260> "Optimize store of "bitcast" from vector to aggregate" for LLVM. This patch applies cleanly to LLVM 3.7.1. It seems to be the last missing puzzle piece on the LLVM side to allow generating efficient SIMD instructions via `llvm_call` in Julia. For an example package, see e.g. <https://github.com/eschnett/SIMD.jl>. Some discussion relevant to this PR are in JuliaLang#5355. @ArchRobison, please comment. Julia stores tuples as LLVM arrays, whereas LLVM SIMD instructions require LLVM vectors. The respective conversions are unfortunately not always optimized out unless the patch above is applied, leading to a cumbersome sequence of instructions to disassemble and reassemble a SIMD vector. An example is given here <eschnett/SIMD.jl#1 (comment)>. Without this patch, the loop kernel looks like (x86-64, AVX2 instructions): ``` vunpcklpd %xmm4, %xmm3, %xmm3 # xmm3 = xmm3[0],xmm4[0] vunpcklpd %xmm2, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm2[0] vinsertf128 $1, %xmm3, %ymm1, %ymm1 vmovupd 8(%rcx), %xmm2 vinsertf128 $1, 24(%rcx), %ymm2, %ymm2 vaddpd %ymm2, %ymm1, %ymm1 vpermilpd $1, %xmm1, %xmm2 # xmm2 = xmm1[1,0] vextractf128 $1, %ymm1, %xmm3 vpermilpd $1, %xmm3, %xmm4 # xmm4 = xmm3[1,0] Source line: 62 vaddsd (%rcx), %xmm0, %xmm0 ``` Note that the SIMD vector is kept in register `%ymm1`, but is unnecessarily scalarized into registers `%xmm{0,1,2,3}` at the end of the kernel, and re-assembled in the beginning. With this patch, the loop kernel looks like: ``` L192: vaddpd (%rdx), %ymm1, %ymm1 Source line: 62 addq %rsi, %rdx addq %rcx, %rdi jne L192 ``` which is perfect.
eschnett · Feb 8, 2016 · 49d1019 · 49d1019 · ArchRobison · Mar 8, 2016
1 parent f6315f5
commit 49d1019
Show file tree

Hide file tree

Showing 2 changed files with 137 additions and 0 deletions.
diff --git a/deps/Makefile b/deps/Makefile
@@ -671,6 +671,7 @@ else ifeq ($(LLVM_VER),3.7.1)
 $(eval $(call LLVM_PATCH,llvm-3.7.1))
 $(eval $(call LLVM_PATCH,llvm-3.7.1_2))
 $(eval $(call LLVM_PATCH,llvm-3.7.1_3))
+$(eval $(call LLVM_PATCH,llvm-D14260))
 $(LLVM_SRC_DIR)/llvm-3.7.1_2.patch-applied: $(LLVM_SRC_DIR)/llvm-3.7.1.patch-applied
 endif # LLVM_VER
 

diff --git a/deps/llvm-D14260.patch b/deps/llvm-D14260.patch
@@ -0,0 +1,136 @@
+Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+===================================================================
+--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
++++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+@@ -879,6 +879,61 @@
+   return nullptr;
+ }
+
++/// \brief Look for extractelement/insertvalue sequence that acts like a bitcast.
++///
++/// \returns underlying value that was "cast", or nullptr otherwise.
++///
++/// For example, if we have:
++///
++///     %E0 = extractelement <2 x double> %U, i32 0
++///     %V0 = insertvalue [2 x double] undef, double %E0, 0
++///     %E1 = extractelement <2 x double> %U, i32 1
++///     %V1 = insertvalue [2 x double] %V0, double %E1, 1
++///
++/// and the layout of a <2 x double> is isomorphic to a [2 x double],
++/// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
++/// Note that %U may contain non-undef values where %V1 has undef.
++static Value* likeBitCastFromVector(InstCombiner &IC, Value* V) {
++  Value *U = nullptr;
++  while (auto *IV = dyn_cast<InsertValueInst>(V)) {
++    auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
++    if (!E)
++      return nullptr;
++    auto *W = E->getVectorOperand();
++    if (!U)
++      U = W;
++    else if (U != W)
++      return nullptr;
++    auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
++    if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
++      return nullptr;
++    V = IV->getAggregateOperand();
++  }
++  if (!isa<UndefValue>(V) ||!U)
++    return nullptr;
++
++  VectorType *UT = cast<VectorType>(U->getType());
++  Type *VT = V->getType();
++  // Check that types UT and VT are bitwise isomorphic.
++  const DataLayout &DL = IC.getDataLayout();
++  if (DL.getTypeSizeInBits(UT) != DL.getTypeSizeInBits(VT)) {
++    return nullptr;
++  }
++  if (ArrayType *AT = dyn_cast<ArrayType>(VT)) {
++    if (AT->getNumElements() != UT->getNumElements())
++      return nullptr;
++  } else {
++    StructType *ST = cast<StructType>(VT);
++    if (ST->getNumElements() != UT->getNumElements())
++      return nullptr;
++    for (const Type *EltT : ST->elements()) {
++      if (EltT != UT->getElementType())
++        return nullptr;
++    }
++  }
++  return U;
++}
++
+ /// \brief Combine stores to match the type of value being stored.
+ ///
+ /// The core idea here is that the memory does not have any intrinsic type and
+@@ -914,6 +969,11 @@
+     return true;
+   }
+
++  if (Value *U = likeBitCastFromVector(IC, V)) {
++    combineStoreToNewValue(IC, SI, U);
++    return true;
++  }
++
+   // FIXME: We should also canonicalize loads of vectors when their elements are
+   // cast to other types.
+   return false;
+Index: test/Transforms/InstCombine/insert-val-extract-elem.ll
+===================================================================
+--- a/test/Transforms/InstCombine/insert-val-extract-elem.ll
++++ b/test/Transforms/InstCombine/insert-val-extract-elem.ll
+@@ -0,0 +1,53 @@
++; RUN: opt -S -instcombine %s | FileCheck %s
++
++; CHECK-NOT: insertvalue
++; CHECK-NOT: extractelement
++; CHECK: store <2 x double>
++define void @julia_2xdouble([2 x double]* sret, <2 x double>*) {
++top:
++  %x = load <2 x double>, <2 x double>* %1
++  %x0 = extractelement <2 x double> %x, i32 0
++  %i0 = insertvalue [2 x double] undef, double %x0, 0
++  %x1 = extractelement <2 x double> %x, i32 1
++  %i1 = insertvalue [2 x double] %i0, double %x1, 1
++  store [2 x double] %i1, [2 x double]* %0, align 4
++  ret void
++}
++
++; CHECK-NOT: insertvalue
++; CHECK-NOT: extractelement
++; CHECK: store <4 x float>
++define void @julia_4xfloat([4 x float]* sret, <4 x float>*) {
++top:
++  %x = load <4 x float>, <4 x float>* %1
++  %x0 = extractelement <4 x float> %x, i32 0
++  %i0 = insertvalue [4 x float] undef, float %x0, 0
++  %x1 = extractelement <4 x float> %x, i32 1
++  %i1 = insertvalue [4 x float] %i0, float %x1, 1
++  %x2 = extractelement <4 x float> %x, i32 2
++  %i2 = insertvalue [4 x float] %i1, float %x2, 2
++  %x3 = extractelement <4 x float> %x, i32 3
++  %i3 = insertvalue [4 x float] %i2, float %x3, 3
++  store [4 x float] %i3, [4 x float]* %0, align 4
++  ret void
++}
++
++%pseudovec = type { float, float, float, float }
++
++; CHECK-NOT: insertvalue
++; CHECK-NOT: extractelement
++; CHECK: store <4 x float>
++define void @julia_pseudovec(%pseudovec* sret, <4 x float>*) {
++top:
++  %x = load <4 x float>, <4 x float>* %1
++  %x0 = extractelement <4 x float> %x, i32 0
++  %i0 = insertvalue %pseudovec undef, float %x0, 0
++  %x1 = extractelement <4 x float> %x, i32 1
++  %i1 = insertvalue %pseudovec %i0, float %x1, 1
++  %x2 = extractelement <4 x float> %x, i32 2
++  %i2 = insertvalue %pseudovec %i1, float %x2, 2
++  %x3 = extractelement <4 x float> %x, i32 3
++  %i3 = insertvalue %pseudovec %i2, float %x3, 3
++  store %pseudovec %i3, %pseudovec* %0, align 4
++  ret void
++}