From 50c4c1df6259f369b76fd930de3d1c08e6a509f2 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 16 Jul 2020 11:56:36 -0700 Subject: [PATCH 1/2] parent c7393adc336d8040a566f6afadefd154b9863daf author Steven Johnson 1594836928 -0700 committer Steven Johnson 1594925723 -0700 Drop support for LLVM9 Since LLVM trunk just got bumped to v12, we can consider dropping LLVM9 support. This PR revises the checks for minimum versions to require v10, and removes all conditional code that is no longer required. --- .github/workflows/llvm_builder.yml | 4 +- .github/workflows/test.yml | 4 +- Makefile | 2 +- dependencies/llvm/CMakeLists.txt | 4 +- src/CodeGen_ARM.cpp | 33 +--- src/CodeGen_LLVM.cpp | 11 +- src/CodeGen_PTX_Dev.cpp | 37 ---- src/Introspection.cpp | 9 - src/LLVM_Headers.h | 22 +-- src/LLVM_Output.cpp | 15 +- src/LLVM_Runtime_Linker.cpp | 40 ---- src/WasmExecutor.cpp | 9 +- test/correctness/simd_op_check.cpp | 299 ++++++++++++++--------------- 13 files changed, 166 insertions(+), 323 deletions(-) diff --git a/.github/workflows/llvm_builder.yml b/.github/workflows/llvm_builder.yml index bd7972248475..182a4c82eda7 100644 --- a/.github/workflows/llvm_builder.yml +++ b/.github/workflows/llvm_builder.yml @@ -34,10 +34,8 @@ jobs: target_arch: [x86, arm] target_bits: [32, 64] target_os: [windows, linux, osx] - llvm_version: [9, 10, 11, 12] + llvm_version: [10, 11, 12] include: - - llvm_version: 9 - llvm_branch: release/9.x - llvm_version: 10 llvm_branch: release/10.x - llvm_version: 11 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e15c33ec3a3b..9e288dd687d9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -58,15 +58,13 @@ jobs: target_os: [linux, osx, windows] llvm_version: [12] build_tool: [cmake_shared] - # llvm_version: [9, 10, 11, 12] # TODO + # llvm_version: [10, 11, 12] # TODO # build_tool: [cmake_shared, make] # TODO # This section basically allows us to define additional values for # each matrix entry, e.g. to map an llvm version number to the specific # git branch that is needed. include: - # - llvm_version: 9 - # llvm_branch: release/9.x # - llvm_version: 10 # llvm_branch: release/10.x # - llvm_version: 11 diff --git a/Makefile b/Makefile index 8ccd716faba3..227522cc4a07 100644 --- a/Makefile +++ b/Makefile @@ -2164,7 +2164,7 @@ $(BUILD_DIR)/clang_ok: @exit 1 endif -ifneq (,$(findstring $(LLVM_VERSION_TIMES_10), 90 100 110 120)) +ifneq (,$(findstring $(LLVM_VERSION_TIMES_10), 100 110 120)) LLVM_OK=yes endif diff --git a/dependencies/llvm/CMakeLists.txt b/dependencies/llvm/CMakeLists.txt index d568187a5d03..5d450d7a54b0 100644 --- a/dependencies/llvm/CMakeLists.txt +++ b/dependencies/llvm/CMakeLists.txt @@ -6,8 +6,8 @@ find_package(LLVM ${HALIDE_REQUIRE_LLVM_VERSION} REQUIRED) message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") -if ("${LLVM_PACKAGE_VERSION}" VERSION_LESS 9.0) - message(FATAL_ERROR "LLVM version must be 9.0 or newer") +if ("${LLVM_PACKAGE_VERSION}" VERSION_LESS 10.0) + message(FATAL_ERROR "LLVM version must be 10.0 or newer") endif () if ("${LLVM_PACKAGE_VERSION}" VERSION_GREATER 12.0) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 64397a5e9bd0..7c299b4487dc 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -114,7 +114,6 @@ CodeGen_ARM::CodeGen_ARM(Target target) casts.push_back(p); // Saturating add -#if LLVM_VERSION >= 100 if (t.is_int()) { p.intrin32 = "llvm.sadd.sat" + t_str; p.intrin64 = "llvm.sadd.sat" + t_str; @@ -122,15 +121,6 @@ CodeGen_ARM::CodeGen_ARM(Target target) p.intrin32 = "llvm.uadd.sat" + t_str; p.intrin64 = "llvm.uadd.sat" + t_str; } -#else - if (t.is_int()) { - p.intrin32 = "llvm.arm.neon.vqadds" + t_str; - p.intrin64 = "llvm.aarch64.neon.sqadd" + t_str; - } else { - p.intrin32 = "llvm.arm.neon.vqaddu" + t_str; - p.intrin64 = "llvm.aarch64.neon.uqadd" + t_str; - } -#endif p.pattern = cast(t, clamp(w_vector + w_vector, tmin, tmax)); casts.push_back(p); @@ -142,7 +132,6 @@ CodeGen_ARM::CodeGen_ARM(Target target) // Saturating subtract // N.B. Saturating subtracts always widen to a signed type -#if LLVM_VERSION >= 100 if (t.is_int()) { p.intrin32 = "llvm.ssub.sat" + t_str; p.intrin64 = "llvm.ssub.sat" + t_str; @@ -150,15 +139,6 @@ CodeGen_ARM::CodeGen_ARM(Target target) p.intrin32 = "llvm.usub.sat" + t_str; p.intrin64 = "llvm.usub.sat" + t_str; } -#else - if (t.is_int()) { - p.intrin32 = "llvm.arm.neon.vqsubs" + t_str; - p.intrin64 = "llvm.aarch64.neon.sqsub" + t_str; - } else { - p.intrin32 = "llvm.arm.neon.vqsubu" + t_str; - p.intrin64 = "llvm.aarch64.neon.uqsub" + t_str; - } -#endif p.pattern = cast(t, clamp(ws_vector - ws_vector, tsmin, tsmax)); casts.push_back(p); @@ -947,10 +927,8 @@ void CodeGen_ARM::visit(const Load *op) { LoadInst *loadI = cast(builder->CreateLoad(bitcastI)); #if LLVM_VERSION >= 110 loadI->setAlignment(Align(alignment)); -#elif LLVM_VERSION >= 100 - loadI->setAlignment(MaybeAlign(alignment)); #else - loadI->setAlignment(alignment); + loadI->setAlignment(MaybeAlign(alignment)); #endif add_tbaa_metadata(loadI, op->name, slice_ramp); Value *shuffleInstr = builder->CreateShuffleVector(loadI, undef, constantsV); @@ -1025,7 +1003,6 @@ void CodeGen_ARM::visit(const Call *op) { } void CodeGen_ARM::visit(const LT *op) { -#if LLVM_VERSION >= 100 if (op->a.type().is_float() && op->type.is_vector()) { // Fast-math flags confuse LLVM's aarch64 backend, so // temporarily clear them for this instruction. @@ -1035,13 +1012,11 @@ void CodeGen_ARM::visit(const LT *op) { CodeGen_Posix::visit(op); return; } -#endif CodeGen_Posix::visit(op); } void CodeGen_ARM::visit(const LE *op) { -#if LLVM_VERSION >= 100 if (op->a.type().is_float() && op->type.is_vector()) { // Fast-math flags confuse LLVM's aarch64 backend, so // temporarily clear them for this instruction. @@ -1051,7 +1026,6 @@ void CodeGen_ARM::visit(const LE *op) { CodeGen_Posix::visit(op); return; } -#endif CodeGen_Posix::visit(op); } @@ -1060,10 +1034,7 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init if (neon_intrinsics_disabled() || op->op == VectorReduce::Or || op->op == VectorReduce::And || - op->op == VectorReduce::Mul || - // LLVM 9 has bugs in the arm backend for vector reduce - // ops. See https://github.com/halide/Halide/issues/5081 - !(LLVM_VERSION >= 100)) { + op->op == VectorReduce::Mul) { CodeGen_Posix::codegen_vector_reduce(op, init); return; } diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index c7c53495e431..e12eaf77fa6f 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -170,15 +170,9 @@ llvm::GlobalValue::LinkageTypes llvm_linkage(LinkageType t) { // A local helper to make an llvm value type representing // alignment. Can't be declared in a header without introducing a // dependence on the LLVM headers. -#if LLVM_VERSION >= 100 llvm::Align make_alignment(int a) { return llvm::Align(a); } -#else -int make_alignment(int a) { - return a; -} -#endif } // namespace @@ -4347,9 +4341,7 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini return; } -#if LLVM_VERSION >= 90 - if (output_lanes == 1 && - (target.arch != Target::ARM || LLVM_VERSION >= 100)) { + if (output_lanes == 1) { const int input_lanes = val.type().lanes(); const int input_bytes = input_lanes * val.type().bytes(); const bool llvm_has_intrinsic = @@ -4449,7 +4441,6 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini return; } } -#endif if (output_lanes == 1 && factor > native_lanes && diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 74ee3a25ac51..abf6534bc32b 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -277,39 +277,6 @@ void CodeGen_PTX_Dev::visit(const Store *op) { if (emit_atomic_stores) { user_assert(is_one(op->predicate)) << "Atomic update does not support predicated store.\n"; user_assert(op->value.type().bits() >= 32) << "CUDA: 8-bit or 16-bit atomics are not supported.\n"; -#if LLVM_VERSION < 90 - user_assert(op->value.type().is_scalar()) - << "CUDA atomic update does not support vectorization with LLVM version < 9.\n"; - // Generate nvvm intrinsics for the atomics if this is a float atomicAdd. - // Otherwise defer to the llvm codegen. For llvm version >= 90, atomicrmw support floats so we - // can also refer to llvm. - // Half atomics are supported by compute capability 7.x or higher. - if (op->value.type().is_float() && - (op->value.type().bits() == 32 || - (op->value.type().bits() == 64 && - target.has_feature(Target::CUDACapability61)))) { - Expr val_expr = op->value; - Expr equiv_load = Load::make(op->value.type(), op->name, op->index, Buffer<>(), op->param, op->predicate, op->alignment); - Expr delta = simplify(common_subexpression_elimination(op->value - equiv_load)); - // For atomicAdd, we check if op->value - store[index] is independent of store. - bool is_atomic_add = !expr_uses_var(delta, op->name); - if (is_atomic_add) { - Value *ptr = codegen_buffer_pointer(op->name, op->value.type(), op->index); - Value *val = codegen(delta); - llvm::Function *intrin = nullptr; - if (op->value.type().bits() == 32) { - intrin = module->getFunction("llvm.nvvm.atomic.load.add.f32.p0f32"); - internal_assert(intrin) << "Could not find atomic intrinsics llvm.nvvm.atomic.load.add.f32.p0f32\n"; - } else { - internal_assert(op->value.type().bits() == 64); - intrin = module->getFunction("llvm.nvvm.atomic.load.add.f64.p0f64"); - internal_assert(intrin) << "Could not find atomic intrinsics llvm.nvvm.atomic.load.add.f64.p0f64\n"; - } - value = builder->CreateCall(intrin, {ptr, val}); - return; - } - } -#endif } // Do aligned 4-wide 32-bit stores as a single i128 store. @@ -640,11 +607,7 @@ vector CodeGen_PTX_Dev::compile_to_src() { // Ask the target to add backend passes as necessary. bool fail = target_machine->addPassesToEmitFile(module_pass_manager, ostream, nullptr, -#if LLVM_VERSION >= 100 ::llvm::CGFT_AssemblyFile, -#else - TargetMachine::CGFT_AssemblyFile, -#endif true); if (fail) { internal_error << "Failed to set up passes to emit PTX source\n"; diff --git a/src/Introspection.cpp b/src/Introspection.cpp index c1eb64d39464..58498c799f23 100644 --- a/src/Introspection.cpp +++ b/src/Introspection.cpp @@ -49,11 +49,7 @@ inline T load_misaligned(const T *p) { return result; } -#if LLVM_VERSION >= 100 typedef uint64_t llvm_offset_t; -#else -typedef uint32_t llvm_offset_t; -#endif } // namespace @@ -959,14 +955,9 @@ class DebugSections { for (llvm::object::section_iterator iter = obj->section_begin(); iter != obj->section_end(); ++iter) { -#if LLVM_VERSION >= 100 auto expected_name = iter->getName(); internal_assert(expected_name); llvm::StringRef name = expected_name.get(); -#else - llvm::StringRef name; - iter->getName(name); -#endif debug(2) << "Section: " << name.str() << "\n"; // ignore errors, just leave strings empty auto e = iter->getContents(); diff --git a/src/LLVM_Headers.h b/src/LLVM_Headers.h index 8758e5535704..36e732bd73a4 100644 --- a/src/LLVM_Headers.h +++ b/src/LLVM_Headers.h @@ -1,10 +1,10 @@ #ifndef HALIDE_LLVM_HEADERS_H #define HALIDE_LLVM_HEADERS_H -#if LLVM_VERSION >= 90 +#if LLVM_VERSION >= 100 // We're good to go #else -#error "Compiling Halide requires LLVM 9.0 or newer" +#error "Compiling Halide requires LLVM 10.0 or newer" #endif // This seems to be required by some LLVM header, which is likely an LLVM bug. @@ -29,25 +29,23 @@ #include #include +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Support/ErrorHandling.h" +#include #include #include #include #include #include +#include #include #include -#include -#if LLVM_VERSION >= 100 -#include -#endif -#include "llvm/ADT/APFloat.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" -#include -#include #include #include +#include +#include #include #include #include @@ -75,11 +73,9 @@ #include #include #include -#if LLVM_VERSION >= 100 #ifdef WITH_HEXAGON #include #endif -#endif #include #include #include diff --git a/src/LLVM_Output.cpp b/src/LLVM_Output.cpp index 5500676dd0eb..541329a710cc 100644 --- a/src/LLVM_Output.cpp +++ b/src/LLVM_Output.cpp @@ -337,12 +337,7 @@ std::unique_ptr clone_module(const llvm::Module &module_in) { } // namespace void emit_file(const llvm::Module &module_in, Internal::LLVMOStream &out, -#if LLVM_VERSION >= 100 - llvm::CodeGenFileType file_type -#else - llvm::TargetMachine::CodeGenFileType file_type -#endif -) { + llvm::CodeGenFileType file_type) { Internal::debug(1) << "emit_file.Compiling to native code...\n"; Internal::debug(2) << "Target triple: " << module_in.getTargetTriple() << "\n"; @@ -403,19 +398,11 @@ std::unique_ptr compile_module_to_llvm_module(const Module &module } void compile_llvm_module_to_object(llvm::Module &module, Internal::LLVMOStream &out) { -#if LLVM_VERSION >= 100 emit_file(module, out, llvm::CGFT_ObjectFile); -#else - emit_file(module, out, llvm::TargetMachine::CGFT_ObjectFile); -#endif } void compile_llvm_module_to_assembly(llvm::Module &module, Internal::LLVMOStream &out) { -#if LLVM_VERSION >= 100 emit_file(module, out, llvm::CGFT_AssemblyFile); -#else - emit_file(module, out, llvm::TargetMachine::CGFT_AssemblyFile); -#endif } void compile_llvm_module_to_llvm_bitcode(llvm::Module &module, Internal::LLVMOStream &out) { diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp index e55291de17ae..de2ec0adbdcb 100644 --- a/src/LLVM_Runtime_Linker.cpp +++ b/src/LLVM_Runtime_Linker.cpp @@ -270,68 +270,28 @@ llvm::DataLayout get_data_layout_for_target(Target target) { if (target.arch == Target::X86) { if (target.bits == 32) { if (target.os == Target::OSX) { -#if LLVM_VERSION >= 100 return llvm::DataLayout("e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:128-n8:16:32-S128"); -#else - return llvm::DataLayout("e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"); -#endif } else if (target.os == Target::IOS) { -#if LLVM_VERSION >= 100 return llvm::DataLayout("e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:128-n8:16:32-S128"); -#else - return llvm::DataLayout("e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"); -#endif } else if (target.os == Target::Windows && !target.has_feature(Target::JIT)) { -#if LLVM_VERSION >= 100 return llvm::DataLayout("e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:32-n8:16:32-a:0:32-S32"); -#else - return llvm::DataLayout("e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"); -#endif } else if (target.os == Target::Windows) { -#if LLVM_VERSION >= 100 return llvm::DataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:32-n8:16:32-a:0:32-S32"); -#else - return llvm::DataLayout("e-m:e-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"); -#endif } else { // Linux/Android -#if LLVM_VERSION >= 100 return llvm::DataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128"); -#else - return llvm::DataLayout("e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"); -#endif } } else { // 64-bit if (target.os == Target::OSX) { -#if LLVM_VERSION >= 100 return llvm::DataLayout("e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"); -#else - return llvm::DataLayout("e-m:o-i64:64-f80:128-n8:16:32:64-S128"); -#endif } else if (target.os == Target::IOS) { -#if LLVM_VERSION >= 100 return llvm::DataLayout("e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"); -#else - return llvm::DataLayout("e-m:o-i64:64-f80:128-n8:16:32:64-S128"); -#endif } else if (target.os == Target::Windows && !target.has_feature(Target::JIT)) { -#if LLVM_VERSION >= 100 return llvm::DataLayout("e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"); -#else - return llvm::DataLayout("e-m:w-i64:64-f80:128-n8:16:32:64-S128"); -#endif } else if (target.os == Target::Windows) { -#if LLVM_VERSION >= 100 return llvm::DataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"); -#else - return llvm::DataLayout("e-m:e-i64:64-f80:128-n8:16:32:64-S128"); -#endif } else { -#if LLVM_VERSION >= 100 return llvm::DataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"); -#else - return llvm::DataLayout("e-m:e-i64:64-f80:128-n8:16:32:64-S128"); -#endif } } } else if (target.arch == Target::ARM) { diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp index ca33db97510e..1692f9b07f34 100644 --- a/src/WasmExecutor.cpp +++ b/src/WasmExecutor.cpp @@ -1264,18 +1264,11 @@ std::vector compile_to_wasm(const Module &module, const std::string &fn_na if (!lld::wasm::link(lld_args, /*CanExitEarly*/ false, llvm::outs(), llvm::errs())) { internal_error << "lld::wasm::link failed\n"; } -#elif LLVM_VERSION >= 100 - std::string lld_errs_string; - llvm::raw_string_ostream lld_errs(lld_errs_string); - - if (!lld::wasm::link(lld_args, /*CanExitEarly*/ false, llvm::outs(), llvm::errs())) { - internal_error << "lld::wasm::link failed: (" << lld_errs.str() << ")\n"; - } #else std::string lld_errs_string; llvm::raw_string_ostream lld_errs(lld_errs_string); - if (!lld::wasm::link(lld_args, /*CanExitEarly*/ false, lld_errs)) { + if (!lld::wasm::link(lld_args, /*CanExitEarly*/ false, llvm::outs(), llvm::errs())) { internal_error << "lld::wasm::link failed: (" << lld_errs.str() << ")\n"; } #endif diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp index 38e2933dad09..f57a0d1ea5bd 100644 --- a/test/correctness/simd_op_check.cpp +++ b/test/correctness/simd_op_check.cpp @@ -235,53 +235,51 @@ class SimdOpCheck : public SimdOpCheckTest { check("pabsd", 2 * w, abs(i32_1)); } - if (Halide::Internal::get_llvm_version() >= 90) { - // Horizontal ops. Our support for them uses intrinsics - // from LLVM 9+. - - // Paradoxically, haddps is a bad way to do horizontal - // adds down to a single scalar on most x86. A better - // sequence (according to Peter Cordes on stackoverflow) - // is movshdup, addps, movhlps, addss. haddps is still - // good if you're only partially reducing and your result - // is at least one native vector, if only to save code - // size, but LLVM really really tries to avoid it and - // replace it with shuffles whenever it can, so we won't - // test for it. - // - // See: - // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86 - - // For reducing down to a scalar we expect to see addps - // and movshdup. We'll sniff for the movshdup. - check("movshdup", 1, sum(in_f32(RDom(0, 2) + 2 * x))); - check("movshdup", 1, sum(in_f32(RDom(0, 4) + 4 * x))); - check("movshdup", 1, sum(in_f32(RDom(0, 16) + 16 * x))); - - // The integer horizontal add operations are pretty - // terrible on all x86 variants, and LLVM does its best to - // avoid generating those too, so we won't test that here - // either. - - // Min reductions should use phminposuw when - // possible. This only exists for u16. X86 is weird. - check("phminposuw", 1, minimum(in_u16(RDom(0, 8) + 8 * x))); - - // Max reductions can use the same instruction by first - // flipping the bits. - check("phminposuw", 1, maximum(in_u16(RDom(0, 8) + 8 * x))); - - // Reductions over signed ints can flip the sign bit - // before and after (equivalent to adding 128). - check("phminposuw", 1, minimum(in_i16(RDom(0, 8) + 8 * x))); - check("phminposuw", 1, maximum(in_i16(RDom(0, 8) + 8 * x))); - - // Reductions over 8-bit ints can widen first - check("phminposuw", 1, minimum(in_u8(RDom(0, 16) + 16 * x))); - check("phminposuw", 1, maximum(in_u8(RDom(0, 16) + 16 * x))); - check("phminposuw", 1, minimum(in_i8(RDom(0, 16) + 16 * x))); - check("phminposuw", 1, maximum(in_i8(RDom(0, 16) + 16 * x))); - } + // Horizontal ops. Our support for them uses intrinsics + // from LLVM 9+. + + // Paradoxically, haddps is a bad way to do horizontal + // adds down to a single scalar on most x86. A better + // sequence (according to Peter Cordes on stackoverflow) + // is movshdup, addps, movhlps, addss. haddps is still + // good if you're only partially reducing and your result + // is at least one native vector, if only to save code + // size, but LLVM really really tries to avoid it and + // replace it with shuffles whenever it can, so we won't + // test for it. + // + // See: + // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86 + + // For reducing down to a scalar we expect to see addps + // and movshdup. We'll sniff for the movshdup. + check("movshdup", 1, sum(in_f32(RDom(0, 2) + 2 * x))); + check("movshdup", 1, sum(in_f32(RDom(0, 4) + 4 * x))); + check("movshdup", 1, sum(in_f32(RDom(0, 16) + 16 * x))); + + // The integer horizontal add operations are pretty + // terrible on all x86 variants, and LLVM does its best to + // avoid generating those too, so we won't test that here + // either. + + // Min reductions should use phminposuw when + // possible. This only exists for u16. X86 is weird. + check("phminposuw", 1, minimum(in_u16(RDom(0, 8) + 8 * x))); + + // Max reductions can use the same instruction by first + // flipping the bits. + check("phminposuw", 1, maximum(in_u16(RDom(0, 8) + 8 * x))); + + // Reductions over signed ints can flip the sign bit + // before and after (equivalent to adding 128). + check("phminposuw", 1, minimum(in_i16(RDom(0, 8) + 8 * x))); + check("phminposuw", 1, maximum(in_i16(RDom(0, 8) + 8 * x))); + + // Reductions over 8-bit ints can widen first + check("phminposuw", 1, minimum(in_u8(RDom(0, 16) + 16 * x))); + check("phminposuw", 1, maximum(in_u8(RDom(0, 16) + 16 * x))); + check("phminposuw", 1, minimum(in_i8(RDom(0, 16) + 16 * x))); + check("phminposuw", 1, maximum(in_i8(RDom(0, 16) + 16 * x))); } // SSE 4.1 @@ -938,113 +936,110 @@ class SimdOpCheck : public SimdOpCheckTest { // VORR X - Bitwise OR // check("vorr", bool1 | bool2); - if (Halide::Internal::get_llvm_version() >= 100) { - - for (int f : {2, 4}) { - RDom r(0, f); - - // A summation reduction that starts at something - // non-trivial, to avoid llvm simplifying accumulating - // widening summations into just widening summations. - auto sum_ = [&](Expr e) { - Func f; - f(x) = cast(e.type(), 123); - f(x) += e; - return f(x); - }; - - // VPADD I, F - Pairwise Add - check(arm32 ? "vpadd.i8" : "addp", 16, sum_(in_i8(f * x + r))); - check(arm32 ? "vpadd.i8" : "addp", 16, sum_(in_u8(f * x + r))); - check(arm32 ? "vpadd.i16" : "addp", 8, sum_(in_i16(f * x + r))); - check(arm32 ? "vpadd.i16" : "addp", 8, sum_(in_u16(f * x + r))); - check(arm32 ? "vpadd.i32" : "addp", 4, sum_(in_i32(f * x + r))); - check(arm32 ? "vpadd.i32" : "addp", 4, sum_(in_u32(f * x + r))); - check(arm32 ? "vpadd.f32" : "addp", 4, sum_(in_f32(f * x + r))); - // In 32-bit, we don't have a pairwise op for doubles, - // and expect to just get vadd instructions on d - // registers. - check(arm32 ? "vadd.f64" : "addp", 4, sum_(in_f64(f * x + r))); - - if (f == 2) { - // VPADAL I - Pairwise Add and Accumulate Long - - // If we're reducing by a factor of two, we can - // use the forms with an accumulator - check(arm32 ? "vpadal.s8" : "sadalp", 16, sum_(i16(in_i8(f * x + r)))); - check(arm32 ? "vpadal.u8" : "uadalp", 16, sum_(i16(in_u8(f * x + r)))); - check(arm32 ? "vpadal.u8" : "uadalp", 16, sum_(u16(in_u8(f * x + r)))); - - check(arm32 ? "vpadal.s16" : "sadalp", 8, sum_(i32(in_i16(f * x + r)))); - check(arm32 ? "vpadal.u16" : "uadalp", 8, sum_(i32(in_u16(f * x + r)))); - check(arm32 ? "vpadal.u16" : "uadalp", 8, sum_(u32(in_u16(f * x + r)))); - - check(arm32 ? "vpadal.s32" : "sadalp", 4, sum_(i64(in_i32(f * x + r)))); - check(arm32 ? "vpadal.u32" : "uadalp", 4, sum_(i64(in_u32(f * x + r)))); - check(arm32 ? "vpadal.u32" : "uadalp", 4, sum_(u64(in_u32(f * x + r)))); - } else { - // VPADDL I - Pairwise Add Long - - // If we're reducing by more than that, that's not - // possible. - check(arm32 ? "vpaddl.s8" : "saddlp", 16, sum_(i16(in_i8(f * x + r)))); - check(arm32 ? "vpaddl.u8" : "uaddlp", 16, sum_(i16(in_u8(f * x + r)))); - check(arm32 ? "vpaddl.u8" : "uaddlp", 16, sum_(u16(in_u8(f * x + r)))); - - check(arm32 ? "vpaddl.s16" : "saddlp", 8, sum_(i32(in_i16(f * x + r)))); - check(arm32 ? "vpaddl.u16" : "uaddlp", 8, sum_(i32(in_u16(f * x + r)))); - check(arm32 ? "vpaddl.u16" : "uaddlp", 8, sum_(u32(in_u16(f * x + r)))); - - check(arm32 ? "vpaddl.s32" : "saddlp", 4, sum_(i64(in_i32(f * x + r)))); - check(arm32 ? "vpaddl.u32" : "uaddlp", 4, sum_(i64(in_u32(f * x + r)))); - check(arm32 ? "vpaddl.u32" : "uaddlp", 4, sum_(u64(in_u32(f * x + r)))); - - // If we're widening the type by a factor of four - // as well as reducing by a factor of four, we - // expect vpaddl followed by vpadal - check(arm32 ? "vpaddl.s8" : "saddlp", 8, sum_(i32(in_i8(f * x + r)))); - check(arm32 ? "vpaddl.u8" : "uaddlp", 8, sum_(i32(in_u8(f * x + r)))); - check(arm32 ? "vpaddl.u8" : "uaddlp", 8, sum_(u32(in_u8(f * x + r)))); - check(arm32 ? "vpaddl.s16" : "saddlp", 4, sum_(i64(in_i16(f * x + r)))); - check(arm32 ? "vpaddl.u16" : "uaddlp", 4, sum_(i64(in_u16(f * x + r)))); - check(arm32 ? "vpaddl.u16" : "uaddlp", 4, sum_(u64(in_u16(f * x + r)))); - - // Note that when going from u8 to i32 like this, - // the vpaddl is unsigned and the vpadal is a - // signed, because the intermediate type is u16 - check(arm32 ? "vpadal.s16" : "sadalp", 8, sum_(i32(in_i8(f * x + r)))); - check(arm32 ? "vpadal.u16" : "uadalp", 8, sum_(i32(in_u8(f * x + r)))); - check(arm32 ? "vpadal.u16" : "uadalp", 8, sum_(u32(in_u8(f * x + r)))); - check(arm32 ? "vpadal.s32" : "sadalp", 4, sum_(i64(in_i16(f * x + r)))); - check(arm32 ? "vpadal.u32" : "uadalp", 4, sum_(i64(in_u16(f * x + r)))); - check(arm32 ? "vpadal.u32" : "uadalp", 4, sum_(u64(in_u16(f * x + r)))); - } - - // VPMAX I, F - Pairwise Maximum - check(arm32 ? "vpmax.s8" : "smaxp", 16, maximum(in_i8(f * x + r))); - check(arm32 ? "vpmax.u8" : "umaxp", 16, maximum(in_u8(f * x + r))); - check(arm32 ? "vpmax.s16" : "smaxp", 8, maximum(in_i16(f * x + r))); - check(arm32 ? "vpmax.u16" : "umaxp", 8, maximum(in_u16(f * x + r))); - check(arm32 ? "vpmax.s32" : "smaxp", 4, maximum(in_i32(f * x + r))); - check(arm32 ? "vpmax.u32" : "umaxp", 4, maximum(in_u32(f * x + r))); - - // VPMIN I, F - Pairwise Minimum - check(arm32 ? "vpmin.s8" : "sminp", 16, minimum(in_i8(f * x + r))); - check(arm32 ? "vpmin.u8" : "uminp", 16, minimum(in_u8(f * x + r))); - check(arm32 ? "vpmin.s16" : "sminp", 8, minimum(in_i16(f * x + r))); - check(arm32 ? "vpmin.u16" : "uminp", 8, minimum(in_u16(f * x + r))); - check(arm32 ? "vpmin.s32" : "sminp", 4, minimum(in_i32(f * x + r))); - check(arm32 ? "vpmin.u32" : "uminp", 4, minimum(in_u32(f * x + r))); + for (int f : {2, 4}) { + RDom r(0, f); + + // A summation reduction that starts at something + // non-trivial, to avoid llvm simplifying accumulating + // widening summations into just widening summations. + auto sum_ = [&](Expr e) { + Func f; + f(x) = cast(e.type(), 123); + f(x) += e; + return f(x); + }; + + // VPADD I, F - Pairwise Add + check(arm32 ? "vpadd.i8" : "addp", 16, sum_(in_i8(f * x + r))); + check(arm32 ? "vpadd.i8" : "addp", 16, sum_(in_u8(f * x + r))); + check(arm32 ? "vpadd.i16" : "addp", 8, sum_(in_i16(f * x + r))); + check(arm32 ? "vpadd.i16" : "addp", 8, sum_(in_u16(f * x + r))); + check(arm32 ? "vpadd.i32" : "addp", 4, sum_(in_i32(f * x + r))); + check(arm32 ? "vpadd.i32" : "addp", 4, sum_(in_u32(f * x + r))); + check(arm32 ? "vpadd.f32" : "addp", 4, sum_(in_f32(f * x + r))); + // In 32-bit, we don't have a pairwise op for doubles, + // and expect to just get vadd instructions on d + // registers. + check(arm32 ? "vadd.f64" : "addp", 4, sum_(in_f64(f * x + r))); + + if (f == 2) { + // VPADAL I - Pairwise Add and Accumulate Long + + // If we're reducing by a factor of two, we can + // use the forms with an accumulator + check(arm32 ? "vpadal.s8" : "sadalp", 16, sum_(i16(in_i8(f * x + r)))); + check(arm32 ? "vpadal.u8" : "uadalp", 16, sum_(i16(in_u8(f * x + r)))); + check(arm32 ? "vpadal.u8" : "uadalp", 16, sum_(u16(in_u8(f * x + r)))); + + check(arm32 ? "vpadal.s16" : "sadalp", 8, sum_(i32(in_i16(f * x + r)))); + check(arm32 ? "vpadal.u16" : "uadalp", 8, sum_(i32(in_u16(f * x + r)))); + check(arm32 ? "vpadal.u16" : "uadalp", 8, sum_(u32(in_u16(f * x + r)))); + + check(arm32 ? "vpadal.s32" : "sadalp", 4, sum_(i64(in_i32(f * x + r)))); + check(arm32 ? "vpadal.u32" : "uadalp", 4, sum_(i64(in_u32(f * x + r)))); + check(arm32 ? "vpadal.u32" : "uadalp", 4, sum_(u64(in_u32(f * x + r)))); + } else { + // VPADDL I - Pairwise Add Long + + // If we're reducing by more than that, that's not + // possible. + check(arm32 ? "vpaddl.s8" : "saddlp", 16, sum_(i16(in_i8(f * x + r)))); + check(arm32 ? "vpaddl.u8" : "uaddlp", 16, sum_(i16(in_u8(f * x + r)))); + check(arm32 ? "vpaddl.u8" : "uaddlp", 16, sum_(u16(in_u8(f * x + r)))); + + check(arm32 ? "vpaddl.s16" : "saddlp", 8, sum_(i32(in_i16(f * x + r)))); + check(arm32 ? "vpaddl.u16" : "uaddlp", 8, sum_(i32(in_u16(f * x + r)))); + check(arm32 ? "vpaddl.u16" : "uaddlp", 8, sum_(u32(in_u16(f * x + r)))); + + check(arm32 ? "vpaddl.s32" : "saddlp", 4, sum_(i64(in_i32(f * x + r)))); + check(arm32 ? "vpaddl.u32" : "uaddlp", 4, sum_(i64(in_u32(f * x + r)))); + check(arm32 ? "vpaddl.u32" : "uaddlp", 4, sum_(u64(in_u32(f * x + r)))); + + // If we're widening the type by a factor of four + // as well as reducing by a factor of four, we + // expect vpaddl followed by vpadal + check(arm32 ? "vpaddl.s8" : "saddlp", 8, sum_(i32(in_i8(f * x + r)))); + check(arm32 ? "vpaddl.u8" : "uaddlp", 8, sum_(i32(in_u8(f * x + r)))); + check(arm32 ? "vpaddl.u8" : "uaddlp", 8, sum_(u32(in_u8(f * x + r)))); + check(arm32 ? "vpaddl.s16" : "saddlp", 4, sum_(i64(in_i16(f * x + r)))); + check(arm32 ? "vpaddl.u16" : "uaddlp", 4, sum_(i64(in_u16(f * x + r)))); + check(arm32 ? "vpaddl.u16" : "uaddlp", 4, sum_(u64(in_u16(f * x + r)))); + + // Note that when going from u8 to i32 like this, + // the vpaddl is unsigned and the vpadal is a + // signed, because the intermediate type is u16 + check(arm32 ? "vpadal.s16" : "sadalp", 8, sum_(i32(in_i8(f * x + r)))); + check(arm32 ? "vpadal.u16" : "uadalp", 8, sum_(i32(in_u8(f * x + r)))); + check(arm32 ? "vpadal.u16" : "uadalp", 8, sum_(u32(in_u8(f * x + r)))); + check(arm32 ? "vpadal.s32" : "sadalp", 4, sum_(i64(in_i16(f * x + r)))); + check(arm32 ? "vpadal.u32" : "uadalp", 4, sum_(i64(in_u16(f * x + r)))); + check(arm32 ? "vpadal.u32" : "uadalp", 4, sum_(u64(in_u16(f * x + r)))); } - // UDOT/SDOT - if (target.has_feature(Target::ARMDotProd)) { - for (int f : {4, 8}) { - RDom r(0, f); - for (int v : {2, 4}) { - check("udot", v, sum(u32(in_u8(f * x + r)) * in_u8(f * x + r + 32))); - check("sdot", v, sum(i32(in_i8(f * x + r)) * in_i8(f * x + r + 32))); - } + // VPMAX I, F - Pairwise Maximum + check(arm32 ? "vpmax.s8" : "smaxp", 16, maximum(in_i8(f * x + r))); + check(arm32 ? "vpmax.u8" : "umaxp", 16, maximum(in_u8(f * x + r))); + check(arm32 ? "vpmax.s16" : "smaxp", 8, maximum(in_i16(f * x + r))); + check(arm32 ? "vpmax.u16" : "umaxp", 8, maximum(in_u16(f * x + r))); + check(arm32 ? "vpmax.s32" : "smaxp", 4, maximum(in_i32(f * x + r))); + check(arm32 ? "vpmax.u32" : "umaxp", 4, maximum(in_u32(f * x + r))); + + // VPMIN I, F - Pairwise Minimum + check(arm32 ? "vpmin.s8" : "sminp", 16, minimum(in_i8(f * x + r))); + check(arm32 ? "vpmin.u8" : "uminp", 16, minimum(in_u8(f * x + r))); + check(arm32 ? "vpmin.s16" : "sminp", 8, minimum(in_i16(f * x + r))); + check(arm32 ? "vpmin.u16" : "uminp", 8, minimum(in_u16(f * x + r))); + check(arm32 ? "vpmin.s32" : "sminp", 4, minimum(in_i32(f * x + r))); + check(arm32 ? "vpmin.u32" : "uminp", 4, minimum(in_u32(f * x + r))); + } + + // UDOT/SDOT + if (target.has_feature(Target::ARMDotProd)) { + for (int f : {4, 8}) { + RDom r(0, f); + for (int v : {2, 4}) { + check("udot", v, sum(u32(in_u8(f * x + r)) * in_u8(f * x + r + 32))); + check("sdot", v, sum(i32(in_i8(f * x + r)) * in_i8(f * x + r + 32))); } } } From e623f94d2dd1b70d2a9eb95fc717902ed03822fe Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Fri, 16 Oct 2020 11:00:35 -0700 Subject: [PATCH 2/2] Update CMakeLists.txt --- dependencies/llvm/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies/llvm/CMakeLists.txt b/dependencies/llvm/CMakeLists.txt index 41567e972260..05d231332401 100644 --- a/dependencies/llvm/CMakeLists.txt +++ b/dependencies/llvm/CMakeLists.txt @@ -17,7 +17,7 @@ message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") if (LLVM_PACKAGE_VERSION VERSION_LESS 10.0) - message(FATAL_ERROR "LLVM version must be 9.0 or newer") + message(FATAL_ERROR "LLVM version must be 10.0 or newer") endif () if (LLVM_PACKAGE_VERSION VERSION_GREATER 12.0)