diff --git a/base/sysimg.jl b/base/sysimg.jl index d9851dbf30a0d2..f81b7b6c2f6a7d 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -400,6 +400,24 @@ end INCLUDE_STATE = 3 # include = include_from_node1 include("precompile.jl") +@noinline function test_clone_f(a) + s = zero(eltype(a)) + @inbounds @simd for i in 1:length(a) + s += a[i] + end + return s +end + +@noinline function test_clone_g(a, n) + s = zero(eltype(a)) + for i in 1:n + s += test_clone_f(a) + end + return s +end + +test_clone_g(Float64[], 1) + end # baremodule Base using Base diff --git a/src/Makefile b/src/Makefile index d6b69c00ad1636..c8565f95817968 100644 --- a/src/Makefile +++ b/src/Makefile @@ -54,7 +54,7 @@ endif LLVMLINK := ifeq ($(JULIACODEGEN),LLVM) -SRCS += codegen jitlayers disasm debuginfo llvm-simdloop llvm-ptls llvm-gcroot cgmemmgr +SRCS += codegen jitlayers disasm debuginfo llvm-simdloop llvm-ptls llvm-gcroot llvm-mv cgmemmgr FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir) LLVM_LIBS := all ifeq ($(USE_POLLY),1) diff --git a/src/dump.c b/src/dump.c index 4738fc74bd204c..d985dffdab8b95 100644 --- a/src/dump.c +++ b/src/dump.c @@ -221,6 +221,11 @@ JL_DLLEXPORT int jl_running_on_valgrind(void) return RUNNING_ON_VALGRIND; } +STATIC_INLINE uint64_t i32_to_i64(uint64_t hi, uint64_t lo) +{ + return (hi << 32) | lo; +} + static void jl_load_sysimg_so(void) { #ifndef _OS_WINDOWS_ @@ -242,6 +247,28 @@ static void jl_load_sysimg_so(void) *sysimg_gvars[tls_offset_idx - 1] = (jl_value_t*)(uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset); #endif + typedef void (*dispatch_t)(uint64_t, uint64_t, uint64_t, size_t*, void***, size_t**); + dispatch_t dispatchf = (dispatch_t)jl_dlsym(jl_sysimg_handle, + "jl_dispatch_sysimg_fvars"); + if (dispatchf) { + int32_t info[4]; + jl_cpuid(info, 1); + int32_t infoex[4]; + jl_cpuidex(infoex, 7, 0); + uint64_t mask = i32_to_i64(info[3], info[2]); + uint64_t emask1 = i32_to_i64(infoex[1], infoex[2]); + uint64_t emask2 = i32_to_i64(infoex[3], 0); + size_t nfunc = 0; + void **fptrs = NULL; + size_t *fidxs = NULL; + dispatchf(mask, emask1, emask2, &nfunc, &fptrs, &fidxs); + if (nfunc && fptrs && fidxs) { + for (size_t i = 0; i < nfunc; i++) { + size_t fi = fidxs[i]; + sysimg_fvars[fi] = fptrs[i]; + } + } + } const char *cpu_target = (const char*)jl_dlsym(jl_sysimg_handle, "jl_sysimg_cpu_target"); if (strcmp(cpu_target,jl_options.cpu_target) != 0) jl_error("Julia and the system image were compiled for different architectures.\n" diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index d1981ff476f7b5..3bc6d62b0e395e 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -172,6 +172,7 @@ void addOptimizationPasses(PassManager *PM) // Let the InstCombine pass remove the unnecessary load of // safepoint address first PM->add(createLowerPTLSPass(imaging_mode)); + PM->add(createJuliaMVPass()); PM->add(createSROAPass()); // Break up aggregate allocas #ifndef INSTCOMBINE_BUG PM->add(createInstructionCombiningPass()); // Cleanup for scalarrepl. @@ -1088,7 +1089,7 @@ static void jl_gen_llvm_globaldata(llvm::Module *mod, ValueToValueMapTy &VMap, ArrayType *fvars_type = ArrayType::get(T_pvoidfunc, jl_sysimg_fvars.size()); addComdat(new GlobalVariable(*mod, fvars_type, - true, + false, GlobalVariable::ExternalLinkage, MapValue(ConstantArray::get(fvars_type, ArrayRef(jl_sysimg_fvars)), VMap), "jl_sysimg_fvars")); diff --git a/src/jitlayers.h b/src/jitlayers.h index 73817ef527f710..062463a65e1dfd 100644 --- a/src/jitlayers.h +++ b/src/jitlayers.h @@ -248,6 +248,7 @@ JL_DLLEXPORT extern LLVMContext &jl_LLVMContext; Pass *createLowerPTLSPass(bool imaging_mode); Pass *createLowerGCFramePass(); +Pass *createJuliaMVPass(); // Whether the Function is an llvm or julia intrinsic. static inline bool isIntrinsicFunction(Function *F) { diff --git a/src/llvm-mv.cpp b/src/llvm-mv.cpp new file mode 100644 index 00000000000000..f590b55c7c8fb1 --- /dev/null +++ b/src/llvm-mv.cpp @@ -0,0 +1,295 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +// Function multi-versioning +#define DEBUG_TYPE "julia_mv" +#undef DEBUG + +// LLVM pass to clone function for different archs + +#include "llvm-version.h" +#include "support/dtypes.h" + +#include +#include +#include +#include +#include +#include +#include +#if JL_LLVM_VERSION >= 30700 +#include +#else +#include +#endif +#include +#include +#include +#include "fix_llvm_assert.h" + +#include "julia.h" +#include "julia_internal.h" + +#include +#include + +using namespace llvm; + +extern std::pair tbaa_make_child(const char *name, MDNode *parent=nullptr, bool isConstant=false); +extern "C" void jl_dump_llvm_value(void *v); + +namespace { + +struct JuliaMV: public ModulePass { + static char ID; + JuliaMV() + : ModulePass(ID) + {} + +private: + bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override + { + AU.addRequired(); + AU.setPreservesAll(); + } + bool shouldClone(Function &F); + bool checkUses(Function &F, Constant *fary); + bool checkUses(Function &F, Constant *V, Constant *fary, bool &inFVars); + bool checkConstantUse(Function &F, Constant *V, Constant *fary, bool &inFVars); +}; + +bool JuliaMV::shouldClone(Function &F) +{ + if (F.empty()) + return false; + auto &LI = getAnalysis(F).getLoopInfo(); + if (!LI.empty()) + return true; + for (auto &bb: F) { + for (auto &I: bb) { + if (auto call = dyn_cast(&I)) { + if (auto callee = call->getCalledFunction()) { + auto name = callee->getName(); + if (name.startswith("llvm.muladd.") || name.startswith("llvm.fma.")) { + return true; + } + } + } + } + } + return false; +} + +bool JuliaMV::checkUses(Function &F, Constant *fary) +{ + bool inFVars = false; + bool res = checkUses(F, &F, fary, inFVars); + return res && inFVars; +} + +bool JuliaMV::checkConstantUse(Function &F, Constant *V, Constant *fary, bool &inFVars) +{ + if (V == fary) { + inFVars = true; + return true; + } + if (auto cexpr = dyn_cast(V)) { + if (cexpr->getOpcode() == Instruction::BitCast) { + return checkUses(F, V, fary, inFVars); + } + } + return false; +} + +bool JuliaMV::checkUses(Function &F, Constant *V, Constant *fary, bool &inFVars) +{ + for (auto *user: V->users()) { + if (isa(user)) + continue; + auto *C = dyn_cast(user); + if (!C || !checkConstantUse(F, C, fary, inFVars)) { + return false; + } + } + return true; +} + +static Function *getFunction(Value *v) +{ + if (auto f = dyn_cast(v)) + return f; + if (auto c = dyn_cast(v)) { + if (c->getOpcode() == Instruction::BitCast) { + return getFunction(c->getOperand(0)); + } + } + return nullptr; +} + +static void addFeatures(Function *F) +{ + auto attr = F->getFnAttribute("target-features"); + std::string feature = + "+avx2,+avx,+fma,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3"; + if (attr.isStringAttribute()) { + feature += ","; + feature += attr.getValueAsString(); + } + F->addFnAttr("target-features", feature); +} + +bool JuliaMV::runOnModule(Module &M) +{ + MDNode *tbaa_const = tbaa_make_child("jtbaa_const", nullptr, true).first; + GlobalVariable *fvars = M.getGlobalVariable("jl_sysimg_fvars"); + // This makes sure this only runs during sysimg generation + if (!fvars || !fvars->hasInitializer()) + return true; + auto *fary = dyn_cast(fvars->getInitializer()); + if (!fary) + return true; + LLVMContext &ctx = M.getContext(); + ValueToValueMapTy VMap; + for (auto &F: M) { + if (shouldClone(F) && checkUses(F, fary)) { + Function *NF = Function::Create(cast(F.getValueType()), + F.getLinkage(), F.getName() + ".avx2", &M); + NF->copyAttributesFrom(&F); + VMap[&F] = NF; + } + } + std::unordered_map idx_map; + size_t nf = fary->getNumOperands(); + for (size_t i = 0; i < nf; i++) { + if (Function *ele = getFunction(fary->getOperand(i))) { + auto it = VMap.find(ele); + if (it != VMap.end()) { + idx_map[ele] = i; + } + } + } + for (auto I: idx_map) { + auto oldF = I.first; + auto newF = cast(VMap[oldF]); + Function::arg_iterator DestI = newF->arg_begin(); + for (Function::const_arg_iterator J = oldF->arg_begin(); J != oldF->arg_end(); ++J) { + DestI->setName(J->getName()); + VMap[&*J] = &*DestI++; + } + SmallVector Returns; + CloneFunctionInto(newF, oldF, VMap, false, Returns); + addFeatures(newF); + } + std::vector ptrs; + std::vector idxs; + auto T_void = Type::getVoidTy(ctx); + auto T_pvoidfunc = FunctionType::get(T_void, false)->getPointerTo(); + auto T_size = (sizeof(size_t) == 8 ? Type::getInt64Ty(ctx) : Type::getInt32Ty(ctx)); + for (auto I: idx_map) { + auto oldF = I.first; + auto idx = I.second; + auto newF = cast(VMap[oldF]); + ptrs.push_back(ConstantExpr::getBitCast(newF, T_pvoidfunc)); + auto offset = ConstantInt::get(T_size, idx); + idxs.push_back(offset); + for (auto user: oldF->users()) { + auto inst = dyn_cast(user); + if (!inst) + continue; + auto encloseF = inst->getParent()->getParent(); + if (VMap.find(encloseF) != VMap.end()) + continue; + auto slot = GetElementPtrInst::Create(fary->getType(), fvars, {offset}, "", inst); + Instruction *ptr = new LoadInst(slot, "", inst); + ptr->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const); + ptr = new BitCastInst(ptr, oldF->getType(), "", inst); + inst->replaceUsesOfWith(oldF, ptr); + } + } + ArrayType *fvars_type = ArrayType::get(T_pvoidfunc, ptrs.size()); + auto ptr_gv = new GlobalVariable(M, fvars_type, true, GlobalVariable::InternalLinkage, + ConstantArray::get(fvars_type, ptrs)); + ArrayType *idxs_type = ArrayType::get(T_size, idxs.size()); + auto idx_gv = new GlobalVariable(M, idxs_type, true, GlobalVariable::InternalLinkage, + ConstantArray::get(idxs_type, idxs)); + + std::vector dispatch_args(0); + dispatch_args.push_back(Type::getInt64Ty(ctx)); // Feature mask + dispatch_args.push_back(Type::getInt64Ty(ctx)); // Extended feature mask1 + dispatch_args.push_back(Type::getInt64Ty(ctx)); // Extended feature mask2 + dispatch_args.push_back(T_size->getPointerTo()); + dispatch_args.push_back(fvars_type->getPointerTo()->getPointerTo()); + dispatch_args.push_back(idxs_type->getPointerTo()->getPointerTo()); + Function *dispatchF = Function::Create(FunctionType::get(T_void, dispatch_args, false), + Function::ExternalLinkage, + "jl_dispatch_sysimg_fvars", &M); + IRBuilder<> builder(ctx); + BasicBlock *b0 = BasicBlock::Create(ctx, "top", dispatchF); + builder.SetInsertPoint(b0); + DebugLoc noDbg; + builder.SetCurrentDebugLocation(noDbg); + + std::vector args; + for (auto &arg: dispatchF->args()) + args.push_back(&arg); + + auto sz_arg = args[3]; + auto fvars_arg = args[4]; + auto idxs_arg = args[5]; + + // Hard code for now + // EDX:ECX + uint64_t mask = 1 | (1 << 9) | (1 << 12) | (1 << 19) | (1 << 20) | (1 << 23) | (1 << 28); + // EBX:ECX + uint64_t emask1 = uint64_t(1) << (5 + 32); + // EDX:0 + uint64_t emask2 = 0; + + builder.CreateStore(ConstantInt::get(T_size, ptrs.size()), sz_arg); + + auto createMaskCmp = [&] (Value *v, uint64_t mask) { + auto maskv = ConstantInt::get(v->getType(), mask); + return builder.CreateICmpEQ(builder.CreateAnd(v, maskv), maskv); + }; + + auto match_mask = createMaskCmp(args[0], mask); + auto match_emask1 = createMaskCmp(args[1], emask1); + auto match_emask2 = createMaskCmp(args[2], emask2); + + auto match = builder.CreateAnd(match_mask, match_emask1); + match = builder.CreateAnd(match, match_emask2); + + BasicBlock *match_bb = BasicBlock::Create(ctx, "match"); + BasicBlock *fail_bb = BasicBlock::Create(ctx, "fail"); + builder.CreateCondBr(match, match_bb, fail_bb); + + dispatchF->getBasicBlockList().push_back(match_bb); + builder.SetInsertPoint(match_bb); + builder.CreateStore(ptr_gv, fvars_arg); + builder.CreateStore(idx_gv, idxs_arg); + builder.CreateRetVoid(); + + dispatchF->getBasicBlockList().push_back(fail_bb); + builder.SetInsertPoint(fail_bb); + builder.CreateStore(ConstantPointerNull::get(fvars_type->getPointerTo()), fvars_arg); + builder.CreateStore(ConstantPointerNull::get(idxs_type->getPointerTo()), idxs_arg); + builder.CreateRetVoid(); + + // jl_dump_llvm_value(dispatchF); + // jl_dump_llvm_value(ptr_gv); + // jl_dump_llvm_value(idx_gv); + + return true; +} + +char JuliaMV::ID = 0; +static RegisterPass X("JuliaMV", "JuliaMV Pass", + false /* Only looks at CFG */, + false /* Analysis Pass */); + +} + +Pass *createJuliaMVPass() +{ + return new JuliaMV(); +}