diff --git a/releases/4.0.0/patches_external/01-Adds-a-special-field-into-value-class.patch b/releases/4.0.0/patches_external/01-Adds-a-special-field-into-value-class.patch deleted file mode 100644 index 3bafb4b..0000000 --- a/releases/4.0.0/patches_external/01-Adds-a-special-field-into-value-class.patch +++ /dev/null @@ -1,59 +0,0 @@ -# Description : Adds a special field into llvm::Value class, -# so it could be used for 'table' based mapping of Value attributes -# When Fixed in Open Source : Need Always -# P4V CL : 557956(3.8.0), 642680(4.0.0) -# Category : Enhanchement -# Author : ktrifuno -# Open Source Classification : Internal Only -# Impact : Compile Time -# Failing Workload/Commandline : -# Notes : Change to improve compilation time, not needed for functionality -# or runtime performance. Might try to convince LLVM community to -# add this field, but it would rather be rejected, since it has no -# use besides IGC. LLVM guys are using the LLVM Table Gen -# intermediate representation for carrying on the codegen tasks, -# thus no need for ID. - -diff -Naur --strip-trailing-cr a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h ---- a/include/llvm/IR/Value.h 2017-10-23 09:53:04.880214746 -0400 -+++ b/include/llvm/IR/Value.h 2017-10-23 12:47:32.456804658 -0400 -@@ -116,6 +116,17 @@ - unsigned HasHungOffUses : 1; - unsigned HasDescriptor : 1; - -+ /// INTEL specific extension -+ unsigned int ID : 32 ; -+ unsigned unused_pack : 32; //added to handle packing -+ -+public: -+ /// INTEL specific extension -+ unsigned int GetID() const { return ID; } -+ -+ /// INTEL specific extension -+ void SetID(unsigned int _ID) { ID = _ID; } -+ /// End of Intel specific extension - private: - template // UseT == 'Use' or 'const Use' - class use_iterator_impl -diff -Naur --strip-trailing-cr a/lib/IR/Value.cpp b/lib/IR/Value.cpp ---- a/lib/IR/Value.cpp 2017-10-23 09:53:05.011214741 -0400 -+++ b/lib/IR/Value.cpp 2017-10-23 12:47:32.457804658 -0400 -@@ -48,7 +48,8 @@ - Value::Value(Type *ty, unsigned scid) - : VTy(checkType(ty)), UseList(nullptr), SubclassID(scid), - HasValueHandle(0), SubclassOptionalData(0), SubclassData(0), -- NumUserOperands(0), IsUsedByMD(false), HasName(false) { -+ NumUserOperands(0), IsUsedByMD(false), HasName(false), -+ ID(std::numeric_limits::max()){ - // FIXME: Why isn't this in the subclass gunk?? - // Note, we cannot call isa before the CallInst has been - // constructed. -@@ -59,7 +60,7 @@ - (SubclassID < ConstantFirstVal || SubclassID > ConstantLastVal)) - assert((VTy->isFirstClassType() || VTy->isVoidTy()) && - "Cannot create non-first-class values except for constants!"); -- static_assert(sizeof(Value) == 3 * sizeof(void *) + 2 * sizeof(unsigned), -+ static_assert(sizeof(Value) == 3 * sizeof(void *) + 4 * sizeof(unsigned), - "Value too big"); - } - diff --git a/releases/4.0.0/patches_external/04-stripPointerCastsAndOffsets-should-not.patch b/releases/4.0.0/patches_external/04-stripPointerCastsAndOffsets-should-not.patch deleted file mode 100644 index a07ed6f..0000000 --- a/releases/4.0.0/patches_external/04-stripPointerCastsAndOffsets-should-not.patch +++ /dev/null @@ -1,27 +0,0 @@ -# Description : stripPointerCastsAndOffsets should not process across AddrSpaceCast -# When Fixed in Open Source : Need Always for OCL -# P4V CL : 552346(3.8.0), 642680(4.0.0) -# Category : Bug -# Author : jliu29 -# Open Source Classification : Internal Only -# Impact : -# Failing Workload/Commandline : -# Notes : I cannot upstream this one, since it exposes LLVM's other GAS issues, -# including several unit tests. This patch is not a must-have, and IGC -# can function even w/o this patch. - -diff -Naur --strip-trailing-cr a/lib/IR/Value.cpp b/lib/IR/Value.cpp ---- a/lib/IR/Value.cpp 2017-10-24 10:42:55.731712686 -0400 -+++ b/lib/IR/Value.cpp 2017-10-24 10:43:09.940712129 -0400 -@@ -467,6 +467,11 @@ - V = GEP->getPointerOperand(); - } else if (Operator::getOpcode(V) == Instruction::BitCast || - Operator::getOpcode(V) == Instruction::AddrSpaceCast) { -+ // Do not process across AddrSpaceCast, since its result could -+ // be a nullptr depending on the meaning of src and dest addrspace. -+ if (Operator::getOpcode(V) == Instruction::AddrSpaceCast) { -+ return V; -+ } - V = cast(V)->getOperand(0); - } else if (GlobalAlias *GA = dyn_cast(V)) { - if (StripKind == PSK_ZeroIndices || GA->isInterposable()) diff --git a/releases/4.0.0/patches_external/1-stripPointerCastsAndOffsets-should-not.patch b/releases/4.0.0/patches_external/1-stripPointerCastsAndOffsets-should-not.patch new file mode 100644 index 0000000..7eb993e --- /dev/null +++ b/releases/4.0.0/patches_external/1-stripPointerCastsAndOffsets-should-not.patch @@ -0,0 +1,17 @@ +# Description : stripPointerCastsAndOffsets should not process across AddrSpaceCast + +diff -Naur --strip-trailing-cr a/lib/IR/Value.cpp b/lib/IR/Value.cpp +--- a/lib/IR/Value.cpp 2016-12-07 13:47:32.000000000 -0800 ++++ b/lib/IR/Value.cpp 2018-03-12 11:25:42.752625100 -0700 +@@ -466,6 +466,11 @@ + V = GEP->getPointerOperand(); + } else if (Operator::getOpcode(V) == Instruction::BitCast || + Operator::getOpcode(V) == Instruction::AddrSpaceCast) { ++ // Do not process across AddrSpaceCast, since its result could ++ // be a nullptr depending on the meaning of src and dest addrspace. ++ if (Operator::getOpcode(V) == Instruction::AddrSpaceCast) { ++ return V; ++ } + V = cast(V)->getOperand(0); + } else if (GlobalAlias *GA = dyn_cast(V)) { + if (StripKind == PSK_ZeroIndices || GA->isInterposable()) diff --git a/releases/4.0.0/patches_external/13-managed-static-mem-leak-fix.patch b/releases/4.0.0/patches_external/13-managed-static-mem-leak-fix.patch deleted file mode 100644 index 302288e..0000000 --- a/releases/4.0.0/patches_external/13-managed-static-mem-leak-fix.patch +++ /dev/null @@ -1,28 +0,0 @@ -# Description : Memory leak fix for Managed Static Mutex -# When Fixed in Open Source : Needed always -# P4V CL : 706045(4.0.0) -# Category : Bugfix -# Author : juanrod2 -# Open Source Classification : Internal Only -# Impact : -# Failing Workload/Commandline : -# Notes : cleaning a mutex inside ManagedStatic llvm class. - -diff -Naur --strip-trailing-cr a/lib/Support/ManagedStatic.cpp b/lib/Support/ManagedStatic.cpp ---- a/lib/Support/ManagedStatic.cpp 2016-06-29 08:04:07.000000000 -0700 -+++ b/lib/Support/ManagedStatic.cpp 2017-10-25 08:35:21.030615300 -0700 -@@ -81,8 +81,13 @@ - - /// llvm_shutdown - Deallocate and destroy all ManagedStatic variables. - void llvm::llvm_shutdown() { -- MutexGuard Lock(*getManagedStaticMutex()); -+ getManagedStaticMutex()->lock(); - - while (StaticList) - StaticList->destroy(); -+ -+ getManagedStaticMutex()->unlock(); -+ -+ delete ManagedStaticMutex; -+ ManagedStaticMutex = nullptr; - } diff --git a/releases/4.0.0/patches_external/14-unwind-fix.patch b/releases/4.0.0/patches_external/14-unwind-fix.patch deleted file mode 100644 index b3d0ad5..0000000 --- a/releases/4.0.0/patches_external/14-unwind-fix.patch +++ /dev/null @@ -1,169 +0,0 @@ -diff --git a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h -index 5638717..77bac7e 100644 ---- a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h -+++ b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h -@@ -64,17 +64,17 @@ public: - ~RTDyldMemoryManager() override; - - /// Register EH frames in the current process. -- static void registerEHFramesInProcess(uint8_t *Addr, size_t Size); -+ static void registerEHFramesInProcess(uint8_t *Addr, uint64_t LoadAddr, size_t Size); - - /// Deregister EH frames in the current proces. -- static void deregisterEHFramesInProcess(uint8_t *Addr, size_t Size); -+ static void deregisterEHFramesInProcess(uint8_t *Addr, uint64_t LoadAddr, size_t Size); - - void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override { -- registerEHFramesInProcess(Addr, Size); -+ registerEHFramesInProcess(Addr, LoadAddr, Size); - } - - void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override { -- deregisterEHFramesInProcess(Addr, Size); -+ deregisterEHFramesInProcess(Addr, LoadAddr, Size); - } - - /// This method returns the address of the specified function or variable in -diff --git a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp -index de73fbd..aa222b2 100644 ---- a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp -+++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp -@@ -28,6 +28,10 @@ - #include - #endif - -+#ifdef _WIN32 -+#include -+#endif -+ - namespace llvm { - - RTDyldMemoryManager::~RTDyldMemoryManager() {} -@@ -94,7 +98,7 @@ static const char *processFDE(const char *Entry, bool isDeregister) { - // This implementation handles frame registration for local targets. - // Memory managers for remote targets should re-implement this function - // and use the LoadAddr parameter. --void RTDyldMemoryManager::registerEHFramesInProcess(uint8_t *Addr, -+void RTDyldMemoryManager::registerEHFramesInProcess(uint8_t *Addr, uint64_t LoadAddr, - size_t Size) { - // On OS X OS X __register_frame takes a single FDE as an argument. - // See http://lists.llvm.org/pipermail/llvm-dev/2013-April/061737.html -@@ -106,7 +110,7 @@ void RTDyldMemoryManager::registerEHFramesInProcess(uint8_t *Addr, - } while(P != End); - } - --void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr, -+void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr, uint64_t LoadAddr, - size_t Size) { - const char *P = (const char *)Addr; - const char *End = P + Size; -@@ -115,9 +119,22 @@ void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr, - } while(P != End); - } - -+#elif defined(_WIN64) -+typedef BOOLEAN(*PFN_RTL_ADD_FUNCTION_TABLE)(PRUNTIME_FUNCTION, DWORD, DWORD64); -+ -+void RTDyldMemoryManager::registerEHFramesInProcess(uint8_t *Addr, uint64_t LoadAddr, -+ size_t Size) { -+ PFN_RTL_ADD_FUNCTION_TABLE pfnRtlAddFunctionTable = (PFN_RTL_ADD_FUNCTION_TABLE)GetProcAddress(GetModuleHandle(L"kernel32.dll"), "RtlAddFunctionTable"); -+ if (pfnRtlAddFunctionTable) pfnRtlAddFunctionTable((PRUNTIME_FUNCTION)Addr, Size/sizeof(RUNTIME_FUNCTION), LoadAddr); -+} -+ -+void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr, uint64_t LoadAddr, -+ size_t Size) { -+} -+ - #else - --void RTDyldMemoryManager::registerEHFramesInProcess(uint8_t *Addr, -+void RTDyldMemoryManager::registerEHFramesInProcess(uint8_t *Addr, uint64_t LoadAddr, - size_t Size) { - // On Linux __register_frame takes a single argument: - // a pointer to the start of the .eh_frame section. -@@ -127,7 +144,7 @@ void RTDyldMemoryManager::registerEHFramesInProcess(uint8_t *Addr, - __register_frame(Addr); - } - --void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr, -+void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr, uint64_t LoadAddr, - size_t Size) { - __deregister_frame(Addr); - } -diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h -index 109beb3..c7e176e 100644 ---- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h -+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h -@@ -40,6 +40,16 @@ public: - return 6; // 2-byte jmp instruction + 32-bit relative address - } - -+ // Walks all the sections and returns the section with the lowest address -+ uint64_t getBaseAddress() -+ { -+ uint64_t ImageBase = -1; -+ for (int i = 0, e = Sections.size(); i != e; ++i) { -+ ImageBase = std::min(ImageBase, Sections[i].getLoadAddress()); -+ } -+ return ImageBase; -+ } -+ - // The target location for the relocation is described by RE.SectionID and - // RE.Offset. RE.SectionID can be used to find the SectionEntry. Each - // SectionEntry has three members describing its location. -@@ -85,14 +95,24 @@ public: - } - - case COFF::IMAGE_REL_AMD64_ADDR32NB: { -- // Note ADDR32NB requires a well-established notion of -- // image base. This address must be less than or equal -- // to every section's load address, and all sections must be -- // within a 32 bit offset from the base. -- // -- // For now we just set these to zero. -- writeBytesUnaligned(0, Target, 4); -- break; -+ // Note ADDR32NB requires a well-established notion of -+ // image base. This address must be less than or equal -+ // to every section's load address, and all sections must be -+ // within a 32 bit offset from the base. -+ -+ //Finding image load base -+ uint64_t ImageBase = getBaseAddress(); -+ -+ //Trying to calculate RVA -+ //TODO: see in the comment above -+ uint64_t Result = 0; -+ if (ImageBase && Value >= ImageBase) { -+ Result = RE.Addend + Value - ImageBase; -+ } -+ -+ assert(ImageBase && Value >= ImageBase && "Unable to perform ADDR32NB relocation"); -+ writeBytesUnaligned(Result, Target, 4); -+ break; - } - - case COFF::IMAGE_REL_AMD64_ADDR64: { -@@ -187,9 +207,11 @@ public: - void registerEHFrames() override { - for (auto const &EHFrameSID : UnregisteredEHFrameSections) { - uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress(); -- uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress(); -+ //uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress(); -+ // Find base address -+ uint64_t ImageBase = getBaseAddress(); - size_t EHFrameSize = Sections[EHFrameSID].getSize(); -- MemMgr.registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize); -+ MemMgr.registerEHFrames(EHFrameAddr, ImageBase, EHFrameSize); - RegisteredEHFrameSections.push_back(EHFrameSID); - } - UnregisteredEHFrameSections.clear(); -@@ -207,7 +229,7 @@ public: - return errorCodeToError(EC); - // Note unwind info is split across .pdata and .xdata, so this - // may not be sufficiently general for all users. -- if (Name == ".xdata") { -+ if (Name == ".pdata") { - UnregisteredEHFrameSections.push_back(SectionPair.second); - } - } diff --git a/releases/4.0.0/patches_external/2_1-enable-aggressive-combining.patch b/releases/4.0.0/patches_external/2_1-enable-aggressive-combining.patch new file mode 100644 index 0000000..c653634 --- /dev/null +++ b/releases/4.0.0/patches_external/2_1-enable-aggressive-combining.patch @@ -0,0 +1,19 @@ +# Description : Enable aggressive (gep (gep base, idx0), idx1) combining + +diff -Naur --strip-trailing-cr a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp +--- a/lib/Transforms/InstCombine/InstructionCombining.cpp 2017-10-24 08:51:48.146973901 -0400 ++++ b/lib/Transforms/InstCombine/InstructionCombining.cpp 2017-10-24 08:52:36.064972024 -0400 +@@ -1573,11 +1573,13 @@ + // normalized. + if (SO1->getType() != GO1->getType()) + return nullptr; ++#if 0 + // Only do the combine when GO1 and SO1 are both constants. Only in + // this case, we are sure the cost after the merge is never more than + // that before the merge. + if (!isa(GO1) || !isa(SO1)) + return nullptr; ++#endif + Sum = Builder->CreateAdd(SO1, GO1, PtrOp->getName()+".sum"); + } + diff --git a/releases/4.0.0/patches_external/03-completely-turn-off-code-sinking-in-InstructionCombining.patch b/releases/4.0.0/patches_external/2_2-completely-turn-off-code-sinking-in-InstructionCombining.patch similarity index 74% rename from releases/4.0.0/patches_external/03-completely-turn-off-code-sinking-in-InstructionCombining.patch rename to releases/4.0.0/patches_external/2_2-completely-turn-off-code-sinking-in-InstructionCombining.patch index 2bc300a..415066d 100644 --- a/releases/4.0.0/patches_external/03-completely-turn-off-code-sinking-in-InstructionCombining.patch +++ b/releases/4.0.0/patches_external/2_2-completely-turn-off-code-sinking-in-InstructionCombining.patch @@ -2,12 +2,6 @@ # This works around the issue with non-uniform constant-buffer # index with an OGL test. The non-uniform buffer indexing is # created due to code sinking. -# Type : Backport from LLVM 3.8 -# P4V CL : 552325(3.8.0), 642679(4.0.0) -# Category : Custom -# Author : gangche1 -# Open Source Classification : Internal only -# Status : Performance related, need not be ported to opensoure repository. diff -Naur --strip-trailing-cr a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp --- a/lib/Transforms/InstCombine/InstructionCombining.cpp 2017-10-23 09:48:44.308224955 -0400 diff --git a/releases/4.0.0/patches_external/08-SimplifyCFG-SinkThenElseCodeToEnd-does-not-sink-code.patch b/releases/4.0.0/patches_external/3_1-SimplifyCFG-SinkThenElseCodeToEnd-does-not-sink-code.patch similarity index 89% rename from releases/4.0.0/patches_external/08-SimplifyCFG-SinkThenElseCodeToEnd-does-not-sink-code.patch rename to releases/4.0.0/patches_external/3_1-SimplifyCFG-SinkThenElseCodeToEnd-does-not-sink-code.patch index 3e0ef63..3503166 100644 --- a/releases/4.0.0/patches_external/08-SimplifyCFG-SinkThenElseCodeToEnd-does-not-sink-code.patch +++ b/releases/4.0.0/patches_external/3_1-SimplifyCFG-SinkThenElseCodeToEnd-does-not-sink-code.patch @@ -1,17 +1,5 @@ # Description : SimplifyCFG::SinkThenElseCodeToEnd does not sink code similiar # to LLVM 3.8 -# When Fixed in Open Source : Needed until we have resolved gen intrinsic handling with -# LLVM 4.0 and resolved code sinking. -# P4V CL : 688188(4.0.0) -# Category : Optimization -# Author : pmistry -# Open Source Classification : Internal Only -# Impact : Performance/Functional -# Failing Workload/Commandline : Ashes of Singularity Escalation.dx12-g1 -# Notes : SimplifyCFG::SinkThenElseCodeToEnd does not sink code similiar to -# LLVM 3.8 in certain cases. In case of GenIntrinsics it sinks and the -# same is not handled properly by IGC passes later on. Benchmark that -# exposed the issue was Ashes of Singularity Escalation.dx12-g1 diff -Naur --strip-trailing-cr a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp --- a/lib/Transforms/Utils/SimplifyCFG.cpp 2017-10-23 09:53:04.934214744 -0400 diff --git a/releases/4.0.0/patches_external/10-Added-checks-in-SimplifyCFG-GVN-and-Earl.patch b/releases/4.0.0/patches_external/3_2-Added-checks-in-SimplifyCFG-GVN-and-Earl.patch similarity index 81% rename from releases/4.0.0/patches_external/10-Added-checks-in-SimplifyCFG-GVN-and-Earl.patch rename to releases/4.0.0/patches_external/3_2-Added-checks-in-SimplifyCFG-GVN-and-Earl.patch index 7439db4..db6171a 100644 --- a/releases/4.0.0/patches_external/10-Added-checks-in-SimplifyCFG-GVN-and-Earl.patch +++ b/releases/4.0.0/patches_external/3_2-Added-checks-in-SimplifyCFG-GVN-and-Earl.patch @@ -1,17 +1,6 @@ # Description : Added checks in SimplifyCFG, GVN, and EarlyCSE to prevent hoisting # or sinking convergent functions out of or into control flow (all # wave intrinsics are marked as convergent). -# When Fixed in Open Source : Needed till a cleaner solution is worked out. -# P4V CL : 696122(4.0.0) -# Category : Custom -# Author : djwoo -# Open Source Classification : Common -# Impact : Functional -# Failing Workload/Commandline : TE.exe D3DConf_12_Core.dll /enablewttlogging /appendwttlogging /name:DXILConfTest::WaveIntrinsics* -# Notes : Need further guidance from the community on whether the convergent -# function attribute should cover this case or whether there needs to -# be a new mechanism to handle it. The current suggestion of adding -# inline asm at every relevant call site is hackish. diff -Naur --strip-trailing-cr a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp --- a/lib/Transforms/Scalar/EarlyCSE.cpp 2017-10-23 14:47:17.025523189 -0400 diff --git a/releases/4.0.0/patches_external/12-universal-driver.patch b/releases/4.0.0/patches_external/4_1-universal-driver.patch similarity index 61% rename from releases/4.0.0/patches_external/12-universal-driver.patch rename to releases/4.0.0/patches_external/4_1-universal-driver.patch index 5de339f..f2771d6 100644 --- a/releases/4.0.0/patches_external/12-universal-driver.patch +++ b/releases/4.0.0/patches_external/4_1-universal-driver.patch @@ -1,14 +1,4 @@ # Description : Universal Driver Enabling -# When Fixed in Open Source : Needed always -# P4V CL : 706504(4.0.0) -# Category : Build -# Author : juanrod2 -# Open Source Classification : Internal Only -# Impact : -# Failing Workload/Commandline : -# Notes : Process::GetArgumentVector for Windows has been -# commented out since it is calling CommandLineToArgvW -# which is not Universal API compliant. diff -Naur --strip-trailing-cr a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc --- a/lib/Support/Windows/Process.inc 2017-10-27 09:51:01.756680018 -0400 diff --git a/releases/4.0.0/patches_external/02-Add-Reassoc-Contract-ApproxFunc-to-FMF.patch b/releases/4.0.0/patches_external/Add-Reassoc-Contract-ApproxFunc-to-FMF.patch similarity index 99% rename from releases/4.0.0/patches_external/02-Add-Reassoc-Contract-ApproxFunc-to-FMF.patch rename to releases/4.0.0/patches_external/Add-Reassoc-Contract-ApproxFunc-to-FMF.patch index ded1655..410fc3f 100644 --- a/releases/4.0.0/patches_external/02-Add-Reassoc-Contract-ApproxFunc-to-FMF.patch +++ b/releases/4.0.0/patches_external/Add-Reassoc-Contract-ApproxFunc-to-FMF.patch @@ -1,7 +1,4 @@ # Description : Adds AllowReassoc, AllowContract and ApproxFunc to FastMathFlags -# When Fixed in LLVM Core : 22fddd859bee89a7950cb3dfbeb72cd6072656ee (03/29/2017), -# : 00e900afdbd5dc97330de6bc0b8b09db1dcac9f7 (11/06/2017) -# Category : Enhanchement diff --git a/docs/LangRef.rst b/docs/LangRef.rst --- a/docs/LangRef.rst diff --git a/releases/4.0.0/patches_external/06-Be-conservative-when-splitting-loop.patch b/releases/4.0.0/patches_external/Be-conservative-when-splitting-loop.patch similarity index 93% rename from releases/4.0.0/patches_external/06-Be-conservative-when-splitting-loop.patch rename to releases/4.0.0/patches_external/Be-conservative-when-splitting-loop.patch index 240614c..996eff2 100644 --- a/releases/4.0.0/patches_external/06-Be-conservative-when-splitting-loop.patch +++ b/releases/4.0.0/patches_external/Be-conservative-when-splitting-loop.patch @@ -1,12 +1,4 @@ # Description : Be conservative when splitting loop -# When Fixed in Open Source : Need Always -# P4V CL : 603927(3.8.0), 642695,653096,699503(4.0.0) -# Category : -# Author : hliao -# Open Source Classification : Internal Only -# Impact : -# Failing Workload/Commandline : -# Notes : diff -Naur --strip-trailing-cr a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp --- a/lib/Transforms/Utils/LoopSimplify.cpp 2017-10-23 09:53:04.933214744 -0400 diff --git a/releases/4.0.0/patches_external/09-Enabling-test-Offset32-Regression-Fix.patch b/releases/4.0.0/patches_external/Enabling-test-Offset32-Regression-Fix.patch similarity index 74% rename from releases/4.0.0/patches_external/09-Enabling-test-Offset32-Regression-Fix.patch rename to releases/4.0.0/patches_external/Enabling-test-Offset32-Regression-Fix.patch index d5f22d4..f6303f7 100644 --- a/releases/4.0.0/patches_external/09-Enabling-test-Offset32-Regression-Fix.patch +++ b/releases/4.0.0/patches_external/Enabling-test-Offset32-Regression-Fix.patch @@ -1,15 +1,4 @@ # Description : Limit recursion depth of constant evolving. -# When Fixed in Open Source : LLVM 5.0.0 - rL291927 -# P4V CL : 700172(4.0.0) -# Category : Bug -# Author : juanrod2 -# Open Source Classification : Common -# Impact : Functional -# Failing Workload/Commandline : scenario: gta-x, operation: -run:Offset32 -# Notes : Limit recursion depth of constant evolving For a loop body with -# VERY complicated exit condition evaluation, constant evolving -# may run out of stack on platforms such as Windows. Need to limit -# the recursion depth. diff -Naur --strip-trailing-cr a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp --- a/lib/Analysis/ScalarEvolution.cpp 2017-10-23 09:53:04.925214744 -0400 diff --git a/releases/4.0.0/patches_external/11-Fix-crash-due-to-bad-bitcast.patch b/releases/4.0.0/patches_external/Fix-crash-due-to-bad-bitcast.patch similarity index 66% rename from releases/4.0.0/patches_external/11-Fix-crash-due-to-bad-bitcast.patch rename to releases/4.0.0/patches_external/Fix-crash-due-to-bad-bitcast.patch index c8cc82d..1250ff3 100644 --- a/releases/4.0.0/patches_external/11-Fix-crash-due-to-bad-bitcast.patch +++ b/releases/4.0.0/patches_external/Fix-crash-due-to-bad-bitcast.patch @@ -1,17 +1,4 @@ # Description : [SROA] Fix crash due to bad bitcast -# When Fixed in Open Source : LLVM 5.0.0 - rL304585 -# P4V CL : 706504(4.0.0) -# Category : Bug -# Author : gkluczek -# Open Source Classification : Common -# Impact : Functional -# Failing Workload/Commandline : Sandra stress tests when PageFaults are ENABLED -# Notes : As shown in the test case, SROA was crashing when trying to split -# stores (to the alloca) of loads (from anywhere), because it assumed -# the pointer operand to the loads and stores had to have the same -# address space. This isn't the case. Make sure to use the correct -# pointer type for both the load and the store. - diff -Naur --strip-trailing-cr a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp --- a/lib/Transforms/Scalar/SROA.cpp 2017-10-23 15:06:46.139477387 -0400 diff --git a/releases/4.0.0/patches_external/07-Temporarily-disable-the-combination-on-b.patch b/releases/4.0.0/patches_external/Temporarily-disable-the-combination-on-b.patch similarity index 71% rename from releases/4.0.0/patches_external/07-Temporarily-disable-the-combination-on-b.patch rename to releases/4.0.0/patches_external/Temporarily-disable-the-combination-on-b.patch index be60b79..7813fa1 100644 --- a/releases/4.0.0/patches_external/07-Temporarily-disable-the-combination-on-b.patch +++ b/releases/4.0.0/patches_external/Temporarily-disable-the-combination-on-b.patch @@ -1,10 +1,4 @@ # Description : Temporarily disable the combination on bitcasts through PHI -# When Fixed in Open Source : Needed till a cleaner solution is worked out. -# Category : Optimization -# Impact : Performance -# Notes : Under certain code pattern, that opt creates lots of redundant -# PHI nodes which is hard to eliminated or coalesced and hence -# increase register pressure significantly. diff -Naur --strip-trailing-cr a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp 2016-12-14 03:57:17.000000000 -0800 diff --git a/releases/4.0.0/patches_external/05-noduplicate-is-mainly-used-to.patch b/releases/4.0.0/patches_external/noduplicate-is-mainly-used-to.patch similarity index 79% rename from releases/4.0.0/patches_external/05-noduplicate-is-mainly-used-to.patch rename to releases/4.0.0/patches_external/noduplicate-is-mainly-used-to.patch index 9e5fd8c..40e46fc 100644 --- a/releases/4.0.0/patches_external/05-noduplicate-is-mainly-used-to.patch +++ b/releases/4.0.0/patches_external/noduplicate-is-mainly-used-to.patch @@ -1,14 +1,6 @@ # Description : `noduplicate` is mainly used to prevent code motion of barrier() # implementation. However, loop unrolling is safe to igore that # as it won't change dominance significant even that call is duplicated. -# When Fixed in Open Source : Need Always -# P4V CL : 603915 (3.8.0), 642693(4.0.0) -# Category : Optimization -# Author : hliao/spillow -# Open Source Classification : Internal Only -# Impact : Performance -# Failing Workload/Commandline : -# Notes : Some loops won't unroll when they should without this change. diff -Naur --strip-trailing-cr a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp 2017-10-23 09:53:04.937214744 -0400 diff --git a/releases/4.0.0/patches_external/non-recursive-sink-hoist-region.patch b/releases/4.0.0/patches_external/non-recursive-sink-hoist-region.patch new file mode 100644 index 0000000..c609705 --- /dev/null +++ b/releases/4.0.0/patches_external/non-recursive-sink-hoist-region.patch @@ -0,0 +1,239 @@ +# Description : Large CFGs can cause a stack overflow due to recursive step +# for each basic block in a region. Instead create a worklist and iterate +# to limit the stack usage + +diff -Naur --strip-trailing-cr a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp +--- a/lib/Transforms/Scalar/LICM.cpp 2017-02-21 11:01:56.000000000 -0800 ++++ b/lib/Transforms/Scalar/LICM.cpp 2018-03-14 12:43:23.452319700 -0700 +@@ -323,6 +323,30 @@ + return Changed; + } + ++// Does a BFS from a given node to all of its children inside a given loop. ++// The returned vector of nodes includes the starting point. ++static SmallVector ++collectChildrenInLoop(DomTreeNode *N, const Loop *CurLoop) { ++ SmallVector Worklist; ++ auto add_region_to_worklist = [&](DomTreeNode *DTN) { ++ // Only include subregions in the top level loop. ++ BasicBlock *BB = DTN->getBlock(); ++ if (CurLoop->contains(BB)) ++ Worklist.push_back(DTN); ++ }; ++ ++ add_region_to_worklist(N); ++ ++ for (size_t I = 0; I < Worklist.size(); I++) { ++ DomTreeNode *DTN = Worklist[I]; ++ for (DomTreeNode *Child : DTN->getChildren()) ++ add_region_to_worklist(Child); ++ } ++ ++ return Worklist; ++} ++ ++ + /// Walk the specified region of the CFG (defined by all blocks dominated by + /// the specified block, and that are in the current loop) in reverse depth + /// first order w.r.t the DominatorTree. This allows us to visit uses before +@@ -338,51 +362,53 @@ + CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr && + "Unexpected input to sinkRegion"); + +- BasicBlock *BB = N->getBlock(); +- // If this subregion is not in the top level loop at all, exit. +- if (!CurLoop->contains(BB)) +- return false; ++ // We want to visit children before parents. We will enque all the parents ++ // before their children in the worklist and process the worklist in reverse ++ // order. ++ SmallVector Worklist = collectChildrenInLoop(N, CurLoop); ++ + +- // We are processing blocks in reverse dfo, so process children first. + bool Changed = false; +- const std::vector &Children = N->getChildren(); +- for (DomTreeNode *Child : Children) +- Changed |= +- sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE); +- +- // Only need to process the contents of this block if it is not part of a +- // subloop (which would already have been processed). +- if (inSubLoop(BB, CurLoop, LI)) +- return Changed; +- +- for (BasicBlock::iterator II = BB->end(); II != BB->begin();) { +- Instruction &I = *--II; +- +- // If the instruction is dead, we would try to sink it because it isn't used +- // in the loop, instead, just delete it. +- if (isInstructionTriviallyDead(&I, TLI)) { +- DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n'); +- ++II; +- CurAST->deleteValue(&I); +- I.eraseFromParent(); +- Changed = true; ++ for (DomTreeNode *DTN : reverse(Worklist)) { ++ BasicBlock *BB = DTN->getBlock(); ++ ++ ++ ++ // Only need to process the contents of this block if it is not part of a ++ // subloop (which would already have been processed). ++ if (inSubLoop(BB, CurLoop, LI)) + continue; +- } + +- // Check to see if we can sink this instruction to the exit blocks +- // of the loop. We can do this if the all users of the instruction are +- // outside of the loop. In this case, it doesn't even matter if the +- // operands of the instruction are loop invariant. +- // +- if (isNotUsedInLoop(I, CurLoop, SafetyInfo) && +- canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) { +- ++II; +- Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE); ++ for (BasicBlock::iterator II = BB->end(); II != BB->begin();) { ++ Instruction &I = *--II; ++ ++ // If the instruction is dead, we would try to sink it because it isn't used ++ // in the loop, instead, just delete it. ++ if (isInstructionTriviallyDead(&I, TLI)) { ++ DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n'); ++ ++II; ++ CurAST->deleteValue(&I); ++ I.eraseFromParent(); ++ Changed = true; ++ continue; ++ } ++ ++ // Check to see if we can sink this instruction to the exit blocks ++ // of the loop. We can do this if the all users of the instruction are ++ // outside of the loop. In this case, it doesn't even matter if the ++ // operands of the instruction are loop invariant. ++ // ++ if (isNotUsedInLoop(I, CurLoop, SafetyInfo) && ++ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) { ++ ++II; ++ Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE); ++ } + } + } + return Changed; + } + ++ + /// Walk the specified region of the CFG (defined by all blocks dominated by + /// the specified block, and that are in the current loop) in depth first + /// order w.r.t the DominatorTree. This allows us to visit definitions before +@@ -397,50 +423,73 @@ + CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr && + "Unexpected input to hoistRegion"); + +- BasicBlock *BB = N->getBlock(); ++ // We want to visit parents before children. We will enque all the parents ++ // before their children in the worklist and process the worklist in order. ++ SmallVector Worklist = collectChildrenInLoop(N, CurLoop); ++ + +- // If this subregion is not in the top level loop at all, exit. +- if (!CurLoop->contains(BB)) +- return false; + +- // Only need to process the contents of this block if it is not part of a +- // subloop (which would already have been processed). + bool Changed = false; +- if (!inSubLoop(BB, CurLoop, LI)) +- for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { +- Instruction &I = *II++; +- // Try constant folding this instruction. If all the operands are +- // constants, it is technically hoistable, but it would be better to just +- // fold it. +- if (Constant *C = ConstantFoldInstruction( +- &I, I.getModule()->getDataLayout(), TLI)) { +- DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); +- CurAST->copyValue(&I, C); +- I.replaceAllUsesWith(C); +- if (isInstructionTriviallyDead(&I, TLI)) { +- CurAST->deleteValue(&I); ++ for (DomTreeNode *DTN : Worklist) { ++ BasicBlock *BB = DTN->getBlock(); ++ // Only need to process the contents of this block if it is not part of a ++ // subloop (which would already have been processed). ++ ++ if (!inSubLoop(BB, CurLoop, LI)) ++ for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { ++ Instruction &I = *II++; ++ // Try constant folding this instruction. If all the operands are ++ // constants, it is technically hoistable, but it would be better to ++ // just fold it. ++ if (Constant *C = ConstantFoldInstruction( ++ &I, I.getModule()->getDataLayout(), TLI)) { ++ DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); ++ CurAST->copyValue(&I, C); ++ I.replaceAllUsesWith(C); ++ if (isInstructionTriviallyDead(&I, TLI)) { ++ CurAST->deleteValue(&I); ++ I.eraseFromParent(); ++ } ++ Changed = true; ++ continue; ++ } ++ ++ // Attempt to remove floating point division out of the loop by ++ // converting it to a reciprocal multiplication. ++ if (I.getOpcode() == Instruction::FDiv && ++ CurLoop->isLoopInvariant(I.getOperand(1)) && ++ I.hasAllowReciprocal()) { ++ auto Divisor = I.getOperand(1); ++ auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0); ++ auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor); ++ ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); ++ ReciprocalDivisor->insertBefore(&I); ++ ++ auto Product = ++ BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor); ++ Product->setFastMathFlags(I.getFastMathFlags()); ++ Product->insertAfter(&I); ++ I.replaceAllUsesWith(Product); + I.eraseFromParent(); ++ ++ hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE); ++ Changed = true; ++ continue; + } +- Changed = true; +- continue; +- } + +- // Try hoisting the instruction out to the preheader. We can only do this +- // if all of the operands of the instruction are loop invariant and if it +- // is safe to hoist the instruction. +- // +- if (CurLoop->hasLoopInvariantOperands(&I) && +- canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) && +- isSafeToExecuteUnconditionally( +- I, DT, CurLoop, SafetyInfo, ORE, +- CurLoop->getLoopPreheader()->getTerminator())) +- Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE); +- } ++ // Try hoisting the instruction out to the preheader. We can only do ++ // this if all of the operands of the instruction are loop invariant and ++ // if it is safe to hoist the instruction. ++ // ++ if (CurLoop->hasLoopInvariantOperands(&I) && ++ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) && ++ isSafeToExecuteUnconditionally( ++ I, DT, CurLoop, SafetyInfo, ORE, ++ CurLoop->getLoopPreheader()->getTerminator())) ++ Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE); ++ } ++ } + +- const std::vector &Children = N->getChildren(); +- for (DomTreeNode *Child : Children) +- Changed |= +- hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE); + return Changed; + } +