From efe92105ebde230c94974536d3f62aea1a72321d Mon Sep 17 00:00:00 2001 From: Jan Vorlicek Date: Sat, 25 Sep 2021 00:06:53 +0200 Subject: [PATCH 01/10] Reimplement stubs to improve performance This change implements `FixupPrecodeStub`, `PrecodeStub`, `CallCountingStub` and VSD stubs `LookupStub`, `DispatchStub` and `ResolveStub` using a new mechanism with fixed code and separate RW data. The `LoaderHeap` was updated to support a new kind of allocation using interleaved code and data pages to support this new mechanism. The JIT now generates code that uses indirection slot to jump to the methods using `FixupPrecode`, improving performance of the ASPNet plaintext benchmark by 3-4% depending on the target platform (measured on x64 Windows / Linux and arm64 Linux). I have also removed the Holders, as the stubs are naturally properly aligned due to the way they are allocated. There is now only a single variant of each stub, there are no long / short ones anymore as they are not needed - the indirect jumps we use now are not range limited. Most of the stubs stuff is now target agnostic and the originally split implementation is now in single place for all targets. Only a few constants are defined as target specific in these. The code for the stubs is no longer generated as bytes by C++ code, but rather written in asm and compiled. These precompiled templates are then used as a source to copy the code from. The x86 is a bit more complex than that due to the fact that it doesn't support PC relative indirect addressing, so we need to relocate all access to the data slots when generating the code pages. As a further improvement, we could generate just a single page of the code and then just map it many times. This is left for future work. ARM64 Unix differs from the other targets / platforms - there are various page sizes being used. So the asm templates are generated for 4k..64k page sizes and the variant is then picked at runtime based on the page size extracted from the OS. This also removes a lot of writeable mappings created for modifications of the stub code when W^X is enabled, in the plaintext benchmark they were reduced by 75%. That results in a significant reducing of the .NET application startup time with W^X enabled. I think the `LoaderHeap` would benefit from some refactoring, but I'd prefer leaving it for a follow up. It seems that for the sake of the review, it is better to keep it as is. The change also implements logging of number of mappings and their exact locations. This helped me to drive the work and I am planning to use it for further changes. It can be removed in the future once we reach a final state. There are still opportunities for improvement, but these stubs allowed me to scrape off the most significant portion of the mappings. --- src/coreclr/inc/daccess.h | 1 - src/coreclr/inc/executableallocator.h | 54 +- src/coreclr/inc/holder.h | 15 +- src/coreclr/inc/loaderheap.h | 42 +- src/coreclr/minipal/Windows/doublemapping.cpp | 4 +- src/coreclr/utilcode/executableallocator.cpp | 301 ++++--- src/coreclr/utilcode/loaderheap.cpp | 267 +++++-- src/coreclr/vm/CMakeLists.txt | 10 +- src/coreclr/vm/amd64/AsmHelpers.asm | 24 - src/coreclr/vm/amd64/asmconstants.h | 57 ++ src/coreclr/vm/amd64/cgenamd64.cpp | 58 -- src/coreclr/vm/amd64/cgencpu.h | 331 -------- src/coreclr/vm/amd64/theprestubamd64.S | 1 - src/coreclr/vm/amd64/thunktemplates.S | 78 ++ src/coreclr/vm/amd64/thunktemplates.asm | 80 ++ src/coreclr/vm/amd64/unixasmhelpers.S | 23 - src/coreclr/vm/amd64/virtualcallstubcpu.hpp | 751 +----------------- src/coreclr/vm/appdomain.cpp | 1 - src/coreclr/vm/arm/asmconstants.h | 57 ++ src/coreclr/vm/arm/asmhelpers.S | 23 - src/coreclr/vm/arm/asmhelpers.asm | 23 - src/coreclr/vm/arm/cgencpu.h | 360 +-------- src/coreclr/vm/arm/stubs.cpp | 503 +----------- src/coreclr/vm/arm/thunktemplates.S | 114 +++ src/coreclr/vm/arm/thunktemplates.asm | 115 +++ src/coreclr/vm/arm/virtualcallstubcpu.hpp | 323 +------- src/coreclr/vm/arm64/asmconstants.h | 72 +- src/coreclr/vm/arm64/asmhelpers.S | 19 - src/coreclr/vm/arm64/asmhelpers.asm | 21 - src/coreclr/vm/arm64/cgencpu.h | 375 --------- src/coreclr/vm/arm64/stubs.cpp | 129 +-- src/coreclr/vm/arm64/thunktemplates.S | 95 +++ src/coreclr/vm/arm64/thunktemplates.asm | 93 +++ src/coreclr/vm/arm64/virtualcallstubcpu.hpp | 421 +--------- src/coreclr/vm/callcounting.cpp | 122 ++- src/coreclr/vm/callcounting.h | 121 +++ src/coreclr/vm/ceeload.cpp | 2 +- src/coreclr/vm/ceemain.cpp | 30 +- src/coreclr/vm/cgensys.h | 6 - src/coreclr/vm/codeman.cpp | 10 +- src/coreclr/vm/codeman.h | 7 + src/coreclr/vm/comcallablewrapper.cpp | 4 +- src/coreclr/vm/common.h | 24 + src/coreclr/vm/corhost.cpp | 5 + src/coreclr/vm/dynamicmethod.cpp | 8 +- src/coreclr/vm/gccover.cpp | 24 +- src/coreclr/vm/i386/AsmMacros.inc | 23 + src/coreclr/vm/i386/asmconstants.h | 56 ++ src/coreclr/vm/i386/asmhelpers.S | 20 - src/coreclr/vm/i386/asmhelpers.asm | 21 - src/coreclr/vm/i386/cgencpu.h | 207 ----- src/coreclr/vm/i386/cgenx86.cpp | 57 -- src/coreclr/vm/i386/excepx86.cpp | 15 +- src/coreclr/vm/i386/jitinterfacex86.cpp | 12 +- src/coreclr/vm/i386/stublinkerx86.cpp | 287 +------ src/coreclr/vm/i386/stublinkerx86.h | 242 +----- src/coreclr/vm/i386/thunktemplates.S | 124 +++ src/coreclr/vm/i386/thunktemplates.asm | 127 +++ src/coreclr/vm/i386/virtualcallstubcpu.hpp | 650 +-------------- src/coreclr/vm/jitinterface.cpp | 10 +- src/coreclr/vm/loaderallocator.cpp | 37 +- src/coreclr/vm/loaderallocator.hpp | 16 + src/coreclr/vm/method.cpp | 28 +- src/coreclr/vm/method.hpp | 6 +- src/coreclr/vm/peimage.cpp | 2 +- src/coreclr/vm/precode.cpp | 462 +++++++---- src/coreclr/vm/precode.h | 397 +++++++-- src/coreclr/vm/stublink.cpp | 4 +- src/coreclr/vm/stubmgr.cpp | 29 +- src/coreclr/vm/stubmgr.h | 24 +- src/coreclr/vm/virtualcallstub.cpp | 728 ++++++++++------- src/coreclr/vm/virtualcallstub.h | 435 ++++++++-- 72 files changed, 3540 insertions(+), 5683 deletions(-) create mode 100644 src/coreclr/vm/amd64/thunktemplates.S create mode 100644 src/coreclr/vm/amd64/thunktemplates.asm create mode 100644 src/coreclr/vm/arm/thunktemplates.S create mode 100644 src/coreclr/vm/arm/thunktemplates.asm create mode 100644 src/coreclr/vm/arm64/thunktemplates.S create mode 100644 src/coreclr/vm/arm64/thunktemplates.asm create mode 100644 src/coreclr/vm/i386/thunktemplates.S create mode 100644 src/coreclr/vm/i386/thunktemplates.asm diff --git a/src/coreclr/inc/daccess.h b/src/coreclr/inc/daccess.h index a1e812276d853..5ad8b99b67b36 100644 --- a/src/coreclr/inc/daccess.h +++ b/src/coreclr/inc/daccess.h @@ -614,7 +614,6 @@ typedef struct _DacGlobals #endif // TARGET_ARM ULONG fn__ThePreStubPatchLabel; - ULONG fn__PrecodeFixupThunk; #ifdef FEATURE_COMINTEROP ULONG fn__Unknown_AddRef; ULONG fn__Unknown_AddRefSpecial; diff --git a/src/coreclr/inc/executableallocator.h b/src/coreclr/inc/executableallocator.h index 04dfdf031b41f..eeb572837a82a 100644 --- a/src/coreclr/inc/executableallocator.h +++ b/src/coreclr/inc/executableallocator.h @@ -15,6 +15,8 @@ #ifndef DACCESS_COMPILE +#define LOG_EXECUTABLE_ALLOCATOR_STATISTICS + // This class is responsible for allocation of all the executable memory in the runtime. class ExecutableAllocator { @@ -49,7 +51,17 @@ class ExecutableAllocator }; typedef void (*FatalErrorHandler)(UINT errorCode, LPCWSTR pszMessage); - +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + static int64_t g_mapTimeSum; + static int64_t g_mapTimeWithLockSum; + static int64_t g_unmapTimeSum; + static int64_t g_unmapTimeWithLockSum; + static int64_t g_mapFindRXTimeSum; + static int64_t g_mapCreateTimeSum; + + static int64_t g_releaseCount; + static int64_t g_reserveCount; +#endif // Instance of the allocator static ExecutableAllocator* g_instance; @@ -142,8 +154,28 @@ class ExecutableAllocator // Initialize the allocator instance bool Initialize(); +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + static CRITSEC_COOKIE s_LoggerCriticalSection; + + struct LogEntry + { + const char* source; + const char* function; + int line; + int count; + }; + + static LogEntry s_usageLog[256]; + static int s_logMaxIndex; +#endif + public: +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + static void LogUsage(const char* source, int line, const char* function); + static void DumpHolderUsage(); +#endif + // Return the ExecuteAllocator singleton instance static ExecutableAllocator* Instance(); @@ -201,6 +233,8 @@ class ExecutableAllocator void UnmapRW(void* pRW); }; +#define ExecutableWriterHolder ExecutableWriterHolderNoLog + // Holder class to map read-execute memory as read-write so that it can be modified without using read-write-execute mapping. // At the moment the implementation is dummy, returning the same addresses for both cases and expecting them to be read-write-execute. // The class uses the move semantics to ensure proper unmapping in case of re-assigning of the holder value. @@ -274,6 +308,24 @@ class ExecutableWriterHolder { return m_addressRW; } + + void AssignExecutableWriterHolder(T* addressRX, size_t size) + { + *this = ExecutableWriterHolder(addressRX, size); + } }; +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS +#undef ExecutableWriterHolder +#ifdef TARGET_UNIX +#define ExecutableWriterHolder ExecutableAllocator::LogUsage(__FILE__, __LINE__, __PRETTY_FUNCTION__); ExecutableWriterHolderNoLog +#define AssignExecutableWriterHolder(addressRX, size) AssignExecutableWriterHolder(addressRX, size); ExecutableAllocator::LogUsage(__FILE__, __LINE__, __PRETTY_FUNCTION__); +#else +#define ExecutableWriterHolder ExecutableAllocator::LogUsage(__FILE__, __LINE__, __FUNCTION__); ExecutableWriterHolderNoLog +#define AssignExecutableWriterHolder(addressRX, size) AssignExecutableWriterHolder(addressRX, size); ExecutableAllocator::LogUsage(__FILE__, __LINE__, __FUNCTION__); +#endif +#else +#define ExecutableWriterHolder ExecutableWriterHolderNoLog +#endif + #endif // !DACCESS_COMPILE diff --git a/src/coreclr/inc/holder.h b/src/coreclr/inc/holder.h index 4ec7b106cc0e8..a0ff213fd6031 100644 --- a/src/coreclr/inc/holder.h +++ b/src/coreclr/inc/holder.h @@ -934,15 +934,24 @@ using NonVMComHolder = SpecializedWrapper<_TYPE, DoTheRelease<_TYPE>>; // } // foo->DecRef() on out of scope // //----------------------------------------------------------------------------- + template -class ExecutableWriterHolder; +class ExecutableWriterHolderNoLog; -template +class ExecutableAllocator; + +template FORCEINLINE void StubRelease(TYPE* value) { if (value) { - ExecutableWriterHolder stubWriterHolder(value, sizeof(TYPE)); +#ifdef TARGET_UNIX + LOGGER::LogUsage(__FILE__, __LINE__, __PRETTY_FUNCTION__); +#else + LOGGER::LogUsage(__FILE__, __LINE__, __FUNCTION__); +#endif + + ExecutableWriterHolderNoLog stubWriterHolder(value, sizeof(TYPE)); stubWriterHolder.GetRW()->DecRef(); } } diff --git a/src/coreclr/inc/loaderheap.h b/src/coreclr/inc/loaderheap.h index 42b9caa6330f3..324cf2f161c50 100644 --- a/src/coreclr/inc/loaderheap.h +++ b/src/coreclr/inc/loaderheap.h @@ -191,6 +191,15 @@ class UnlockedLoaderHeap friend class ClrDataAccess; #endif +public: + + enum class HeapKind + { + Data, + Executable, + Interleaved + }; + private: // Linked list of ClrVirtualAlloc'd pages PTR_LoaderHeapBlock m_pFirstBlock; @@ -208,12 +217,16 @@ class UnlockedLoaderHeap // When we need to commit pages from our reserved list, number of bytes to commit at a time DWORD m_dwCommitBlockSize; + // For interleaved heap (RX pages interleaved with RW ones), this specifies the allocation granularity, + // which is the individual code block size + DWORD m_dwGranularity; + // Range list to record memory ranges in RangeList * m_pRangeList; size_t m_dwTotalAlloc; - DWORD m_Options; + HeapKind m_kind; LoaderHeapFreeBlock *m_pFirstFreeBlock; @@ -263,6 +276,7 @@ class UnlockedLoaderHeap public: BOOL m_fExplicitControl; // Am I a LoaderHeap or an ExplicitControlLoaderHeap? + void (*m_codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX); #ifdef DACCESS_COMPILE public: @@ -283,7 +297,9 @@ class UnlockedLoaderHeap const BYTE* dwReservedRegionAddress, SIZE_T dwReservedRegionSize, RangeList *pRangeList = NULL, - BOOL fMakeExecutable = FALSE); + HeapKind kind = HeapKind::Data, + void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX) = NULL, + DWORD dwGranularity = 1); ~UnlockedLoaderHeap(); #endif @@ -400,6 +416,7 @@ class UnlockedLoaderHeap } BOOL IsExecutable(); + BOOL IsInterleaved(); public: #ifdef _DEBUG @@ -443,14 +460,18 @@ class LoaderHeap : public UnlockedLoaderHeap, public ILoaderHeapBackout LoaderHeap(DWORD dwReserveBlockSize, DWORD dwCommitBlockSize, RangeList *pRangeList = NULL, - BOOL fMakeExecutable = FALSE, - BOOL fUnlocked = FALSE + UnlockedLoaderHeap::HeapKind kind = UnlockedLoaderHeap::HeapKind::Data, + BOOL fUnlocked = FALSE, + void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX) = NULL, + DWORD dwGranularity = 1 ) : UnlockedLoaderHeap(dwReserveBlockSize, dwCommitBlockSize, NULL, 0, pRangeList, - fMakeExecutable), + kind, + codePageGenerator, + dwGranularity), m_CriticalSection(fUnlocked ? NULL : CreateLoaderHeapLock()) { WRAPPER_NO_CONTRACT; @@ -463,15 +484,18 @@ class LoaderHeap : public UnlockedLoaderHeap, public ILoaderHeapBackout const BYTE* dwReservedRegionAddress, SIZE_T dwReservedRegionSize, RangeList *pRangeList = NULL, - BOOL fMakeExecutable = FALSE, - BOOL fUnlocked = FALSE + UnlockedLoaderHeap::HeapKind kind = UnlockedLoaderHeap::HeapKind::Data, + BOOL fUnlocked = FALSE, + void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX) = NULL, + DWORD dwGranularity = 1 ) : UnlockedLoaderHeap(dwReserveBlockSize, dwCommitBlockSize, dwReservedRegionAddress, dwReservedRegionSize, pRangeList, - fMakeExecutable), + kind, + codePageGenerator, dwGranularity), m_CriticalSection(fUnlocked ? NULL : CreateLoaderHeapLock()) { WRAPPER_NO_CONTRACT; @@ -776,7 +800,7 @@ class ExplicitControlLoaderHeap : public UnlockedLoaderHeap ) : UnlockedLoaderHeap(0, 0, NULL, 0, pRangeList, - fMakeExecutable) + fMakeExecutable ? UnlockedLoaderHeap::HeapKind::Executable : UnlockedLoaderHeap::HeapKind::Data) { WRAPPER_NO_CONTRACT; m_fExplicitControl = TRUE; diff --git a/src/coreclr/minipal/Windows/doublemapping.cpp b/src/coreclr/minipal/Windows/doublemapping.cpp index e265f1d139ad0..0d7033b567056 100644 --- a/src/coreclr/minipal/Windows/doublemapping.cpp +++ b/src/coreclr/minipal/Windows/doublemapping.cpp @@ -184,8 +184,8 @@ void *VMToOSInterface::CommitDoubleMappedMemory(void* pStart, size_t size, bool bool VMToOSInterface::ReleaseDoubleMappedMemory(void *mapperHandle, void* pStart, size_t offset, size_t size) { - // Zero the memory before the unmapping - VirtualAlloc(pStart, size, MEM_COMMIT, PAGE_READWRITE); + LPVOID result = VirtualAlloc(pStart, size, MEM_COMMIT, PAGE_READWRITE); + assert(result != NULL); memset(pStart, 0, size); return UnmapViewOfFile(pStart); } diff --git a/src/coreclr/utilcode/executableallocator.cpp b/src/coreclr/utilcode/executableallocator.cpp index 49431b6ecce74..2f094739618f0 100644 --- a/src/coreclr/utilcode/executableallocator.cpp +++ b/src/coreclr/utilcode/executableallocator.cpp @@ -17,9 +17,90 @@ BYTE * ExecutableAllocator::g_preferredRangeMax; bool ExecutableAllocator::g_isWXorXEnabled = false; ExecutableAllocator::FatalErrorHandler ExecutableAllocator::g_fatalErrorHandler = NULL; - ExecutableAllocator* ExecutableAllocator::g_instance = NULL; +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS +int64_t ExecutableAllocator::g_mapTimeSum = 0; +int64_t ExecutableAllocator::g_mapTimeWithLockSum = 0; +int64_t ExecutableAllocator::g_unmapTimeSum = 0; +int64_t ExecutableAllocator::g_unmapTimeWithLockSum = 0; +int64_t ExecutableAllocator::g_mapFindRXTimeSum = 0; +int64_t ExecutableAllocator::g_mapCreateTimeSum = 0; +int64_t ExecutableAllocator::g_releaseCount = 0; +int64_t ExecutableAllocator::g_reserveCount = 0; + +ExecutableAllocator::LogEntry ExecutableAllocator::s_usageLog[256]; +int ExecutableAllocator::s_logMaxIndex = 0; +CRITSEC_COOKIE ExecutableAllocator::s_LoggerCriticalSection; + +class StopWatch +{ + LARGE_INTEGER m_start; + int64_t* m_accumulator; + +public: + StopWatch(int64_t* accumulator) : m_accumulator(accumulator) + { + QueryPerformanceCounter(&m_start); + } + + ~StopWatch() + { + LARGE_INTEGER end; + QueryPerformanceCounter(&end); + + InterlockedExchangeAdd64(m_accumulator, end.QuadPart - m_start.QuadPart); + } +}; + +void ExecutableAllocator::LogUsage(const char* source, int line, const char* function) +{ + CRITSEC_Holder csh(s_LoggerCriticalSection); + + for (int i = 0; i < s_logMaxIndex; i++) + { + if (s_usageLog[i].source == source && s_usageLog[i].line == line) + { + s_usageLog[i].count++; + return; + } + } + + int i = s_logMaxIndex; + s_logMaxIndex++; + s_usageLog[i].source = source; + s_usageLog[i].function = function; + s_usageLog[i].line = line; + s_usageLog[i].count = 1; +} + +void ExecutableAllocator::DumpHolderUsage() +{ + CRITSEC_Holder csh(s_LoggerCriticalSection); + + LARGE_INTEGER freq; + QueryPerformanceFrequency(&freq); + + fprintf(stderr, "Map time with lock sum: %I64dms\n", g_mapTimeWithLockSum / (freq.QuadPart / 1000)); + fprintf(stderr, "Map time sum: %I64dms\n", g_mapTimeSum / (freq.QuadPart / 1000)); + fprintf(stderr, "Map find RX time sum: %I64dms\n", g_mapFindRXTimeSum / (freq.QuadPart / 1000)); + fprintf(stderr, "Map create time sum: %I64dms\n", g_mapCreateTimeSum / (freq.QuadPart / 1000)); + fprintf(stderr, "Unmap time with lock sum: %I64dms\n", g_unmapTimeWithLockSum / (freq.QuadPart / 1000)); + fprintf(stderr, "Unmap time sum: %I64dms\n", g_unmapTimeSum / (freq.QuadPart / 1000)); + + fprintf(stderr, "Reserve count: %I64d\n", g_reserveCount); + fprintf(stderr, "Release count: %I64d\n", g_releaseCount); + + fprintf(stderr, "ExecutableWriterHolder usage:\n"); + + for (int i = 0; i < s_logMaxIndex; i++) + { + fprintf(stderr, "Count: %d at %s:%d in %s\n", s_usageLog[i].count, s_usageLog[i].source, s_usageLog[i].line, s_usageLog[i].function); + } +} + +#endif // LOG_EXECUTABLE_ALLOCATOR_STATISTICS + bool ExecutableAllocator::IsDoubleMappingEnabled() { LIMITED_METHOD_CONTRACT; @@ -154,6 +235,9 @@ HRESULT ExecutableAllocator::StaticInitialize(FatalErrorHandler fatalErrorHandle return E_FAIL; } +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + s_LoggerCriticalSection = ClrCreateCriticalSection(CrstExecutableAllocatorLock, CrstFlags(CRST_UNSAFE_ANYMODE | CRST_DEBUGGER_THREAD)); +#endif return S_OK; } @@ -212,7 +296,11 @@ void* ExecutableAllocator::FindRWBlock(void* baseRX, size_t size) { if (pBlock->baseRX <= baseRX && ((size_t)baseRX + size) <= ((size_t)pBlock->baseRX + pBlock->size)) { - pBlock->refCount++; +#ifdef TARGET_64BIT + InterlockedIncrement64((LONG64*)& pBlock->refCount); +#else + InterlockedIncrement((LONG*)&pBlock->refCount); +#endif UpdateCachedMapping(pBlock); return (BYTE*)pBlock->baseRW + ((size_t)baseRX - (size_t)pBlock->baseRX); @@ -226,14 +314,6 @@ bool ExecutableAllocator::AddRWBlock(void* baseRW, void* baseRX, size_t size) { LIMITED_METHOD_CONTRACT; - for (BlockRW* pBlock = m_pFirstBlockRW; pBlock != NULL; pBlock = pBlock->next) - { - if (pBlock->baseRX <= baseRX && ((size_t)baseRX + size) <= ((size_t)pBlock->baseRX + pBlock->size)) - { - break; - } - } - // The new "nothrow" below failure is handled as fail fast since it is not recoverable PERMANENT_CONTRACT_VIOLATION(FaultViolation, ReasonContractInfrastructure); @@ -340,45 +420,49 @@ void ExecutableAllocator::Release(void* pRX) { LIMITED_METHOD_CONTRACT; +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + InterlockedIncrement64(&g_releaseCount); +#endif + if (IsDoubleMappingEnabled()) { - CRITSEC_Holder csh(m_CriticalSection); - - // Locate the RX block corresponding to the pRX and remove it from the linked list - BlockRX* pBlock; - BlockRX* pPrevBlock = NULL; - - for (pBlock = m_pFirstBlockRX; pBlock != NULL; pBlock = pBlock->next) - { - if (pRX == pBlock->baseRX) - { - if (pPrevBlock == NULL) - { - m_pFirstBlockRX = pBlock->next; - } - else - { - pPrevBlock->next = pBlock->next; - } - - break; - } - pPrevBlock = pBlock; - } - - if (pBlock != NULL) - { + CRITSEC_Holder csh(m_CriticalSection); + + // Locate the RX block corresponding to the pRX and remove it from the linked list + BlockRX* pBlock; + BlockRX* pPrevBlock = NULL; + + for (pBlock = m_pFirstBlockRX; pBlock != NULL; pBlock = pBlock->next) + { + if (pRX == pBlock->baseRX) + { + if (pPrevBlock == NULL) + { + m_pFirstBlockRX = pBlock->next; + } + else + { + pPrevBlock->next = pBlock->next; + } + + break; + } + pPrevBlock = pBlock; + } + + if (pBlock != NULL) + { VMToOSInterface::ReleaseDoubleMappedMemory(m_doubleMemoryMapperHandle, pRX, pBlock->offset, pBlock->size); // Put the released block into the free block list - pBlock->baseRX = NULL; - pBlock->next = m_pFirstFreeBlockRX; - m_pFirstFreeBlockRX = pBlock; - } - else - { - // The block was not found, which should never happen. - g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("The RX block to release was not found")); - } + pBlock->baseRX = NULL; + pBlock->next = m_pFirstFreeBlockRX; + m_pFirstFreeBlockRX = pBlock; + } + else + { + // The block was not found, which should never happen. + g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("The RX block to release was not found")); + } } else { @@ -386,54 +470,40 @@ void ExecutableAllocator::Release(void* pRX) } } -// Find a free block with the closest size >= the requested size. +// Find a free block with the size == the requested size. // Returns NULL if no such block exists. ExecutableAllocator::BlockRX* ExecutableAllocator::FindBestFreeBlock(size_t size) { LIMITED_METHOD_CONTRACT; BlockRX* pPrevBlock = NULL; - BlockRX* pPrevBestBlock = NULL; - BlockRX* pBestBlock = NULL; BlockRX* pBlock = m_pFirstFreeBlockRX; while (pBlock != NULL) { - if (pBlock->size >= size) + if (pBlock->size == size) { - if (pBestBlock != NULL) - { - if (pBlock->size < pBestBlock->size) - { - pPrevBestBlock = pPrevBlock; - pBestBlock = pBlock; - } - } - else - { - pPrevBestBlock = pPrevBlock; - pBestBlock = pBlock; - } + break; } pPrevBlock = pBlock; pBlock = pBlock->next; } - if (pBestBlock != NULL) + if (pBlock != NULL) { - if (pPrevBestBlock != NULL) + if (pPrevBlock != NULL) { - pPrevBestBlock->next = pBestBlock->next; + pPrevBlock->next = pBlock->next; } else { - m_pFirstFreeBlockRX = pBestBlock->next; + m_pFirstFreeBlockRX = pBlock->next; } - pBestBlock->next = NULL; + pBlock->next = NULL; } - return pBestBlock; + return pBlock; } // Allocate a new block of executable memory and the related descriptor structure. @@ -491,6 +561,10 @@ void* ExecutableAllocator::ReserveWithinRange(size_t size, const void* loAddress { LIMITED_METHOD_CONTRACT; +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + InterlockedIncrement64(&g_reserveCount); +#endif + _ASSERTE((size & (Granularity() - 1)) == 0); if (IsDoubleMappingEnabled()) { @@ -499,9 +573,9 @@ void* ExecutableAllocator::ReserveWithinRange(size_t size, const void* loAddress bool isFreeBlock; BlockRX* block = AllocateBlock(size, &isFreeBlock); if (block == NULL) - { - return NULL; - } + { + return NULL; + } void *result = VMToOSInterface::ReserveDoubleMappedMemory(m_doubleMemoryMapperHandle, block->offset, size, loAddress, hiAddress); @@ -537,6 +611,10 @@ void* ExecutableAllocator::Reserve(size_t size) { LIMITED_METHOD_CONTRACT; +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + InterlockedIncrement64(&g_reserveCount); +#endif + _ASSERTE((size & (Granularity() - 1)) == 0); BYTE *result = NULL; @@ -582,14 +660,14 @@ void* ExecutableAllocator::Reserve(size_t size) { if (IsDoubleMappingEnabled()) { - CRITSEC_Holder csh(m_CriticalSection); + CRITSEC_Holder csh(m_CriticalSection); - bool isFreeBlock; + bool isFreeBlock; BlockRX* block = AllocateBlock(size, &isFreeBlock); - if (block == NULL) - { - return NULL; - } + if (block == NULL) + { + return NULL; + } result = (BYTE*)VMToOSInterface::ReserveDoubleMappedMemory(m_doubleMemoryMapperHandle, block->offset, size, 0, 0); @@ -625,6 +703,10 @@ void* ExecutableAllocator::ReserveAt(void* baseAddressRX, size_t size) { LIMITED_METHOD_CONTRACT; +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + InterlockedIncrement64(&g_reserveCount); +#endif + _ASSERTE((size & (Granularity() - 1)) == 0); if (IsDoubleMappingEnabled()) @@ -670,30 +752,45 @@ void* ExecutableAllocator::MapRW(void* pRX, size_t size) return pRX; } +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + StopWatch swAll(&g_mapTimeWithLockSum); +#endif + CRITSEC_Holder csh(m_CriticalSection); - void* result = FindRWBlock(pRX, size); - if (result != NULL) - { - return result; - } +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + StopWatch sw(&g_mapTimeSum); +#endif + + void* result = FindRWBlock(pRX, size); + if (result != NULL) + { + return result; + } +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + StopWatch sw2(&g_mapFindRXTimeSum); +#endif for (BlockRX* pBlock = m_pFirstBlockRX; pBlock != NULL; pBlock = pBlock->next) { if (pRX >= pBlock->baseRX && ((size_t)pRX + size) <= ((size_t)pBlock->baseRX + pBlock->size)) { - // Offset of the RX address in the originally allocated block - size_t offset = (size_t)pRX - (size_t)pBlock->baseRX; - // Offset of the RX address that will start the newly mapped block - size_t mapOffset = ALIGN_DOWN(offset, Granularity()); - // Size of the block we will map - size_t mapSize = ALIGN_UP(offset - mapOffset + size, Granularity()); - void* pRW = VMToOSInterface::GetRWMapping(m_doubleMemoryMapperHandle, (BYTE*)pBlock->baseRX + mapOffset, pBlock->offset + mapOffset, mapSize); - - if (pRW == NULL) - { - g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("Failed to create RW mapping for RX memory")); - } + // Offset of the RX address in the originally allocated block + size_t offset = (size_t)pRX - (size_t)pBlock->baseRX; + // Offset of the RX address that will start the newly mapped block + size_t mapOffset = ALIGN_DOWN(offset, Granularity()); + // Size of the block we will map + size_t mapSize = ALIGN_UP(offset - mapOffset + size, Granularity()); + +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + StopWatch sw2(&g_mapCreateTimeSum); +#endif + void* pRW = VMToOSInterface::GetRWMapping(m_doubleMemoryMapperHandle, (BYTE*)pBlock->baseRX + mapOffset, pBlock->offset + mapOffset, mapSize); + + if (pRW == NULL) + { + g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("Failed to create RW mapping for RX memory")); + } AddRWBlock(pRW, (BYTE*)pBlock->baseRX + mapOffset, mapSize); @@ -720,6 +817,10 @@ void ExecutableAllocator::UnmapRW(void* pRW) { LIMITED_METHOD_CONTRACT; +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + StopWatch swAll(&g_unmapTimeWithLockSum); +#endif + if (!IsDoubleMappingEnabled()) { return; @@ -728,13 +829,17 @@ void ExecutableAllocator::UnmapRW(void* pRW) CRITSEC_Holder csh(m_CriticalSection); _ASSERTE(pRW != NULL); +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + StopWatch swNoLock(&g_unmapTimeSum); +#endif + void* unmapAddress = NULL; size_t unmapSize; - if (!RemoveRWBlock(pRW, &unmapAddress, &unmapSize)) - { - g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("The RW block to unmap was not found")); - } + if (!RemoveRWBlock(pRW, &unmapAddress, &unmapSize)) + { + g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("The RW block to unmap was not found")); + } if (unmapAddress && !VMToOSInterface::ReleaseRWMapping(unmapAddress, unmapSize)) { diff --git a/src/coreclr/utilcode/loaderheap.cpp b/src/coreclr/utilcode/loaderheap.cpp index 51e39de70ecf9..7638031add7db 100644 --- a/src/coreclr/utilcode/loaderheap.cpp +++ b/src/coreclr/utilcode/loaderheap.cpp @@ -8,8 +8,6 @@ #define DONOT_DEFINE_ETW_CALLBACK #include "eventtracebase.h" -#define LHF_EXECUTABLE 0x1 - #ifndef DACCESS_COMPILE INDEBUG(DWORD UnlockedLoaderHeap::s_dwNumInstancesOfLoaderHeaps = 0;) @@ -728,15 +726,25 @@ struct LoaderHeapFreeBlock } #endif - void* pMemRW = pMem; - ExecutableWriterHolder memWriterHolder; - if (pHeap->IsExecutable()) +#ifdef DEBUG + if (!pHeap->IsInterleaved()) { - memWriterHolder = ExecutableWriterHolder(pMem, dwTotalSize); - pMemRW = memWriterHolder.GetRW(); + void* pMemRW = pMem; + ExecutableWriterHolderNoLog memWriterHolder; + if (pHeap->IsExecutable()) + { + memWriterHolder.AssignExecutableWriterHolder(pMem, dwTotalSize); + pMemRW = memWriterHolder.GetRW(); + } + + memset(pMemRW, 0xcc, dwTotalSize); } + else + { + memset((BYTE*)pMem + GetOsPageSize(), 0xcc, dwTotalSize); + } +#endif // DEBUG - INDEBUG(memset(pMemRW, 0xcc, dwTotalSize);) LoaderHeapFreeBlock *pNewBlock = new (nothrow) LoaderHeapFreeBlock; // If we fail allocating the LoaderHeapFreeBlock, ignore the failure and don't insert the free block at all. if (pNewBlock != NULL) @@ -793,10 +801,10 @@ struct LoaderHeapFreeBlock if (pResult) { void *pResultRW = pResult; - ExecutableWriterHolder resultWriterHolder; + ExecutableWriterHolderNoLog resultWriterHolder; if (pHeap->IsExecutable()) { - resultWriterHolder = ExecutableWriterHolder(pResult, dwSize); + resultWriterHolder.AssignExecutableWriterHolder(pResult, dwSize); pResultRW = resultWriterHolder.GetRW(); } // Callers of loaderheap assume allocated memory is zero-inited so we must preserve this invariant! @@ -828,10 +836,10 @@ struct LoaderHeapFreeBlock size_t dwCombinedSize = dwSize + pNextBlock->m_dwSize; LoaderHeapFreeBlock *pNextNextBlock = pNextBlock->m_pNext; void *pMemRW = pFreeBlock->m_pBlockAddress; - ExecutableWriterHolder memWriterHolder; + ExecutableWriterHolderNoLog memWriterHolder; if (pHeap->IsExecutable()) { - memWriterHolder = ExecutableWriterHolder(pFreeBlock->m_pBlockAddress, dwCombinedSize); + memWriterHolder.AssignExecutableWriterHolder(pFreeBlock->m_pBlockAddress, dwCombinedSize); pMemRW = memWriterHolder.GetRW(); } INDEBUG(memset(pMemRW, 0xcc, dwCombinedSize);) @@ -875,18 +883,23 @@ inline size_t AllocMem_TotalSize(size_t dwRequestedSize, UnlockedLoaderHeap *pHe LIMITED_METHOD_CONTRACT; size_t dwSize = dwRequestedSize; + + // Interleaved heap cannot ad any extra to the requested size + if (!pHeap->IsInterleaved()) + { #ifdef _DEBUG - dwSize += LOADER_HEAP_DEBUG_BOUNDARY; - dwSize = ((dwSize + ALLOC_ALIGN_CONSTANT) & (~ALLOC_ALIGN_CONSTANT)); + dwSize += LOADER_HEAP_DEBUG_BOUNDARY; + dwSize = ((dwSize + ALLOC_ALIGN_CONSTANT) & (~ALLOC_ALIGN_CONSTANT)); #endif - if (!pHeap->m_fExplicitControl) - { + if (!pHeap->m_fExplicitControl) + { #ifdef _DEBUG - dwSize += sizeof(LoaderHeapValidationTag); + dwSize += sizeof(LoaderHeapValidationTag); #endif + } + dwSize = ((dwSize + ALLOC_ALIGN_CONSTANT) & (~ALLOC_ALIGN_CONSTANT)); } - dwSize = ((dwSize + ALLOC_ALIGN_CONSTANT) & (~ALLOC_ALIGN_CONSTANT)); return dwSize; } @@ -919,7 +932,9 @@ UnlockedLoaderHeap::UnlockedLoaderHeap(DWORD dwReserveBlockSize, const BYTE* dwReservedRegionAddress, SIZE_T dwReservedRegionSize, RangeList *pRangeList, - BOOL fMakeExecutable) + HeapKind kind, + void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX), + DWORD dwGranularity) { CONTRACTL { @@ -943,6 +958,8 @@ UnlockedLoaderHeap::UnlockedLoaderHeap(DWORD dwReserveBlockSize, // Round to VIRTUAL_ALLOC_RESERVE_GRANULARITY m_dwTotalAlloc = 0; + m_dwGranularity = dwGranularity; + #ifdef _DEBUG m_dwDebugWastedBytes = 0; s_dwNumInstancesOfLoaderHeaps++; @@ -952,10 +969,10 @@ UnlockedLoaderHeap::UnlockedLoaderHeap(DWORD dwReserveBlockSize, m_fStubUnwindInfoUnregistered= FALSE; #endif - m_Options = 0; + m_kind = kind; - if (fMakeExecutable) - m_Options |= LHF_EXECUTABLE; + _ASSERTE((kind != HeapKind::Interleaved) || (codePageGenerator != NULL)); + m_codePageGenerator = codePageGenerator; m_pFirstFreeBlock = NULL; @@ -1059,12 +1076,6 @@ size_t UnlockedLoaderHeap::GetBytesAvailReservedRegion() return 0; } -#define SETUP_NEW_BLOCK(pData, dwSizeToCommit, dwSizeToReserve) \ - m_pPtrToEndOfCommittedRegion = (BYTE *) (pData) + (dwSizeToCommit); \ - m_pAllocPtr = (BYTE *) (pData); \ - m_pEndReservedRegion = (BYTE *) (pData) + (dwSizeToReserve); - - #ifndef DACCESS_COMPILE void ReleaseReservedMemory(BYTE* value) @@ -1132,6 +1143,7 @@ BOOL UnlockedLoaderHeap::UnlockedReservePages(size_t dwSizeToCommit) pData = (BYTE *)ExecutableAllocator::Instance()->Reserve(dwSizeToReserve); if (pData == NULL) { + _ASSERTE(!"Unable to reserve memory range for a loaderheap"); return FALSE; } } @@ -1143,26 +1155,44 @@ BOOL UnlockedLoaderHeap::UnlockedReservePages(size_t dwSizeToCommit) // and notify the user to provide more reserved mem. _ASSERTE((dwSizeToCommit <= dwSizeToReserve) && "Loaderheap tried to commit more memory than reserved by user"); - if (pData == NULL) + if (!fReleaseMemory) { - //_ASSERTE(!"Unable to ClrVirtualAlloc reserve in a loaderheap"); - return FALSE; + pData.SuppressRelease(); } - if (!fReleaseMemory) + size_t dwSizeToCommitPart = dwSizeToCommit; + if (IsInterleaved()) { - pData.SuppressRelease(); + // For interleaved heaps, we perform two commits, each being half of the requested size + dwSizeToCommitPart /= 2; } // Commit first set of pages, since it will contain the LoaderHeapBlock - void *pTemp = ExecutableAllocator::Instance()->Commit(pData, dwSizeToCommit, (m_Options & LHF_EXECUTABLE)); + void *pTemp = ExecutableAllocator::Instance()->Commit(pData, dwSizeToCommitPart, IsExecutable()); if (pTemp == NULL) { - //_ASSERTE(!"Unable to ClrVirtualAlloc commit in a loaderheap"); + _ASSERTE(!"Unable to commit a loaderheap code page"); return FALSE; } + if (IsInterleaved()) + { + _ASSERTE(dwSizeToCommitPart == GetOsPageSize()); + + void *pTemp = ExecutableAllocator::Instance()->Commit((BYTE*)pData + dwSizeToCommitPart, dwSizeToCommitPart, FALSE); + if (pTemp == NULL) + { + _ASSERTE(!"Unable to commit a loaderheap data page"); + + return FALSE; + } + + ExecutableWriterHolder codePageWriterHolder(pData, GetOsPageSize()); + m_codePageGenerator(codePageWriterHolder.GetRW(), pData); + FlushInstructionCache(GetCurrentProcess(), pData, GetOsPageSize()); + } + // Record reserved range in range list, if one is specified // Do this AFTER the commit - otherwise we'll have bogus ranges included. if (m_pRangeList != NULL) @@ -1193,7 +1223,14 @@ BOOL UnlockedLoaderHeap::UnlockedReservePages(size_t dwSizeToCommit) // Add to the linked list m_pFirstBlock = pNewBlock; - SETUP_NEW_BLOCK(pData, dwSizeToCommit, dwSizeToReserve); + if (IsInterleaved()) + { + dwSizeToCommit /= 2; + } + + m_pPtrToEndOfCommittedRegion = (BYTE *) (pData) + (dwSizeToCommit); \ + m_pAllocPtr = (BYTE *) (pData); \ + m_pEndReservedRegion = (BYTE *) (pData) + (dwSizeToReserve); return TRUE; } @@ -1216,30 +1253,108 @@ BOOL UnlockedLoaderHeap::GetMoreCommittedPages(size_t dwMinSize) // If we have memory we can use, what are you doing here! _ASSERTE(dwMinSize > (SIZE_T)(m_pPtrToEndOfCommittedRegion - m_pAllocPtr)); + if (IsInterleaved()) + { + // This mode interleaves data and code pages 1:1. So the code size is required to be smaller than + // or equal to the page size to ensure that the code range is consecutive. + _ASSERTE(dwMinSize <= GetOsPageSize()); + // For interleaved heap, we always get two memory pages - one for code and one for data + dwMinSize = 2 * GetOsPageSize(); + } + // Does this fit in the reserved region? if (dwMinSize <= (size_t)(m_pEndReservedRegion - m_pAllocPtr)) { - SIZE_T dwSizeToCommit = (m_pAllocPtr + dwMinSize) - m_pPtrToEndOfCommittedRegion; + SIZE_T dwSizeToCommit; - if (dwSizeToCommit < m_dwCommitBlockSize) - dwSizeToCommit = min((SIZE_T)(m_pEndReservedRegion - m_pPtrToEndOfCommittedRegion), (SIZE_T)m_dwCommitBlockSize); + if (IsInterleaved()) + { + // For interleaved heaps, the allocation cannot cross page boundary since there are data and executable + // pages interleaved in a 1:1 fashion. + dwSizeToCommit = dwMinSize; + } + else + { + dwSizeToCommit = (m_pAllocPtr + dwMinSize) - m_pPtrToEndOfCommittedRegion; + } + + size_t unusedRemainder = (size_t)((BYTE*)m_pPtrToEndOfCommittedRegion - m_pAllocPtr); - // Round to page size - dwSizeToCommit = ALIGN_UP(dwSizeToCommit, GetOsPageSize()); + if (IsInterleaved()) + { + // The end of commited region for interleaved heaps points to the end of the executable + // page and the data pages goes right after that. So we skip the data page here. + m_pPtrToEndOfCommittedRegion += GetOsPageSize(); + } + else + { + if (dwSizeToCommit < m_dwCommitBlockSize) + dwSizeToCommit = min((SIZE_T)(m_pEndReservedRegion - m_pPtrToEndOfCommittedRegion), (SIZE_T)m_dwCommitBlockSize); + + // Round to page size + dwSizeToCommit = ALIGN_UP(dwSizeToCommit, GetOsPageSize()); + } + + size_t dwSizeToCommitPart = dwSizeToCommit; + if (IsInterleaved()) + { + // For interleaved heaps, we perform two commits, each being half of the requested size + dwSizeToCommitPart /= 2; + } // Yes, so commit the desired number of reserved pages - void *pData = ExecutableAllocator::Instance()->Commit(m_pPtrToEndOfCommittedRegion, dwSizeToCommit, (m_Options & LHF_EXECUTABLE)); + void *pData = ExecutableAllocator::Instance()->Commit(m_pPtrToEndOfCommittedRegion, dwSizeToCommitPart, IsExecutable()); if (pData == NULL) + { + _ASSERTE(!"Unable to commit a loaderheap page"); return FALSE; + } + + if (IsInterleaved()) + { + // Commit a data page after the code page + ExecutableAllocator::Instance()->Commit(m_pPtrToEndOfCommittedRegion + dwSizeToCommitPart, dwSizeToCommitPart, FALSE); + + ExecutableWriterHolder codePageWriterHolder((BYTE*)pData, GetOsPageSize()); + m_codePageGenerator(codePageWriterHolder.GetRW(), (BYTE*)pData); + FlushInstructionCache(GetCurrentProcess(), pData, GetOsPageSize()); + // If the remaning bytes are large enough to allocate data of the allocation granularity, add them to the free + // block list. + // Otherwise the remaining bytes that are available will be wasted. + if (unusedRemainder >= m_dwGranularity) + { + LoaderHeapFreeBlock::InsertFreeBlock(&m_pFirstFreeBlock, m_pAllocPtr, unusedRemainder, this); + } + else + { + INDEBUG(m_dwDebugWastedBytes += unusedRemainder;) + } + + // For interleaved heaps, further allocations will start from the newly committed page as they cannot + // cross page boundary. + m_pAllocPtr = (BYTE*)pData; + } + + m_pPtrToEndOfCommittedRegion += dwSizeToCommitPart; m_dwTotalAlloc += dwSizeToCommit; - m_pPtrToEndOfCommittedRegion += dwSizeToCommit; return TRUE; } - // Need to allocate a new set of reserved pages - INDEBUG(m_dwDebugWastedBytes += (size_t)(m_pPtrToEndOfCommittedRegion - m_pAllocPtr);) + // Need to allocate a new set of reserved pages that will be located likely at a nonconsecutive virtual address. + // If the remaning bytes are large enough to allocate data of the allocation granularity, add them to the free + // block list. + // Otherwise the remaining bytes that are available will be wasted. + size_t unusedRemainder = (size_t)(m_pPtrToEndOfCommittedRegion - m_pAllocPtr); + if (unusedRemainder >= AllocMem_TotalSize(m_dwGranularity, this)) + { + LoaderHeapFreeBlock::InsertFreeBlock(&m_pFirstFreeBlock, m_pAllocPtr, unusedRemainder, this); + } + else + { + INDEBUG(m_dwDebugWastedBytes += (size_t)(m_pPtrToEndOfCommittedRegion - m_pAllocPtr);) + } // Note, there are unused reserved pages at end of current region -can't do much about that // Provide dwMinSize here since UnlockedReservePages will round up the commit size again @@ -1321,7 +1436,7 @@ void *UnlockedLoaderHeap::UnlockedAllocMem_NoThrow(size_t dwSize INCONTRACT(_ASSERTE(!ARE_FAULTS_FORBIDDEN())); #ifdef RANDOMIZE_ALLOC - if (!m_fExplicitControl) + if (!m_fExplicitControl && !IsInterleaved()) dwSize += s_random.Next() % 256; #endif @@ -1346,10 +1461,10 @@ void *UnlockedLoaderHeap::UnlockedAllocMem_NoThrow(size_t dwSize { #ifdef _DEBUG BYTE *pAllocatedBytes = (BYTE*)pData; - ExecutableWriterHolder dataWriterHolder; - if (m_Options & LHF_EXECUTABLE) + ExecutableWriterHolderNoLog dataWriterHolder; + if (IsExecutable()) { - dataWriterHolder = ExecutableWriterHolder(pData, dwSize); + dataWriterHolder.AssignExecutableWriterHolder(pData, dwSize); pAllocatedBytes = (BYTE *)dataWriterHolder.GetRW(); } @@ -1363,7 +1478,7 @@ void *UnlockedLoaderHeap::UnlockedAllocMem_NoThrow(size_t dwSize "LoaderHeap must return zero-initialized memory"); } - if (!m_fExplicitControl) + if (!m_fExplicitControl && !IsInterleaved()) { LoaderHeapValidationTag *pTag = AllocMem_GetTag(pAllocatedBytes, dwRequestedSize); pTag->m_allocationType = kAllocMem; @@ -1425,6 +1540,7 @@ void UnlockedLoaderHeap::UnlockedBackoutMem(void *pMem, } #ifdef _DEBUG + if (!IsInterleaved()) { DEBUG_ONLY_REGION(); @@ -1511,7 +1627,7 @@ void UnlockedLoaderHeap::UnlockedBackoutMem(void *pMem, size_t dwSize = AllocMem_TotalSize(dwRequestedSize, this); #ifdef _DEBUG - if (m_dwDebugFlags & kCallTracing) + if ((m_dwDebugFlags & kCallTracing) && !IsInterleaved()) { DEBUG_ONLY_REGION(); @@ -1533,17 +1649,25 @@ void UnlockedLoaderHeap::UnlockedBackoutMem(void *pMem, if (m_pAllocPtr == ( ((BYTE*)pMem) + dwSize )) { - void *pMemRW = pMem; - ExecutableWriterHolder memWriterHolder; - if (m_Options & LHF_EXECUTABLE) + if (IsInterleaved()) { - memWriterHolder = ExecutableWriterHolder(pMem, dwSize); - pMemRW = memWriterHolder.GetRW(); + // Clear the RW page + memset((BYTE*)pMem + GetOsPageSize(), 0x00, dwSize); // Fill freed region with 0 } + else + { + void *pMemRW = pMem; + ExecutableWriterHolderNoLog memWriterHolder; + if (IsExecutable()) + { + memWriterHolder.AssignExecutableWriterHolder(pMem, dwSize); + pMemRW = memWriterHolder.GetRW(); + } - // Cool. This was the last block allocated. We can just undo the allocation instead - // of going to the freelist. - memset(pMemRW, 0x00, dwSize); // Fill freed region with 0 + // Cool. This was the last block allocated. We can just undo the allocation instead + // of going to the freelist. + memset(pMemRW, 0x00, dwSize); // Fill freed region with 0 + } m_pAllocPtr = (BYTE*)pMem; } else @@ -1588,6 +1712,7 @@ void *UnlockedLoaderHeap::UnlockedAllocAlignedMem_NoThrow(size_t dwRequestedSiz PRECONDITION( alignment != 0 ); PRECONDITION(0 == (alignment & (alignment - 1))); // require power of 2 + PRECONDITION((dwRequestedSize % m_dwGranularity) == 0); POSTCONDITION( (RETVAL) ? (0 == ( ((UINT_PTR)(RETVAL)) & (alignment - 1))) : // If non-null, pointer must be aligned (pdwExtra == NULL || 0 == *pdwExtra) // or else *pdwExtra must be set to 0 @@ -1632,6 +1757,11 @@ void *UnlockedLoaderHeap::UnlockedAllocAlignedMem_NoThrow(size_t dwRequestedSiz pResult = m_pAllocPtr; size_t extra = alignment - ((size_t)pResult & ((size_t)alignment - 1)); + if ((IsInterleaved())) + { + _ASSERTE(alignment == 1); + extra = 0; + } // On DEBUG, we force a non-zero extra so people don't forget to adjust for it on backout #ifndef _DEBUG @@ -1655,10 +1785,10 @@ void *UnlockedLoaderHeap::UnlockedAllocAlignedMem_NoThrow(size_t dwRequestedSiz #ifdef _DEBUG BYTE *pAllocatedBytes = (BYTE *)pResult; - ExecutableWriterHolder resultWriterHolder; - if (m_Options & LHF_EXECUTABLE) + ExecutableWriterHolderNoLog resultWriterHolder; + if (IsExecutable()) { - resultWriterHolder = ExecutableWriterHolder(pResult, dwSize - extra); + resultWriterHolder.AssignExecutableWriterHolder(pResult, dwSize - extra); pAllocatedBytes = (BYTE *)resultWriterHolder.GetRW(); } @@ -1667,7 +1797,7 @@ void *UnlockedLoaderHeap::UnlockedAllocAlignedMem_NoThrow(size_t dwRequestedSiz memset(pAllocatedBytes + dwRequestedSize, 0xee, LOADER_HEAP_DEBUG_BOUNDARY); #endif - if (dwRequestedSize != 0) + if (dwRequestedSize != 0 && !IsInterleaved()) { _ASSERTE_MSG(pAllocatedBytes[0] == 0 && memcmp(pAllocatedBytes, pAllocatedBytes + 1, dwRequestedSize - 1) == 0, "LoaderHeap must return zero-initialized memory"); @@ -1689,7 +1819,7 @@ void *UnlockedLoaderHeap::UnlockedAllocAlignedMem_NoThrow(size_t dwRequestedSiz EtwAllocRequest(this, pResult, dwSize); - if (!m_fExplicitControl) + if (!m_fExplicitControl && !IsInterleaved()) { LoaderHeapValidationTag *pTag = AllocMem_GetTag(pAllocatedBytes - extra, dwRequestedSize + extra); pTag->m_allocationType = kAllocMem; @@ -1789,7 +1919,12 @@ void *UnlockedLoaderHeap::UnlockedAllocMemForCode_NoThrow(size_t dwHeaderSize, s BOOL UnlockedLoaderHeap::IsExecutable() { - return (m_Options & LHF_EXECUTABLE); + return (m_kind == HeapKind::Executable) || IsInterleaved(); +} + +BOOL UnlockedLoaderHeap::IsInterleaved() +{ + return m_kind == HeapKind::Interleaved; } #ifdef DACCESS_COMPILE @@ -2081,7 +2216,7 @@ void LoaderHeapSniffer::ValidateFreeList(UnlockedLoaderHeap *pHeap) ( ((UINT_PTR)pProbeThis) - ((UINT_PTR)(pPrevEvent->m_pMem)) + pPrevEvent->m_dwSize ) < 1024) { message.AppendASCII("\nThis block is located close to the corruption point. "); - if (pPrevEvent->QuietValidate()) + if (!pHeap->IsInterleaved() && pPrevEvent->QuietValidate()) { message.AppendASCII("If it was overrun, it might have caused this."); } diff --git a/src/coreclr/vm/CMakeLists.txt b/src/coreclr/vm/CMakeLists.txt index 08a5bb92a66ed..b9147ba7ca93f 100644 --- a/src/coreclr/vm/CMakeLists.txt +++ b/src/coreclr/vm/CMakeLists.txt @@ -665,6 +665,7 @@ if(CLR_CMAKE_TARGET_ARCH_AMD64) ${ARCH_SOURCES_DIR}/PInvokeStubs.asm ${ARCH_SOURCES_DIR}/RedirectedHandledJITCase.asm ${ARCH_SOURCES_DIR}/ThePreStubAMD64.asm + ${ARCH_SOURCES_DIR}/thunktemplates.asm ${ARCH_SOURCES_DIR}/Context.asm ${ARCH_SOURCES_DIR}/ExternalMethodFixupThunk.asm ${ARCH_SOURCES_DIR}/UMThunkStub.asm @@ -681,6 +682,7 @@ elseif(CLR_CMAKE_TARGET_ARCH_I386) ${ARCH_SOURCES_DIR}/gmsasm.asm ${ARCH_SOURCES_DIR}/jithelp.asm ${ARCH_SOURCES_DIR}/PInvokeStubs.asm + ${ARCH_SOURCES_DIR}/thunktemplates.asm ) set(VM_HEADERS_WKS_ARCH_ASM @@ -693,6 +695,7 @@ elseif(CLR_CMAKE_TARGET_ARCH_ARM) ${ARCH_SOURCES_DIR}/ehhelpers.asm ${ARCH_SOURCES_DIR}/patchedcode.asm ${ARCH_SOURCES_DIR}/PInvokeStubs.asm + ${ARCH_SOURCES_DIR}/thunktemplates.asm ) set(VM_HEADERS_WKS_ARCH_ASM @@ -704,6 +707,7 @@ elseif(CLR_CMAKE_TARGET_ARCH_ARM64) ${ARCH_SOURCES_DIR}/CallDescrWorkerARM64.asm ${ARCH_SOURCES_DIR}/CrtHelpers.asm ${ARCH_SOURCES_DIR}/PInvokeStubs.asm + ${ARCH_SOURCES_DIR}/thunktemplates.asm ) set(VM_HEADERS_WKS_ARCH_ASM @@ -726,6 +730,7 @@ else(CLR_CMAKE_TARGET_WIN32) ${ARCH_SOURCES_DIR}/jithelpers_slow.S ${ARCH_SOURCES_DIR}/pinvokestubs.S ${ARCH_SOURCES_DIR}/theprestubamd64.S + ${ARCH_SOURCES_DIR}/thunktemplates.S ${ARCH_SOURCES_DIR}/unixasmhelpers.S ${ARCH_SOURCES_DIR}/umthunkstub.S ${ARCH_SOURCES_DIR}/virtualcallstubamd64.S @@ -738,7 +743,8 @@ else(CLR_CMAKE_TARGET_WIN32) ${ARCH_SOURCES_DIR}/gmsasm.S ${ARCH_SOURCES_DIR}/pinvokestubs.S ${ARCH_SOURCES_DIR}/umthunkstub.S - ) + ${ARCH_SOURCES_DIR}/thunktemplates.S + ) elseif(CLR_CMAKE_TARGET_ARCH_ARM) set(VM_SOURCES_WKS_ARCH_ASM ${ARCH_SOURCES_DIR}/asmhelpers.S @@ -746,6 +752,7 @@ else(CLR_CMAKE_TARGET_WIN32) ${ARCH_SOURCES_DIR}/ehhelpers.S ${ARCH_SOURCES_DIR}/patchedcode.S ${ARCH_SOURCES_DIR}/pinvokestubs.S + ${ARCH_SOURCES_DIR}/thunktemplates.S ) elseif(CLR_CMAKE_TARGET_ARCH_ARM64) set(VM_SOURCES_WKS_ARCH_ASM @@ -753,6 +760,7 @@ else(CLR_CMAKE_TARGET_WIN32) ${ARCH_SOURCES_DIR}/calldescrworkerarm64.S ${ARCH_SOURCES_DIR}/crthelpers.S ${ARCH_SOURCES_DIR}/pinvokestubs.S + ${ARCH_SOURCES_DIR}/thunktemplates.S ) endif() diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index 0fd77a277f58b..273f8173415c6 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -239,30 +239,6 @@ NESTED_ENTRY JIT_RareDisableHelper, _TEXT NESTED_END JIT_RareDisableHelper, _TEXT -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; -;; PrecodeFixupThunk -;; -;; The call in fixup precode initally points to this function. -;; The pupose of this function is to load the MethodDesc and forward the call the prestub. -;; -; EXTERN_C VOID __stdcall PrecodeFixupThunk(); -LEAF_ENTRY PrecodeFixupThunk, _TEXT - - pop rax ; Pop the return address. It points right after the call instruction in the precode. - - ; Inline computation done by FixupPrecode::GetMethodDesc() - movzx r10,byte ptr [rax+2] ; m_PrecodeChunkIndex - movzx r11,byte ptr [rax+1] ; m_MethodDescChunkIndex - mov rax,qword ptr [rax+r10*8+3] - lea METHODDESC_REGISTER,[rax+r11*8] - - ; Tail call to prestub - jmp ThePreStub - -LEAF_END PrecodeFixupThunk, _TEXT - - ; extern "C" void setFPReturn(int fpSize, INT64 retVal); LEAF_ENTRY setFPReturn, _TEXT cmp ecx, 4 diff --git a/src/coreclr/vm/amd64/asmconstants.h b/src/coreclr/vm/amd64/asmconstants.h index 9d7d3159842b1..5bbdff5576ea6 100644 --- a/src/coreclr/vm/amd64/asmconstants.h +++ b/src/coreclr/vm/amd64/asmconstants.h @@ -566,6 +566,63 @@ ASMCONSTANTS_C_ASSERT(CallDescrData__returnValue == offsetof(CallDescrD ASMCONSTANTS_C_ASSERT(OFFSETOF__TransitionBlock__m_argumentRegisters == offsetof(TransitionBlock, m_argumentRegisters)) #endif // UNIX_AMD64_ABI +#define FixupPrecodeData__Target 0x00 +ASMCONSTANTS_C_ASSERT(FixupPrecodeData__Target == offsetof(FixupPrecodeData, Target)) + +#define FixupPrecodeData__MethodDesc 0x08 +ASMCONSTANTS_C_ASSERT(FixupPrecodeData__MethodDesc == offsetof(FixupPrecodeData, MethodDesc)) + +#define FixupPrecodeData__PrecodeFixupThunk 0x10 +ASMCONSTANTS_C_ASSERT(FixupPrecodeData__PrecodeFixupThunk == offsetof(FixupPrecodeData, PrecodeFixupThunk)) + +#define StubPrecodeData__Target 0x08 +ASMCONSTANTS_C_ASSERT(StubPrecodeData__Target == offsetof(StubPrecodeData, Target)) + +#define StubPrecodeData__MethodDesc 0x00 +ASMCONSTANTS_C_ASSERT(StubPrecodeData__MethodDesc == offsetof(StubPrecodeData, MethodDesc)) + +#define CallCountingStubData__RemainingCallCountCell 0x00 +ASMCONSTANTS_C_ASSERT(CallCountingStubData__RemainingCallCountCell == offsetof(CallCountingStubData, RemainingCallCountCell)) + +#define CallCountingStubData__TargetForMethod 0x08 +ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForMethod == offsetof(CallCountingStubData, TargetForMethod)) + +#define CallCountingStubData__TargetForThresholdReached 0x10 +ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForThresholdReached == offsetof(CallCountingStubData, TargetForThresholdReached)) + +#define LookupStubData__DispatchToken 0x00 +ASMCONSTANTS_C_ASSERT(LookupStubData__DispatchToken == offsetof(LookupStubData, DispatchToken)) + +#define LookupStubData__ResolveWorkerTarget 0x08 +ASMCONSTANTS_C_ASSERT(LookupStubData__ResolveWorkerTarget == offsetof(LookupStubData, ResolveWorkerTarget)) + +#define DispatchStubData__ExpectedMT 0x00 +ASMCONSTANTS_C_ASSERT(DispatchStubData__ExpectedMT == offsetof(DispatchStubData, ExpectedMT)) + +#define DispatchStubData__ImplTarget 0x08 +ASMCONSTANTS_C_ASSERT(DispatchStubData__ImplTarget == offsetof(DispatchStubData, ImplTarget)) + +#define DispatchStubData__FailTarget 0x10 +ASMCONSTANTS_C_ASSERT(DispatchStubData__FailTarget == offsetof(DispatchStubData, FailTarget)) + +#define ResolveStubData__HashedToken 0x08 +ASMCONSTANTS_C_ASSERT(ResolveStubData__HashedToken == offsetof(ResolveStubData, HashedToken)) + +#define ResolveStubData__CacheAddress 0x00 +ASMCONSTANTS_C_ASSERT(ResolveStubData__CacheAddress == offsetof(ResolveStubData, CacheAddress)) + +#define ResolveStubData__Token 0x10 +ASMCONSTANTS_C_ASSERT(ResolveStubData__Token == offsetof(ResolveStubData, Token)) + +#define ResolveStubData__Counter 0x0c +ASMCONSTANTS_C_ASSERT(ResolveStubData__Counter == offsetof(ResolveStubData, Counter)) + +#define ResolveStubData__ResolveWorkerTarget 0x18 +ASMCONSTANTS_C_ASSERT(ResolveStubData__ResolveWorkerTarget == offsetof(ResolveStubData, ResolveWorkerTarget)) + +#define CALL_STUB_CACHE_MASK_ASM 0xfff +ASMCONSTANTS_C_ASSERT(CALL_STUB_CACHE_MASK_ASM == CALL_STUB_CACHE_MASK) + #undef ASMCONSTANTS_RUNTIME_ASSERT #undef ASMCONSTANTS_C_ASSERT #ifndef UNIX_AMD64_ABI diff --git a/src/coreclr/vm/amd64/cgenamd64.cpp b/src/coreclr/vm/amd64/cgenamd64.cpp index 86c74f6e7d344..1e2e3f64b460a 100644 --- a/src/coreclr/vm/amd64/cgenamd64.cpp +++ b/src/coreclr/vm/amd64/cgenamd64.cpp @@ -690,64 +690,6 @@ INT32 rel32UsingPreallocatedJumpStub(INT32 UNALIGNED * pRel32, PCODE target, PCO _ASSERTE(FitsInI4(offset)); return static_cast(offset); } - -BOOL DoesSlotCallPrestub(PCODE pCode) -{ - CONTRACTL { - NOTHROW; - GC_NOTRIGGER; - PRECONDITION(pCode != GetPreStubEntryPoint()); - } CONTRACTL_END; - - // AMD64 has the following possible sequences for prestub logic: - // 1. slot -> temporary entrypoint -> prestub - // 2. slot -> precode -> prestub - // 3. slot -> precode -> jumprel64 (jump stub) -> prestub - // 4. slot -> precode -> jumprel64 (NGEN case) -> prestub - -#ifdef HAS_COMPACT_ENTRYPOINTS - if (MethodDescChunk::GetMethodDescFromCompactEntryPoint(pCode, TRUE) != NULL) - { - return TRUE; - } -#endif - - if (!IS_ALIGNED(pCode, PRECODE_ALIGNMENT)) - { - return FALSE; - } - -#ifdef HAS_FIXUP_PRECODE - if (*PTR_BYTE(pCode) == X86_INSTR_CALL_REL32) - { - // Note that call could have been patched to jmp in the meantime - pCode = rel32Decode(pCode+1); - - // JumpStub - if (isJumpRel64(pCode)) { - pCode = decodeJump64(pCode); - } - - return pCode == (TADDR)PrecodeFixupThunk; - } -#endif - - if (*PTR_USHORT(pCode) != X86_INSTR_MOV_R10_IMM64 || // mov rax,XXXX - *PTR_BYTE(pCode+10) != X86_INSTR_NOP || // nop - *PTR_BYTE(pCode+11) != X86_INSTR_JMP_REL32) // jmp rel32 - { - return FALSE; - } - pCode = rel32Decode(pCode+12); - - // JumpStub - if (isJumpRel64(pCode)) { - pCode = decodeJump64(pCode); - } - - return pCode == GetPreStubEntryPoint(); -} - // // Some AMD64 assembly functions have one or more DWORDS at the end of the function // that specify the offsets where significant instructions are diff --git a/src/coreclr/vm/amd64/cgencpu.h b/src/coreclr/vm/amd64/cgencpu.h index 33589c27bae39..d562627856295 100644 --- a/src/coreclr/vm/amd64/cgencpu.h +++ b/src/coreclr/vm/amd64/cgencpu.h @@ -53,8 +53,6 @@ EXTERN_C void FastCallFinalizeWorker(Object *obj, PCODE funcPtr); #define HAS_NDIRECT_IMPORT_PRECODE 1 #define HAS_FIXUP_PRECODE 1 -#define HAS_FIXUP_PRECODE_CHUNKS 1 -#define FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS 1 // ThisPtrRetBufPrecode one is necessary for closed delegates over static methods with return buffer #define HAS_THISPTR_RETBUF_PRECODE 1 @@ -513,334 +511,5 @@ inline BOOL ClrFlushInstructionCache(LPCVOID pCodeAddr, size_t sizeOfCode) #define JIT_GetSharedNonGCStaticBaseNoCtor JIT_GetSharedNonGCStaticBaseNoCtor_SingleAppDomain -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Call counting - -#ifdef FEATURE_TIERED_COMPILATION - -#define DISABLE_COPY(T) \ - T(const T &) = delete; \ - T &operator =(const T &) = delete - -typedef UINT16 CallCount; -typedef DPTR(CallCount) PTR_CallCount; - -//////////////////////////////////////////////////////////////// -// CallCountingStub - -class CallCountingStub; -typedef DPTR(const CallCountingStub) PTR_CallCountingStub; - -class CallCountingStub -{ -public: - static const SIZE_T Alignment = sizeof(void *); - -#ifndef DACCESS_COMPILE -protected: - static const PCODE TargetForThresholdReached; - - CallCountingStub() = default; - -public: - static const CallCountingStub *From(TADDR stubIdentifyingToken); - - PCODE GetEntryPoint() const - { - WRAPPER_NO_CONTRACT; - return PINSTRToPCODE((TADDR)this); - } -#endif // !DACCESS_COMPILE - -public: - PTR_CallCount GetRemainingCallCountCell() const; - PCODE GetTargetForMethod() const; - -#ifndef DACCESS_COMPILE -protected: - template static INT_PTR GetRelativeOffset(const T *relRef, PCODE target) - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg(sizeof(T) != 0); - static_assert_no_msg(sizeof(T) <= sizeof(void *)); - static_assert_no_msg((sizeof(T) & (sizeof(T) - 1)) == 0); // is a power of 2 - _ASSERTE(relRef != nullptr); - - TADDR targetAddress = PCODEToPINSTR(target); - _ASSERTE(targetAddress != NULL); - return (INT_PTR)targetAddress - (INT_PTR)(relRef + 1); - } -#endif - -protected: - template static PCODE GetTarget(const T *relRef) - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); - _ASSERTE(relRef != nullptr); - - return PINSTRToPCODE((INT_PTR)(relRef + 1) + *relRef); - } - - DISABLE_COPY(CallCountingStub); -}; - -//////////////////////////////////////////////////////////////// -// CallCountingStubShort - -class CallCountingStubShort; -typedef DPTR(const CallCountingStubShort) PTR_CallCountingStubShort; -class CallCountingStubLong; -typedef DPTR(const CallCountingStubLong) PTR_CallCountingStubLong; - -#pragma pack(push, 1) -class CallCountingStubShort : public CallCountingStub -{ -private: - const UINT8 m_part0[2]; - CallCount *const m_remainingCallCountCell; - const UINT8 m_part1[5]; - const INT32 m_rel32TargetForMethod; - const UINT8 m_part2[1]; - const INT32 m_rel32TargetForThresholdReached; - const UINT8 m_alignmentPadding[0]; - -#ifndef DACCESS_COMPILE -public: - CallCountingStubShort(CallCountingStubShort* stubRX, CallCount *remainingCallCountCell, PCODE targetForMethod) - : m_part0{ 0x48, 0xb8}, // mov rax, - m_remainingCallCountCell(remainingCallCountCell), // - m_part1{ 0x66, 0xff, 0x08, // dec word ptr [rax] - 0x0f, 0x85}, // jnz - m_rel32TargetForMethod( // - GetRelative32BitOffset( - &stubRX->m_rel32TargetForMethod, - targetForMethod)), - m_part2{ 0xe8}, // call - m_rel32TargetForThresholdReached( // - GetRelative32BitOffset( - &stubRX->m_rel32TargetForThresholdReached, - TargetForThresholdReached)), - // (rip == stub-identifying token) - m_alignmentPadding{} - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg(sizeof(CallCountingStubShort) % Alignment == 0); - _ASSERTE(remainingCallCountCell != nullptr); - _ASSERTE(PCODEToPINSTR(targetForMethod) != NULL); - } - - static bool Is(TADDR stubIdentifyingToken) - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg((offsetof(CallCountingStubShort, m_alignmentPadding[0]) & 1) == 0); - - return (stubIdentifyingToken & 1) == 0; - } - - static const CallCountingStubShort *From(TADDR stubIdentifyingToken) - { - WRAPPER_NO_CONTRACT; - _ASSERTE(Is(stubIdentifyingToken)); - _ASSERTE(stubIdentifyingToken % Alignment == offsetof(CallCountingStubShort, m_alignmentPadding[0]) % Alignment); - - const CallCountingStubShort *stub = - (const CallCountingStubShort *)(stubIdentifyingToken - offsetof(CallCountingStubShort, m_alignmentPadding[0])); - _ASSERTE(IS_ALIGNED(stub, Alignment)); - return stub; - } -#endif // !DACCESS_COMPILE - -public: - static bool Is(PTR_CallCountingStub callCountingStub) - { - WRAPPER_NO_CONTRACT; - return dac_cast(callCountingStub)->m_part1[4] == 0x85; - } - - static PTR_CallCountingStubShort From(PTR_CallCountingStub callCountingStub) - { - WRAPPER_NO_CONTRACT; - _ASSERTE(Is(callCountingStub)); - - return dac_cast(callCountingStub); - } - - PCODE GetTargetForMethod() const - { - WRAPPER_NO_CONTRACT; - return GetTarget(&m_rel32TargetForMethod); - } - -#ifndef DACCESS_COMPILE -private: - static bool CanUseRelative32BitOffset(const INT32 *rel32Ref, PCODE target) - { - WRAPPER_NO_CONTRACT; - - INT_PTR relativeOffset = GetRelativeOffset(rel32Ref, target); - return (INT32)relativeOffset == relativeOffset; - } - -public: - static bool CanUseFor(const void *allocationAddress, PCODE targetForMethod) - { - WRAPPER_NO_CONTRACT; - - const CallCountingStubShort *fakeStub = (const CallCountingStubShort *)allocationAddress; - return - CanUseRelative32BitOffset(&fakeStub->m_rel32TargetForMethod, targetForMethod) && - CanUseRelative32BitOffset(&fakeStub->m_rel32TargetForThresholdReached, TargetForThresholdReached); - } - -private: - static INT32 GetRelative32BitOffset(const INT32 *rel32Ref, PCODE target) - { - WRAPPER_NO_CONTRACT; - - INT_PTR relativeOffset = GetRelativeOffset(rel32Ref, target); - _ASSERTE((INT32)relativeOffset == relativeOffset); - return (INT32)relativeOffset; - } -#endif // !DACCESS_COMPILE - - friend CallCountingStub; - friend CallCountingStubLong; - DISABLE_COPY(CallCountingStubShort); -}; -#pragma pack(pop) - -//////////////////////////////////////////////////////////////// -// CallCountingStubLong - -#pragma pack(push, 1) -class CallCountingStubLong : public CallCountingStub -{ -private: - const UINT8 m_part0[2]; - CallCount *const m_remainingCallCountCell; - const UINT8 m_part1[7]; - const PCODE m_targetForMethod; - const UINT8 m_part2[4]; - const PCODE m_targetForThresholdReached; - const UINT8 m_part3[2]; - const UINT8 m_alignmentPadding[1]; - -#ifndef DACCESS_COMPILE -public: - CallCountingStubLong(CallCount *remainingCallCountCell, PCODE targetForMethod) - : m_part0{ 0x48, 0xb8}, // mov rax, - m_remainingCallCountCell(remainingCallCountCell), // - m_part1{ 0x66, 0xff, 0x08, // dec word ptr [rax] - 0x74, 0x0c, // jz L0 - 0x48, 0xb8}, // mov rax, - m_targetForMethod(targetForMethod), // - m_part2{ 0xff, 0xe0, // jmp rax - 0x48, 0xb8}, // L0: mov rax, - m_targetForThresholdReached(TargetForThresholdReached), // - m_part3{ 0xff, 0xd0}, // call rax - // (rip == stub-identifying token) - m_alignmentPadding{ 0xcc} // int 3 - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg(sizeof(CallCountingStubLong) % Alignment == 0); - static_assert_no_msg(sizeof(CallCountingStubLong) > sizeof(CallCountingStubShort)); - _ASSERTE(remainingCallCountCell != nullptr); - _ASSERTE(PCODEToPINSTR(targetForMethod) != NULL); - } - - static bool Is(TADDR stubIdentifyingToken) - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg((offsetof(CallCountingStubLong, m_alignmentPadding[0]) & 1) != 0); - - return (stubIdentifyingToken & 1) != 0; - } - - static const CallCountingStubLong *From(TADDR stubIdentifyingToken) - { - WRAPPER_NO_CONTRACT; - _ASSERTE(Is(stubIdentifyingToken)); - _ASSERTE(stubIdentifyingToken % Alignment == offsetof(CallCountingStubLong, m_alignmentPadding[0]) % Alignment); - - const CallCountingStubLong *stub = - (const CallCountingStubLong *)(stubIdentifyingToken - offsetof(CallCountingStubLong, m_alignmentPadding[0])); - _ASSERTE(IS_ALIGNED(stub, Alignment)); - return stub; - } -#endif // !DACCESS_COMPILE - -public: - static bool Is(PTR_CallCountingStub callCountingStub) - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg(offsetof(CallCountingStubShort, m_part1[4]) == offsetof(CallCountingStubLong, m_part1[4])); - static_assert_no_msg(sizeof(CallCountingStubShort::m_part1[4]) == sizeof(CallCountingStubLong::m_part1[4])); - - return dac_cast(callCountingStub)->m_part1[4] == 0x0c; - } - - static PTR_CallCountingStubLong From(PTR_CallCountingStub callCountingStub) - { - WRAPPER_NO_CONTRACT; - _ASSERTE(Is(callCountingStub)); - - return dac_cast(callCountingStub); - } - - PCODE GetTargetForMethod() const - { - WRAPPER_NO_CONTRACT; - return m_targetForMethod; - } - - friend CallCountingStub; - DISABLE_COPY(CallCountingStubLong); -}; -#pragma pack(pop) - -//////////////////////////////////////////////////////////////// -// CallCountingStub definitions - -#ifndef DACCESS_COMPILE -inline const CallCountingStub *CallCountingStub::From(TADDR stubIdentifyingToken) -{ - WRAPPER_NO_CONTRACT; - _ASSERTE(stubIdentifyingToken != NULL); - - return - CallCountingStubShort::Is(stubIdentifyingToken) - ? (const CallCountingStub *)CallCountingStubShort::From(stubIdentifyingToken) - : (const CallCountingStub *)CallCountingStubLong::From(stubIdentifyingToken); -} -#endif - -inline PTR_CallCount CallCountingStub::GetRemainingCallCountCell() const -{ - WRAPPER_NO_CONTRACT; - static_assert_no_msg( - offsetof(CallCountingStubShort, m_remainingCallCountCell) == - offsetof(CallCountingStubLong, m_remainingCallCountCell)); - - return PTR_CallCount(dac_cast(this)->m_remainingCallCountCell); -} - -inline PCODE CallCountingStub::GetTargetForMethod() const -{ - WRAPPER_NO_CONTRACT; - - return - CallCountingStubShort::Is(PTR_CallCountingStub(this)) - ? CallCountingStubShort::From(PTR_CallCountingStub(this))->GetTargetForMethod() - : CallCountingStubLong::From(PTR_CallCountingStub(this))->GetTargetForMethod(); -} - -//////////////////////////////////////////////////////////////// - -#undef DISABLE_COPY - -#endif // FEATURE_TIERED_COMPILATION - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #endif // __cgencpu_h__ diff --git a/src/coreclr/vm/amd64/theprestubamd64.S b/src/coreclr/vm/amd64/theprestubamd64.S index 82ddc075de6fe..dd02f70780e2f 100644 --- a/src/coreclr/vm/amd64/theprestubamd64.S +++ b/src/coreclr/vm/amd64/theprestubamd64.S @@ -26,4 +26,3 @@ LEAF_ENTRY ThePreStubPatch, _TEXT PATCH_LABEL ThePreStubPatchLabel ret LEAF_END ThePreStubPatch, _TEXT - diff --git a/src/coreclr/vm/amd64/thunktemplates.S b/src/coreclr/vm/amd64/thunktemplates.S new file mode 100644 index 0000000000000..d2b1cea9108c2 --- /dev/null +++ b/src/coreclr/vm/amd64/thunktemplates.S @@ -0,0 +1,78 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +.intel_syntax noprefix +#include "unixasmmacros.inc" +#include "asmconstants.h" + +PAGE_SIZE = 4096 + +#define DATA_SLOT(stub, field) (stub##Code + PAGE_SIZE + stub##Data__##field) + +LEAF_ENTRY StubPrecodeCode, _TEXT + mov r10, [rip + DATA_SLOT(StubPrecode, MethodDesc)] + jmp [rip + DATA_SLOT(StubPrecode, Target)] +LEAF_END_MARKED StubPrecodeCode, _TEXT + +LEAF_ENTRY FixupPrecodeCode, _TEXT + jmp [rip + DATA_SLOT(FixupPrecode, Target)] +PATCH_LABEL FixupPrecodeCode_Fixup + mov r10, [rip + DATA_SLOT(FixupPrecode, MethodDesc)] + jmp [rip + DATA_SLOT(FixupPrecode, PrecodeFixupThunk)] +LEAF_END_MARKED FixupPrecodeCode, _TEXT + +LEAF_ENTRY CallCountingStubCode, _TEXT + mov rax,QWORD PTR [rip + DATA_SLOT(CallCountingStub, RemainingCallCountCell)] + dec WORD PTR [rax] + je LOCAL_LABEL(CountReachedZero) + jmp QWORD PTR [rip + DATA_SLOT(CallCountingStub, TargetForMethod)] + LOCAL_LABEL(CountReachedZero): + call QWORD PTR [rip + DATA_SLOT(CallCountingStub, TargetForThresholdReached)] +LEAF_END_MARKED CallCountingStubCode, _TEXT + +LEAF_ENTRY LookupStubCode, _TEXT + push QWORD PTR [rip + DATA_SLOT(LookupStub, DispatchToken)] + jmp QWORD PTR [rip + DATA_SLOT(LookupStub, ResolveWorkerTarget)] +LEAF_END_MARKED LookupStubCode, _TEXT + +LEAF_ENTRY DispatchStubCode, _TEXT + mov rax,QWORD PTR [rip + DATA_SLOT(DispatchStub, ExpectedMT)] +PATCH_LABEL DispatchStubCode_ThisDeref + cmp QWORD PTR [rdi],rax; + jne LOCAL_LABEL(Fail) + jmp QWORD PTR [rip + DATA_SLOT(DispatchStub, ImplTarget)] + LOCAL_LABEL(Fail): + jmp QWORD PTR [rip + DATA_SLOT(DispatchStub, FailTarget)] +LEAF_END_MARKED DispatchStubCode, _TEXT + +LEAF_ENTRY ResolveStubCode, _TEXT +PATCH_LABEL ResolveStubCode_ResolveEntry + LOCAL_LABEL(Resolve): + push rdx + mov r10,QWORD PTR [rip + DATA_SLOT(ResolveStub, CacheAddress)] +PATCH_LABEL ResolveStubCode_ThisDeref + mov rax,QWORD PTR [rdi] + mov rdx,rax + shr rax,12 + add rax,rdx + xor eax,DWORD PTR [rip + DATA_SLOT(ResolveStub, HashedToken)] + and eax, CALL_STUB_CACHE_MASK_ASM * 8 + mov rax,QWORD PTR [r10+rax*1] + mov r10,QWORD PTR [rip + DATA_SLOT(ResolveStub, Token)] + cmp rdx,QWORD PTR [rax] + jne LOCAL_LABEL(Miss) + cmp r10,QWORD PTR [rax+8] + jne LOCAL_LABEL(Miss) + pop rdx + jmp QWORD PTR [rax+0x10] +PATCH_LABEL ResolveStubCode_FailEntry + add DWORD PTR [rip + DATA_SLOT(ResolveStub, Counter)], -1 + jge Resolve + or r11, 1 // SDF_ResolveBackPatch +PATCH_LABEL ResolveStubCode_SlowEntry + push rdx + mov r10,QWORD PTR [rip + DATA_SLOT(ResolveStub, Token)] + LOCAL_LABEL(Miss): + push rax + jmp QWORD PTR [rip + DATA_SLOT(ResolveStub, ResolveWorkerTarget)] +LEAF_END_MARKED ResolveStubCode, _TEXT diff --git a/src/coreclr/vm/amd64/thunktemplates.asm b/src/coreclr/vm/amd64/thunktemplates.asm new file mode 100644 index 0000000000000..076e957f2b552 --- /dev/null +++ b/src/coreclr/vm/amd64/thunktemplates.asm @@ -0,0 +1,80 @@ +; Licensed to the .NET Foundation under one or more agreements. +; The .NET Foundation licenses this file to you under the MIT license. + +include +include AsmConstants.inc + +PAGE_SIZE = 4096 + +DATA_SLOT macro stub, field + exitm @CatStr(stub, , stub, , field) +endm + +LEAF_ENTRY StubPrecodeCode, _TEXT + mov r10, QWORD PTR [DATA_SLOT(StubPrecode, MethodDesc)] + jmp QWORD PTR [DATA_SLOT(StubPrecode, Target)] +LEAF_END_MARKED StubPrecodeCode, _TEXT + +LEAF_ENTRY FixupPrecodeCode, _TEXT + jmp QWORD PTR [DATA_SLOT(FixupPrecode, Target)] +PATCH_LABEL FixupPrecodeCode_Fixup + mov r10, QWORD PTR [DATA_SLOT(FixupPrecode, MethodDesc)] + jmp QWORD PTR [DATA_SLOT(FixupPrecode, PrecodeFixupThunk)] +LEAF_END_MARKED FixupPrecodeCode, _TEXT + +LEAF_ENTRY CallCountingStubCode, _TEXT + mov rax,QWORD PTR [DATA_SLOT(CallCountingStub, RemainingCallCountCell)] + dec WORD PTR [rax] + je CountReachedZero + jmp QWORD PTR [DATA_SLOT(CallCountingStub, TargetForMethod)] + CountReachedZero: + call QWORD PTR [DATA_SLOT(CallCountingStub, TargetForThresholdReached)] +LEAF_END_MARKED CallCountingStubCode, _TEXT + +LEAF_ENTRY LookupStubCode, _TEXT + push QWORD PTR [DATA_SLOT(LookupStub, DispatchToken)] + jmp QWORD PTR [DATA_SLOT(LookupStub, ResolveWorkerTarget)] +LEAF_END_MARKED LookupStubCode, _TEXT + +LEAF_ENTRY DispatchStubCode, _TEXT + mov rax,QWORD PTR [DATA_SLOT(DispatchStub, ExpectedMT)] +PATCH_LABEL DispatchStubCode_ThisDeref + cmp QWORD PTR [rcx],rax; + jne Fail + jmp QWORD PTR [DATA_SLOT(DispatchStub, ImplTarget)] + Fail: + jmp QWORD PTR [DATA_SLOT(DispatchStub, FailTarget)] +LEAF_END_MARKED DispatchStubCode, _TEXT + +LEAF_ENTRY ResolveStubCode, _TEXT +PATCH_LABEL ResolveStubCode_ResolveEntry + push rdx + mov r10,QWORD PTR [DATA_SLOT(ResolveStub, CacheAddress)] +PATCH_LABEL ResolveStubCode_ThisDeref + mov rax,QWORD PTR [rcx] + mov rdx,rax + shr rax,12 + add rax,rdx + xor eax,DWORD PTR [DATA_SLOT(ResolveStub, HashedToken)] + and eax, CALL_STUB_CACHE_MASK_ASM * 8 + mov rax,QWORD PTR [r10+rax*1] + mov r10,QWORD PTR [DATA_SLOT(ResolveStub, Token)] + cmp rdx,QWORD PTR [rax] + jne Miss + cmp r10,QWORD PTR [rax+8] + jne Miss + pop rdx + jmp QWORD PTR [rax+10h] +PATCH_LABEL ResolveStubCode_FailEntry + add DWORD PTR [DATA_SLOT(ResolveStub, Counter)], -1 + jge ResolveStubCode + or r11, 1; SDF_ResolveBackPatch +PATCH_LABEL ResolveStubCode_SlowEntry + push rdx + mov r10,QWORD PTR [DATA_SLOT(ResolveStub, Token)] +Miss: + push rax + jmp QWORD PTR [DATA_SLOT(ResolveStub, ResolveWorkerTarget)] +LEAF_END_MARKED ResolveStubCode, _TEXT + + end diff --git a/src/coreclr/vm/amd64/unixasmhelpers.S b/src/coreclr/vm/amd64/unixasmhelpers.S index 5d9cd711df7d6..7848d068a82cf 100644 --- a/src/coreclr/vm/amd64/unixasmhelpers.S +++ b/src/coreclr/vm/amd64/unixasmhelpers.S @@ -5,29 +5,6 @@ #include "unixasmmacros.inc" #include "asmconstants.h" -////////////////////////////////////////////////////////////////////////// -// -// PrecodeFixupThunk -// -// The call in fixup precode initally points to this function. -// The pupose of this function is to load the MethodDesc and forward the call the prestub. -// -// EXTERN_C VOID __stdcall PrecodeFixupThunk() -LEAF_ENTRY PrecodeFixupThunk, _TEXT - - pop rax // Pop the return address. It points right after the call instruction in the precode. - - // Inline computation done by FixupPrecode::GetMethodDesc() - movzx r10,byte ptr [rax+2] // m_PrecodeChunkIndex - movzx r11,byte ptr [rax+1] // m_MethodDescChunkIndex - mov rax,qword ptr [rax+r10*8+3] - lea METHODDESC_REGISTER,[rax+r11*8] - - // Tail call to prestub - jmp C_FUNC(ThePreStub) - -LEAF_END PrecodeFixupThunk, _TEXT - // EXTERN_C int __fastcall HelperMethodFrameRestoreState( // INDEBUG_COMMA(HelperMethodFrame *pFrame) // MachState *pState diff --git a/src/coreclr/vm/amd64/virtualcallstubcpu.hpp b/src/coreclr/vm/amd64/virtualcallstubcpu.hpp index d579633e527c5..12490953e8c41 100644 --- a/src/coreclr/vm/amd64/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/amd64/virtualcallstubcpu.hpp @@ -15,457 +15,16 @@ #ifndef _VIRTUAL_CALL_STUB_AMD64_H #define _VIRTUAL_CALL_STUB_AMD64_H +#define DISPATCH_STUB_FIRST_WORD 0x8B48 +#define DISPATCH_STUB_THIRD_BYTE 0x05 +#define RESOLVE_STUB_FIRST_WORD 0x4C52 +#define LOOKUP_STUB_FIRST_WORD 0x35FF +#define VTABLECALL_STUB_FIRST_WORD 0x8B48 + #include "dbginterface.h" //#define STUB_LOGGING - #pragma pack(push, 1) -// since we are placing code, we want byte packing of the structs - -// Codes of the instruction in the stub where the instruction access violation -// is converted to NullReferenceException at the caller site. -#ifdef UNIX_AMD64_ABI -#define X64_INSTR_CMP_IND_THIS_REG_RAX 0x073948 -#define X64_INSTR_MOV_RAX_IND_THIS_REG 0x078b48 -#else // UNIX_AMD64_ABI -#define X64_INSTR_CMP_IND_THIS_REG_RAX 0x013948 -#define X64_INSTR_MOV_RAX_IND_THIS_REG 0x018b48 -#endif // UNIX_AMD64_ABI - -#define USES_LOOKUP_STUBS 1 - -/********************************************************************************************* -Stubs that contain code are all part of larger structs called Holders. There is a -Holder for each kind of stub, i.e XXXStub is contained with XXXHolder. Holders are -essentially an implementation trick that allowed rearranging the code sequences more -easily while trying out different alternatives, and for dealing with any alignment -issues in a way that was mostly immune to the actually code sequences. These Holders -should be revisited when the stub code sequences are fixed, since in many cases they -add extra space to a stub that is not really needed. - -Stubs are placed in cache and hash tables. Since unaligned access of data in memory -is very slow, the keys used in those tables should be aligned. The things used as keys -typically also occur in the generated code, e.g. a token as an immediate part of an instruction. -For now, to avoid alignment computations as different code strategies are tried out, the key -fields are all in the Holders. Eventually, many of these fields should be dropped, and the instruction -streams aligned so that the immediate fields fall on aligned boundaries. -*/ - -#if USES_LOOKUP_STUBS - -struct LookupStub; -struct LookupHolder; - -/*LookupStub************************************************************************************** -Virtual and interface call sites are initially setup to point at LookupStubs. -This is because the runtime type of the pointer is not yet known, -so the target cannot be resolved. Note: if the jit is able to determine the runtime type -of the pointer, it should be generating a direct call not a virtual or interface call. -This stub pushes a lookup token onto the stack to identify the sought after method, and then -jumps into the EE (VirtualCallStubManager::ResolveWorkerStub) to effectuate the lookup and -transfer of control to the appropriate target method implementation, perhaps patching of the call site -along the way to point to a more appropriate stub. Hence callsites that point to LookupStubs -get quickly changed to point to another kind of stub. -*/ -struct LookupStub -{ - inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } - - inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } - -private: - friend struct LookupHolder; - - // The lookup entry point starts with a nop in order to allow us to quickly see - // if the stub is lookup stub or a dispatch stub. We can read thye first byte - // of a stub to find out what kind of a stub we have. - - BYTE _entryPoint [3]; // 90 nop - // 48 B8 mov rax, - size_t _token; // xx xx xx xx xx xx xx xx 64-bit address - BYTE part2 [3]; // 50 push rax - // 48 B8 mov rax, - size_t _resolveWorkerAddr; // xx xx xx xx xx xx xx xx 64-bit address - BYTE part3 [2]; // FF E0 jmp rax -}; - -/* LookupHolders are the containers for LookupStubs, they provide for any alignment of -stubs as necessary. In the case of LookupStubs, alignment is necessary since -LookupStubs are placed in a hash table keyed by token. */ -struct LookupHolder -{ - static void InitializeStatic(); - - void Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken); - - LookupStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } - - static LookupHolder* FromLookupEntry(PCODE lookupEntry); - -private: - friend struct LookupStub; - - LookupStub _stub; -}; - -#endif // USES_LOOKUP_STUBS - -struct DispatchStub; -struct DispatchStubShort; -struct DispatchStubLong; -struct DispatchHolder; - -/*DispatchStub************************************************************************************** -The structure of a full dispatch stub in memory is a DispatchStub followed contiguously in memory -by either a DispatchStubShort of a DispatchStubLong. DispatchStubShort is used when the resolve -stub (failTarget()) is reachable by a rel32 (DISPL) jump. We make a pretty good effort to make sure -that the stub heaps are set up so that this is the case. If we allocate enough stubs that the heap -end up allocating in a new block that is further away than a DISPL jump can go, then we end up using -a DispatchStubLong which is bigger but is a full 64-bit jump. */ - -/*DispatchStubShort********************************************************************************* -This is the logical continuation of DispatchStub for the case when the failure target is within -a rel32 jump (DISPL). */ -struct DispatchStubShort -{ - friend struct DispatchHolder; - friend struct DispatchStub; - - static BOOL isShortStub(LPCBYTE pCode); - inline PCODE implTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) _implTarget; } - - inline TADDR implTargetSlot() const - { - LIMITED_METHOD_CONTRACT; - return (TADDR)&_implTarget; - } - - inline PCODE failTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) &_failDispl + sizeof(DISPL) + _failDispl; } - -private: - BYTE part1 [2]; // 48 B8 mov rax, - size_t _implTarget; // xx xx xx xx xx xx xx xx 64-bit address - BYTE part2[2]; // 0f 85 jne - DISPL _failDispl; // xx xx xx xx failEntry ;must be forward jmp for perf reasons - BYTE part3 [2]; // FF E0 jmp rax -}; - -#define DispatchStubShort_offsetof_failDisplBase (offsetof(DispatchStubLong, _failDispl) + sizeof(DISPL)) - -inline BOOL DispatchStubShort::isShortStub(LPCBYTE pCode) -{ - LIMITED_METHOD_CONTRACT; - return reinterpret_cast(pCode)->part2[0] == 0x0f; -} - - -/*DispatchStubLong********************************************************************************** -This is the logical continuation of DispatchStub for the case when the failure target is not -reachable by a rel32 jump (DISPL). */ -struct DispatchStubLong -{ - friend struct DispatchHolder; - friend struct DispatchStub; - - static inline BOOL isLongStub(LPCBYTE pCode); - inline PCODE implTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) _implTarget; } - - inline TADDR implTargetSlot() const - { - LIMITED_METHOD_CONTRACT; - return (TADDR)&_implTarget; - } - - inline PCODE failTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) _failTarget; } - -private: - BYTE part1[2]; // 48 B8 mov rax, - size_t _implTarget; // xx xx xx xx xx xx xx xx 64-bit address - BYTE part2 [1]; // 75 jne - BYTE _failDispl; // xx failLabel - BYTE part3 [2]; // FF E0 jmp rax - // failLabel: - BYTE part4 [2]; // 48 B8 mov rax, - size_t _failTarget; // xx xx xx xx xx xx xx xx 64-bit address - BYTE part5 [2]; // FF E0 jmp rax -}; - -#define DispatchStubLong_offsetof_failDisplBase (offsetof(DispatchStubLong, _failDispl) + sizeof(BYTE)) -#define DispatchStubLong_offsetof_failLabel (offsetof(DispatchStubLong, part4[0])) - -inline BOOL DispatchStubLong::isLongStub(LPCBYTE pCode) -{ - LIMITED_METHOD_CONTRACT; - return reinterpret_cast(pCode)->part2[0] == 0x75; -} - -/*DispatchStub************************************************************************************** -Monomorphic and mostly monomorphic call sites eventually point to DispatchStubs. -A dispatch stub has an expected type (expectedMT), target address (target) and fail address (failure). -If the calling frame does in fact have the type be of the expected type, then -control is transfered to the target address, the method implementation. If not, -then control is transfered to the fail address, a fail stub (see below) where a polymorphic -lookup is done to find the correct address to go to. - -implementation note: Order, choice of instructions, and branch directions -should be carefully tuned since it can have an inordinate effect on performance. Particular -attention needs to be paid to the effects on the BTB and branch prediction, both in the small -and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. -Note that since this stub is only used for mostly monomorphic callsites (ones that are not, get patched -to something else), therefore the conditional jump "jne failure" is mostly not taken, and hence it is important -that the branch prediction staticly predict this, which means it must be a forward jump. The alternative -is to reverse the order of the jumps and make sure that the resulting conditional jump "je implTarget" -is statically predicted as taken, i.e a backward jump. The current choice was taken since it was easier -to control the placement of the stubs than control the placement of the jitted code and the stubs. */ -struct DispatchStub -{ - friend struct DispatchHolder; - - enum DispatchStubType - { - e_TYPE_SHORT, - e_TYPE_LONG, - }; - - inline DispatchStubType type() const - { - LIMITED_METHOD_CONTRACT; - CONSISTENCY_CHECK(DispatchStubShort::isShortStub(reinterpret_cast(this + 1)) - || DispatchStubLong::isLongStub(reinterpret_cast(this + 1))); - return DispatchStubShort::isShortStub((BYTE *)(this + 1)) ? e_TYPE_SHORT : e_TYPE_LONG; - } - - inline static size_t size(DispatchStubType type) - { - STATIC_CONTRACT_LEAF; - return sizeof(DispatchStub) + - ((type == e_TYPE_SHORT) ? sizeof(DispatchStubShort) : sizeof(DispatchStubLong)); - } - - inline PCODE entryPoint() const { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } - inline size_t expectedMT() const { LIMITED_METHOD_CONTRACT; return _expectedMT; } - inline size_t size() const { WRAPPER_NO_CONTRACT; return size(type()); } - - inline PCODE implTarget() const - { - LIMITED_METHOD_CONTRACT; - if (type() == e_TYPE_SHORT) - return getShortStub()->implTarget(); - else - return getLongStub()->implTarget(); - } - - inline TADDR implTargetSlot(EntryPointSlots::SlotType *slotTypeRef) const - { - LIMITED_METHOD_CONTRACT; - _ASSERTE(slotTypeRef != nullptr); - - *slotTypeRef = EntryPointSlots::SlotType_Executable; - if (type() == e_TYPE_SHORT) - return getShortStub()->implTargetSlot(); - else - return getLongStub()->implTargetSlot(); - } - - inline PCODE failTarget() const - { - if (type() == e_TYPE_SHORT) - return getShortStub()->failTarget(); - else - return getLongStub()->failTarget(); - } - -private: - inline DispatchStubShort const *getShortStub() const - { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this + 1); } - - inline DispatchStubLong const *getLongStub() const - { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this + 1); } - - BYTE _entryPoint [2]; // 48 B8 mov rax, - size_t _expectedMT; // xx xx xx xx xx xx xx xx 64-bit address - BYTE part1 [3]; // 48 39 XX cmp [THIS_REG], rax - BYTE nopOp; // 90 nop ; 1-byte nop to align _implTarget - - // Followed by either DispatchStubShort or DispatchStubLong, depending - // on whether we were able to make a rel32 or had to make an abs64 jump - // to the resolve stub on failure. - -}; - -/* DispatchHolders are the containers for DispatchStubs, they provide for any alignment of -stubs as necessary. DispatchStubs are placed in a hashtable and in a cache. The keys for both -are the pair expectedMT and token. Efficiency of the of the hash table is not a big issue, -since lookups in it are fairly rare. Efficiency of the cache is paramount since it is accessed frequently -(see ResolveStub below). Currently we are storing both of these fields in the DispatchHolder to simplify -alignment issues. If inlineMT in the stub itself was aligned, then it could be the expectedMT field. -While the token field can be logically gotten by following the failure target to the failEntryPoint -of the ResolveStub and then to the token over there, for perf reasons of cache access, it is duplicated here. -This allows us to use DispatchStubs in the cache. The alternative is to provide some other immutable struct -for the cache composed of the triplet (expectedMT, token, target) and some sort of reclaimation scheme when -they are thrown out of the cache via overwrites (since concurrency will make the obvious approaches invalid). -*/ - -/* @workaround for ee resolution - Since the EE does not currently have a resolver function that -does what we want, see notes in implementation of VirtualCallStubManager::Resolver, we are -using dispatch stubs to siumulate what we want. That means that inlineTarget, which should be immutable -is in fact written. Hence we have moved target out into the holder and aligned it so we can -atomically update it. When we get a resolver function that does what we want, we can drop this field, -and live with just the inlineTarget field in the stub itself, since immutability will hold.*/ -struct DispatchHolder -{ - static void InitializeStatic(); - - void Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT, - DispatchStub::DispatchStubType type); - - static size_t GetHolderSize(DispatchStub::DispatchStubType type) - { STATIC_CONTRACT_WRAPPER; return DispatchStub::size(type); } - - static BOOL CanShortJumpDispatchStubReachFailTarget(PCODE failTarget, LPCBYTE stubMemory) - { - STATIC_CONTRACT_WRAPPER; - LPCBYTE pFrom = stubMemory + sizeof(DispatchStub) + DispatchStubShort_offsetof_failDisplBase; - size_t cbRelJump = failTarget - (PCODE)pFrom; - return FitsInI4(cbRelJump); - } - - DispatchStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this); } - - static DispatchHolder* FromDispatchEntry(PCODE dispatchEntry); - -private: - // DispatchStub follows here. It is dynamically sized on allocation - // because it could be a DispatchStubLong or a DispatchStubShort -}; - -struct ResolveStub; -struct ResolveHolder; - -/*ResolveStub************************************************************************************** -Polymorphic call sites and monomorphic calls that fail end up in a ResolverStub. There is only -one resolver stub built for any given token, even though there may be many call sites that -use that token and many distinct types that are used in the calling call frames. A resolver stub -actually has two entry points, one for polymorphic call sites and one for dispatch stubs that fail on their -expectedMT test. There is a third part of the resolver stub that enters the ee when a decision should -be made about changing the callsite. Therefore, we have defined the resolver stub as three distinct pieces, -even though they are actually allocated as a single contiguous block of memory. These pieces are: - -A ResolveStub has two entry points: - -FailEntry - where the dispatch stub goes if the expected MT test fails. This piece of the stub does -a check to see how often we are actually failing. If failures are frequent, control transfers to the -patch piece to cause the call site to be changed from a mostly monomorphic callsite -(calls dispatch stub) to a polymorphic callsize (calls resolve stub). If failures are rare, control -transfers to the resolve piece (see ResolveStub). The failEntryPoint decrements a counter -every time it is entered. The ee at various times will add a large chunk to the counter. - -ResolveEntry - does a lookup via in a cache by hashing the actual type of the calling frame s - and the token identifying the (contract,method) pair desired. If found, control is transfered -to the method implementation. If not found in the cache, the token is pushed and the ee is entered via -the ResolveWorkerStub to do a full lookup and eventual transfer to the correct method implementation. Since -there is a different resolve stub for every token, the token can be inlined and the token can be pre-hashed. -The effectiveness of this approach is highly sensitive to the effectiveness of the hashing algorithm used, -as well as its speed. It turns out it is very important to make the hash function sensitive to all -of the bits of the method table, as method tables are laid out in memory in a very non-random way. Before -making any changes to the code sequences here, it is very important to measure and tune them as perf -can vary greatly, in unexpected ways, with seeming minor changes. - -Implementation note - Order, choice of instructions, and branch directions -should be carefully tuned since it can have an inordinate effect on performance. Particular -attention needs to be paid to the effects on the BTB and branch prediction, both in the small -and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. -Note that this stub is called in highly polymorphic cases, but the cache should have been sized -and the hash function chosen to maximize the cache hit case. Hence the cmp/jcc instructions should -mostly be going down the cache hit route, and it is important that this be statically predicted as so. -Hence the 3 jcc instrs need to be forward jumps. As structured, there is only one jmp/jcc that typically -gets put in the BTB since all the others typically fall straight thru. Minimizing potential BTB entries -is important. */ - -struct ResolveStub -{ - inline PCODE failEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_failEntryPoint[0]; } - inline PCODE resolveEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_resolveEntryPoint[0]; } - inline PCODE slowEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_slowEntryPoint[0]; } - - inline INT32* pCounter() { LIMITED_METHOD_CONTRACT; return _pCounter; } - inline UINT32 hashedToken() { LIMITED_METHOD_CONTRACT; return _hashedToken >> LOG2_PTRSIZE; } - inline size_t cacheAddress() { LIMITED_METHOD_CONTRACT; return _cacheAddress; } - inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } - -private: - friend struct ResolveHolder; - - BYTE _resolveEntryPoint[3];// resolveStub: - // 52 push rdx - // 49 BA mov r10, - size_t _cacheAddress; // xx xx xx xx xx xx xx xx 64-bit address - BYTE part1 [15]; // 48 8B XX mov rax, [THIS_REG] ; Compute hash = ((MT + MT>>12) ^ prehash) - // 48 8B D0 mov rdx, rax ; rdx <- current MethodTable - // 48 C1 E8 0C shr rax, 12 - // 48 03 C2 add rax, rdx - // 48 35 xor rax, - UINT32 _hashedToken; // xx xx xx xx hashedtoken ; xor with pre-hashed token - BYTE part2 [2]; // 48 25 and rax, - UINT32 mask; // xx xx xx xx cache_mask ; and with cache mask - BYTE part3 [6]; // 4A 8B 04 10 mov rax, [r10 + rax] ; get cache entry address - // 49 BA mov r10, - size_t _token; // xx xx xx xx xx xx xx xx 64-bit address - BYTE part4 [3]; // 48 3B 50 cmp rdx, [rax+ ; compare our MT vs. cache MT - BYTE mtOffset; // xx ResolverCacheElem.pMT] - BYTE part5 [1]; // 75 jne - BYTE toMiss1; // xx miss ; must be forward jump, for perf reasons - BYTE part6 [3]; // 4C 3B 50 cmp r10, [rax+ ; compare our token vs. cache token - BYTE tokenOffset; // xx ResolverCacheElem.token] - BYTE part7 [1]; // 75 jne - BYTE toMiss2; // xx miss ; must be forward jump, for perf reasons - BYTE part8 [3]; // 48 8B 40 mov rax, [rax+ ; setup rax with method impl address - BYTE targetOffset; // xx ResolverCacheElem.target] - BYTE part9 [3]; // 5A pop rdx - // FF E0 jmp rax - // failStub: - BYTE _failEntryPoint [2]; // 48 B8 mov rax, - INT32* _pCounter; // xx xx xx xx xx xx xx xx 64-bit address - BYTE part11 [4]; // 83 00 FF add dword ptr [rax], -1 - // 7d jnl - BYTE toResolveStub1; // xx resolveStub - BYTE part12 [4]; // 49 83 CB 01 or r11, 1 - BYTE _slowEntryPoint [3]; // 52 slow: push rdx - // 49 BA mov r10, - size_t _tokenSlow; // xx xx xx xx xx xx xx xx 64-bit address -// BYTE miss [5]; // 5A miss: pop rdx ; don't pop rdx -// // 41 52 push r10 ; don't push r10 leave it setup with token - BYTE miss [3]; // 50 push rax ; push ptr to cache elem - // 48 B8 mov rax, - size_t _resolveWorker; // xx xx xx xx xx xx xx xx 64-bit address - BYTE part10 [2]; // FF E0 jmp rax -}; - -/* ResolveHolders are the containers for ResolveStubs, They provide -for any alignment of the stubs as necessary. The stubs are placed in a hash table keyed by -the token for which they are built. Efficiency of access requires that this token be aligned. -For now, we have copied that field into the ResolveHolder itself, if the resolve stub is arranged such that -any of its inlined tokens (non-prehashed) is aligned, then the token field in the ResolveHolder -is not needed. */ -struct ResolveHolder -{ - static void InitializeStatic(); - - void Initialize(ResolveHolder* pResolveHolderRX, - PCODE resolveWorkerTarget, PCODE patcherTarget, - size_t dispatchToken, UINT32 hashedToken, - void * cacheAddr, INT32* counterAddr); - - ResolveStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } - - static ResolveHolder* FromFailEntry(PCODE resolveEntry); - static ResolveHolder* FromResolveEntry(PCODE resolveEntry); - -private: - ResolveStub _stub; -}; /*VTableCallStub************************************************************************************** These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed @@ -511,6 +70,7 @@ struct VTableCallHolder VTableCallStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this); } + size_t size() { return stub()->size(); } static size_t GetHolderSize(unsigned slot) { STATIC_CONTRACT_WRAPPER; @@ -529,12 +89,6 @@ struct VTableCallHolder #ifdef DECLARE_DATA -LookupStub lookupInit; -DispatchStub dispatchInit; -DispatchStubShort dispatchShortInit; -DispatchStubLong dispatchLongInit; -ResolveStub resolveInit; - #define INSTR_INT3 0xcc #define INSTR_NOP 0x90 @@ -542,287 +96,8 @@ ResolveStub resolveInit; #include "asmconstants.h" -#ifdef STUB_LOGGING -extern size_t g_lookup_inline_counter; -extern size_t g_call_inline_counter; -extern size_t g_miss_inline_counter; -extern size_t g_call_cache_counter; -extern size_t g_miss_cache_counter; -#endif - -/* Template used to generate the stub. We generate a stub by allocating a block of - memory and copy the template over it and just update the specific fields that need - to be changed. -*/ - -void LookupHolder::InitializeStatic() -{ - static_assert_no_msg((sizeof(LookupHolder) % sizeof(void*)) == 0); - - // The first instruction of a LookupStub is nop - // and we use it in order to differentiate the first two bytes - // of a LookupStub and a ResolveStub - lookupInit._entryPoint [0] = INSTR_NOP; - lookupInit._entryPoint [1] = 0x48; - lookupInit._entryPoint [2] = 0xB8; - lookupInit._token = 0xcccccccccccccccc; - lookupInit.part2 [0] = 0x50; - lookupInit.part2 [1] = 0x48; - lookupInit.part2 [2] = 0xB8; - lookupInit._resolveWorkerAddr = 0xcccccccccccccccc; - lookupInit.part3 [0] = 0xFF; - lookupInit.part3 [1] = 0xE0; -} - -void LookupHolder::Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken) -{ - _stub = lookupInit; - - //fill in the stub specific fields - _stub._token = dispatchToken; - _stub._resolveWorkerAddr = (size_t) resolveWorkerTarget; -} - -/* Template used to generate the stub. We generate a stub by allocating a block of - memory and copy the template over it and just update the specific fields that need - to be changed. -*/ - -void DispatchHolder::InitializeStatic() -{ - // Check that _implTarget is aligned in the DispatchStub for backpatching - static_assert_no_msg(((sizeof(DispatchStub) + offsetof(DispatchStubShort, _implTarget)) % sizeof(void *)) == 0); - static_assert_no_msg(((sizeof(DispatchStub) + offsetof(DispatchStubLong, _implTarget)) % sizeof(void *)) == 0); - - static_assert_no_msg(((sizeof(DispatchStub) + sizeof(DispatchStubShort)) % sizeof(void*)) == 0); - static_assert_no_msg(((sizeof(DispatchStub) + sizeof(DispatchStubLong)) % sizeof(void*)) == 0); - static_assert_no_msg((DispatchStubLong_offsetof_failLabel - DispatchStubLong_offsetof_failDisplBase) < INT8_MAX); - - // Common dispatch stub initialization - dispatchInit._entryPoint [0] = 0x48; - dispatchInit._entryPoint [1] = 0xB8; - dispatchInit._expectedMT = 0xcccccccccccccccc; - dispatchInit.part1 [0] = X64_INSTR_CMP_IND_THIS_REG_RAX & 0xff; - dispatchInit.part1 [1] = (X64_INSTR_CMP_IND_THIS_REG_RAX >> 8) & 0xff; - dispatchInit.part1 [2] = (X64_INSTR_CMP_IND_THIS_REG_RAX >> 16) & 0xff; - dispatchInit.nopOp = 0x90; - - // Short dispatch stub initialization - dispatchShortInit.part1 [0] = 0x48; - dispatchShortInit.part1 [1] = 0xb8; - dispatchShortInit._implTarget = 0xcccccccccccccccc; - dispatchShortInit.part2 [0] = 0x0F; - dispatchShortInit.part2 [1] = 0x85; - dispatchShortInit._failDispl = 0xcccccccc; - dispatchShortInit.part3 [0] = 0xFF; - dispatchShortInit.part3 [1] = 0xE0; - - // Long dispatch stub initialization - dispatchLongInit.part1 [0] = 0x48; - dispatchLongInit.part1 [1] = 0xb8; - dispatchLongInit._implTarget = 0xcccccccccccccccc; - dispatchLongInit.part2 [0] = 0x75; - dispatchLongInit._failDispl = BYTE(DispatchStubLong_offsetof_failLabel - DispatchStubLong_offsetof_failDisplBase); - dispatchLongInit.part3 [0] = 0xFF; - dispatchLongInit.part3 [1] = 0xE0; - // failLabel: - dispatchLongInit.part4 [0] = 0x48; - dispatchLongInit.part4 [1] = 0xb8; - dispatchLongInit._failTarget = 0xcccccccccccccccc; - dispatchLongInit.part5 [0] = 0xFF; - dispatchLongInit.part5 [1] = 0xE0; -}; - -void DispatchHolder::Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT, - DispatchStub::DispatchStubType type) -{ - // - // Initialize the common area - // - - // initialize the static data - *stub() = dispatchInit; - - // fill in the dynamic data - stub()->_expectedMT = expectedMT; - - // - // Initialize the short/long areas - // - if (type == DispatchStub::e_TYPE_SHORT) - { - DispatchStubShort *shortStubRW = const_cast(stub()->getShortStub()); - DispatchStubShort *shortStubRX = const_cast(pDispatchHolderRX->stub()->getShortStub()); - - // initialize the static data - *shortStubRW = dispatchShortInit; - - // fill in the dynamic data - size_t displ = (failTarget - ((PCODE) &shortStubRX->_failDispl + sizeof(DISPL))); - CONSISTENCY_CHECK(FitsInI4(displ)); - shortStubRW->_failDispl = (DISPL) displ; - shortStubRW->_implTarget = (size_t) implTarget; - CONSISTENCY_CHECK((PCODE)&shortStubRX->_failDispl + sizeof(DISPL) + shortStubRX->_failDispl == failTarget); - } - else - { - CONSISTENCY_CHECK(type == DispatchStub::e_TYPE_LONG); - DispatchStubLong *longStub = const_cast(stub()->getLongStub()); - - // initialize the static data - *longStub = dispatchLongInit; - - // fill in the dynamic data - longStub->_implTarget = implTarget; - longStub->_failTarget = failTarget; - } -} - -/* Template used to generate the stub. We generate a stub by allocating a block of - memory and copy the template over it and just update the specific fields that need - to be changed. -*/ - -void ResolveHolder::InitializeStatic() -{ - static_assert_no_msg((sizeof(ResolveHolder) % sizeof(void*)) == 0); - - resolveInit._resolveEntryPoint [0] = 0x52; - resolveInit._resolveEntryPoint [1] = 0x49; - resolveInit._resolveEntryPoint [2] = 0xBA; - resolveInit._cacheAddress = 0xcccccccccccccccc; - resolveInit.part1 [ 0] = X64_INSTR_MOV_RAX_IND_THIS_REG & 0xff; - resolveInit.part1 [ 1] = (X64_INSTR_MOV_RAX_IND_THIS_REG >> 8) & 0xff; - resolveInit.part1 [ 2] = (X64_INSTR_MOV_RAX_IND_THIS_REG >> 16) & 0xff; - resolveInit.part1 [ 3] = 0x48; - resolveInit.part1 [ 4] = 0x8B; - resolveInit.part1 [ 5] = 0xD0; - resolveInit.part1 [ 6] = 0x48; - resolveInit.part1 [ 7] = 0xC1; - resolveInit.part1 [ 8] = 0xE8; - resolveInit.part1 [ 9] = CALL_STUB_CACHE_NUM_BITS; - resolveInit.part1 [10] = 0x48; - resolveInit.part1 [11] = 0x03; - resolveInit.part1 [12] = 0xC2; - resolveInit.part1 [13] = 0x48; - resolveInit.part1 [14] = 0x35; -// Review truncation from unsigned __int64 to UINT32 of a constant value. -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable:4305 4309) -#endif // defined(_MSC_VER) - - resolveInit._hashedToken = 0xcccccccc; - -#if defined(_MSC_VER) -#pragma warning(pop) -#endif // defined(_MSC_VER) - - resolveInit.part2 [ 0] = 0x48; - resolveInit.part2 [ 1] = 0x25; - resolveInit.mask = CALL_STUB_CACHE_MASK*sizeof(void *); - resolveInit.part3 [0] = 0x4A; - resolveInit.part3 [1] = 0x8B; - resolveInit.part3 [2] = 0x04; - resolveInit.part3 [3] = 0x10; - resolveInit.part3 [4] = 0x49; - resolveInit.part3 [5] = 0xBA; - resolveInit._token = 0xcccccccccccccccc; - resolveInit.part4 [0] = 0x48; - resolveInit.part4 [1] = 0x3B; - resolveInit.part4 [2] = 0x50; - resolveInit.mtOffset = offsetof(ResolveCacheElem,pMT) & 0xFF; - resolveInit.part5 [0] = 0x75; - resolveInit.toMiss1 = (offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss1)+1)) & 0xFF; - resolveInit.part6 [0] = 0x4C; - resolveInit.part6 [1] = 0x3B; - resolveInit.part6 [2] = 0x50; - resolveInit.tokenOffset = offsetof(ResolveCacheElem,token) & 0xFF; - resolveInit.part7 [0] = 0x75; - resolveInit.toMiss2 = (offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss2)+1)) & 0xFF; - resolveInit.part8 [0] = 0x48; - resolveInit.part8 [1] = 0x8B; - resolveInit.part8 [2] = 0x40; - resolveInit.targetOffset = offsetof(ResolveCacheElem,target) & 0xFF; - resolveInit.part9 [0] = 0x5A; - resolveInit.part9 [1] = 0xFF; - resolveInit.part9 [2] = 0xE0; - resolveInit._failEntryPoint [0] = 0x48; - resolveInit._failEntryPoint [1] = 0xB8; - resolveInit._pCounter = (INT32*) (size_t) 0xcccccccccccccccc; - resolveInit.part11 [0] = 0x83; - resolveInit.part11 [1] = 0x00; - resolveInit.part11 [2] = 0xFF; - resolveInit.part11 [3] = 0x7D; - resolveInit.toResolveStub1 = (offsetof(ResolveStub, _resolveEntryPoint) - (offsetof(ResolveStub, toResolveStub1)+1)) & 0xFF; - resolveInit.part12 [0] = 0x49; - resolveInit.part12 [1] = 0x83; - resolveInit.part12 [2] = 0xCB; - resolveInit.part12 [3] = 0x01; - resolveInit._slowEntryPoint [0] = 0x52; - resolveInit._slowEntryPoint [1] = 0x49; - resolveInit._slowEntryPoint [2] = 0xBA; - resolveInit._tokenSlow = 0xcccccccccccccccc; - resolveInit.miss [0] = 0x50; - resolveInit.miss [1] = 0x48; - resolveInit.miss [2] = 0xB8; - resolveInit._resolveWorker = 0xcccccccccccccccc; - resolveInit.part10 [0] = 0xFF; - resolveInit.part10 [1] = 0xE0; -}; - -void ResolveHolder::Initialize(ResolveHolder* pResolveHolderRX, - PCODE resolveWorkerTarget, PCODE patcherTarget, - size_t dispatchToken, UINT32 hashedToken, - void * cacheAddr, INT32* counterAddr) -{ - _stub = resolveInit; - - //fill in the stub specific fields - _stub._cacheAddress = (size_t) cacheAddr; - _stub._hashedToken = hashedToken << LOG2_PTRSIZE; - _stub._token = dispatchToken; - _stub._tokenSlow = dispatchToken; - _stub._resolveWorker = (size_t) resolveWorkerTarget; - _stub._pCounter = counterAddr; -} - -ResolveHolder* ResolveHolder::FromFailEntry(PCODE failEntry) -{ - LIMITED_METHOD_CONTRACT; - ResolveHolder* resolveHolder = (ResolveHolder*) ( failEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _failEntryPoint) ); - _ASSERTE(resolveHolder->_stub._resolveEntryPoint[1] == resolveInit._resolveEntryPoint[1]); - return resolveHolder; -} - #endif // DACCESS_COMPILE -LookupHolder* LookupHolder::FromLookupEntry(PCODE lookupEntry) -{ - LIMITED_METHOD_CONTRACT; - LookupHolder* lookupHolder = (LookupHolder*) ( lookupEntry - offsetof(LookupHolder, _stub) - offsetof(LookupStub, _entryPoint) ); - _ASSERTE(lookupHolder->_stub._entryPoint[2] == lookupInit._entryPoint[2]); - return lookupHolder; -} - - -DispatchHolder* DispatchHolder::FromDispatchEntry(PCODE dispatchEntry) -{ - LIMITED_METHOD_CONTRACT; - DispatchHolder* dispatchHolder = (DispatchHolder*) ( dispatchEntry - offsetof(DispatchStub, _entryPoint) ); - _ASSERTE(dispatchHolder->stub()->_entryPoint[1] == dispatchInit._entryPoint[1]); - return dispatchHolder; -} - - -ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry) -{ - LIMITED_METHOD_CONTRACT; - ResolveHolder* resolveHolder = (ResolveHolder*) ( resolveEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _resolveEntryPoint) ); - _ASSERTE(resolveHolder->_stub._resolveEntryPoint[1] == resolveInit._resolveEntryPoint[1]); - return resolveHolder; -} - void VTableCallHolder::Initialize(unsigned slot) { unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE; @@ -885,23 +160,19 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s WORD firstWord = *((WORD*) stubStartAddress); - if (firstWord == 0xB848) + if (firstWord == DISPATCH_STUB_FIRST_WORD && *((BYTE*)stubStartAddress + 2) == DISPATCH_STUB_THIRD_BYTE) { stubKind = SK_DISPATCH; } - else if (firstWord == 0x4890) + else if (firstWord == LOOKUP_STUB_FIRST_WORD) { stubKind = SK_LOOKUP; } - else if (firstWord == 0x4952) + else if (firstWord == RESOLVE_STUB_FIRST_WORD) { stubKind = SK_RESOLVE; } - else if (firstWord == 0x48F8) - { - stubKind = SK_LOOKUP; - } - else if (firstWord == 0x8B48) + else if (firstWord == VTABLECALL_STUB_FIRST_WORD) { stubKind = SK_VTABLECALL; } diff --git a/src/coreclr/vm/appdomain.cpp b/src/coreclr/vm/appdomain.cpp index 5171b57c565da..55076560a46a4 100644 --- a/src/coreclr/vm/appdomain.cpp +++ b/src/coreclr/vm/appdomain.cpp @@ -3176,7 +3176,6 @@ DomainAssembly * AppDomain::FindAssembly(PEAssembly * pPEAssembly, FindAssemblyO if (pManifestFile && pManifestFile->Equals(pPEAssembly)) { - // Caller already has PEAssembly, so we can give DomainAssembly away freely without added reference return pDomainAssembly.GetValue(); } } diff --git a/src/coreclr/vm/arm/asmconstants.h b/src/coreclr/vm/arm/asmconstants.h index 7f2ffa77923d7..06da653d85fa1 100644 --- a/src/coreclr/vm/arm/asmconstants.h +++ b/src/coreclr/vm/arm/asmconstants.h @@ -223,5 +223,62 @@ ASMCONSTANTS_C_ASSERT(InlinedCallFrame__m_pThread == offsetof(InlinedCallFrame, #define InlinedCallFrame__m_pSPAfterProlog 0x1C ASMCONSTANTS_C_ASSERT(InlinedCallFrame__m_pSPAfterProlog == offsetof(InlinedCallFrame, m_pSPAfterProlog)) +#define FixupPrecodeData__Target 0x00 +ASMCONSTANTS_C_ASSERT(FixupPrecodeData__Target == offsetof(FixupPrecodeData, Target)) + +#define FixupPrecodeData__MethodDesc 0x04 +ASMCONSTANTS_C_ASSERT(FixupPrecodeData__MethodDesc == offsetof(FixupPrecodeData, MethodDesc)) + +#define FixupPrecodeData__PrecodeFixupThunk 0x08 +ASMCONSTANTS_C_ASSERT(FixupPrecodeData__PrecodeFixupThunk == offsetof(FixupPrecodeData, PrecodeFixupThunk)) + +#define StubPrecodeData__MethodDesc 0x00 +ASMCONSTANTS_C_ASSERT(StubPrecodeData__MethodDesc == offsetof(StubPrecodeData, MethodDesc)) + +#define StubPrecodeData__Target 0x04 +ASMCONSTANTS_C_ASSERT(StubPrecodeData__Target == offsetof(StubPrecodeData, Target)) + +#define CallCountingStubData__RemainingCallCountCell 0x00 +ASMCONSTANTS_C_ASSERT(CallCountingStubData__RemainingCallCountCell == offsetof(CallCountingStubData, RemainingCallCountCell)) + +#define CallCountingStubData__TargetForMethod 0x04 +ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForMethod == offsetof(CallCountingStubData, TargetForMethod)) + +#define CallCountingStubData__TargetForThresholdReached 0x08 +ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForThresholdReached == offsetof(CallCountingStubData, TargetForThresholdReached)) + +#define LookupStubData__DispatchToken 0x00 +ASMCONSTANTS_C_ASSERT(LookupStubData__DispatchToken == offsetof(LookupStubData, DispatchToken)) + +#define LookupStubData__ResolveWorkerTarget 0x04 +ASMCONSTANTS_C_ASSERT(LookupStubData__ResolveWorkerTarget == offsetof(LookupStubData, ResolveWorkerTarget)) + +#define DispatchStubData__ExpectedMT 0x00 +ASMCONSTANTS_C_ASSERT(DispatchStubData__ExpectedMT == offsetof(DispatchStubData, ExpectedMT)) + +#define DispatchStubData__ImplTarget 0x04 +ASMCONSTANTS_C_ASSERT(DispatchStubData__ImplTarget == offsetof(DispatchStubData, ImplTarget)) + +#define DispatchStubData__FailTarget 0x08 +ASMCONSTANTS_C_ASSERT(DispatchStubData__FailTarget == offsetof(DispatchStubData, FailTarget)) + +#define ResolveStubData__HashedToken 0x04 +ASMCONSTANTS_C_ASSERT(ResolveStubData__HashedToken == offsetof(ResolveStubData, HashedToken)) + +#define ResolveStubData__CacheAddress 0x00 +ASMCONSTANTS_C_ASSERT(ResolveStubData__CacheAddress == offsetof(ResolveStubData, CacheAddress)) + +#define ResolveStubData__Token 0x0c +ASMCONSTANTS_C_ASSERT(ResolveStubData__Token == offsetof(ResolveStubData, Token)) + +#define ResolveStubData__Counter 0x08 +ASMCONSTANTS_C_ASSERT(ResolveStubData__Counter == offsetof(ResolveStubData, Counter)) + +#define ResolveStubData__ResolveWorkerTarget 0x10 +ASMCONSTANTS_C_ASSERT(ResolveStubData__ResolveWorkerTarget == offsetof(ResolveStubData, ResolveWorkerTarget)) + +#define CALL_STUB_CACHE_MASK_ASM 0xfff +ASMCONSTANTS_C_ASSERT(CALL_STUB_CACHE_MASK_ASM == CALL_STUB_CACHE_MASK) + #undef ASMCONSTANTS_RUNTIME_ASSERT #undef ASMCONSTANTS_C_ASSERT diff --git a/src/coreclr/vm/arm/asmhelpers.S b/src/coreclr/vm/arm/asmhelpers.S index f49ed946bfec7..84dc9783630e6 100644 --- a/src/coreclr/vm/arm/asmhelpers.S +++ b/src/coreclr/vm/arm/asmhelpers.S @@ -261,29 +261,6 @@ ThePreStubPatchLabel: NESTED_END NDirectImportThunk, _TEXT -// ------------------------------------------------------------------ -// The call in fixup precode initally points to this function. -// The pupose of this function is to load the MethodDesc and forward the call the prestub. - NESTED_ENTRY PrecodeFixupThunk, _TEXT, NoHandler - - // r12 = FixupPrecode * - - PROLOG_PUSH "{r0-r1}" - - // Inline computation done by FixupPrecode::GetMethodDesc() - ldrb r0, [r12, #3] // m_PrecodeChunkIndex - ldrb r1, [r12, #2] // m_MethodDescChunkIndex - - add r12,r12,r0,lsl #3 - add r0,r12,r0,lsl #2 - ldr r0, [r0,#8] - add r12,r0,r1,lsl #2 - - EPILOG_POP "{r0-r1}" - b C_FUNC(ThePreStub) - - NESTED_END PrecodeFixupThunk, _TEXT - // ------------------------------------------------------------------ // void ResolveWorkerAsmStub(r0, r1, r2, r3, r4:IndirectionCellAndFlags, r12:DispatchToken) // diff --git a/src/coreclr/vm/arm/asmhelpers.asm b/src/coreclr/vm/arm/asmhelpers.asm index 0afdbf444f2a1..d550137316b69 100644 --- a/src/coreclr/vm/arm/asmhelpers.asm +++ b/src/coreclr/vm/arm/asmhelpers.asm @@ -311,29 +311,6 @@ ThePreStubPatchLabel NESTED_END -; ------------------------------------------------------------------ -; The call in fixup precode initally points to this function. -; The pupose of this function is to load the MethodDesc and forward the call the prestub. - NESTED_ENTRY PrecodeFixupThunk - - ; r12 = FixupPrecode * - - PROLOG_PUSH {r0-r1} - - ; Inline computation done by FixupPrecode::GetMethodDesc() - ldrb r0, [r12, #3] ; m_PrecodeChunkIndex - ldrb r1, [r12, #2] ; m_MethodDescChunkIndex - - add r12,r12,r0,lsl #3 - add r0,r12,r0,lsl #2 - ldr r0, [r0,#8] - add r12,r0,r1,lsl #2 - - EPILOG_POP {r0-r1} - EPILOG_BRANCH ThePreStub - - NESTED_END - ; ------------------------------------------------------------------ ; void ResolveWorkerAsmStub(r0, r1, r2, r3, r4:IndirectionCellAndFlags, r12:DispatchToken) ; diff --git a/src/coreclr/vm/arm/cgencpu.h b/src/coreclr/vm/arm/cgencpu.h index feafd7335cc90..598e11c9f4112 100644 --- a/src/coreclr/vm/arm/cgencpu.h +++ b/src/coreclr/vm/arm/cgencpu.h @@ -16,7 +16,10 @@ #define DATA_ALIGNMENT 4 #define DISPATCH_STUB_FIRST_WORD 0xf8d0 +#define DISPATCH_STUB_THIRD_WORD 0xb420 #define RESOLVE_STUB_FIRST_WORD 0xf8d0 +#define RESOLVE_STUB_THIRD_WORD 0xb460 +#define LOOKUP_STUB_FIRST_WORD 0xf8df class MethodDesc; class FramedMethodFrame; @@ -66,7 +69,6 @@ EXTERN_C void getFPReturn(int fpSize, INT64 *pRetVal); EXTERN_C void setFPReturn(int fpSize, INT64 retVal); #define HAS_FIXUP_PRECODE 1 -#define HAS_FIXUP_PRECODE_CHUNKS 1 // ThisPtrRetBufPrecode one is necessary for closed delegates over static methods with return buffer #define HAS_THISPTR_RETBUF_PRECODE 1 @@ -1021,202 +1023,10 @@ inline BOOL ClrFlushInstructionCache(LPCVOID pCodeAddr, size_t sizeOfCode) // Note: If you introduce new precode implementation below, then please // update PrecodeStubManager::CheckIsStub_Internal to account for it. -EXTERN_C VOID STDCALL PrecodeFixupThunk(); - -#define PRECODE_ALIGNMENT sizeof(void*) -#define SIZEOF_PRECODE_BASE CODE_SIZE_ALIGN -#define OFFSETOF_PRECODE_TYPE 0 - -// Invalid precode type -struct InvalidPrecode { - static const int Type = 0; -}; - -struct StubPrecode { - - static const int Type = 0xdf; - - // ldr r12, [pc, #8] ; =m_pMethodDesc - // ldr pc, [pc, #0] ; =m_pTarget - // dcd pTarget - // dcd pMethodDesc - WORD m_rgCode[4]; - TADDR m_pTarget; - TADDR m_pMethodDesc; - - void Init(StubPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator); - - TADDR GetMethodDesc() - { - LIMITED_METHOD_DAC_CONTRACT; - return m_pMethodDesc; - } - - PCODE GetTarget() - { - LIMITED_METHOD_DAC_CONTRACT; - return m_pTarget; - } - -#ifndef DACCESS_COMPILE - void ResetTargetInterlocked() - { - CONTRACTL - { - THROWS; - GC_NOTRIGGER; - } - CONTRACTL_END; - - ExecutableWriterHolder precodeWriterHolder(this, sizeof(StubPrecode)); - InterlockedExchange((LONG*)&precodeWriterHolder.GetRW()->m_pTarget, (LONG)GetPreStubEntryPoint()); - } - - BOOL SetTargetInterlocked(TADDR target, TADDR expected) - { - CONTRACTL - { - THROWS; - GC_NOTRIGGER; - } - CONTRACTL_END; - - ExecutableWriterHolder precodeWriterHolder(this, sizeof(StubPrecode)); - return (TADDR)InterlockedCompareExchange( - (LONG*)&precodeWriterHolder.GetRW()->m_pTarget, (LONG)target, (LONG)expected) == expected; - } -#endif // !DACCESS_COMPILE - -}; -typedef DPTR(StubPrecode) PTR_StubPrecode; - - -struct NDirectImportPrecode { - - static const int Type = 0xe0; - - // ldr r12, [pc, #4] ; =m_pMethodDesc - // ldr pc, [pc, #4] ; =m_pTarget - // dcd pMethodDesc - // dcd pTarget - WORD m_rgCode[4]; - TADDR m_pMethodDesc; // Notice that the fields are reversed compared to StubPrecode. Precode::GetType - // takes advantage of this to detect NDirectImportPrecode. - TADDR m_pTarget; - - void Init(NDirectImportPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator); - - TADDR GetMethodDesc() - { - LIMITED_METHOD_DAC_CONTRACT; - return m_pMethodDesc; - } - - PCODE GetTarget() - { - LIMITED_METHOD_DAC_CONTRACT; - return m_pTarget; - } - - LPVOID GetEntrypoint() - { - LIMITED_METHOD_CONTRACT; - return (LPVOID)(dac_cast(this) + THUMB_CODE); - } - -}; -typedef DPTR(NDirectImportPrecode) PTR_NDirectImportPrecode; - - -struct FixupPrecode { - - static const int Type = 0xfc; - - // mov r12, pc - // ldr pc, [pc, #4] ; =m_pTarget - // dcb m_MethodDescChunkIndex - // dcb m_PrecodeChunkIndex - // dcd m_pTarget - WORD m_rgCode[3]; - BYTE m_MethodDescChunkIndex; - BYTE m_PrecodeChunkIndex; - TADDR m_pTarget; - - void Init(FixupPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator, int iMethodDescChunkIndex = 0, int iPrecodeChunkIndex = 0); - - TADDR GetBase() - { - LIMITED_METHOD_CONTRACT; - SUPPORTS_DAC; - - return dac_cast(this) + (m_PrecodeChunkIndex + 1) * sizeof(FixupPrecode); - } - - size_t GetSizeRW() - { - LIMITED_METHOD_CONTRACT; - - return GetBase() + sizeof(void*) - dac_cast(this); - } - - TADDR GetMethodDesc(); - - PCODE GetTarget() - { - LIMITED_METHOD_DAC_CONTRACT; - return m_pTarget; - } - -#ifndef DACCESS_COMPILE - void ResetTargetInterlocked() - { - CONTRACTL - { - THROWS; - GC_TRIGGERS; - } - CONTRACTL_END; - - ExecutableWriterHolder precodeWriterHolder(this, sizeof(FixupPrecode)); - InterlockedExchange((LONG*)&precodeWriterHolder.GetRW()->m_pTarget, (LONG)GetEEFuncEntryPoint(PrecodeFixupThunk)); - } - - BOOL SetTargetInterlocked(TADDR target, TADDR expected) - { - CONTRACTL - { - THROWS; - GC_TRIGGERS; - } - CONTRACTL_END; - - ExecutableWriterHolder precodeWriterHolder(this, sizeof(FixupPrecode)); - return (TADDR)InterlockedCompareExchange( - (LONG*)&precodeWriterHolder.GetRW()->m_pTarget, (LONG)target, (LONG)expected) == expected; - } -#endif // !DACCESS_COMPILE - - static BOOL IsFixupPrecodeByASM(PCODE addr) - { - PTR_WORD pInstr = dac_cast(PCODEToPINSTR(addr)); - - return - (pInstr[0] == 0x46fc) && - (pInstr[1] == 0xf8df) && - (pInstr[2] == 0xf004); - } - -#ifdef DACCESS_COMPILE - void EnumMemoryRegions(CLRDataEnumMemoryFlags flags); -#endif -}; -typedef DPTR(FixupPrecode) PTR_FixupPrecode; - - // Precode to shuffle this and retbuf for closed delegates over static methods with return buffer struct ThisPtrRetBufPrecode { - static const int Type = 0x84; + static const int Type = 0x46; // mov r12, r0 // mov r0, r1 @@ -1294,166 +1104,4 @@ inline size_t GetARMInstructionLength(PBYTE pInstr) return GetARMInstructionLength(*(WORD*)pInstr); } - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Call counting - -#ifdef FEATURE_TIERED_COMPILATION - -#define DISABLE_COPY(T) \ - T(const T &) = delete; \ - T &operator =(const T &) = delete - -typedef UINT16 CallCount; -typedef DPTR(CallCount) PTR_CallCount; - -//////////////////////////////////////////////////////////////// -// CallCountingStub - -class CallCountingStub; -typedef DPTR(const CallCountingStub) PTR_CallCountingStub; - -class CallCountingStub -{ -public: - static const SIZE_T Alignment = sizeof(void *); - -#ifndef DACCESS_COMPILE -protected: - static const PCODE TargetForThresholdReached; - - CallCountingStub() = default; - -public: - static const CallCountingStub *From(TADDR stubIdentifyingToken); - - PCODE GetEntryPoint() const - { - WRAPPER_NO_CONTRACT; - return PINSTRToPCODE((TADDR)this); - } -#endif - -public: - PTR_CallCount GetRemainingCallCountCell() const; - PCODE GetTargetForMethod() const; - - DISABLE_COPY(CallCountingStub); -}; - -//////////////////////////////////////////////////////////////// -// CallCountingStubShort - -class CallCountingStubShort; -typedef DPTR(const CallCountingStubShort) PTR_CallCountingStubShort; - -#pragma pack(push, 1) -class CallCountingStubShort : public CallCountingStub -{ -private: - const UINT16 m_part0[16]; - CallCount *const m_remainingCallCountCell; - const PCODE m_targetForMethod; - const PCODE m_targetForThresholdReached; - -#ifndef DACCESS_COMPILE -public: - CallCountingStubShort(CallCountingStubShort* stubRX, CallCount *remainingCallCountCell, PCODE targetForMethod) - : m_part0{ 0xb401, // push {r0} - 0xf8df, 0xc01c, // ldr r12, [pc, #(m_remainingCallCountCell)] - 0xf8bc, 0x0000, // ldrh r0, [r12] - 0x1e40, // subs r0, r0, #1 - 0xf8ac, 0x0000, // strh r0, [r12] - 0xbc01, // pop {r0} - 0xd001, // beq L0 - 0xf8df, 0xf00c, // ldr pc, [pc, #(m_targetForMethod)] - 0xf2af, 0x0c1c, // L0: adr r12, #(this) - // (r12 == stub-identifying token == this) - 0xf8df, 0xf008}, // ldr pc, [pc, #(m_targetForThresholdReached)] - m_remainingCallCountCell(remainingCallCountCell), - m_targetForMethod(targetForMethod), - m_targetForThresholdReached(TargetForThresholdReached) - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg(sizeof(CallCountingStubShort) % Alignment == 0); - _ASSERTE(remainingCallCountCell != nullptr); - _ASSERTE(PCODEToPINSTR(targetForMethod) != NULL); - } - - static bool Is(TADDR stubIdentifyingToken) - { - WRAPPER_NO_CONTRACT; - return true; - } - - static const CallCountingStubShort *From(TADDR stubIdentifyingToken) - { - WRAPPER_NO_CONTRACT; - _ASSERTE(Is(stubIdentifyingToken)); - - const CallCountingStubShort *stub = (const CallCountingStubShort *)stubIdentifyingToken; - _ASSERTE(IS_ALIGNED(stub, Alignment)); - return stub; - } -#endif - -public: - static bool Is(PTR_CallCountingStub callCountingStub) - { - WRAPPER_NO_CONTRACT; - return true; - } - - static PTR_CallCountingStubShort From(PTR_CallCountingStub callCountingStub) - { - WRAPPER_NO_CONTRACT; - _ASSERTE(Is(callCountingStub)); - - return dac_cast(callCountingStub); - } - - PCODE GetTargetForMethod() const - { - WRAPPER_NO_CONTRACT; - return m_targetForMethod; - } - - friend CallCountingStub; - DISABLE_COPY(CallCountingStubShort); -}; -#pragma pack(pop) - -//////////////////////////////////////////////////////////////// -// CallCountingStub definitions - -#ifndef DACCESS_COMPILE -inline const CallCountingStub *CallCountingStub::From(TADDR stubIdentifyingToken) -{ - WRAPPER_NO_CONTRACT; - _ASSERTE(stubIdentifyingToken != NULL); - - return CallCountingStubShort::From(stubIdentifyingToken); -} -#endif - -inline PTR_CallCount CallCountingStub::GetRemainingCallCountCell() const -{ - WRAPPER_NO_CONTRACT; - return PTR_CallCount(dac_cast(this)->m_remainingCallCountCell); -} - -inline PCODE CallCountingStub::GetTargetForMethod() const -{ - WRAPPER_NO_CONTRACT; - return CallCountingStubShort::From(PTR_CallCountingStub(this))->GetTargetForMethod(); -} - -//////////////////////////////////////////////////////////////// - -#undef DISABLE_COPY - -#endif // FEATURE_TIERED_COMPILATION - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - #endif // __cgencpu_h__ diff --git a/src/coreclr/vm/arm/stubs.cpp b/src/coreclr/vm/arm/stubs.cpp index 57e478633668f..75922effad94f 100644 --- a/src/coreclr/vm/arm/stubs.cpp +++ b/src/coreclr/vm/arm/stubs.cpp @@ -348,10 +348,10 @@ void CopyWriteBarrier(PCODE dstCode, PCODE srcCode, PCODE endCode) size_t size = (PBYTE)end - (PBYTE)src; - ExecutableWriterHolder writeBarrierWriterHolder; + ExecutableWriterHolderNoLog writeBarrierWriterHolder; if (IsWriteBarrierCopyEnabled()) { - writeBarrierWriterHolder = ExecutableWriterHolder((void*)dst, size); + writeBarrierWriterHolder.AssignExecutableWriterHolder((void*)dst, size); dst = (TADDR)writeBarrierWriterHolder.GetRW(); } @@ -458,10 +458,10 @@ void UpdateGCWriteBarriers(bool postGrow = false) if(to) { to = (PBYTE)PCODEToPINSTR((PCODE)GetWriteBarrierCodeLocation(to)); - ExecutableWriterHolder barrierWriterHolder; + ExecutableWriterHolderNoLog barrierWriterHolder; if (IsWriteBarrierCopyEnabled()) { - barrierWriterHolder = ExecutableWriterHolder(to, barrierSize); + barrierWriterHolder.AssignExecutableWriterHolder(to, barrierSize); to = barrierWriterHolder.GetRW(); } GWB_PATCH_OFFSET(g_lowest_address); @@ -721,98 +721,8 @@ void HelperMethodFrame::UpdateRegDisplay(const PREGDISPLAY pRD) pRD->pCurrentContextPointers->Lr = NULL; } -TADDR FixupPrecode::GetMethodDesc() -{ - LIMITED_METHOD_DAC_CONTRACT; - - // This lookup is also manually inlined in PrecodeFixupThunk assembly code - TADDR base = *PTR_TADDR(GetBase()); - if (base == NULL) - return NULL; - return base + (m_MethodDescChunkIndex * MethodDesc::ALIGNMENT); -} - -#ifdef DACCESS_COMPILE -void FixupPrecode::EnumMemoryRegions(CLRDataEnumMemoryFlags flags) -{ - SUPPORTS_DAC; - DacEnumMemoryRegion(dac_cast(this), sizeof(FixupPrecode)); - - DacEnumMemoryRegion(GetBase(), sizeof(TADDR)); -} -#endif // DACCESS_COMPILE - #ifndef DACCESS_COMPILE -void StubPrecode::Init(StubPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator) -{ - WRAPPER_NO_CONTRACT; - - int n = 0; - - m_rgCode[n++] = 0xf8df; // ldr r12, [pc, #8] - m_rgCode[n++] = 0xc008; - m_rgCode[n++] = 0xf8df; // ldr pc, [pc, #0] - m_rgCode[n++] = 0xf000; - - _ASSERTE(n == ARRAY_SIZE(m_rgCode)); - - m_pTarget = GetPreStubEntryPoint(); - m_pMethodDesc = (TADDR)pMD; -} - -void NDirectImportPrecode::Init(NDirectImportPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator) -{ - WRAPPER_NO_CONTRACT; - - int n = 0; - - m_rgCode[n++] = 0xf8df; // ldr r12, [pc, #4] - m_rgCode[n++] = 0xc004; - m_rgCode[n++] = 0xf8df; // ldr pc, [pc, #4] - m_rgCode[n++] = 0xf004; - - _ASSERTE(n == ARRAY_SIZE(m_rgCode)); - - m_pMethodDesc = (TADDR)pMD; - m_pTarget = GetEEFuncEntryPoint(NDirectImportThunk); -} - -void FixupPrecode::Init(FixupPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator, int iMethodDescChunkIndex /*=0*/, int iPrecodeChunkIndex /*=0*/) -{ - WRAPPER_NO_CONTRACT; - - m_rgCode[0] = 0x46fc; // mov r12, pc - m_rgCode[1] = 0xf8df; // ldr pc, [pc, #4] - m_rgCode[2] = 0xf004; - - // Initialize chunk indices only if they are not initialized yet. This is necessary to make MethodDesc::Reset work. - if (m_PrecodeChunkIndex == 0) - { - _ASSERTE(FitsInU1(iPrecodeChunkIndex)); - m_PrecodeChunkIndex = static_cast(iPrecodeChunkIndex); - } - - if (iMethodDescChunkIndex != -1) - { - if (m_MethodDescChunkIndex == 0) - { - _ASSERTE(FitsInU1(iMethodDescChunkIndex)); - m_MethodDescChunkIndex = static_cast(iMethodDescChunkIndex); - } - - if (*(void**)GetBase() == NULL) - *(void**)GetBase() = (BYTE*)pMD - (iMethodDescChunkIndex * MethodDesc::ALIGNMENT); - } - - _ASSERTE(GetMethodDesc() == (TADDR)pMD); - - if (pLoaderAllocator != NULL) - { - m_pTarget = GetEEFuncEntryPoint(PrecodeFixupThunk); - } -} - void ThisPtrRetBufPrecode::Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator) { WRAPPER_NO_CONTRACT; @@ -842,20 +752,20 @@ Rough pseudo-code of interface dispatching: // jitted code calls *indirectionCell switch (*indirectionCell) { - case LookupHolder._stub: + case LookupStub: // ResolveWorkerAsmStub: - *indirectionCell = DispatchHolder._stub; + *indirectionCell = DispatchStub; call ResolveWorkerStatic, jump to target method; - case DispatchHolder._stub: + case DispatchStub: if (r0.methodTable == expectedMethodTable) jump to target method; - // ResolveHolder._stub._failEntryPoint: - jump to case ResolveHolder._stub._resolveEntryPoint; - case ResolveHolder._stub._resolveEntryPoint: + // ResolveStub._failEntryPoint: + jump to case ResolveStub._resolveEntryPoint; + case ResolveStub._resolveEntryPoint: if (r0.methodTable in hashTable) jump to target method; - // ResolveHolder._stub._slowEntryPoint: + // ResolveStub._slowEntryPoint: // ResolveWorkerChainLookupAsmStub: // ResolveWorkerAsmStub: - if (_failEntryPoint called too many times) *indirectionCell = ResolveHolder._stub._resolveEntryPoint; + if (_failEntryPoint called too many times) *indirectionCell = ResolveStub._resolveEntryPoint; call ResolveWorkerStatic, jump to target method; } @@ -863,395 +773,6 @@ Note that ResolveWorkerChainLookupAsmStub currently points directly to ResolveWorkerAsmStub; in the future, this could be separate. */ -void LookupHolder::Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken) -{ - // Called directly by JITTED code - // See ResolveWorkerAsmStub - - // ldr r12, [pc + 8] ; #_token - _stub._entryPoint[0] = 0xf8df; - _stub._entryPoint[1] = 0xc008; - // ldr pc, [pc] ; #_resolveWorkerTarget - _stub._entryPoint[2] = 0xf8df; - _stub._entryPoint[3] = 0xf000; - - _stub._resolveWorkerTarget = resolveWorkerTarget; - _stub._token = dispatchToken; - _ASSERTE(4 == LookupStub::entryPointLen); -} - -void DispatchHolder::Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT) -{ - // Called directly by JITTED code - // DispatchHolder._stub._entryPoint(r0:object, r1, r2, r3, r4:IndirectionCell) - // { - // if (r0.methodTable == this._expectedMT) (this._implTarget)(r0, r1, r2, r3); - // else (this._failTarget)(r0, r1, r2, r3, r4); - // } - - int n = 0; - WORD offset; - - // We rely on the stub entry-point being DWORD aligned (so we can tell whether any subsequent WORD is - // DWORD-aligned or not, which matters in the calculation of PC-relative offsets). - _ASSERTE(((UINT_PTR)_stub._entryPoint & 0x3) == 0); - -// Compute a PC-relative offset for use in an instruction encoding. Must call this prior to emitting the -// instruction halfword to which it applies. For thumb-2 encodings the offset must be computed before emitting -// the first of the halfwords. -#undef PC_REL_OFFSET -#define PC_REL_OFFSET(_field) (WORD)(offsetof(DispatchStub, _field) - ((offsetof(DispatchStub, _entryPoint) + sizeof(*DispatchStub::_entryPoint) * (n + 2)) & 0xfffffffc)) - - // r0 : object. It can be null as well. - // when it is null the code causes an AV. This AV is seen by the VM's personality routine - // and it converts it into nullRef. We want the AV to happen before modifying the stack so that we can get the - // call stack in windbg at the point of AV. So therefore "ldr r12, [r0]" should be the first instruction. - - // ldr r12, [r0 + #Object.m_pMethTab] - _stub._entryPoint[n++] = DISPATCH_STUB_FIRST_WORD; - _stub._entryPoint[n++] = 0xc000; - - // push {r5} - _stub._entryPoint[n++] = 0xb420; - - // ldr r5, [pc + #_expectedMT] - offset = PC_REL_OFFSET(_expectedMT); - _ASSERTE((offset & 0x3) == 0); - _stub._entryPoint[n++] = 0x4d00 | (offset >> 2); - - // cmp r5, r12 - _stub._entryPoint[n++] = 0x4565; - - // pop {r5} - _stub._entryPoint[n++] = 0xbc20; - - // bne failTarget - _stub._entryPoint[n++] = 0xd101; - - // ldr pc, [pc + #_implTarget] - offset = PC_REL_OFFSET(_implTarget); - _stub._entryPoint[n++] = 0xf8df; - _stub._entryPoint[n++] = 0xf000 | offset; - - // failTarget: - // ldr pc, [pc + #_failTarget] - offset = PC_REL_OFFSET(_failTarget); - _stub._entryPoint[n++] = 0xf8df; - _stub._entryPoint[n++] = 0xf000 | offset; - - // nop - insert padding - _stub._entryPoint[n++] = 0xbf00; - - _ASSERTE(n == DispatchStub::entryPointLen); - - // Make sure that the data members below are aligned - _ASSERTE((n & 1) == 0); - - _stub._expectedMT = DWORD(expectedMT); - _stub._failTarget = failTarget; - _stub._implTarget = implTarget; -} - -void ResolveHolder::Initialize(ResolveHolder* pResolveHolderRX, - PCODE resolveWorkerTarget, PCODE patcherTarget, - size_t dispatchToken, UINT32 hashedToken, - void * cacheAddr, INT32 * counterAddr) -{ - // Called directly by JITTED code - // ResolveStub._resolveEntryPoint(r0:Object*, r1, r2, r3, r4:IndirectionCellAndFlags) - // { - // MethodTable mt = r0.m_pMethTab; - // int i = ((mt + mt >> 12) ^ this._hashedToken) & this._cacheMask - // ResolveCacheElem e = this._cacheAddress + i - // do - // { - // if (mt == e.pMT && this._token == e.token) (e.target)(r0, r1, r2, r3); - // e = e.pNext; - // } while (e != null) - // (this._slowEntryPoint)(r0, r1, r2, r3, r4); - // } - // - - int n = 0; - WORD offset; - - // We rely on the stub entry-point being DWORD aligned (so we can tell whether any subsequent WORD is - // DWORD-aligned or not, which matters in the calculation of PC-relative offsets). - _ASSERTE(((UINT_PTR)_stub._resolveEntryPoint & 0x3) == 0); - -// Compute a PC-relative offset for use in an instruction encoding. Must call this prior to emitting the -// instruction halfword to which it applies. For thumb-2 encodings the offset must be computed before emitting -// the first of the halfwords. -#undef PC_REL_OFFSET -#define PC_REL_OFFSET(_field) (WORD)(offsetof(ResolveStub, _field) - ((offsetof(ResolveStub, _resolveEntryPoint) + sizeof(*ResolveStub::_resolveEntryPoint) * (n + 2)) & 0xfffffffc)) - - // ldr r12, [r0 + #Object.m_pMethTab] - _stub._resolveEntryPoint[n++] = RESOLVE_STUB_FIRST_WORD; - _stub._resolveEntryPoint[n++] = 0xc000; - - // ;; We need two scratch registers, r5 and r6 - // push {r5,r6} - _stub._resolveEntryPoint[n++] = 0xb460; - - // ;; Compute i = ((mt + mt >> 12) ^ this._hashedToken) & this._cacheMask - - // add r6, r12, r12 lsr #12 - _stub._resolveEntryPoint[n++] = 0xeb0c; - _stub._resolveEntryPoint[n++] = 0x361c; - - // ldr r5, [pc + #_hashedToken] - offset = PC_REL_OFFSET(_hashedToken); - _ASSERTE((offset & 0x3) == 0); - _stub._resolveEntryPoint[n++] = 0x4d00 | (offset >> 2); - - // eor r6, r6, r5 - _stub._resolveEntryPoint[n++] = 0xea86; - _stub._resolveEntryPoint[n++] = 0x0605; - - // ldr r5, [pc + #_cacheMask] - offset = PC_REL_OFFSET(_cacheMask); - _ASSERTE((offset & 0x3) == 0); - _stub._resolveEntryPoint[n++] = 0x4d00 | (offset >> 2); - - // and r6, r6, r5 - _stub._resolveEntryPoint[n++] = 0xea06; - _stub._resolveEntryPoint[n++] = 0x0605; - - // ;; ResolveCacheElem e = this._cacheAddress + i - // ldr r5, [pc + #_cacheAddress] - offset = PC_REL_OFFSET(_cacheAddress); - _ASSERTE((offset & 0x3) == 0); - _stub._resolveEntryPoint[n++] = 0x4d00 | (offset >> 2); - - // ldr r6, [r5 + r6] ;; r6 = e = this._cacheAddress + i - _stub._resolveEntryPoint[n++] = 0x59ae; - - // ;; do { - int loop = n; - - // ;; Check mt == e.pMT - // ldr r5, [r6 + #ResolveCacheElem.pMT] - offset = offsetof(ResolveCacheElem, pMT); - _ASSERTE(offset <= 124 && (offset & 0x3) == 0); - _stub._resolveEntryPoint[n++] = 0x6835 | (offset<< 4); - - // cmp r12, r5 - _stub._resolveEntryPoint[n++] = 0x45ac; - - // bne nextEntry - _stub._resolveEntryPoint[n++] = 0xd108; - - // ;; Check this._token == e.token - // ldr r5, [pc + #_token] - offset = PC_REL_OFFSET(_token); - _ASSERTE((offset & 0x3) == 0); - _stub._resolveEntryPoint[n++] = 0x4d00 | (offset>>2); - - // ldr r12, [r6 + #ResolveCacheElem.token] - offset = offsetof(ResolveCacheElem, token); - _stub._resolveEntryPoint[n++] = 0xf8d6; - _stub._resolveEntryPoint[n++] = 0xc000 | offset; - - // cmp r12, r5 - _stub._resolveEntryPoint[n++] = 0x45ac; - - // bne nextEntry - _stub._resolveEntryPoint[n++] = 0xd103; - - // ldr r12, [r6 + #ResolveCacheElem.target] ;; r12 : e.target - offset = offsetof(ResolveCacheElem, target); - _stub._resolveEntryPoint[n++] = 0xf8d6; - _stub._resolveEntryPoint[n++] = 0xc000 | offset; - - // ;; Restore r5 and r6 - // pop {r5,r6} - _stub._resolveEntryPoint[n++] = 0xbc60; - - // ;; Branch to e.target - // bx r12 ;; (e.target)(r0,r1,r2,r3) - _stub._resolveEntryPoint[n++] = 0x4760; - - // nextEntry: - // ;; e = e.pNext; - // ldr r6, [r6 + #ResolveCacheElem.pNext] - offset = offsetof(ResolveCacheElem, pNext); - _ASSERTE(offset <=124 && (offset & 0x3) == 0); - _stub._resolveEntryPoint[n++] = 0x6836 | (offset << 4); - - // ;; } while(e != null); - // cbz r6, slowEntryPoint - _stub._resolveEntryPoint[n++] = 0xb116; - - // ldr r12, [r0 + #Object.m_pMethTab] - _stub._resolveEntryPoint[n++] = 0xf8d0; - _stub._resolveEntryPoint[n++] = 0xc000; - - // b loop - offset = (WORD)((loop - (n + 2)) * sizeof(WORD)); - offset = (offset >> 1) & 0x07ff; - _stub._resolveEntryPoint[n++] = 0xe000 | offset; - - // slowEntryPoint: - // pop {r5,r6} - _stub._resolveEntryPoint[n++] = 0xbc60; - - // nop for alignment - _stub._resolveEntryPoint[n++] = 0xbf00; - - // the slow entry point be DWORD-aligned (see _ASSERTE below) insert nops if necessary . - - // ARMSTUB TODO: promotion - - // fall through to slow case - _ASSERTE(_stub._resolveEntryPoint + n == _stub._slowEntryPoint); - _ASSERTE(n == ResolveStub::resolveEntryPointLen); - - // ResolveStub._slowEntryPoint(r0:MethodToken, r1, r2, r3, r4:IndirectionCellAndFlags) - // { - // r12 = this._tokenSlow; - // this._resolveWorkerTarget(r0, r1, r2, r3, r4, r12); - // } - - // The following macro relies on this entry point being DWORD-aligned. We've already asserted that the - // overall stub is aligned above, just need to check that the preceding stubs occupy an even number of - // WORD slots. - _ASSERTE((n & 1) == 0); - -#undef PC_REL_OFFSET -#define PC_REL_OFFSET(_field) (WORD)(offsetof(ResolveStub, _field) - ((offsetof(ResolveStub, _slowEntryPoint) + sizeof(*ResolveStub::_slowEntryPoint) * (n + 2)) & 0xfffffffc)) - - n = 0; - - // ldr r12, [pc + #_tokenSlow] - offset = PC_REL_OFFSET(_tokenSlow); - _stub._slowEntryPoint[n++] = 0xf8df; - _stub._slowEntryPoint[n++] = 0xc000 | offset; - - // ldr pc, [pc + #_resolveWorkerTarget] - offset = PC_REL_OFFSET(_resolveWorkerTarget); - _stub._slowEntryPoint[n++] = 0xf8df; - _stub._slowEntryPoint[n++] = 0xf000 | offset; - - _ASSERTE(n == ResolveStub::slowEntryPointLen); - - // ResolveStub._failEntryPoint(r0:MethodToken, r1, r2, r3, r4:IndirectionCellAndFlags) - // { - // if(--*(this._pCounter) < 0) r4 = r4 | SDF_ResolveBackPatch; - // this._resolveEntryPoint(r0, r1, r2, r3, r4); - // } - - // The following macro relies on this entry point being DWORD-aligned. We've already asserted that the - // overall stub is aligned above, just need to check that the preceding stubs occupy an even number of - // WORD slots. - _ASSERTE((n & 1) == 0); - -#undef PC_REL_OFFSET -#define PC_REL_OFFSET(_field) (WORD)(offsetof(ResolveStub, _field) - ((offsetof(ResolveStub, _failEntryPoint) + sizeof(*ResolveStub::_failEntryPoint) * (n + 2)) & 0xfffffffc)) - - n = 0; - - // push {r5} - _stub._failEntryPoint[n++] = 0xb420; - - // ldr r5, [pc + #_pCounter] - offset = PC_REL_OFFSET(_pCounter); - _ASSERTE((offset & 0x3) == 0); - _stub._failEntryPoint[n++] = 0x4d00 | (offset >>2); - - // ldr r12, [r5] - _stub._failEntryPoint[n++] = 0xf8d5; - _stub._failEntryPoint[n++] = 0xc000; - - // subs r12, r12, #1 - _stub._failEntryPoint[n++] = 0xf1bc; - _stub._failEntryPoint[n++] = 0x0c01; - - // str r12, [r5] - _stub._failEntryPoint[n++] = 0xf8c5; - _stub._failEntryPoint[n++] = 0xc000; - - // pop {r5} - _stub._failEntryPoint[n++] = 0xbc20; - - // bge resolveEntryPoint - _stub._failEntryPoint[n++] = 0xda01; - - // or r4, r4, SDF_ResolveBackPatch - _ASSERTE(SDF_ResolveBackPatch < 256); - _stub._failEntryPoint[n++] = 0xf044; - _stub._failEntryPoint[n++] = 0x0400 | SDF_ResolveBackPatch; - - // resolveEntryPoint: - // b _resolveEntryPoint - offset = (WORD)(offsetof(ResolveStub, _resolveEntryPoint) - (offsetof(ResolveStub, _failEntryPoint) + sizeof(*ResolveStub::_failEntryPoint) * (n + 2))); - _ASSERTE((offset & 1) == 0); - offset = (offset >> 1) & 0x07ff; - _stub._failEntryPoint[n++] = 0xe000 | offset; - - // nop for alignment - _stub._failEntryPoint[n++] = 0xbf00; - - _ASSERTE(n == ResolveStub::failEntryPointLen); - - _stub._pCounter = counterAddr; - _stub._hashedToken = hashedToken << LOG2_PTRSIZE; - _stub._cacheAddress = (size_t) cacheAddr; - _stub._token = dispatchToken; - _stub._tokenSlow = dispatchToken; - _stub._resolveWorkerTarget = resolveWorkerTarget; - _stub._cacheMask = CALL_STUB_CACHE_MASK * sizeof(void*); - - _ASSERTE(resolveWorkerTarget == (PCODE)ResolveWorkerChainLookupAsmStub); - _ASSERTE(patcherTarget == NULL); -} - -BOOL DoesSlotCallPrestub(PCODE pCode) -{ - PTR_WORD pInstr = dac_cast(PCODEToPINSTR(pCode)); - -#ifdef HAS_COMPACT_ENTRYPOINTS - if (MethodDescChunk::GetMethodDescFromCompactEntryPoint(pCode, TRUE) != NULL) - { - return TRUE; - } -#endif // HAS_COMPACT_ENTRYPOINTS - - // FixupPrecode - if (pInstr[0] == 0x46fc && // // mov r12, pc - pInstr[1] == 0xf8df && - pInstr[2] == 0xf004) - { - PCODE pTarget = dac_cast(pInstr)->m_pTarget; - - // Check for jump stub (NGen case) - if (isJump(pTarget)) - { - pTarget = decodeJump(pTarget); - } - - return pTarget == (TADDR)PrecodeFixupThunk; - } - - // StubPrecode - if (pInstr[0] == 0xf8df && // ldr r12, [pc + 8] - pInstr[1] == 0xc008 && - pInstr[2] == 0xf8df && // ldr pc, [pc] - pInstr[3] == 0xf000) - { - PCODE pTarget = dac_cast(pInstr)->m_pTarget; - - // Check for jump stub (NGen case) - if (isJump(pTarget)) - { - pTarget = decodeJump(pTarget); - } - - return pTarget == GetPreStubEntryPoint(); - } - - return FALSE; -} - Stub *GenerateInitPInvokeFrameHelper() { CONTRACT(Stub*) diff --git a/src/coreclr/vm/arm/thunktemplates.S b/src/coreclr/vm/arm/thunktemplates.S new file mode 100644 index 0000000000000..e2b4dda34c0ea --- /dev/null +++ b/src/coreclr/vm/arm/thunktemplates.S @@ -0,0 +1,114 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#include "unixasmmacros.inc" +#include "asmconstants.h" + +.syntax unified +.thumb + + .align 4 + +PAGE_SIZE = 4096 + +#define DATA_SLOT(stub, field) stub##Code + PAGE_SIZE + stub##Data__##field + + LEAF_ENTRY StubPrecodeCode + ldr r12, DATA_SLOT(StubPrecode, MethodDesc) + ldr pc, DATA_SLOT(StubPrecode, Target) + LEAF_END_MARKED StubPrecodeCode + + .align 4 + + LEAF_ENTRY FixupPrecodeCode + ldr pc, DATA_SLOT(FixupPrecode, Target) + ldr r12, DATA_SLOT(FixupPrecode, MethodDesc) + ldr pc, DATA_SLOT(FixupPrecode, PrecodeFixupThunk) + LEAF_END_MARKED FixupPrecodeCode + + .align 4 + + LEAF_ENTRY CallCountingStubCode + push {r0} + ldr r12, DATA_SLOT(CallCountingStub, RemainingCallCountCell) + ldrh r0, [r12] + subs r0, r0, #1 + strh r0, [r12] + pop {r0} + beq LOCAL_LABEL(CountReachedZero) + ldr pc, DATA_SLOT(CallCountingStub, TargetForMethod) +LOCAL_LABEL(CountReachedZero): + adr r12, CallCountingStubCode + ldr pc, DATA_SLOT(CallCountingStub, TargetForThresholdReached) + LEAF_END_MARKED CallCountingStubCode + + .align 4 + + LEAF_ENTRY LookupStubCode + ldr r12, DATA_SLOT(LookupStub, DispatchToken) + ldr pc, DATA_SLOT(LookupStub, ResolveWorkerTarget) + LEAF_END_MARKED LookupStubCode + + .align 4 + + LEAF_ENTRY DispatchStubCode + PATCH_LABEL DispatchStubCode_ThisDeref + ldr r12, [r0] + push {r5} + ldr r5, DATA_SLOT(DispatchStub, ExpectedMT) + cmp r5, r12 + pop {r5} + bne LOCAL_LABEL(FailTarget) + ldr pc, DATA_SLOT(DispatchStub, ImplTarget) +LOCAL_LABEL(FailTarget): + ldr pc, DATA_SLOT(DispatchStub, FailTarget) + LEAF_END_MARKED DispatchStubCode + + .align 4 + + LEAF_ENTRY ResolveStubCode + PATCH_LABEL ResolveStubCode_ResolveEntry + PATCH_LABEL ResolveStubCode_ThisDeref + ldr r12, [r0] + push {r5, r6} + add r6, r12, r12, lsr #12 + ldr r5, DATA_SLOT(ResolveStub, HashedToken) + eor r6, r6, r5 + mov r5, #CALL_STUB_CACHE_MASK_ASM * 4 + and r6, r6, r5 + ldr r5, DATA_SLOT(ResolveStub, CacheAddress) + ldr r6, [r5, r6] +LOCAL_LABEL(Loop): + ldr r5, [r6] + cmp r12, r5 + bne LOCAL_LABEL(NextEntry) + ldr r5, DATA_SLOT(ResolveStub, Token) + ldr r12, [r6, #4] + cmp r12, r5 + bne LOCAL_LABEL(NextEntry) + ldr r12, [r6, #8] + pop {r5, r6} + bx r12 +LOCAL_LABEL(NextEntry): + ldr r6, [r6, #12] + cbz r6, LOCAL_LABEL(Slow) + ldr r12, [r0] + b LOCAL_LABEL(Loop) +LOCAL_LABEL(Slow): + pop {r5, r6} + nop + ldr r12, DATA_SLOT(ResolveStub, Token) + ldr pc, DATA_SLOT(ResolveStub, ResolveWorkerTarget) + PATCH_LABEL ResolveStubCode_FailEntry + push {r5} + adr r5, DATA_SLOT(ResolveStub, Counter) + ldr r12, [r5] + subs r12, r12, #1 + str r12, [r5] + pop {r5} + bge ResolveStubCode + orr r4, r4, #1 // SDF_ResolveBackPatch + b ResolveStubCode + LEAF_END_MARKED ResolveStubCode + + diff --git a/src/coreclr/vm/arm/thunktemplates.asm b/src/coreclr/vm/arm/thunktemplates.asm new file mode 100644 index 0000000000000..f8ae8043903db --- /dev/null +++ b/src/coreclr/vm/arm/thunktemplates.asm @@ -0,0 +1,115 @@ +; Licensed to the .NET Foundation under one or more agreements. +; The .NET Foundation licenses this file to you under the MIT license. + +#include "ksarm.h" +#include "asmconstants.h" +#include "asmmacros.h" + + + TEXTAREA + + ALIGN 4 + + PAGE_SIZE = 4096 + + #define DATA_SLOT(stub, field) stub##Code + PAGE_SIZE + stub##Data__##field + + LEAF_ENTRY StubPrecodeCode + ldr r12, DATA_SLOT(StubPrecode, MethodDesc) + ldr pc, DATA_SLOT(StubPrecode, Target) + LEAF_END_MARKED StubPrecodeCode + + ALIGN 4 + + LEAF_ENTRY FixupPrecodeCode + ldr pc, DATA_SLOT(FixupPrecode, Target) + ldr r12, DATA_SLOT(FixupPrecode, MethodDesc) + ldr pc, DATA_SLOT(FixupPrecode, PrecodeFixupThunk) + LEAF_END_MARKED FixupPrecodeCode + + ALIGN 4 + + LEAF_ENTRY CallCountingStubCode + push {r0} + ldr r12, DATA_SLOT(CallCountingStub, RemainingCallCountCell) + ldrh r0, [r12] + subs r0, r0, #1 + strh r0, [r12] + pop {r0} + beq CountReachedZero + ldr pc, DATA_SLOT(CallCountingStub, TargetForMethod) +CountReachedZero + adr r12, CallCountingStubCode + ldr pc, DATA_SLOT(CallCountingStub, TargetForThresholdReached) + LEAF_END_MARKED CallCountingStubCode + + ALIGN 4 + + LEAF_ENTRY LookupStubCode + ldr r12, DATA_SLOT(LookupStub, DispatchToken) + ldr pc, DATA_SLOT(LookupStub, ResolveWorkerTarget) + LEAF_END_MARKED LookupStubCode + + ALIGN 4 + + LEAF_ENTRY DispatchStubCode + PATCH_LABEL DispatchStubCode_ThisDeref + ldr r12, [r0] + push {r5} + ldr r5, DATA_SLOT(DispatchStub, ExpectedMT) + cmp r5, r12 + pop {r5} + bne FailTarget + ldr pc, DATA_SLOT(DispatchStub, ImplTarget) +FailTarget + ldr pc, DATA_SLOT(DispatchStub, FailTarget) + LEAF_END_MARKED DispatchStubCode + + ALIGN 4 + + LEAF_ENTRY ResolveStubCode + PATCH_LABEL ResolveStubCode_ResolveEntry + PATCH_LABEL ResolveStubCode_ThisDeref + ldr r12, [r0] + push {r5, r6} + add r6, r12, r12 lsr #12 + ldr r5, DATA_SLOT(ResolveStub, HashedToken) + eor r6, r6, r5 + mov r5, #CALL_STUB_CACHE_MASK_ASM * 4 + and r6, r6, r5 + ldr r5, DATA_SLOT(ResolveStub, CacheAddress) + ldr r6, [r5, r6] +Loop + ldr r5, [r6] + cmp r12, r5 + bne NextEntry + ldr r5, DATA_SLOT(ResolveStub, Token) + ldr r12, [r6, #4] + cmp r12, r5 + bne NextEntry + ldr r12, [r6, #8] + pop {r5, r6} + bx r12 +NextEntry + ldr r6, [r6, #12] + cbz r6, Slow + ldr r12, [r0] + b Loop +Slow + pop {r5, r6} + nop + ldr r12, DATA_SLOT(ResolveStub, Token) + ldr pc, DATA_SLOT(ResolveStub, ResolveWorkerTarget) + PATCH_LABEL ResolveStubCode_FailEntry + push {r5} + adr r5, DATA_SLOT(ResolveStub, Counter) + ldr r12, [r5] + subs r12, r12, #1 + str r12, [r5] + pop {r5} + bge ResolveStubCode + orr r4, r4, #1; SDF_ResolveBackPatch + b ResolveStubCode + LEAF_END_MARKED ResolveStubCode + + END diff --git a/src/coreclr/vm/arm/virtualcallstubcpu.hpp b/src/coreclr/vm/arm/virtualcallstubcpu.hpp index 041c8267d1812..7e8f6b4baf022 100644 --- a/src/coreclr/vm/arm/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/arm/virtualcallstubcpu.hpp @@ -14,265 +14,6 @@ #include // Since we are placing code, we want byte packing of the structs -#define USES_LOOKUP_STUBS 1 - -/********************************************************************************************* -Stubs that contain code are all part of larger structs called Holders. There is a -Holder for each kind of stub, i.e XXXStub is contained with XXXHolder. Holders are -essentially an implementation trick that allowed rearranging the code sequences more -easily while trying out different alternatives, and for dealing with any alignment -issues in a way that was mostly immune to the actually code sequences. These Holders -should be revisited when the stub code sequences are fixed, since in many cases they -add extra space to a stub that is not really needed. - -Stubs are placed in cache and hash tables. Since unaligned access of data in memory -is very slow, the keys used in those tables should be aligned. The things used as keys -typically also occur in the generated code, e.g. a token as an immediate part of an instruction. -For now, to avoid alignment computations as different code strategies are tried out, the key -fields are all in the Holders. Eventually, many of these fields should be dropped, and the instruction -streams aligned so that the immediate fields fall on aligned boundaries. -*/ - -#if USES_LOOKUP_STUBS - -struct LookupStub; -struct LookupHolder; - -/*LookupStub************************************************************************************** -Virtual and interface call sites are initially setup to point at LookupStubs. -This is because the runtime type of the pointer is not yet known, -so the target cannot be resolved. Note: if the jit is able to determine the runtime type -of the pointer, it should be generating a direct call not a virtual or interface call. -This stub pushes a lookup token onto the stack to identify the sought after method, and then -jumps into the EE (VirtualCallStubManager::ResolveWorkerStub) to effectuate the lookup and -transfer of control to the appropriate target method implementation, perhaps patching of the call site -along the way to point to a more appropriate stub. Hence callsites that point to LookupStubs -get quickly changed to point to another kind of stub. -*/ -struct LookupStub -{ - inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0] + THUMB_CODE; } - inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } - -private: - friend struct LookupHolder; - const static int entryPointLen = 4; - - WORD _entryPoint[entryPointLen]; - PCODE _resolveWorkerTarget; // xx xx xx xx target address - size_t _token; // xx xx xx xx 32-bit constant -}; - -/* LookupHolders are the containers for LookupStubs, they provide for any alignment of -stubs as necessary. In the case of LookupStubs, alignment is necessary since -LookupStubs are placed in a hash table keyed by token. */ -struct LookupHolder -{ - static void InitializeStatic() { LIMITED_METHOD_CONTRACT; } - - void Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken); - - LookupStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } - - static LookupHolder* FromLookupEntry(PCODE lookupEntry); - -private: - friend struct LookupStub; - - LookupStub _stub; -}; - - -#endif // USES_LOOKUP_STUBS - -struct DispatchStub; -struct DispatchHolder; - -/*DispatchStub************************************************************************************** -Monomorphic and mostly monomorphic call sites eventually point to DispatchStubs. -A dispatch stub has an expected type (expectedMT), target address (target) and fail address (failure). -If the calling frame does in fact have the type be of the expected type, then -control is transfered to the target address, the method implementation. If not, -then control is transfered to the fail address, a fail stub (see below) where a polymorphic -lookup is done to find the correct address to go to. - -implementation note: Order, choice of instructions, and branch directions -should be carefully tuned since it can have an inordinate effect on performance. Particular -attention needs to be paid to the effects on the BTB and branch prediction, both in the small -and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. -Note that since this stub is only used for mostly monomorphic callsites (ones that are not, get patched -to something else), therefore the conditional jump "jne failure" is mostly not taken, and hence it is important -that the branch prediction staticly predict this, which means it must be a forward jump. The alternative -is to reverse the order of the jumps and make sure that the resulting conditional jump "je implTarget" -is statically predicted as taken, i.e a backward jump. The current choice was taken since it was easier -to control the placement of the stubs than control the placement of the jitted code and the stubs. */ -struct DispatchStub -{ - inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)(&_entryPoint[0]) + THUMB_CODE; } - - inline size_t expectedMT() { LIMITED_METHOD_CONTRACT; return _expectedMT; } - inline PCODE implTarget() { LIMITED_METHOD_CONTRACT; return _implTarget; } - - inline TADDR implTargetSlot(EntryPointSlots::SlotType *slotTypeRef) const - { - LIMITED_METHOD_CONTRACT; - _ASSERTE(slotTypeRef != nullptr); - - *slotTypeRef = EntryPointSlots::SlotType_Executable; - return (TADDR)&_implTarget; - } - - inline PCODE failTarget() { LIMITED_METHOD_CONTRACT; return _failTarget; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(DispatchStub); } - -private: - friend struct DispatchHolder; - const static int entryPointLen = 12; - - WORD _entryPoint[entryPointLen]; - size_t _expectedMT; - PCODE _failTarget; - PCODE _implTarget; -}; - -/* DispatchHolders are the containers for DispatchStubs, they provide for any alignment of -stubs as necessary. DispatchStubs are placed in a hashtable and in a cache. The keys for both -are the pair expectedMT and token. Efficiency of the of the hash table is not a big issue, -since lookups in it are fairly rare. Efficiency of the cache is paramount since it is accessed frequently -o(see ResolveStub below). Currently we are storing both of these fields in the DispatchHolder to simplify -alignment issues. If inlineMT in the stub itself was aligned, then it could be the expectedMT field. -While the token field can be logically gotten by following the failure target to the failEntryPoint -of the ResolveStub and then to the token over there, for perf reasons of cache access, it is duplicated here. -This allows us to use DispatchStubs in the cache. The alternative is to provide some other immutable struct -for the cache composed of the triplet (expectedMT, token, target) and some sort of reclaimation scheme when -they are thrown out of the cache via overwrites (since concurrency will make the obvious approaches invalid). -*/ - -/* @workaround for ee resolution - Since the EE does not currently have a resolver function that -does what we want, see notes in implementation of VirtualCallStubManager::Resolver, we are -using dispatch stubs to siumulate what we want. That means that inlineTarget, which should be immutable -is in fact written. Hence we have moved target out into the holder and aligned it so we can -atomically update it. When we get a resolver function that does what we want, we can drop this field, -and live with just the inlineTarget field in the stub itself, since immutability will hold.*/ -struct DispatchHolder -{ - static void InitializeStatic() - { - LIMITED_METHOD_CONTRACT; - - // Check that _implTarget is aligned in the DispatchHolder for backpatching - static_assert_no_msg(((offsetof(DispatchHolder, _stub) + offsetof(DispatchStub, _implTarget)) % sizeof(void *)) == 0); - } - - void Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT); - - DispatchStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } - - static DispatchHolder* FromDispatchEntry(PCODE dispatchEntry); - -private: - //force expectedMT to be aligned since used as key in hash tables. - DispatchStub _stub; -}; - -struct ResolveStub; -struct ResolveHolder; - -/*ResolveStub************************************************************************************** -Polymorphic call sites and monomorphic calls that fail end up in a ResolverStub. There is only -one resolver stub built for any given token, even though there may be many call sites that -use that token and many distinct types that are used in the calling call frames. A resolver stub -actually has two entry points, one for polymorphic call sites and one for dispatch stubs that fail on their -expectedMT test. There is a third part of the resolver stub that enters the ee when a decision should -be made about changing the callsite. Therefore, we have defined the resolver stub as three distinct pieces, -even though they are actually allocated as a single contiguous block of memory. These pieces are: - -A ResolveStub has two entry points: - -FailEntry - where the dispatch stub goes if the expected MT test fails. This piece of the stub does -a check to see how often we are actually failing. If failures are frequent, control transfers to the -patch piece to cause the call site to be changed from a mostly monomorphic callsite -(calls dispatch stub) to a polymorphic callsize (calls resolve stub). If failures are rare, control -transfers to the resolve piece (see ResolveStub). The failEntryPoint decrements a counter -every time it is entered. The ee at various times will add a large chunk to the counter. - -ResolveEntry - does a lookup via in a cache by hashing the actual type of the calling frame s - and the token identifying the (contract,method) pair desired. If found, control is transfered -to the method implementation. If not found in the cache, the token is pushed and the ee is entered via -the ResolveWorkerStub to do a full lookup and eventual transfer to the correct method implementation. Since -there is a different resolve stub for every token, the token can be inlined and the token can be pre-hashed. -The effectiveness of this approach is highly sensitive to the effectiveness of the hashing algorithm used, -as well as its speed. It turns out it is very important to make the hash function sensitive to all -of the bits of the method table, as method tables are laid out in memory in a very non-random way. Before -making any changes to the code sequences here, it is very important to measure and tune them as perf -can vary greatly, in unexpected ways, with seeming minor changes. - -Implementation note - Order, choice of instructions, and branch directions -should be carefully tuned since it can have an inordinate effect on performance. Particular -attention needs to be paid to the effects on the BTB and branch prediction, both in the small -and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. -Note that this stub is called in highly polymorphic cases, but the cache should have been sized -and the hash function chosen to maximize the cache hit case. Hence the cmp/jcc instructions should -mostly be going down the cache hit route, and it is important that this be statically predicted as so. -Hence the 3 jcc instrs need to be forward jumps. As structured, there is only one jmp/jcc that typically -gets put in the BTB since all the others typically fall straight thru. Minimizing potential BTB entries -is important. */ - -struct ResolveStub -{ - inline PCODE failEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)(&_failEntryPoint[0]) + THUMB_CODE; } - inline PCODE resolveEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)(&_resolveEntryPoint[0]) + THUMB_CODE; } - inline PCODE slowEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)(&_slowEntryPoint[0]) + THUMB_CODE; } - - inline INT32* pCounter() { LIMITED_METHOD_CONTRACT; return _pCounter; } - inline UINT32 hashedToken() { LIMITED_METHOD_CONTRACT; return _hashedToken >> LOG2_PTRSIZE; } - inline size_t cacheAddress() { LIMITED_METHOD_CONTRACT; return _cacheAddress; } - inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(ResolveStub); } - -private: - friend struct ResolveHolder; - const static int resolveEntryPointLen = 32; - const static int slowEntryPointLen = 4; - const static int failEntryPointLen = 14; - - WORD _resolveEntryPoint[resolveEntryPointLen]; - WORD _slowEntryPoint[slowEntryPointLen]; - WORD _failEntryPoint[failEntryPointLen]; - INT32* _pCounter; - UINT32 _hashedToken; - size_t _cacheAddress; // lookupCache - size_t _token; - size_t _tokenSlow; - PCODE _resolveWorkerTarget; - UINT32 _cacheMask; -}; - -/* ResolveHolders are the containers for ResolveStubs, They provide -for any alignment of the stubs as necessary. The stubs are placed in a hash table keyed by -the token for which they are built. Efficiency of access requires that this token be aligned. -For now, we have copied that field into the ResolveHolder itself, if the resolve stub is arranged such that -any of its inlined tokens (non-prehashed) is aligned, then the token field in the ResolveHolder -is not needed. */ -struct ResolveHolder -{ - static void InitializeStatic() { LIMITED_METHOD_CONTRACT; } - - void Initialize(ResolveHolder* pResolveHolderRX, - PCODE resolveWorkerTarget, PCODE patcherTarget, - size_t dispatchToken, UINT32 hashedToken, - void * cacheAddr, INT32 * counterAddr); - - ResolveStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } - - static ResolveHolder* FromFailEntry(PCODE failEntry); - static ResolveHolder* FromResolveEntry(PCODE resolveEntry); - -private: - ResolveStub _stub; -}; - /*VTableCallStub************************************************************************************** These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed in the first argument register (this pointer). From there, the stub extracts the MethodTable pointer, followed by the @@ -329,6 +70,7 @@ struct VTableCallHolder VTableCallStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this); } + size_t size() { return stub()->size(); } static size_t GetHolderSize(unsigned slot) { STATIC_CONTRACT_WRAPPER; @@ -360,64 +102,8 @@ struct VTableCallHolder #ifndef DACCESS_COMPILE -#ifdef STUB_LOGGING -extern size_t g_lookup_inline_counter; -extern size_t g_mono_call_counter; -extern size_t g_mono_miss_counter; -extern size_t g_poly_call_counter; -extern size_t g_poly_miss_counter; -#endif - TADDR StubDispatchFrame_MethodFrameVPtr; -LookupHolder* LookupHolder::FromLookupEntry(PCODE lookupEntry) -{ - lookupEntry = lookupEntry & ~THUMB_CODE; - return (LookupHolder*) ( lookupEntry - offsetof(LookupHolder, _stub) - offsetof(LookupStub, _entryPoint) ); -} - - -/* Template used to generate the stub. We generate a stub by allocating a block of - memory and copy the template over it and just update the specific fields that need - to be changed. -*/ -DispatchStub dispatchInit; - -DispatchHolder* DispatchHolder::FromDispatchEntry(PCODE dispatchEntry) -{ - LIMITED_METHOD_CONTRACT; - dispatchEntry = dispatchEntry & ~THUMB_CODE; - DispatchHolder* dispatchHolder = (DispatchHolder*) ( dispatchEntry - offsetof(DispatchHolder, _stub) - offsetof(DispatchStub, _entryPoint) ); - // _ASSERTE(dispatchHolder->_stub._entryPoint[0] == dispatchInit._entryPoint[0]); - return dispatchHolder; -} - - -/* Template used to generate the stub. We generate a stub by allocating a block of - memory and copy the template over it and just update the specific fields that need - to be changed. -*/ - -ResolveStub resolveInit; - -ResolveHolder* ResolveHolder::FromFailEntry(PCODE failEntry) -{ - LIMITED_METHOD_CONTRACT; - failEntry = failEntry & ~THUMB_CODE; - ResolveHolder* resolveHolder = (ResolveHolder*) ( failEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _failEntryPoint) ); - // _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]); - return resolveHolder; -} - -ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry) -{ - LIMITED_METHOD_CONTRACT; - resolveEntry = resolveEntry & ~THUMB_CODE; - ResolveHolder* resolveHolder = (ResolveHolder*) ( resolveEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _resolveEntryPoint) ); - // _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]); - return resolveHolder; -} - void MovRegImm(BYTE* p, int reg, TADDR imm); void VTableCallHolder::Initialize(unsigned slot) @@ -515,19 +201,20 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s if (stubKind == SK_UNKNOWN) { //Assuming that RESOLVE_STUB_FIRST_WORD & DISPATCH_STUB_FIRST_WORD have same values + _ASSERTE(RESOLVE_STUB_FIRST_WORD == DISPATCH_STUB_FIRST_WORD); if (firstWord == DISPATCH_STUB_FIRST_WORD) { WORD thirdWord = ((WORD*)pInstr)[2]; - if (thirdWord == 0xf84d) + if (thirdWord == DISPATCH_STUB_THIRD_WORD) { stubKind = SK_DISPATCH; } - else if (thirdWord == 0xb460) + else if (thirdWord == RESOLVE_STUB_THIRD_WORD) { stubKind = SK_RESOLVE; } } - else if (firstWord == 0xf8df) + else if (firstWord == LOOKUP_STUB_FIRST_WORD) { stubKind = SK_LOOKUP; } diff --git a/src/coreclr/vm/arm64/asmconstants.h b/src/coreclr/vm/arm64/asmconstants.h index 5f5560fbc25f3..4945b7d462ef5 100644 --- a/src/coreclr/vm/arm64/asmconstants.h +++ b/src/coreclr/vm/arm64/asmconstants.h @@ -165,18 +165,20 @@ ASMCONSTANTS_C_ASSERT(SIZEOF__FaultingExceptionFrame == sizeof(FaultingEx ASMCONSTANTS_C_ASSERT(FaultingExceptionFrame__m_fFilterExecuted == offsetof(FaultingExceptionFrame, m_fFilterExecuted)); #define SIZEOF__FixupPrecode 24 -#define Offset_PrecodeChunkIndex 15 -#define Offset_MethodDescChunkIndex 14 +//#define Offset_PrecodeChunkIndex 15 +//#define Offset_MethodDescChunkIndex 14 #define MethodDesc_ALIGNMENT_SHIFT 3 -#define FixupPrecode_ALIGNMENT_SHIFT_1 3 -#define FixupPrecode_ALIGNMENT_SHIFT_2 4 +//#define FixupPrecode_ALIGNMENT_SHIFT_1 3 +//#define FixupPrecode_ALIGNMENT_SHIFT_2 4 ASMCONSTANTS_C_ASSERT(SIZEOF__FixupPrecode == sizeof(FixupPrecode)); -ASMCONSTANTS_C_ASSERT(Offset_PrecodeChunkIndex == offsetof(FixupPrecode, m_PrecodeChunkIndex)); -ASMCONSTANTS_C_ASSERT(Offset_MethodDescChunkIndex == offsetof(FixupPrecode, m_MethodDescChunkIndex)); +//ASMCONSTANTS_C_ASSERT(Offset_PrecodeChunkIndex == offsetof(FixupPrecode, m_PrecodeChunkIndex)); +//ASMCONSTANTS_C_ASSERT(Offset_MethodDescChunkIndex == offsetof(FixupPrecode, m_MethodDescChunkIndex)); ASMCONSTANTS_C_ASSERT(MethodDesc_ALIGNMENT_SHIFT == MethodDesc::ALIGNMENT_SHIFT); -ASMCONSTANTS_C_ASSERT((1< precodeWriterHolder(this, sizeof(StubPrecode)); - InterlockedExchange64((LONGLONG*)&precodeWriterHolder.GetRW()->m_pTarget, (TADDR)GetPreStubEntryPoint()); - } - - BOOL SetTargetInterlocked(TADDR target, TADDR expected) - { - CONTRACTL - { - THROWS; - GC_NOTRIGGER; - } - CONTRACTL_END; - - ExecutableWriterHolder precodeWriterHolder(this, sizeof(StubPrecode)); - return (TADDR)InterlockedCompareExchange64( - (LONGLONG*)&precodeWriterHolder.GetRW()->m_pTarget, (TADDR)target, (TADDR)expected) == expected; - } -#endif // !DACCESS_COMPILE - -}; -typedef DPTR(StubPrecode) PTR_StubPrecode; - - -struct NDirectImportPrecode { - - static const int Type = 0x8B; - - // adr x11, #16 ; Notice that x11 register is used to differentiate the stub from StubPrecode which uses x9 - // ldp x10,x12,[x11] ; =m_pTarget,m_pMethodDesc - // br x10 - // 4 byte padding for 8 byte allignement - // dcd pTarget - // dcd pMethodDesc - DWORD m_rgCode[4]; - TADDR m_pTarget; - TADDR m_pMethodDesc; - - void Init(NDirectImportPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator); - - TADDR GetMethodDesc() - { - LIMITED_METHOD_DAC_CONTRACT; - return m_pMethodDesc; - } - - PCODE GetTarget() - { - LIMITED_METHOD_DAC_CONTRACT; - return m_pTarget; - } - - LPVOID GetEntrypoint() - { - LIMITED_METHOD_CONTRACT; - return this; - } - -}; -typedef DPTR(NDirectImportPrecode) PTR_NDirectImportPrecode; - - -struct FixupPrecode { - - static const int Type = 0x0C; - - // adr x12, #0 - // ldr x11, [pc, #12] ; =m_pTarget - // br x11 - // dcb m_MethodDescChunkIndex - // dcb m_PrecodeChunkIndex - // 2 byte padding - // dcd m_pTarget - - - UINT32 m_rgCode[3]; - BYTE padding[2]; - BYTE m_MethodDescChunkIndex; - BYTE m_PrecodeChunkIndex; - TADDR m_pTarget; - - void Init(FixupPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator, int iMethodDescChunkIndex = 0, int iPrecodeChunkIndex = 0); - void InitCommon() - { - WRAPPER_NO_CONTRACT; - int n = 0; - - m_rgCode[n++] = 0x1000000C; // adr x12, #0 - m_rgCode[n++] = 0x5800006B; // ldr x11, [pc, #12] ; =m_pTarget - - _ASSERTE((UINT32*)&m_pTarget == &m_rgCode[n + 2]); - - m_rgCode[n++] = 0xD61F0160; // br x11 - - _ASSERTE(n == ARRAY_SIZE(m_rgCode)); - } - - TADDR GetBase() - { - LIMITED_METHOD_CONTRACT; - SUPPORTS_DAC; - - return dac_cast(this) + (m_PrecodeChunkIndex + 1) * sizeof(FixupPrecode); - } - - size_t GetSizeRW() - { - LIMITED_METHOD_CONTRACT; - - return GetBase() + sizeof(void*) - dac_cast(this); - } - - TADDR GetMethodDesc(); - - PCODE GetTarget() - { - LIMITED_METHOD_DAC_CONTRACT; - return m_pTarget; - } - -#ifndef DACCESS_COMPILE - void ResetTargetInterlocked() - { - CONTRACTL - { - THROWS; - GC_NOTRIGGER; - } - CONTRACTL_END; - - ExecutableWriterHolder precodeWriterHolder(this, sizeof(FixupPrecode)); - InterlockedExchange64((LONGLONG*)&precodeWriterHolder.GetRW()->m_pTarget, (TADDR)GetEEFuncEntryPoint(PrecodeFixupThunk)); - } - - BOOL SetTargetInterlocked(TADDR target, TADDR expected) - { - CONTRACTL - { - THROWS; - GC_NOTRIGGER; - } - CONTRACTL_END; - - ExecutableWriterHolder precodeWriterHolder(this, sizeof(FixupPrecode)); - return (TADDR)InterlockedCompareExchange64( - (LONGLONG*)&precodeWriterHolder.GetRW()->m_pTarget, (TADDR)target, (TADDR)expected) == expected; - } -#endif // !DACCESS_COMPILE - - static BOOL IsFixupPrecodeByASM(PCODE addr) - { - PTR_DWORD pInstr = dac_cast(PCODEToPINSTR(addr)); - return - (pInstr[0] == 0x1000000C) && - (pInstr[1] == 0x5800006B) && - (pInstr[2] == 0xD61F0160); - } - -#ifdef DACCESS_COMPILE - void EnumMemoryRegions(CLRDataEnumMemoryFlags flags); -#endif -}; -typedef DPTR(FixupPrecode) PTR_FixupPrecode; - - // Precode to shuffle this and retbuf for closed delegates over static methods with return buffer struct ThisPtrRetBufPrecode { @@ -804,165 +590,4 @@ struct ThisPtrRetBufPrecode { }; typedef DPTR(ThisPtrRetBufPrecode) PTR_ThisPtrRetBufPrecode; -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Call counting - -#ifdef FEATURE_TIERED_COMPILATION - -#define DISABLE_COPY(T) \ - T(const T &) = delete; \ - T &operator =(const T &) = delete - -typedef UINT16 CallCount; -typedef DPTR(CallCount) PTR_CallCount; - -//////////////////////////////////////////////////////////////// -// CallCountingStub - -class CallCountingStub; -typedef DPTR(const CallCountingStub) PTR_CallCountingStub; - -class CallCountingStub -{ -public: - static const SIZE_T Alignment = sizeof(void *); - -#ifndef DACCESS_COMPILE -protected: - static const PCODE TargetForThresholdReached; - - CallCountingStub() = default; - -public: - static const CallCountingStub *From(TADDR stubIdentifyingToken); - - PCODE GetEntryPoint() const - { - WRAPPER_NO_CONTRACT; - return PINSTRToPCODE((TADDR)this); - } -#endif // !DACCESS_COMPILE - -public: - PTR_CallCount GetRemainingCallCountCell() const; - PCODE GetTargetForMethod() const; - - DISABLE_COPY(CallCountingStub); -}; - -//////////////////////////////////////////////////////////////// -// CallCountingStubShort - -class CallCountingStubShort; -typedef DPTR(const CallCountingStubShort) PTR_CallCountingStubShort; - -#pragma pack(push, 1) -class CallCountingStubShort : public CallCountingStub -{ -private: - const UINT32 m_part0[10]; - CallCount *const m_remainingCallCountCell; - const PCODE m_targetForMethod; - const PCODE m_targetForThresholdReached; - -#ifndef DACCESS_COMPILE -public: - CallCountingStubShort(CallCountingStubShort* stubRX, CallCount *remainingCallCountCell, PCODE targetForMethod) - : m_part0{ 0x58000149, // ldr x9, [pc, #(m_remainingCallCountCell)] - 0x7940012a, // ldrh w10, [x9] - 0x7100054a, // subs w10, w10, #1 - 0x7900012a, // strh w10, [x9] - 0x54000060, // beq L0 - 0x580000e9, // ldr x9, [pc, #(m_targetForMethod)] - 0xd61f0120, // br x9 - 0x10ffff2a, // L0: adr x10, #(this) - // (x10 == stub-identifying token == this) - 0x580000c9, // ldr x9, [pc, #(m_targetForThresholdReached)] - 0xd61f0120}, // br x9 - m_remainingCallCountCell(remainingCallCountCell), - m_targetForMethod(targetForMethod), - m_targetForThresholdReached(TargetForThresholdReached) - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg(sizeof(CallCountingStubShort) % Alignment == 0); - _ASSERTE(remainingCallCountCell != nullptr); - _ASSERTE(PCODEToPINSTR(targetForMethod) != NULL); - } - - static bool Is(TADDR stubIdentifyingToken) - { - WRAPPER_NO_CONTRACT; - return true; - } - - static const CallCountingStubShort *From(TADDR stubIdentifyingToken) - { - WRAPPER_NO_CONTRACT; - _ASSERTE(Is(stubIdentifyingToken)); - - const CallCountingStubShort *stub = (const CallCountingStubShort *)stubIdentifyingToken; - _ASSERTE(IS_ALIGNED(stub, Alignment)); - return stub; - } -#endif // !DACCESS_COMPILE - -public: - static bool Is(PTR_CallCountingStub callCountingStub) - { - WRAPPER_NO_CONTRACT; - return true; - } - - static PTR_CallCountingStubShort From(PTR_CallCountingStub callCountingStub) - { - WRAPPER_NO_CONTRACT; - _ASSERTE(Is(callCountingStub)); - - return dac_cast(callCountingStub); - } - - PCODE GetTargetForMethod() const - { - WRAPPER_NO_CONTRACT; - return m_targetForMethod; - } - - friend CallCountingStub; - DISABLE_COPY(CallCountingStubShort); -}; -#pragma pack(pop) - -//////////////////////////////////////////////////////////////// -// CallCountingStub definitions - -#ifndef DACCESS_COMPILE -inline const CallCountingStub *CallCountingStub::From(TADDR stubIdentifyingToken) -{ - WRAPPER_NO_CONTRACT; - _ASSERTE(stubIdentifyingToken != NULL); - - return CallCountingStubShort::From(stubIdentifyingToken); -} -#endif - -inline PTR_CallCount CallCountingStub::GetRemainingCallCountCell() const -{ - WRAPPER_NO_CONTRACT; - return PTR_CallCount(dac_cast(this)->m_remainingCallCountCell); -} - -inline PCODE CallCountingStub::GetTargetForMethod() const -{ - WRAPPER_NO_CONTRACT; - return CallCountingStubShort::From(PTR_CallCountingStub(this))->GetTargetForMethod(); -} - -//////////////////////////////////////////////////////////////// - -#undef DISABLE_COPY - -#endif // FEATURE_TIERED_COMPILATION - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - #endif // __cgencpu_h__ diff --git a/src/coreclr/vm/arm64/stubs.cpp b/src/coreclr/vm/arm64/stubs.cpp index fbd222a9b2aba..fcbb0aba8fc2d 100644 --- a/src/coreclr/vm/arm64/stubs.cpp +++ b/src/coreclr/vm/arm64/stubs.cpp @@ -543,93 +543,7 @@ void HelperMethodFrame::UpdateRegDisplay(const PREGDISPLAY pRD) ClearRegDisplayArgumentAndScratchRegisters(pRD); } -TADDR FixupPrecode::GetMethodDesc() -{ - LIMITED_METHOD_DAC_CONTRACT; - - // This lookup is also manually inlined in PrecodeFixupThunk assembly code - TADDR base = *PTR_TADDR(GetBase()); - if (base == NULL) - return NULL; - return base + (m_MethodDescChunkIndex * MethodDesc::ALIGNMENT); -} - -#ifdef DACCESS_COMPILE -void FixupPrecode::EnumMemoryRegions(CLRDataEnumMemoryFlags flags) -{ - SUPPORTS_DAC; - DacEnumMemoryRegion(dac_cast(this), sizeof(FixupPrecode)); - - DacEnumMemoryRegion(GetBase(), sizeof(TADDR)); -} -#endif // DACCESS_COMPILE - #ifndef DACCESS_COMPILE -void StubPrecode::Init(StubPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator) -{ - WRAPPER_NO_CONTRACT; - - int n = 0; - - m_rgCode[n++] = 0x10000089; // adr x9, #16 - m_rgCode[n++] = 0xA940312A; // ldp x10,x12,[x9] - m_rgCode[n++] = 0xD61F0140; // br x10 - - _ASSERTE(n+1 == ARRAY_SIZE(m_rgCode)); - - m_pTarget = GetPreStubEntryPoint(); - m_pMethodDesc = (TADDR)pMD; -} - -void NDirectImportPrecode::Init(NDirectImportPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator) -{ - WRAPPER_NO_CONTRACT; - - int n = 0; - - m_rgCode[n++] = 0x1000008B; // adr x11, #16 - m_rgCode[n++] = 0xA940316A; // ldp x10,x12,[x11] - m_rgCode[n++] = 0xD61F0140; // br x10 - - _ASSERTE(n+1 == ARRAY_SIZE(m_rgCode)); - - m_pTarget = GetEEFuncEntryPoint(NDirectImportThunk); - m_pMethodDesc = (TADDR)pMD; -} - -void FixupPrecode::Init(FixupPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator, int iMethodDescChunkIndex /*=0*/, int iPrecodeChunkIndex /*=0*/) -{ - WRAPPER_NO_CONTRACT; - - InitCommon(); - - // Initialize chunk indices only if they are not initialized yet. This is necessary to make MethodDesc::Reset work. - if (m_PrecodeChunkIndex == 0) - { - _ASSERTE(FitsInU1(iPrecodeChunkIndex)); - m_PrecodeChunkIndex = static_cast(iPrecodeChunkIndex); - } - - if (iMethodDescChunkIndex != -1) - { - if (m_MethodDescChunkIndex == 0) - { - _ASSERTE(FitsInU1(iMethodDescChunkIndex)); - m_MethodDescChunkIndex = static_cast(iMethodDescChunkIndex); - } - - if (*(void**)GetBase() == NULL) - *(void**)GetBase() = (BYTE*)pMD - (iMethodDescChunkIndex * MethodDesc::ALIGNMENT); - } - - _ASSERTE(pPrecodeRX->GetMethodDesc() == (TADDR)pMD); - - if (pLoaderAllocator != NULL) - { - m_pTarget = GetEEFuncEntryPoint(PrecodeFixupThunk); - } -} - void ThisPtrRetBufPrecode::Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocator) { WRAPPER_NO_CONTRACT; @@ -652,45 +566,6 @@ void ThisPtrRetBufPrecode::Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocat m_pMethodDesc = (TADDR)pMD; } -BOOL DoesSlotCallPrestub(PCODE pCode) -{ - PTR_DWORD pInstr = dac_cast(PCODEToPINSTR(pCode)); - - //FixupPrecode -#if defined(HAS_FIXUP_PRECODE) - if (FixupPrecode::IsFixupPrecodeByASM(pCode)) - { - PCODE pTarget = dac_cast(pInstr)->m_pTarget; - - if (isJump(pTarget)) - { - pTarget = decodeJump(pTarget); - } - - return pTarget == (TADDR)PrecodeFixupThunk; - } -#endif - - // StubPrecode - if (pInstr[0] == 0x10000089 && // adr x9, #16 - pInstr[1] == 0xA940312A && // ldp x10,x12,[x9] - pInstr[2] == 0xD61F0140) // br x10 - { - PCODE pTarget = dac_cast(pInstr)->m_pTarget; - - if (isJump(pTarget)) - { - pTarget = decodeJump(pTarget); - } - - return pTarget == GetPreStubEntryPoint(); - } - - return FALSE; - -} - - #endif // !DACCESS_COMPILE void UpdateRegDisplayFromCalleeSavedRegisters(REGDISPLAY * pRD, CalleeSavedRegisters * pCalleeSaved) @@ -983,10 +858,10 @@ static void UpdateWriteBarrierState(bool skipEphemeralCheck) { BYTE *writeBarrierCodeStart = GetWriteBarrierCodeLocation((void*)JIT_PatchedCodeStart); BYTE *writeBarrierCodeStartRW = writeBarrierCodeStart; - ExecutableWriterHolder writeBarrierWriterHolder; + ExecutableWriterHolderNoLog writeBarrierWriterHolder; if (IsWriteBarrierCopyEnabled()) { - writeBarrierWriterHolder = ExecutableWriterHolder(writeBarrierCodeStart, (BYTE*)JIT_PatchedCodeLast - (BYTE*)JIT_PatchedCodeStart); + writeBarrierWriterHolder.AssignExecutableWriterHolder(writeBarrierCodeStart, (BYTE*)JIT_PatchedCodeLast - (BYTE*)JIT_PatchedCodeStart); writeBarrierCodeStartRW = writeBarrierWriterHolder.GetRW(); } JIT_UpdateWriteBarrierState(GCHeapUtilities::IsServerHeap(), writeBarrierCodeStartRW - writeBarrierCodeStart); diff --git a/src/coreclr/vm/arm64/thunktemplates.S b/src/coreclr/vm/arm64/thunktemplates.S new file mode 100644 index 0000000000000..a047c9949d197 --- /dev/null +++ b/src/coreclr/vm/arm64/thunktemplates.S @@ -0,0 +1,95 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#include "unixasmmacros.inc" +#include "asmconstants.h" + +#define DATA_SLOT(stub, field) (C_FUNC(stub##Code\PAGE_SIZE) + \PAGE_SIZE + stub##Data__##field) + + .irp PAGE_SIZE, 4096, 8192, 16384, 32768, 65536 + + LEAF_ENTRY StubPrecodeCode\PAGE_SIZE + ldr x10, DATA_SLOT(StubPrecode, Target) + ldr x12, DATA_SLOT(StubPrecode, MethodDesc) + br x10 + LEAF_END_MARKED StubPrecodeCode\PAGE_SIZE + + LEAF_ENTRY FixupPrecodeCode\PAGE_SIZE + ldr x11, DATA_SLOT(FixupPrecode, Target) + br x11 + ldr x12, DATA_SLOT(FixupPrecode, MethodDesc) + ldr x11, DATA_SLOT(FixupPrecode, PrecodeFixupThunk) + br x11 + LEAF_END_MARKED FixupPrecodeCode\PAGE_SIZE + + LEAF_ENTRY CallCountingStubCode\PAGE_SIZE +LOCAL_LABEL(StubStart\PAGE_SIZE): + ldr x9, DATA_SLOT(CallCountingStub, RemainingCallCountCell) + ldrh w10, [x9] + subs w10, w10, #1 + strh w10, [x9] + beq LOCAL_LABEL(CountReachedZero\PAGE_SIZE) + ldr x9, DATA_SLOT(CallCountingStub, TargetForMethod) + br x9 +LOCAL_LABEL(CountReachedZero\PAGE_SIZE): + adr x10, LOCAL_LABEL(StubStart\PAGE_SIZE) + ldr x9, DATA_SLOT(CallCountingStub, TargetForThresholdReached) + br x9 + LEAF_END_MARKED CallCountingStubCode\PAGE_SIZE + + + LEAF_ENTRY LookupStubCode\PAGE_SIZE + ldr x12, DATA_SLOT(LookupStub, DispatchToken) + ldr x10, DATA_SLOT(LookupStub, ResolveWorkerTarget) + br x10 + LEAF_END_MARKED LookupStubCode\PAGE_SIZE + + LEAF_ENTRY DispatchStubCode\PAGE_SIZE + PATCH_LABEL DispatchStubCode_ThisDeref\PAGE_SIZE + ldr x13, [x0] // methodTable from object in x0 + adr x9, DATA_SLOT(DispatchStub, ExpectedMT) + ldp x10, x12, [x9] // x10 = ExpectedMT & x12 = ImplTarget + cmp x13, x10 + bne LOCAL_LABEL(Fail\PAGE_SIZE) + br x12 +LOCAL_LABEL(Fail\PAGE_SIZE): + ldr x9, DATA_SLOT(DispatchStub, FailTarget) + br x9 + LEAF_END_MARKED DispatchStubCode\PAGE_SIZE + + LEAF_ENTRY ResolveStubCode\PAGE_SIZE + PATCH_LABEL ResolveStubCode_ResolveEntry\PAGE_SIZE + PATCH_LABEL ResolveStubCode_ThisDeref\PAGE_SIZE +LOCAL_LABEL(Resolve\PAGE_SIZE): + ldr x12, [x0] + add x9, x12, x12, lsr #12 + ldr w13, DATA_SLOT(ResolveStub, HashedToken) + eor x9, x9, x13 + and x9, x9, #CALL_STUB_CACHE_MASK_ASM * 8 + ldr x13, DATA_SLOT(ResolveStub, CacheAddress) + ldr x9, [x13, x9] + ldr x15, DATA_SLOT(ResolveStub, Token) + ldr x13, [x9, #ResolveCacheElem__pMT] + cmp x12, x13 + bne LOCAL_LABEL(SlowEntry\PAGE_SIZE) + ldr x13, [x9, #ResolveCacheElem__token] + cmp x15, x13 + bne LOCAL_LABEL(SlowEntry\PAGE_SIZE) + ldr x12, [x9, ResolveCacheElem__target] + br x12 + PATCH_LABEL ResolveStubCode_SlowEntry\PAGE_SIZE +LOCAL_LABEL(SlowEntry\PAGE_SIZE): + ldr x12, DATA_SLOT(ResolveStub, Token) + ldr x13, DATA_SLOT(ResolveStub, ResolveWorkerTarget) + br x13 + PATCH_LABEL ResolveStubCode_FailEntry\PAGE_SIZE + adr x10, DATA_SLOT(ResolveStub, Counter) + ldr w9, [x10] + subs w9, w9, #1 + str w9, [x10] + bge LOCAL_LABEL(Resolve\PAGE_SIZE) + orr x11, x11, #1 // SDF_ResolveBackPatch + b LOCAL_LABEL(Resolve\PAGE_SIZE) + LEAF_END_MARKED ResolveStubCode\PAGE_SIZE + + .endr diff --git a/src/coreclr/vm/arm64/thunktemplates.asm b/src/coreclr/vm/arm64/thunktemplates.asm new file mode 100644 index 0000000000000..812791d6e0a38 --- /dev/null +++ b/src/coreclr/vm/arm64/thunktemplates.asm @@ -0,0 +1,93 @@ +; Licensed to the .NET Foundation under one or more agreements. +; The .NET Foundation licenses this file to you under the MIT license. + +#include "ksarm64.h" +#include "asmconstants.h" +#include "asmmacros.h" + +PAGE_SIZE = 4096 + +#define DATA_SLOT(stub, field) (stub##Code + PAGE_SIZE + stub##Data__##field) + + LEAF_ENTRY StubPrecodeCode + ldr x10, DATA_SLOT(StubPrecode, Target) + ldr x12, DATA_SLOT(StubPrecode, MethodDesc) + br x10 + LEAF_END_MARKED StubPrecodeCode + + LEAF_ENTRY FixupPrecodeCode + ldr x11, DATA_SLOT(FixupPrecode, Target) + br x11 + ldr x12, DATA_SLOT(FixupPrecode, MethodDesc) + ldr x11, DATA_SLOT(FixupPrecode, PrecodeFixupThunk) + br x11 + LEAF_END_MARKED FixupPrecodeCode + + LEAF_ENTRY CallCountingStubCode + ldr x9, DATA_SLOT(CallCountingStub, RemainingCallCountCell) + ldrh w10, [x9] + subs w10, w10, #1 + strh w10, [x9] + beq CountReachedZero + ldr x9, DATA_SLOT(CallCountingStub, TargetForMethod) + br x9 +CountReachedZero + adr x10, CallCountingStubCode + ldr x9, DATA_SLOT(CallCountingStub, TargetForThresholdReached) + br x9 + LEAF_END_MARKED CallCountingStubCode + + + LEAF_ENTRY LookupStubCode + ldr x12, DATA_SLOT(LookupStub, DispatchToken) + ldr x10, DATA_SLOT(LookupStub, ResolveWorkerTarget) + br x10 + LEAF_END_MARKED LookupStubCode + + LEAF_ENTRY DispatchStubCode + PATCH_LABEL DispatchStubCode_ThisDeref + ldr x13, [x0] ; methodTable from object in x0 + adr x9, DATA_SLOT(DispatchStub, ExpectedMT) + ldp x10, x12, [x9] ; x10 = ExpectedMT & x12 = ImplTarget + cmp x13, x10 + bne Fail + br x12 +Fail + ldr x9, DATA_SLOT(DispatchStub, FailTarget) + br x9 + LEAF_END_MARKED DispatchStubCode + + LEAF_ENTRY ResolveStubCode + PATCH_LABEL ResolveStubCode_ResolveEntry + PATCH_LABEL ResolveStubCode_ThisDeref + ldr x12, [x0] + add x9, x12, x12, lsr #12 + ldr w13, DATA_SLOT(ResolveStub, HashedToken) + eor x9, x9, x13 + and x9, x9, #CALL_STUB_CACHE_MASK_ASM * 8 + ldr x13, DATA_SLOT(ResolveStub, CacheAddress) + ldr x9, [x13, x9] + ldr x15, DATA_SLOT(ResolveStub, Token) + ldr x13, [x9, #ResolveCacheElem__pMT] + cmp x12, x13 + bne ResolveStubCode_SlowEntry + ldr x13, [x9, #ResolveCacheElem__token] + cmp x15, x13 + bne ResolveStubCode_SlowEntry + ldr x12, [x9, ResolveCacheElem__target] + br x12 + PATCH_LABEL ResolveStubCode_SlowEntry + ldr x12, DATA_SLOT(ResolveStub, Token) + ldr x13, DATA_SLOT(ResolveStub, ResolveWorkerTarget) + br x13 + PATCH_LABEL ResolveStubCode_FailEntry + adr x10, DATA_SLOT(ResolveStub, Counter) + ldr w9, [x10] + subs w9, w9, #1 + str w9, [x10] + bge ResolveStubCode + orr x11, x11, #1; SDF_ResolveBackPatch + b ResolveStubCode + LEAF_END_MARKED ResolveStubCode + + END diff --git a/src/coreclr/vm/arm64/virtualcallstubcpu.hpp b/src/coreclr/vm/arm64/virtualcallstubcpu.hpp index 4944b198e5212..92e145ace49dd 100644 --- a/src/coreclr/vm/arm64/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/arm64/virtualcallstubcpu.hpp @@ -8,405 +8,10 @@ #define DISPATCH_STUB_FIRST_DWORD 0xf940000d #define RESOLVE_STUB_FIRST_DWORD 0xF940000C +#define LOOKUP_STUB_FIRST_DWORD 0x5800000C +#define LOOKUP_STUB_FIRST_DWORD_MASK 0xFFF07FFF #define VTABLECALL_STUB_FIRST_DWORD 0xF9400009 -struct ARM64EncodeHelpers -{ - inline static DWORD ADR_PATCH(DWORD offset) - { - DWORD immLO = (offset & 0x03)<<29 ; - - if (immLO ==0 ) - return (offset<<3); - else - return immLO<<29 | (offset -immLO)<<3; - } - -}; - -#define USES_LOOKUP_STUBS 1 - -struct LookupStub -{ - inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } - inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } -private : - friend struct LookupHolder; - - DWORD _entryPoint[4]; - PCODE _resolveWorkerTarget; - size_t _token; -}; - -struct LookupHolder -{ -private: - LookupStub _stub; -public: - static void InitializeStatic() { } - - void Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken) - { - // adr x9, _resolveWorkerTarget - // ldp x10, x12, [x9] - // br x10 - // _resolveWorkerTarget - // _token - _stub._entryPoint[0] = 0x10000089; - _stub._entryPoint[1] = 0xa940312a; - _stub._entryPoint[2] = 0xd61f0140; - //4th element of _entryPoint array is padding for 8byte alignment - _stub._resolveWorkerTarget = resolveWorkerTarget; - _stub._token = dispatchToken; - } - - LookupStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } - static LookupHolder* FromLookupEntry(PCODE lookupEntry) - { - return (LookupHolder*) ( lookupEntry - offsetof(LookupHolder, _stub) - offsetof(LookupStub, _entryPoint) ); - } -}; - -struct DispatchStub -{ - inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } - - inline size_t expectedMT() { LIMITED_METHOD_CONTRACT; return _expectedMT; } - inline PCODE implTarget() { LIMITED_METHOD_CONTRACT; return _implTarget; } - - inline TADDR implTargetSlot(EntryPointSlots::SlotType *slotTypeRef) const - { - LIMITED_METHOD_CONTRACT; - _ASSERTE(slotTypeRef != nullptr); - - *slotTypeRef = EntryPointSlots::SlotType_Executable; - return (TADDR)&_implTarget; - } - - inline PCODE failTarget() { LIMITED_METHOD_CONTRACT; return _failTarget; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(DispatchStub); } - -private: - friend struct DispatchHolder; - - DWORD _entryPoint[8]; - size_t _expectedMT; - PCODE _implTarget; - PCODE _failTarget; -}; - -struct DispatchHolder -{ - static void InitializeStatic() - { - LIMITED_METHOD_CONTRACT; - - // Check that _implTarget is aligned in the DispatchHolder for backpatching - static_assert_no_msg(((offsetof(DispatchHolder, _stub) + offsetof(DispatchStub, _implTarget)) % sizeof(void *)) == 0); - } - - void Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT) - { - // ldr x13, [x0] ; methodTable from object in x0 - // adr x9, _expectedMT ; _expectedMT is at offset 28 from pc - // ldp x10, x12, [x9] ; x10 = _expectedMT & x12 = _implTarget - // cmp x13, x10 - // bne failLabel - // br x12 - // failLabel - // ldr x9, _failTarget ; _failTarget is at offset 24 from pc - // br x9 - // _expectedMT - // _implTarget - // _failTarget - - _stub._entryPoint[0] = DISPATCH_STUB_FIRST_DWORD; // 0xf940000d - _stub._entryPoint[1] = 0x100000e9; - _stub._entryPoint[2] = 0xa940312a; - _stub._entryPoint[3] = 0xeb0a01bf; - _stub._entryPoint[4] = 0x54000041; - _stub._entryPoint[5] = 0xd61f0180; - _stub._entryPoint[6] = 0x580000c9; - _stub._entryPoint[7] = 0xd61f0120; - - _stub._expectedMT = expectedMT; - _stub._implTarget = implTarget; - _stub._failTarget = failTarget; - } - - DispatchStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } - - static DispatchHolder* FromDispatchEntry(PCODE dispatchEntry) - { - LIMITED_METHOD_CONTRACT; - DispatchHolder* dispatchHolder = (DispatchHolder*) ( dispatchEntry - offsetof(DispatchHolder, _stub) - offsetof(DispatchStub, _entryPoint) ); - return dispatchHolder; - } - -private: - DispatchStub _stub; -}; - -struct ResolveStub -{ - inline PCODE failEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_failEntryPoint[0]; } - inline PCODE resolveEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_resolveEntryPoint[0]; } - inline PCODE slowEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_slowEntryPoint[0]; } - inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } - inline INT32* pCounter() { LIMITED_METHOD_CONTRACT; return _pCounter; } - - inline UINT32 hashedToken() { LIMITED_METHOD_CONTRACT; return _hashedToken >> LOG2_PTRSIZE; } - inline size_t cacheAddress() { LIMITED_METHOD_CONTRACT; return _cacheAddress; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(ResolveStub); } - -private: - friend struct ResolveHolder; - const static int resolveEntryPointLen = 17; - const static int slowEntryPointLen = 4; - const static int failEntryPointLen = 8; - - DWORD _resolveEntryPoint[resolveEntryPointLen]; - DWORD _slowEntryPoint[slowEntryPointLen]; - DWORD _failEntryPoint[failEntryPointLen]; - INT32* _pCounter; //Base of the Data Region - size_t _cacheAddress; // lookupCache - size_t _token; - PCODE _resolveWorkerTarget; - UINT32 _hashedToken; -}; - -struct ResolveHolder -{ - static void InitializeStatic() { } - - void Initialize(ResolveHolder* pResolveHolderRX, - PCODE resolveWorkerTarget, PCODE patcherTarget, - size_t dispatchToken, UINT32 hashedToken, - void * cacheAddr, INT32 * counterAddr) - { - int n=0; - DWORD offset; - int br_nextEntry[2]; -/******** Rough Convention of used in this routine - ;;x9 hash scratch / current ResolveCacheElem - ;;x10 base address of the data region - ;;x11 indirection cell - ;;x12 MethodTable (from object ref in x0), out: this._token - ;;X13 temp - ;;X15 temp, this._token - ;;cachemask => [CALL_STUB_CACHE_MASK * sizeof(void*)] -*********/ - // Called directly by JITTED code - // ResolveStub._resolveEntryPoint(x0:Object*, x1 ...,r7, x11:IndirectionCellAndFlags) - // { - // MethodTable mt = x0.m_pMethTab; - // int i = ((mt + mt >> 12) ^ this._hashedToken) & _cacheMask - // ResolveCacheElem e = this._cacheAddress + i - // x9 = e = this._cacheAddress + i - // if (mt == e.pMT && this._token == e.token) - // { - // (e.target)(x0, [x1,...,x7 and x8]); - // } - // else - // { - // x12 = this._token; - // (this._slowEntryPoint)(x0, [x1,.., x7 and x8], x9, x11, x12); - // } - // } - // - -#define Dataregionbase _pCounter -#define DATA_OFFSET(_fieldHigh) (DWORD)((offsetof(ResolveStub, _fieldHigh ) - offsetof(ResolveStub, Dataregionbase)) & 0xffffffff) -#define PC_REL_OFFSET(_field) (DWORD)((offsetof(ResolveStub, _field) - (offsetof(ResolveStub, _resolveEntryPoint) + sizeof(*ResolveStub::_resolveEntryPoint) * n)) & 0xffffffff) - - //ldr x12, [x0,#Object.m_pMethTab ] ; methodTable from object in x0 - _stub._resolveEntryPoint[n++] = RESOLVE_STUB_FIRST_DWORD; //0xF940000C - - // ;; Compute i = ((mt + mt >> 12) ^ this._hashedToken) & _cacheMask - - //add x9, x12, x12 lsr #12 - _stub._resolveEntryPoint[n++] = 0x8B4C3189; - - //;;adr x10, #Dataregionbase of ResolveStub - _stub._resolveEntryPoint[n] = 0x1000000A | ARM64EncodeHelpers::ADR_PATCH(PC_REL_OFFSET(Dataregionbase)); - n++; - - //w13- this._hashedToken - //ldr w13, [x10 + DATA_OFFSET(_hashedToken)] - offset = DATA_OFFSET(_hashedToken); - _ASSERTE(offset >=0 && offset%4 == 0); - _stub._resolveEntryPoint[n++] = 0xB940014D | offset<<8; - - //eor x9,x9,x13 - _stub._resolveEntryPoint[n++] = 0xCA0D0129; - - _ASSERTE(CALL_STUB_CACHE_MASK * sizeof(void*) == 0x7FF8); - //x9-i - //and x9,x9,#cachemask - _stub._resolveEntryPoint[n++] = 0x927D2D29; - - //;; ResolveCacheElem e = this._cacheAddress + i - // - //ldr x13, [x10 + DATA_OFFSET(_cacheAddress)] - offset=DATA_OFFSET(_cacheAddress); - _ASSERTE(offset >=0 && offset%8 == 0); - _stub._resolveEntryPoint[n++] = 0xF940014D | offset<<7; - - //ldr x9, [x13, x9] ;; x9 = e = this._cacheAddress + i - _stub._resolveEntryPoint[n++] = 0xF86969A9 ; - - //ldr x15, [x10 + DATA_OFFSET(_token)] - offset = DATA_OFFSET(_token); - _ASSERTE(offset >=0 && offset%8 == 0); - _stub._resolveEntryPoint[n++] = 0xF940014F | offset<<7; - - //;; Check mt == e.pMT - // - // - //ldr x13, [x9, #offsetof(ResolveCacheElem, pMT) ] - offset = offsetof(ResolveCacheElem, pMT) & 0x000001ff; - _ASSERTE(offset >=0 && offset%8 == 0); - _stub._resolveEntryPoint[n++] = 0xF940012D | offset<<7; - - //cmp x12, x13 - _stub._resolveEntryPoint[n++] = 0xEB0D019F; - - //;; bne nextEntry - //place holder for the above instruction - br_nextEntry[0]=n++; - - //;; Check this._token == e.token - //x15: this._token - // - //ldr x13, [x9, #offsetof(ResolveCacheElem, token) ] - offset = offsetof(ResolveCacheElem, token) & 0xffffffff; - _ASSERTE(offset >=0 && offset%8 == 0); - _stub._resolveEntryPoint[n++] = 0xF940012D | offset<<7; - - //cmp x15, x13 - _stub._resolveEntryPoint[n++] = 0xEB0D01FF; - - //;; bne nextEntry - //place holder for the above instruction - br_nextEntry[1]=n++; - - //ldr x12, [x9, #offsetof(ResolveCacheElem, target) ] - offset = offsetof(ResolveCacheElem, target) & 0xffffffff; - _ASSERTE(offset >=0 && offset%8 == 0); - _stub._resolveEntryPoint[n++] = 0xF940012C | offset<<7; - - // ;; Branch to e.target - // br x12 - _stub._resolveEntryPoint[n++] = 0xD61F0180; - - //;;nextEntry: - //back patching the call sites as now we know the offset to nextEntry - //bne #offset - for(auto i: br_nextEntry) - { - _stub._resolveEntryPoint[i] = 0x54000001 | ((((n-i)*sizeof(DWORD))<<3) & 0x3FFFFFF); - } - - _ASSERTE(n == ResolveStub::resolveEntryPointLen); - _ASSERTE(_stub._resolveEntryPoint + n == _stub._slowEntryPoint); - - // ResolveStub._slowEntryPoint(x0:MethodToken, [x1..x7 and x8], x11:IndirectionCellAndFlags) - // { - // x12 = this._token; - // this._resolveWorkerTarget(x0, [x1..x7 and x8], x9, x11, x12); - // } - -#undef PC_REL_OFFSET -#define PC_REL_OFFSET(_field) (DWORD)((offsetof(ResolveStub, _field) - (offsetof(ResolveStub, _slowEntryPoint) + sizeof(*ResolveStub::_slowEntryPoint) * n)) & 0xffffffff ) - n = 0; - // ;;slowEntryPoint: - // ;;fall through to the slow case - - //;;adr x10, #Dataregionbase - _stub._slowEntryPoint[n] = 0x1000000A | ARM64EncodeHelpers::ADR_PATCH(PC_REL_OFFSET(Dataregionbase)); - n++; - - //ldr x12, [x10 , DATA_OFFSET(_token)] - offset=DATA_OFFSET(_token); - _ASSERTE(offset >=0 && offset%8 == 0); - _stub._slowEntryPoint[n++] = 0xF940014C | (offset<<7); - - // - //ldr x13, [x10 , DATA_OFFSET(_resolveWorkerTarget)] - offset=DATA_OFFSET(_resolveWorkerTarget); - _ASSERTE(offset >=0 && offset%8 == 0); - _stub._slowEntryPoint[n++] = 0xF940014d | (offset<<7); - - // br x13 - _stub._slowEntryPoint[n++] = 0xD61F01A0; - - _ASSERTE(n == ResolveStub::slowEntryPointLen); - // ResolveStub._failEntryPoint(x0:MethodToken, x1,.., x7 and x8, x11:IndirectionCellAndFlags) - // { - // if(--*(this._pCounter) < 0) x11 = x11 | SDF_ResolveBackPatch; - // this._resolveEntryPoint(x0, [x1..x7 and x8]); - // } - -#undef PC_REL_OFFSET //NOTE Offset can be negative -#define PC_REL_OFFSET(_field) (DWORD)((offsetof(ResolveStub, _field) - (offsetof(ResolveStub, _failEntryPoint) + sizeof(*ResolveStub::_failEntryPoint) * n)) & 0xffffffff) - n = 0; - - //;;failEntryPoint - //;;adr x10, #Dataregionbase - _stub._failEntryPoint[n] = 0x1000000A | ARM64EncodeHelpers::ADR_PATCH(PC_REL_OFFSET(Dataregionbase)); - n++; - - // - //ldr x13, [x10] - offset=DATA_OFFSET(_pCounter); - _ASSERTE(offset >=0 && offset%8 == 0); - _stub._failEntryPoint[n++] = 0xF940014D | offset<<7; - - //ldr w9, [x13] - _stub._failEntryPoint[n++] = 0xB94001A9; - //subs w9,w9,#1 - _stub._failEntryPoint[n++] = 0x71000529; - //str w9, [x13] - _stub._failEntryPoint[n++] = 0xB90001A9; - - //;;bge resolveEntryPoint - offset = PC_REL_OFFSET(_resolveEntryPoint); - _stub._failEntryPoint[n++] = 0x5400000A | ((offset <<3)& 0x00FFFFF0) ; - - // ;; orr x11, x11, SDF_ResolveBackPatch - // orr x11, x11, #1 - _ASSERTE(SDF_ResolveBackPatch == 0x1); - _stub._failEntryPoint[n++] = 0xB240016B; - - //;;b resolveEntryPoint: - offset = PC_REL_OFFSET(_resolveEntryPoint); - _stub._failEntryPoint[n++] = 0x14000000 | ((offset>>2) & 0x3FFFFFF); - - _ASSERTE(n == ResolveStub::failEntryPointLen); - _stub._pCounter = counterAddr; - _stub._hashedToken = hashedToken << LOG2_PTRSIZE; - _stub._cacheAddress = (size_t) cacheAddr; - _stub._token = dispatchToken; - _stub._resolveWorkerTarget = resolveWorkerTarget; - - _ASSERTE(resolveWorkerTarget == (PCODE)ResolveWorkerChainLookupAsmStub); - _ASSERTE(patcherTarget == NULL); - -#undef DATA_OFFSET -#undef PC_REL_OFFSET -#undef Dataregionbase - } - - ResolveStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } - - static ResolveHolder* FromFailEntry(PCODE failEntry); - static ResolveHolder* FromResolveEntry(PCODE resolveEntry); -private: - ResolveStub _stub; -}; - - /*VTableCallStub************************************************************************************** These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed in the first argument register (this pointer). From there, the stub extracts the MethodTable pointer, followed by the @@ -469,6 +74,7 @@ struct VTableCallHolder VTableCallStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this); } + size_t size() { return stub()->size(); } static size_t GetHolderSize(unsigned slot) { STATIC_CONTRACT_WRAPPER; @@ -490,19 +96,6 @@ struct VTableCallHolder #ifdef DECLARE_DATA #ifndef DACCESS_COMPILE -ResolveHolder* ResolveHolder::FromFailEntry(PCODE failEntry) -{ - LIMITED_METHOD_CONTRACT; - ResolveHolder* resolveHolder = (ResolveHolder*) ( failEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _failEntryPoint) ); - return resolveHolder; -} - -ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry) -{ - LIMITED_METHOD_CONTRACT; - ResolveHolder* resolveHolder = (ResolveHolder*) ( resolveEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _resolveEntryPoint) ); - return resolveHolder; -} void VTableCallHolder::Initialize(unsigned slot) { @@ -599,19 +192,19 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s DWORD firstDword = *((DWORD*) pInstr); - if (firstDword == DISPATCH_STUB_FIRST_DWORD) // assembly of first instruction of DispatchStub : ldr x13, [x0] + if (firstDword == DISPATCH_STUB_FIRST_DWORD) { stubKind = SK_DISPATCH; } - else if (firstDword == RESOLVE_STUB_FIRST_DWORD) // assembly of first instruction of ResolveStub : ldr x12, [x0,#Object.m_pMethTab ] + else if (firstDword == RESOLVE_STUB_FIRST_DWORD) { stubKind = SK_RESOLVE; } - else if (firstDword == VTABLECALL_STUB_FIRST_DWORD) // assembly of first instruction of VTableCallStub : ldr x9, [x0] + else if (firstDword == VTABLECALL_STUB_FIRST_DWORD) { stubKind = SK_VTABLECALL; } - else if (firstDword == 0x10000089) // assembly of first instruction of LookupStub : adr x9, _resolveWorkerTarget + else if ((firstDword & LOOKUP_STUB_FIRST_DWORD_MASK) == LOOKUP_STUB_FIRST_DWORD) // The instruction depends on page size, so we mask out the dependent part { stubKind = SK_LOOKUP; } diff --git a/src/coreclr/vm/callcounting.cpp b/src/coreclr/vm/callcounting.cpp index 5a3561c284da8..89da7e13410d1 100644 --- a/src/coreclr/vm/callcounting.cpp +++ b/src/coreclr/vm/callcounting.cpp @@ -118,6 +118,7 @@ PTR_CallCount CallCountingManager::CallCountingInfo::GetRemainingCallCountCell() { WRAPPER_NO_CONTRACT; _ASSERTE(m_stage != Stage::Disabled); + //_ASSERTE(m_callCountingStub != nullptr); return &m_remainingCallCount; } @@ -257,49 +258,93 @@ const CallCountingStub *CallCountingManager::CallCountingStubAllocator::Allocate heap = AllocateHeap(); } - SIZE_T sizeInBytes; - const CallCountingStub *stub; - do + SIZE_T sizeInBytes = sizeof(CallCountingStub); + AllocMemHolder allocationAddressHolder(heap->AllocAlignedMem(sizeInBytes, 1)); + CallCountingStub *stub = (CallCountingStub*)(void*)allocationAddressHolder; + allocationAddressHolder.SuppressRelease(); + stub->Initialize(targetForMethod, remainingCallCountCell); + + return stub; +} + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + #define ENUM_PAGE_SIZE(size) \ + extern "C" void CallCountingStubCode##size(); \ + extern "C" void CallCountingStubCode##size##_End(); + + ENUM_PAGE_SIZES + #undef ENUM_PAGE_SIZE +#else +extern "C" void CallCountingStubCode(); +extern "C" void CallCountingStubCode_End(); +#endif + +#ifdef TARGET_X86 +extern "C" size_t CallCountingStubCode_RemainingCallCountCell_Offset; +extern "C" size_t CallCountingStubCode_TargetForMethod_Offset; +extern "C" size_t CallCountingStubCode_TargetForThresholdReached_Offset; + +#define SYMBOL_VALUE(name) ((size_t)&name) + +#endif + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) +void (*CallCountingStub::CallCountingStubCode)(); +#endif + +#ifndef DACCESS_COMPILE + +void CallCountingStub::StaticInitialize() +{ +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + int pageSize = GetOsPageSize(); + #define ENUM_PAGE_SIZE(size) \ + case size: \ + CallCountingStubCode = CallCountingStubCode##size; \ + _ASSERTE(((BYTE*)CallCountingStubCode##size##_End - (BYTE*)CallCountingStubCode##size) <= CallCountingStub::CodeSize); \ + break; + + switch (pageSize) { - bool forceLongStub = false; - #if defined(_DEBUG) && defined(TARGET_AMD64) - if (s_callCountingStubCount % 2 == 0) - { - forceLongStub = true; - } - #endif + ENUM_PAGE_SIZES + default: + EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size")); + } + #undef ENUM_PAGE_SIZE +#else + _ASSERTE(((BYTE*)CallCountingStubCode_End - (BYTE*)CallCountingStubCode) <= CallCountingStub::CodeSize); +#endif +} - if (!forceLongStub) - { - sizeInBytes = sizeof(CallCountingStubShort); - AllocMemHolder allocationAddressHolder(heap->AllocAlignedMem(sizeInBytes, CallCountingStub::Alignment)); - #ifdef TARGET_AMD64 - if (CallCountingStubShort::CanUseFor(allocationAddressHolder, targetForMethod)) - #endif - { - ExecutableWriterHolder writerHolder(allocationAddressHolder, sizeInBytes); - new(writerHolder.GetRW()) CallCountingStubShort((CallCountingStubShort*)(void*)allocationAddressHolder, remainingCallCountCell, targetForMethod); - stub = (CallCountingStub*)(void*)allocationAddressHolder; - allocationAddressHolder.SuppressRelease(); - break; - } - } +#endif // DACCESS_COMPILE - #ifdef TARGET_AMD64 - sizeInBytes = sizeof(CallCountingStubLong); - void *allocationAddress = (void *)heap->AllocAlignedMem(sizeInBytes, CallCountingStub::Alignment); - ExecutableWriterHolder writerHolder(allocationAddress, sizeInBytes); - new(writerHolder.GetRW()) CallCountingStubLong(remainingCallCountCell, targetForMethod); - stub = (CallCountingStub*)allocationAddress; - #else - UNREACHABLE(); - #endif - } while (false); +void CallCountingStub::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX) +{ + int pageSize = GetOsPageSize(); - ClrFlushInstructionCache(stub, sizeInBytes); - return stub; +#ifdef TARGET_X86 + int totalCodeSize = (pageSize / CallCountingStub::CodeSize) * CallCountingStub::CodeSize; + + for (int i = 0; i < totalCodeSize; i += CallCountingStub::CodeSize) + { + memcpy(pageBase + i, (const void*)CallCountingStubCode, CallCountingStub::CodeSize); + + // Set absolute addresses of the slots in the stub + BYTE* pCounterSlot = pageBaseRX + i + pageSize + offsetof(CallCountingStubData, RemainingCallCountCell); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(CallCountingStubCode_RemainingCallCountCell_Offset)) = pCounterSlot; + + BYTE* pTargetSlot = pageBaseRX + i + pageSize + offsetof(CallCountingStubData, TargetForMethod); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(CallCountingStubCode_TargetForMethod_Offset)) = pTargetSlot; + + BYTE* pCountReachedZeroSlot = pageBaseRX + i + pageSize + offsetof(CallCountingStubData, TargetForThresholdReached); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(CallCountingStubCode_TargetForThresholdReached_Offset)) = pCountReachedZeroSlot; + } +#else // TARGET_X86 + FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)CallCountingStubCode), CallCountingStub::CodeSize, pageSize); +#endif } + NOINLINE LoaderHeap *CallCountingManager::CallCountingStubAllocator::AllocateHeap() { CONTRACTL @@ -312,7 +357,7 @@ NOINLINE LoaderHeap *CallCountingManager::CallCountingStubAllocator::AllocateHea _ASSERTE(m_heap == nullptr); - LoaderHeap *heap = new LoaderHeap(0, 0, &m_heapRangeList, true /* fMakeExecutable */, true /* fUnlocked */); + LoaderHeap *heap = new LoaderHeap(0, 0, &m_heapRangeList, UnlockedLoaderHeap::HeapKind::Interleaved, true /* fUnlocked */, CallCountingStub::GenerateCodePage, CallCountingStub::CodeSize); m_heap = heap; return heap; } @@ -437,6 +482,7 @@ void CallCountingManager::StaticInitialize() { WRAPPER_NO_CONTRACT; s_callCountingManagers = PTR_CallCountingManagerHash(new CallCountingManagerHash()); + CallCountingStub::StaticInitialize(); } #endif diff --git a/src/coreclr/vm/callcounting.h b/src/coreclr/vm/callcounting.h index fa0345238c870..1e28c3448e47b 100644 --- a/src/coreclr/vm/callcounting.h +++ b/src/coreclr/vm/callcounting.h @@ -65,6 +65,127 @@ Miscellaneous T(const T &) = delete; \ T &operator =(const T &) = delete +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Call counting + +typedef UINT16 CallCount; +typedef DPTR(CallCount) PTR_CallCount; + +//////////////////////////////////////////////////////////////// +// CallCountingStub + +class CallCountingStub; +typedef DPTR(const CallCountingStub) PTR_CallCountingStub; + +struct CallCountingStubData +{ + PTR_CallCount RemainingCallCountCell; + PCODE TargetForMethod; + PCODE TargetForThresholdReached; +}; + +typedef DPTR(CallCountingStubData) PTR_CallCountingStubData; + +class CallCountingStub +{ +public: +#if defined(TARGET_AMD64) + static const int CodeSize = 24; + static const int StubIdentifyingTokenOffset = 24; +#elif defined(TARGET_X86) + static const int CodeSize = 24; + static const int StubIdentifyingTokenOffset = 22; +#elif defined(TARGET_ARM64) + static const int CodeSize = 40; + static const int StubIdentifyingTokenOffset = 0; +#elif defined(TARGET_ARM) + static const int CodeSize = 32; + static const int StubIdentifyingTokenOffset = 0; +#endif + +private: + UINT8 m_code[CodeSize]; + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + static void (*CallCountingStubCode)(); +#endif + +public: + static const SIZE_T Alignment = sizeof(void *); + +protected: + PTR_CallCountingStubData GetData() const + { + return dac_cast(dac_cast(this) + GetOsPageSize()); + } + +#ifndef DACCESS_COMPILE + static const PCODE TargetForThresholdReached; + + CallCountingStub() = default; + +public: + static const CallCountingStub *From(TADDR stubIdentifyingToken); + + PCODE GetEntryPoint() const + { + WRAPPER_NO_CONTRACT; + return PINSTRToPCODE((TADDR)this); + } +#endif // !DACCESS_COMPILE + +public: + +#ifndef DACCESS_COMPILE + void Initialize(PCODE targetForMethod, CallCount* remainingCallCountCell) + { + PTR_CallCountingStubData pStubData = GetData(); + pStubData->RemainingCallCountCell = remainingCallCountCell; + pStubData->TargetForMethod = targetForMethod; + pStubData->TargetForThresholdReached = CallCountingStub::TargetForThresholdReached; + } + + static void StaticInitialize(); +#endif // !DACCESS_COMPILE + + static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX); + + PTR_CallCount GetRemainingCallCountCell() const; + PCODE GetTargetForMethod() const; + +protected: + + DISABLE_COPY(CallCountingStub); +}; + +//////////////////////////////////////////////////////////////// +// CallCountingStub definitions + +#ifndef DACCESS_COMPILE +inline const CallCountingStub *CallCountingStub::From(TADDR stubIdentifyingToken) +{ + WRAPPER_NO_CONTRACT; + _ASSERTE(stubIdentifyingToken != NULL); + + const CallCountingStub *stub = (const CallCountingStub *)(stubIdentifyingToken - StubIdentifyingTokenOffset); + + _ASSERTE(IS_ALIGNED(stub, Alignment)); + return stub; +} +#endif + +inline PTR_CallCount CallCountingStub::GetRemainingCallCountCell() const +{ + WRAPPER_NO_CONTRACT; + return GetData()->RemainingCallCountCell; +} + +inline PCODE CallCountingStub::GetTargetForMethod() const +{ + WRAPPER_NO_CONTRACT; + return GetData()->TargetForMethod; +} + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // CallCountingManager diff --git a/src/coreclr/vm/ceeload.cpp b/src/coreclr/vm/ceeload.cpp index c48f2fcb596cb..4d016d3dd6a4a 100644 --- a/src/coreclr/vm/ceeload.cpp +++ b/src/coreclr/vm/ceeload.cpp @@ -4473,7 +4473,7 @@ LoaderHeap *Module::GetThunkHeap() LoaderHeap *pNewHeap = new LoaderHeap(VIRTUAL_ALLOC_RESERVE_GRANULARITY, // DWORD dwReserveBlockSize 0, // DWORD dwCommitBlockSize ThunkHeapStubManager::g_pManager->GetRangeList(), - TRUE); // BOOL fMakeExecutable + UnlockedLoaderHeap::HeapKind::Executable); if (FastInterlockCompareExchangePointer(&m_pThunkHeap, pNewHeap, 0) != 0) { diff --git a/src/coreclr/vm/ceemain.cpp b/src/coreclr/vm/ceemain.cpp index 23e5a77eaa422..8404413beb345 100644 --- a/src/coreclr/vm/ceemain.cpp +++ b/src/coreclr/vm/ceemain.cpp @@ -672,6 +672,19 @@ void EEStartupHelper() CallCountingManager::StaticInitialize(); OnStackReplacementManager::StaticInitialize(); +#ifdef TARGET_UNIX + ExecutableAllocator::InitPreferredRange(); +#else + { + // Record coreclr.dll geometry + PEDecoder pe(GetClrModuleBase()); + + g_runtimeLoadedBaseAddress = (SIZE_T)pe.GetBase(); + g_runtimeVirtualSize = (SIZE_T)pe.GetVirtualSize(); + ExecutableAllocator::InitLazyPreferredRange(g_runtimeLoadedBaseAddress, g_runtimeVirtualSize, GetRandomInt(64)); + } +#endif // !TARGET_UNIX + InitThreadManager(); STRESS_LOG0(LF_STARTUP, LL_ALWAYS, "Returned successfully from InitThreadManager"); @@ -807,20 +820,6 @@ void EEStartupHelper() StubManager::InitializeStubManagers(); -#ifdef TARGET_UNIX - ExecutableAllocator::InitPreferredRange(); -#else - { - // Record coreclr.dll geometry - PEDecoder pe(GetClrModuleBase()); - - g_runtimeLoadedBaseAddress = (SIZE_T)pe.GetBase(); - g_runtimeVirtualSize = (SIZE_T)pe.GetVirtualSize(); - ExecutableAllocator::InitLazyPreferredRange(g_runtimeLoadedBaseAddress, g_runtimeVirtualSize, GetRandomInt(64)); - } -#endif // !TARGET_UNIX - - // Set up the cor handle map. This map is used to load assemblies in // memory instead of using the normal system load PEImage::Startup(); @@ -831,7 +830,8 @@ void EEStartupHelper() Stub::Init(); StubLinkerCPU::Init(); - + StubPrecode::StaticInitialize(); + FixupPrecode::StaticInitialize(); InitializeGarbageCollector(); diff --git a/src/coreclr/vm/cgensys.h b/src/coreclr/vm/cgensys.h index ad02efe70d13d..f66614a63d25f 100644 --- a/src/coreclr/vm/cgensys.h +++ b/src/coreclr/vm/cgensys.h @@ -119,12 +119,6 @@ inline bool TargetHasAVXSupport() return false; } - -#ifndef DACCESS_COMPILE -// Given an address in a slot, figure out if the prestub will be called -BOOL DoesSlotCallPrestub(PCODE pCode); -#endif - #ifdef DACCESS_COMPILE // Used by dac/strike to make sense of non-jit/non-jit-helper call targets diff --git a/src/coreclr/vm/codeman.cpp b/src/coreclr/vm/codeman.cpp index 633c135b60a95..668364c00e71f 100644 --- a/src/coreclr/vm/codeman.cpp +++ b/src/coreclr/vm/codeman.cpp @@ -3167,7 +3167,7 @@ JumpStubBlockHeader * EEJitManager::allocJumpStubBlock(MethodDesc* pMD, DWORD n requestInfo.setThrowOnOutOfMemoryWithinRange(throwOnOutOfMemoryWithinRange); TADDR mem; - ExecutableWriterHolder blockWriterHolder; + ExecutableWriterHolderNoLog blockWriterHolder; // Scope the lock { @@ -3187,7 +3187,7 @@ JumpStubBlockHeader * EEJitManager::allocJumpStubBlock(MethodDesc* pMD, DWORD n NibbleMapSetUnlocked(pCodeHeap, mem, TRUE); - blockWriterHolder = ExecutableWriterHolder((JumpStubBlockHeader *)mem, sizeof(JumpStubBlockHeader)); + blockWriterHolder.AssignExecutableWriterHolder((JumpStubBlockHeader *)mem, sizeof(JumpStubBlockHeader)); _ASSERTE(IS_ALIGNED(blockWriterHolder.GetRW(), CODE_SIZE_ALIGN)); } @@ -5253,7 +5253,7 @@ PCODE ExecutionManager::getNextJumpStub(MethodDesc* pMD, PCODE target, JumpStubBlockHeader ** ppHead = &(pJumpStubCache->m_pBlocks); JumpStubBlockHeader * curBlock = *ppHead; - ExecutableWriterHolder curBlockWriterHolder; + ExecutableWriterHolderNoLog curBlockWriterHolder; // allocate a new jumpstub from 'curBlock' if it is not fully allocated // @@ -5269,7 +5269,7 @@ PCODE ExecutionManager::getNextJumpStub(MethodDesc* pMD, PCODE target, { // We will update curBlock->m_used at "DONE" size_t blockSize = sizeof(JumpStubBlockHeader) + (size_t) numJumpStubs * BACK_TO_BACK_JUMP_ALLOCATE_SIZE; - curBlockWriterHolder = ExecutableWriterHolder(curBlock, blockSize); + curBlockWriterHolder.AssignExecutableWriterHolder(curBlock, blockSize); jumpStubRW = (BYTE *)((TADDR)jumpStub + (TADDR)curBlockWriterHolder.GetRW() - (TADDR)curBlock); goto DONE; } @@ -5309,7 +5309,7 @@ PCODE ExecutionManager::getNextJumpStub(MethodDesc* pMD, PCODE target, RETURN(NULL); } - curBlockWriterHolder = ExecutableWriterHolder(curBlock, sizeof(JumpStubBlockHeader) + ((size_t) (curBlock->m_used + 1) * BACK_TO_BACK_JUMP_ALLOCATE_SIZE)); + curBlockWriterHolder.AssignExecutableWriterHolder(curBlock, sizeof(JumpStubBlockHeader) + ((size_t) (curBlock->m_used + 1) * BACK_TO_BACK_JUMP_ALLOCATE_SIZE)); jumpStubRW = (BYTE *) curBlockWriterHolder.GetRW() + sizeof(JumpStubBlockHeader) + ((size_t) curBlock->m_used * BACK_TO_BACK_JUMP_ALLOCATE_SIZE); jumpStub = (BYTE *) curBlock + sizeof(JumpStubBlockHeader) + ((size_t) curBlock->m_used * BACK_TO_BACK_JUMP_ALLOCATE_SIZE); diff --git a/src/coreclr/vm/codeman.h b/src/coreclr/vm/codeman.h index e2afd8ce64540..6cb9edf8d429b 100644 --- a/src/coreclr/vm/codeman.h +++ b/src/coreclr/vm/codeman.h @@ -1420,6 +1420,13 @@ class ExecutionManager static unsigned m_LCG_JumpStubBlockFullCount; public: + + static void DumpExecutionManagerUsage() + { + fprintf(stderr, "JumpStub usage count:\n"); + fprintf(stderr, "Normal: %u, LCG: %u\n", m_normal_JumpStubLookup, m_LCG_JumpStubLookup); + } + struct JumpStubCache { JumpStubCache() diff --git a/src/coreclr/vm/comcallablewrapper.cpp b/src/coreclr/vm/comcallablewrapper.cpp index d9fc2fe0ac4d5..938a80d123b17 100644 --- a/src/coreclr/vm/comcallablewrapper.cpp +++ b/src/coreclr/vm/comcallablewrapper.cpp @@ -3299,7 +3299,7 @@ void ComMethodTable::LayOutClassMethodTable() if (!m_pMT->HasGenericClassInstantiationInHierarchy()) { - ExecutableWriterHolder methodDescMemoryWriteableHolder; + ExecutableWriterHolderNoLog methodDescMemoryWriteableHolder; // // Allocate method desc's for the rest of the slots. // @@ -3310,7 +3310,7 @@ void ComMethodTable::LayOutClassMethodTable() pMDMemoryPtr = m_pMT->GetLoaderAllocator()->GetStubHeap()->AllocMem(S_SIZE_T(cbAlloc + sizeof(UINT_PTR))); pMethodDescMemory = pMDMemoryPtr; - methodDescMemoryWriteableHolder = ExecutableWriterHolder(pMethodDescMemory, cbAlloc + sizeof(UINT_PTR)); + methodDescMemoryWriteableHolder.AssignExecutableWriterHolder(pMethodDescMemory, cbAlloc + sizeof(UINT_PTR)); writeableOffset = methodDescMemoryWriteableHolder.GetRW() - pMethodDescMemory; // initialize the method desc memory to zero diff --git a/src/coreclr/vm/common.h b/src/coreclr/vm/common.h index eb37c0a0cac94..ebe1a981e2636 100644 --- a/src/coreclr/vm/common.h +++ b/src/coreclr/vm/common.h @@ -417,6 +417,30 @@ extern DummyGlobalContract ___contract; #endif // defined(_DEBUG) +#define ENUM_PAGE_SIZES \ + ENUM_PAGE_SIZE(4096) \ + ENUM_PAGE_SIZE(8192) \ + ENUM_PAGE_SIZE(16384) \ + ENUM_PAGE_SIZE(32768) \ + ENUM_PAGE_SIZE(65536) + +inline void FillStubCodePage(BYTE* pageBase, const void* code, int codeSize, int pageSize) +{ + int totalCodeSize = (pageSize / codeSize) * codeSize; + + memcpy(pageBase, code, codeSize); + + int i; + for (i = codeSize; i < pageSize / 2; i *= 2) + { + memcpy(pageBase + i, pageBase, i); + } + + if (i != totalCodeSize) + { + memcpy(pageBase + i, pageBase, totalCodeSize - i); + } +} // All files get to see all of these .inl files to make sure all files // get the benefit of inlining. diff --git a/src/coreclr/vm/corhost.cpp b/src/coreclr/vm/corhost.cpp index 861ec15be77a8..efb35656b5259 100644 --- a/src/coreclr/vm/corhost.cpp +++ b/src/coreclr/vm/corhost.cpp @@ -394,6 +394,11 @@ HRESULT CorHost2::ExecuteAssembly(DWORD dwAppDomainId, UNINSTALL_UNWIND_AND_CONTINUE_HANDLER; UNINSTALL_UNHANDLED_MANAGED_EXCEPTION_TRAP; +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS + ExecutableAllocator::DumpHolderUsage(); + ExecutionManager::DumpExecutionManagerUsage(); +#endif + ErrExit: return hr; diff --git a/src/coreclr/vm/dynamicmethod.cpp b/src/coreclr/vm/dynamicmethod.cpp index a5a9ee44ba8eb..ec37942871689 100644 --- a/src/coreclr/vm/dynamicmethod.cpp +++ b/src/coreclr/vm/dynamicmethod.cpp @@ -503,10 +503,10 @@ HostCodeHeap::TrackAllocation* HostCodeHeap::AllocFromFreeList(size_t header, si // found a block LOG((LF_BCL, LL_INFO100, "Level2 - CodeHeap [0x%p] - Block found, size 0x%X\n", this, pCurrent->size)); - ExecutableWriterHolder previousWriterHolder; + ExecutableWriterHolderNoLog previousWriterHolder; if (pPrevious) { - previousWriterHolder = ExecutableWriterHolder(pPrevious, sizeof(TrackAllocation)); + previousWriterHolder.AssignExecutableWriterHolder(pPrevious, sizeof(TrackAllocation)); } ExecutableWriterHolder currentWriterHolder(pCurrent, sizeof(TrackAllocation)); @@ -585,11 +585,11 @@ void HostCodeHeap::AddToFreeList(TrackAllocation *pBlockToInsert, TrackAllocatio { // found the point of insertion pBlockToInsertRW->pNext = pCurrent; - ExecutableWriterHolder previousWriterHolder; + ExecutableWriterHolderNoLog previousWriterHolder; if (pPrevious) { - previousWriterHolder = ExecutableWriterHolder(pPrevious, sizeof(TrackAllocation)); + previousWriterHolder.AssignExecutableWriterHolder(pPrevious, sizeof(TrackAllocation)); previousWriterHolder.GetRW()->pNext = pBlockToInsert; LOG((LF_BCL, LL_INFO100, "Level2 - CodeHeap [0x%p] - Insert block [%p, 0x%X] -> [%p, 0x%X] -> [%p, 0x%X]\n", this, pPrevious, pPrevious->size, diff --git a/src/coreclr/vm/gccover.cpp b/src/coreclr/vm/gccover.cpp index 8c5a050130685..9499b3efcd65b 100644 --- a/src/coreclr/vm/gccover.cpp +++ b/src/coreclr/vm/gccover.cpp @@ -67,6 +67,26 @@ static MethodDesc* getTargetMethodDesc(PCODE target) return MethodDesc::GetMethodDescFromStubAddr(target, TRUE); } + if (PrecodeStubManager::g_pManager->GetStubPrecodeRangeList()->IsInRange(target)) + { + return (MethodDesc*)((StubPrecode*)PCODEToPINSTR(target))->GetMethodDesc(); + } + + if (PrecodeStubManager::g_pManager->GetFixupPrecodeRangeList()->IsInRange(target)) + { + // If the target slot points to the fixup part of the stub, the actual + // stub starts FixupPrecode::FixupCodeOffset bytes below the target, + // so we need to compensate for it. + target -= FixupPrecode::FixupCodeOffset; + if (!FixupPrecode::IsFixupPrecodeByASM(target)) + { + _ASSERTE(FALSE); // We should never get other precode type here + return nullptr; + } + + return (MethodDesc*)((FixupPrecode*)PCODEToPINSTR(target))->GetMethodDesc(); + } + return nullptr; } @@ -418,7 +438,7 @@ void GCCoverageInfo::SprinkleBreakpoints( #if (defined(TARGET_X86) || defined(TARGET_AMD64)) && USE_DISASSEMBLER BYTE * codeStart = (BYTE *)pCode; - ExecutableWriterHolder codeWriterHolder; + ExecutableWriterHolderNoLog codeWriterHolder; size_t writeableOffset; memcpy(saveAddr, codeStart, codeSize); @@ -432,7 +452,7 @@ void GCCoverageInfo::SprinkleBreakpoints( } else { - codeWriterHolder = ExecutableWriterHolder(codeStart, codeSize); + codeWriterHolder.AssignExecutableWriterHolder(codeStart, codeSize); writeableOffset = codeWriterHolder.GetRW() - codeStart; } diff --git a/src/coreclr/vm/i386/AsmMacros.inc b/src/coreclr/vm/i386/AsmMacros.inc index 6b9eb6eb3fae5..ac77064f0da5c 100644 --- a/src/coreclr/vm/i386/AsmMacros.inc +++ b/src/coreclr/vm/i386/AsmMacros.inc @@ -21,3 +21,26 @@ INLINE_GETTHREAD macro destReg, trashReg add trashReg, SECTIONREL gCurrentThreadInfo mov destReg, [trashReg] endm + +LEAF_ENTRY macro functionName + functionName PROC PUBLIC +endm + +LEAF_END macro functionName + functionName ENDP +endm + +LEAF_END_MARKED macro functionName + LOCAL stackArgsSize, bareFunctionName, endMarkerName + stackArgsSize TEXTEQU @SubStr(functionName, @InStr(,functionName, <@>)) + bareFunctionName TEXTEQU @SubStr(functionName, 1, @SizeStr(functionName)-@SizeStr(%stackArgsSize)) + endMarkerName TEXTEQU @CatStr(%bareFunctionName, <_End@0>) + %endMarkerName: + PUBLIC endMarkerName + functionName ENDP +endm + +PATCH_LABEL macro labelName + labelName: + PUBLIC labelName +endm diff --git a/src/coreclr/vm/i386/asmconstants.h b/src/coreclr/vm/i386/asmconstants.h index b24d70302076f..fc01b0cf139a7 100644 --- a/src/coreclr/vm/i386/asmconstants.h +++ b/src/coreclr/vm/i386/asmconstants.h @@ -330,6 +330,62 @@ ASMCONSTANTS_C_ASSERT(ResolveCacheElem__token == offsetof(ResolveCacheElem, to ASMCONSTANTS_C_ASSERT(ResolveCacheElem__target == offsetof(ResolveCacheElem, target)); ASMCONSTANTS_C_ASSERT(ResolveCacheElem__pNext == offsetof(ResolveCacheElem, pNext)); +#define FixupPrecodeData__Target 0x00 +ASMCONSTANTS_C_ASSERT(FixupPrecodeData__Target == offsetof(FixupPrecodeData, Target)) +#define FixupPrecodeData__MethodDesc 0x04 +ASMCONSTANTS_C_ASSERT(FixupPrecodeData__MethodDesc == offsetof(FixupPrecodeData, MethodDesc)) +#define FixupPrecodeData__PrecodeFixupThunk 0x08 +ASMCONSTANTS_C_ASSERT(FixupPrecodeData__PrecodeFixupThunk == offsetof(FixupPrecodeData, PrecodeFixupThunk)) + +#define StubPrecodeData__Target 0x04 +ASMCONSTANTS_C_ASSERT(StubPrecodeData__Target == offsetof(StubPrecodeData, Target)) +#define StubPrecodeData__MethodDesc 0x00 +ASMCONSTANTS_C_ASSERT(StubPrecodeData__MethodDesc == offsetof(StubPrecodeData, MethodDesc)) + +#define CallCountingStubData__RemainingCallCountCell 0x00 +ASMCONSTANTS_C_ASSERT(CallCountingStubData__RemainingCallCountCell == offsetof(CallCountingStubData, RemainingCallCountCell)) + +#define CallCountingStubData__TargetForMethod 0x04 +ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForMethod == offsetof(CallCountingStubData, TargetForMethod)) + +#define CallCountingStubData__TargetForThresholdReached 0x08 +ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForThresholdReached == offsetof(CallCountingStubData, TargetForThresholdReached)) + +#define LookupStubData__DispatchToken 0x00 +ASMCONSTANTS_C_ASSERT(LookupStubData__DispatchToken == offsetof(LookupStubData, DispatchToken)) + +#define LookupStubData__ResolveWorkerTarget 0x04 +ASMCONSTANTS_C_ASSERT(LookupStubData__ResolveWorkerTarget == offsetof(LookupStubData, ResolveWorkerTarget)) + +#define DispatchStubData__ExpectedMT 0x00 +ASMCONSTANTS_C_ASSERT(DispatchStubData__ExpectedMT == offsetof(DispatchStubData, ExpectedMT)) + +#define DispatchStubData__ImplTarget 0x04 +ASMCONSTANTS_C_ASSERT(DispatchStubData__ImplTarget == offsetof(DispatchStubData, ImplTarget)) + +#define DispatchStubData__FailTarget 0x08 +ASMCONSTANTS_C_ASSERT(DispatchStubData__FailTarget == offsetof(DispatchStubData, FailTarget)) + +#define ResolveStubData__HashedToken 0x04 +ASMCONSTANTS_C_ASSERT(ResolveStubData__HashedToken == offsetof(ResolveStubData, HashedToken)) + +#define ResolveStubData__CacheAddress 0x00 +ASMCONSTANTS_C_ASSERT(ResolveStubData__CacheAddress == offsetof(ResolveStubData, CacheAddress)) + +#define ResolveStubData__Token 0x0c +ASMCONSTANTS_C_ASSERT(ResolveStubData__Token == offsetof(ResolveStubData, Token)) + +#define ResolveStubData__Counter 0x08 +ASMCONSTANTS_C_ASSERT(ResolveStubData__Counter == offsetof(ResolveStubData, Counter)) + +#define ResolveStubData__ResolveWorkerTarget 0x10 +ASMCONSTANTS_C_ASSERT(ResolveStubData__ResolveWorkerTarget == offsetof(ResolveStubData, ResolveWorkerTarget)) + +#define ResolveStubData__PatcherTarget 0x14 +ASMCONSTANTS_C_ASSERT(ResolveStubData__PatcherTarget == offsetof(ResolveStubData, PatcherTarget)) + +#define CALL_STUB_CACHE_MASK_ASM 0xfff +ASMCONSTANTS_C_ASSERT(CALL_STUB_CACHE_MASK_ASM == CALL_STUB_CACHE_MASK) #undef ASMCONSTANTS_C_ASSERT #undef ASMCONSTANTS_RUNTIME_ASSERT diff --git a/src/coreclr/vm/i386/asmhelpers.S b/src/coreclr/vm/i386/asmhelpers.S index ee675c838bc56..ffaa33831a85e 100644 --- a/src/coreclr/vm/i386/asmhelpers.S +++ b/src/coreclr/vm/i386/asmhelpers.S @@ -534,26 +534,6 @@ LEAF_ENTRY NDirectImportThunk, _TEXT jmp eax // Jump to DLL target LEAF_END NDirectImportThunk, _TEXT -// ========================================================================== -// The call in fixup precode initally points to this function. -// The pupose of this function is to load the MethodDesc and forward the call the prestub. -LEAF_ENTRY PrecodeFixupThunk, _TEXT - // Pop the return address. It points right after the call instruction in the precode. - pop eax - push esi - push edi - - // Inline computation done by FixupPrecode::GetMethodDesc() - movzx esi, BYTE PTR [eax + 2] // m_PrecodeChunkIndex - movzx edi, BYTE PTR [eax + 1] // m_MethodDescChunkIndex - mov eax, DWORD PTR [eax + esi*8 +3] - lea eax, [eax + edi*4] - - pop edi - pop esi - jmp C_FUNC(ThePreStub) -LEAF_END PrecodeFixupThunk, _TEXT - // // Used to get the current instruction pointer value // diff --git a/src/coreclr/vm/i386/asmhelpers.asm b/src/coreclr/vm/i386/asmhelpers.asm index 9258b7848f39f..c73a6d8da9051 100644 --- a/src/coreclr/vm/i386/asmhelpers.asm +++ b/src/coreclr/vm/i386/asmhelpers.asm @@ -804,27 +804,6 @@ _NDirectImportThunk@0 proc public jmp eax ; Jump to DLL target _NDirectImportThunk@0 endp -;========================================================================== -; The call in fixup precode initally points to this function. -; The pupose of this function is to load the MethodDesc and forward the call the prestub. -_PrecodeFixupThunk@0 proc public - - pop eax ; Pop the return address. It points right after the call instruction in the precode. - push esi - push edi - - ; Inline computation done by FixupPrecode::GetMethodDesc() - movzx esi,byte ptr [eax+2] ; m_PrecodeChunkIndex - movzx edi,byte ptr [eax+1] ; m_MethodDescChunkIndex - mov eax,dword ptr [eax+esi*8+3] - lea eax,[eax+edi*4] - - pop edi - pop esi - jmp _ThePreStub@0 - -_PrecodeFixupThunk@0 endp - ; void __stdcall setFPReturn(int fpSize, INT64 retVal) _setFPReturn@12 proc public mov ecx, [esp+4] diff --git a/src/coreclr/vm/i386/cgencpu.h b/src/coreclr/vm/i386/cgencpu.h index ab4fc5b120bda..1cc63b10d8b9d 100644 --- a/src/coreclr/vm/i386/cgencpu.h +++ b/src/coreclr/vm/i386/cgencpu.h @@ -81,7 +81,6 @@ EXTERN_C void SinglecastDelegateInvokeStub(); #define HAS_NDIRECT_IMPORT_PRECODE 1 #define HAS_FIXUP_PRECODE 1 -#define HAS_FIXUP_PRECODE_CHUNKS 1 // ThisPtrRetBufPrecode one is necessary for closed delegates over static methods with return buffer #define HAS_THISPTR_RETBUF_PRECODE 1 @@ -525,210 +524,4 @@ inline BOOL ClrFlushInstructionCache(LPCVOID pCodeAddr, size_t sizeOfCode) #define JIT_NewCrossContext JIT_NewCrossContext #endif // TARGET_UNIX -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Call counting - -#ifdef FEATURE_TIERED_COMPILATION - -#define DISABLE_COPY(T) \ - T(const T &) = delete; \ - T &operator =(const T &) = delete - -typedef UINT16 CallCount; -typedef DPTR(CallCount) PTR_CallCount; - -//////////////////////////////////////////////////////////////// -// CallCountingStub - -class CallCountingStub; -typedef DPTR(const CallCountingStub) PTR_CallCountingStub; - -class CallCountingStub -{ -public: - static const SIZE_T Alignment = sizeof(void *); - -#ifndef DACCESS_COMPILE -protected: - static const PCODE TargetForThresholdReached; - - CallCountingStub() = default; - -public: - static const CallCountingStub *From(TADDR stubIdentifyingToken); - - PCODE GetEntryPoint() const - { - WRAPPER_NO_CONTRACT; - return PINSTRToPCODE((TADDR)this); - } -#endif // !DACCESS_COMPILE - -public: - PTR_CallCount GetRemainingCallCountCell() const; - PCODE GetTargetForMethod() const; - -#ifndef DACCESS_COMPILE -protected: - template static INT_PTR GetRelativeOffset(const T *relRef, PCODE target) - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg(sizeof(T) != 0); - static_assert_no_msg(sizeof(T) <= sizeof(void *)); - static_assert_no_msg((sizeof(T) & (sizeof(T) - 1)) == 0); // is a power of 2 - _ASSERTE(relRef != nullptr); - - TADDR targetAddress = PCODEToPINSTR(target); - _ASSERTE(targetAddress != NULL); - return (INT_PTR)targetAddress - (INT_PTR)(relRef + 1); - } -#endif - -protected: - template static PCODE GetTarget(const T *relRef) - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); - _ASSERTE(relRef != nullptr); - - return PINSTRToPCODE((INT_PTR)(relRef + 1) + *relRef); - } - - DISABLE_COPY(CallCountingStub); -}; - -//////////////////////////////////////////////////////////////// -// CallCountingStubShort - -class CallCountingStubShort; -typedef DPTR(const CallCountingStubShort) PTR_CallCountingStubShort; - -#pragma pack(push, 1) -class CallCountingStubShort : public CallCountingStub -{ -private: - const UINT8 m_part0[1]; - CallCount *const m_remainingCallCountCell; - const UINT8 m_part1[5]; - const INT32 m_rel32TargetForMethod; - const UINT8 m_part2[1]; - const INT32 m_rel32TargetForThresholdReached; - const UINT8 m_alignmentPadding[1]; - -#ifndef DACCESS_COMPILE -public: - CallCountingStubShort(CallCountingStubShort* stubRX, CallCount *remainingCallCountCell, PCODE targetForMethod) - : m_part0{ 0xb8}, // mov eax, - m_remainingCallCountCell(remainingCallCountCell), // - m_part1{ 0x66, 0xff, 0x08, // dec word ptr [eax] - 0x0f, 0x85}, // jnz - m_rel32TargetForMethod( // - GetRelative32BitOffset( - &stubRX->m_rel32TargetForMethod, - targetForMethod)), - m_part2{ 0xe8}, // call - m_rel32TargetForThresholdReached( // - GetRelative32BitOffset( - &stubRX->m_rel32TargetForThresholdReached, - TargetForThresholdReached)), - // (eip == stub-identifying token) - m_alignmentPadding{ 0xcc} // int 3 - { - WRAPPER_NO_CONTRACT; - static_assert_no_msg(sizeof(CallCountingStubShort) % Alignment == 0); - _ASSERTE(remainingCallCountCell != nullptr); - _ASSERTE(PCODEToPINSTR(targetForMethod) != NULL); - } - -public: - static bool Is(TADDR stubIdentifyingToken) - { - WRAPPER_NO_CONTRACT; - return true; - } - - static const CallCountingStubShort *From(TADDR stubIdentifyingToken) - { - WRAPPER_NO_CONTRACT; - _ASSERTE(Is(stubIdentifyingToken)); - _ASSERTE(stubIdentifyingToken % Alignment == offsetof(CallCountingStubShort, m_alignmentPadding[0]) % Alignment); - - const CallCountingStubShort *stub = - (const CallCountingStubShort *)(stubIdentifyingToken - offsetof(CallCountingStubShort, m_alignmentPadding[0])); - _ASSERTE(IS_ALIGNED(stub, Alignment)); - return stub; - } -#endif // !DACCESS_COMPILE - -public: - static bool Is(PTR_CallCountingStub callCountingStub) - { - WRAPPER_NO_CONTRACT; - return true; - } - - static PTR_CallCountingStubShort From(PTR_CallCountingStub callCountingStub) - { - WRAPPER_NO_CONTRACT; - _ASSERTE(Is(callCountingStub)); - - return dac_cast(callCountingStub); - } - - PCODE GetTargetForMethod() const - { - WRAPPER_NO_CONTRACT; - return GetTarget(&m_rel32TargetForMethod); - } - -#ifndef DACCESS_COMPILE -private: - static INT32 GetRelative32BitOffset(const INT32 *rel32Ref, PCODE target) - { - WRAPPER_NO_CONTRACT; - - INT_PTR relativeOffset = GetRelativeOffset(rel32Ref, target); - _ASSERTE((INT32)relativeOffset == relativeOffset); - return (INT32)relativeOffset; - } -#endif - - friend CallCountingStub; - DISABLE_COPY(CallCountingStubShort); -}; -#pragma pack(pop) - -//////////////////////////////////////////////////////////////// -// CallCountingStub definitions - -#ifndef DACCESS_COMPILE -inline const CallCountingStub *CallCountingStub::From(TADDR stubIdentifyingToken) -{ - WRAPPER_NO_CONTRACT; - _ASSERTE(stubIdentifyingToken != NULL); - - return CallCountingStubShort::From(stubIdentifyingToken); -} -#endif - -inline PTR_CallCount CallCountingStub::GetRemainingCallCountCell() const -{ - WRAPPER_NO_CONTRACT; - return PTR_CallCount(dac_cast(this)->m_remainingCallCountCell); -} - -inline PCODE CallCountingStub::GetTargetForMethod() const -{ - WRAPPER_NO_CONTRACT; - return CallCountingStubShort::From(PTR_CallCountingStub(this))->GetTargetForMethod(); -} - -//////////////////////////////////////////////////////////////// - -#undef DISABLE_COPY - -#endif // FEATURE_TIERED_COMPILATION - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - #endif // __cgenx86_h__ diff --git a/src/coreclr/vm/i386/cgenx86.cpp b/src/coreclr/vm/i386/cgenx86.cpp index 83bffa716e0f2..8ced6a6c58a50 100644 --- a/src/coreclr/vm/i386/cgenx86.cpp +++ b/src/coreclr/vm/i386/cgenx86.cpp @@ -1186,63 +1186,6 @@ UMEntryThunk* UMEntryThunk::Decode(LPVOID pCallback) return *(UMEntryThunk**)( 1 + (BYTE*)pCallback ); } -BOOL DoesSlotCallPrestub(PCODE pCode) -{ - CONTRACTL { - NOTHROW; - GC_NOTRIGGER; - PRECONDITION(pCode != NULL); - PRECONDITION(pCode != GetPreStubEntryPoint()); - } CONTRACTL_END; - - // x86 has the following possible sequences for prestub logic: - // 1. slot -> temporary entrypoint -> prestub - // 2. slot -> precode -> prestub - // 3. slot -> precode -> jumprel32 (NGEN case) -> prestub - -#ifdef HAS_COMPACT_ENTRYPOINTS - if (MethodDescChunk::GetMethodDescFromCompactEntryPoint(pCode, TRUE) != NULL) - { - return TRUE; - } -#endif // HAS_COMPACT_ENTRYPOINTS - - if (!IS_ALIGNED(pCode, PRECODE_ALIGNMENT)) - { - return FALSE; - } - -#ifdef HAS_FIXUP_PRECODE - if (*PTR_BYTE(pCode) == X86_INSTR_CALL_REL32) - { - // Note that call could have been patched to jmp in the meantime - pCode = rel32Decode(pCode+1); - - // NGEN case - if (*PTR_BYTE(pCode) == X86_INSTR_JMP_REL32) { - pCode = rel32Decode(pCode+1); - } - - return pCode == (TADDR)PrecodeFixupThunk; - } -#endif - - if (*PTR_BYTE(pCode) != X86_INSTR_MOV_EAX_IMM32 || - *PTR_BYTE(pCode+5) != X86_INSTR_MOV_RM_R || - *PTR_BYTE(pCode+7) != X86_INSTR_JMP_REL32) - { - return FALSE; - } - pCode = rel32Decode(pCode+8); - - // NGEN case - if (*PTR_BYTE(pCode) == X86_INSTR_JMP_REL32) { - pCode = rel32Decode(pCode+1); - } - - return pCode == GetPreStubEntryPoint(); -} - #ifdef FEATURE_READYTORUN // diff --git a/src/coreclr/vm/i386/excepx86.cpp b/src/coreclr/vm/i386/excepx86.cpp index 15dd0667dd6c4..418bf7090a775 100644 --- a/src/coreclr/vm/i386/excepx86.cpp +++ b/src/coreclr/vm/i386/excepx86.cpp @@ -3455,11 +3455,13 @@ AdjustContextForVirtualStub( if (sk == VirtualCallStubManager::SK_DISPATCH) { - if (*PTR_WORD(f_IP) != X86_INSTR_CMP_IND_ECX_IMM32) + if (*PTR_WORD(f_IP) != X86_INSTR_CMP_IND_ECX_EAX) { _ASSERTE(!"AV in DispatchStub at unknown instruction"); return FALSE; } + + SetSP(pContext, dac_cast(dac_cast(GetSP(pContext)) + sizeof(void*))); // rollback push eax } else if (sk == VirtualCallStubManager::SK_RESOLVE) @@ -3502,17 +3504,18 @@ AdjustContextForVirtualStub( { ENABLE_FORBID_GC_LOADER_USE_IN_THIS_SCOPE(); - DispatchHolder *holder = DispatchHolder::FromDispatchEntry(f_IP); - MethodTable *pMT = (MethodTable*)holder->stub()->expectedMT(); - DispatchToken token(VirtualCallStubManager::GetTokenFromStubQuick(pMgr, f_IP, sk)); + PCODE dispatchEntry = f_IP - DispatchStub::offsetOfThisDeref(); + DispatchStub *pStub = DispatchStub::FromDispatchEntry(dispatchEntry); + MethodTable *pMT = (MethodTable*)pStub->expectedMT(); + DispatchToken token(VirtualCallStubManager::GetTokenFromStubQuick(pMgr, dispatchEntry, sk)); MethodDesc* pMD = VirtualCallStubManager::GetRepresentativeMethodDescFromToken(token, pMT); stackArgumentsSize = pMD->SizeOfArgStack(); } else { // Compute the stub entry address from the address of failure (location of dereferencing of "this" pointer) - ResolveHolder *holder = ResolveHolder::FromResolveEntry(f_IP - ResolveStub::offsetOfThisDeref()); - stackArgumentsSize = holder->stub()->stackArgumentsSize(); + ResolveStub *pResolveStub = ResolveStub::FromResolveEntry(f_IP - ResolveStub::offsetOfThisDeref()); + stackArgumentsSize = pResolveStub->stackArgumentsSize(); } sp += stackArgumentsSize; diff --git a/src/coreclr/vm/i386/jitinterfacex86.cpp b/src/coreclr/vm/i386/jitinterfacex86.cpp index 0467f347aaacb..641925821ac67 100644 --- a/src/coreclr/vm/i386/jitinterfacex86.cpp +++ b/src/coreclr/vm/i386/jitinterfacex86.cpp @@ -1054,10 +1054,10 @@ void InitJITHelpers1() int reg = c_rgWriteBarrierRegs[iBarrier]; BYTE * pBufRW = pBuf; - ExecutableWriterHolder barrierWriterHolder; + ExecutableWriterHolderNoLog barrierWriterHolder; if (IsWriteBarrierCopyEnabled()) { - barrierWriterHolder = ExecutableWriterHolder(pBuf, 34); + barrierWriterHolder.AssignExecutableWriterHolder(pBuf, 34); pBufRW = barrierWriterHolder.GetRW(); } @@ -1206,10 +1206,10 @@ int StompWriteBarrierEphemeral(bool /* isRuntimeSuspended */) BYTE * pBuf = GetWriteBarrierCodeLocation((BYTE *)c_rgWriteBarriers[iBarrier]); BYTE * pBufRW = pBuf; - ExecutableWriterHolder barrierWriterHolder; + ExecutableWriterHolderNoLog barrierWriterHolder; if (IsWriteBarrierCopyEnabled()) { - barrierWriterHolder = ExecutableWriterHolder(pBuf, 42); + barrierWriterHolder.AssignExecutableWriterHolder(pBuf, 42); pBufRW = barrierWriterHolder.GetRW(); } @@ -1275,10 +1275,10 @@ int StompWriteBarrierResize(bool isRuntimeSuspended, bool bReqUpperBoundsCheck) size_t *pfunc; BYTE * pBufRW = pBuf; - ExecutableWriterHolder barrierWriterHolder; + ExecutableWriterHolderNoLog barrierWriterHolder; if (IsWriteBarrierCopyEnabled()) { - barrierWriterHolder = ExecutableWriterHolder(pBuf, 42); + barrierWriterHolder.AssignExecutableWriterHolder(pBuf, 42); pBufRW = barrierWriterHolder.GetRW(); } diff --git a/src/coreclr/vm/i386/stublinkerx86.cpp b/src/coreclr/vm/i386/stublinkerx86.cpp index 74d55fd4544db..44101c0c3b5ad 100644 --- a/src/coreclr/vm/i386/stublinkerx86.cpp +++ b/src/coreclr/vm/i386/stublinkerx86.cpp @@ -4962,289 +4962,6 @@ Thread* __stdcall CreateThreadBlockReturnHr(ComMethodFrame *pFrame) #endif // !DACCESS_COMPILE -#ifdef HAS_FIXUP_PRECODE - -#ifdef HAS_FIXUP_PRECODE_CHUNKS -TADDR FixupPrecode::GetMethodDesc() -{ - LIMITED_METHOD_CONTRACT; - SUPPORTS_DAC; - - // This lookup is also manually inlined in PrecodeFixupThunk assembly code - TADDR base = *PTR_TADDR(GetBase()); - if (base == NULL) - return NULL; - return base + (m_MethodDescChunkIndex * MethodDesc::ALIGNMENT); -} -#endif - -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS -PCODE FixupPrecode::GetDynamicMethodPrecodeFixupJumpStub() -{ - WRAPPER_NO_CONTRACT; - _ASSERTE(((PTR_MethodDesc)GetMethodDesc())->IsLCGMethod()); - - // The precode fixup jump stub is shared by all fixup precodes in a chunk, and immediately follows the MethodDesc. Jump - // stubs cannot be reused currently for the same method: - // - The jump stub's target would change separately from the precode being updated from "call Func" to "jmp Func", both - // changes would have to be done atomically with runtime suspension, which is not done currently - // - When changing the entry point from one version of jitted code to another, the jump stub's target pointer is not - // aligned to 8 bytes in order to be able to do an interlocked update of the target address - // So, when initially the precode intends to be of the form "call PrecodeFixupThunk", if the target address happens to be - // too far for a relative 32-bit jump, it will use the shared precode fixup jump stub. When changing the entry point to - // jitted code, the jump stub associated with the precode is patched, and the precode is updated to use that jump stub. - // - // Notes: - // - Dynamic method descs, and hence their precodes and preallocated jump stubs, may be reused for a different method - // (along with reinitializing the precode), but only with a transition where the original method is no longer accessible - // to user code - // - Concurrent calls to a dynamic method that has not yet been jitted may trigger multiple writes to the jump stub - // associated with the precode, but only to the same target address (and while the precode is still pointing to - // PrecodeFixupThunk) - return GetBase() + sizeof(PTR_MethodDesc); -} - -PCODE FixupPrecode::GetDynamicMethodEntryJumpStub() -{ - WRAPPER_NO_CONTRACT; - _ASSERTE(((PTR_MethodDesc)GetMethodDesc())->IsLCGMethod()); - - // m_PrecodeChunkIndex has a value inverted to the order of precodes in memory (the precode at the lowest address has the - // highest index, and the precode at the highest address has the lowest index). To map a precode to its jump stub by memory - // order, invert the precode index to get the jump stub index. Also skip the precode fixup jump stub (see - // GetDynamicMethodPrecodeFixupJumpStub()). - UINT32 count = ((PTR_MethodDesc)GetMethodDesc())->GetMethodDescChunk()->GetCount(); - _ASSERTE(m_PrecodeChunkIndex < count); - SIZE_T jumpStubIndex = count - m_PrecodeChunkIndex; - - return GetBase() + sizeof(PTR_MethodDesc) + jumpStubIndex * BACK_TO_BACK_JUMP_ALLOCATE_SIZE; -} -#endif // FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - -#ifdef DACCESS_COMPILE -void FixupPrecode::EnumMemoryRegions(CLRDataEnumMemoryFlags flags) -{ - SUPPORTS_DAC; - DacEnumMemoryRegion(dac_cast(this), sizeof(FixupPrecode)); - - DacEnumMemoryRegion(GetBase(), sizeof(TADDR)); -} -#endif // DACCESS_COMPILE - -#endif // HAS_FIXUP_PRECODE - -#ifndef DACCESS_COMPILE - -void rel32SetInterlocked(/*PINT32*/ PVOID pRel32, /*PINT32*/ PVOID pRel32RW, TADDR target, MethodDesc* pMD) -{ - CONTRACTL - { - THROWS; // Creating a JumpStub could throw OutOfMemory - GC_NOTRIGGER; - } - CONTRACTL_END; - - INT32 targetRel32 = rel32UsingJumpStub((INT32*)pRel32, target, pMD); - - _ASSERTE(IS_ALIGNED(pRel32RW, sizeof(INT32))); - FastInterlockExchange((LONG*)pRel32RW, (LONG)targetRel32); -} - -BOOL rel32SetInterlocked(/*PINT32*/ PVOID pRel32, /*PINT32*/ PVOID pRel32RW, TADDR target, TADDR expected, MethodDesc* pMD) -{ - CONTRACTL - { - THROWS; // Creating a JumpStub could throw OutOfMemory - GC_NOTRIGGER; - } - CONTRACTL_END; - - BYTE* callAddrAdj = (BYTE*)pRel32 + 4; - INT32 expectedRel32 = static_cast((BYTE*)expected - callAddrAdj); - - INT32 targetRel32 = rel32UsingJumpStub((INT32*)pRel32, target, pMD); - - _ASSERTE(IS_ALIGNED(pRel32RW, sizeof(INT32))); - return FastInterlockCompareExchange((LONG*)pRel32RW, (LONG)targetRel32, (LONG)expectedRel32) == (LONG)expectedRel32; -} - -void StubPrecode::Init(StubPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator /* = NULL */, - BYTE type /* = StubPrecode::Type */, TADDR target /* = NULL */) -{ - WRAPPER_NO_CONTRACT; - - IN_TARGET_64BIT(m_movR10 = X86_INSTR_MOV_R10_IMM64); // mov r10, pMethodDesc - IN_TARGET_32BIT(m_movEAX = X86_INSTR_MOV_EAX_IMM32); // mov eax, pMethodDesc - m_pMethodDesc = (TADDR)pMD; - IN_TARGET_32BIT(m_mov_rm_r = X86_INSTR_MOV_RM_R); // mov reg,reg - m_type = type; - m_jmp = X86_INSTR_JMP_REL32; // jmp rel32 - - if (pLoaderAllocator != NULL) - { - // Use pMD == NULL in all precode initialization methods to allocate the initial jump stub in non-dynamic heap - // that has the same lifetime like as the precode itself - if (target == NULL) - target = GetPreStubEntryPoint(); - m_rel32 = rel32UsingJumpStub(&pPrecodeRX->m_rel32, target, NULL /* pMD */, pLoaderAllocator); - } -} - -#ifdef HAS_NDIRECT_IMPORT_PRECODE - -void NDirectImportPrecode::Init(NDirectImportPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator) -{ - WRAPPER_NO_CONTRACT; - StubPrecode::Init(pPrecodeRX, pMD, pLoaderAllocator, NDirectImportPrecode::Type, GetEEFuncEntryPoint(NDirectImportThunk)); -} - -#endif // HAS_NDIRECT_IMPORT_PRECODE - - -#ifdef HAS_FIXUP_PRECODE -void FixupPrecode::Init(FixupPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator, int iMethodDescChunkIndex /*=0*/, int iPrecodeChunkIndex /*=0*/) -{ - WRAPPER_NO_CONTRACT; - - m_op = X86_INSTR_CALL_REL32; // call PrecodeFixupThunk - m_type = FixupPrecode::TypePrestub; - - // Initialize chunk indices only if they are not initialized yet. This is necessary to make MethodDesc::Reset work. - if (m_PrecodeChunkIndex == 0) - { - _ASSERTE(FitsInU1(iPrecodeChunkIndex)); - m_PrecodeChunkIndex = static_cast(iPrecodeChunkIndex); - } - - if (iMethodDescChunkIndex != -1) - { - if (m_MethodDescChunkIndex == 0) - { - _ASSERTE(FitsInU1(iMethodDescChunkIndex)); - m_MethodDescChunkIndex = static_cast(iMethodDescChunkIndex); - } - - if (*(void**)GetBase() == NULL) - *(void**)GetBase() = (BYTE*)pMD - (iMethodDescChunkIndex * MethodDesc::ALIGNMENT); - } - - _ASSERTE(GetMethodDesc() == (TADDR)pMD); - - PCODE target = (PCODE)GetEEFuncEntryPoint(PrecodeFixupThunk); -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - if (pMD->IsLCGMethod()) - { - m_rel32 = rel32UsingPreallocatedJumpStub(&pPrecodeRX->m_rel32, target, pPrecodeRX->GetDynamicMethodPrecodeFixupJumpStub(), GetDynamicMethodPrecodeFixupJumpStub(), false /* emitJump */); - return; - } -#endif // FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - if (pLoaderAllocator != NULL) - { - m_rel32 = rel32UsingJumpStub(&pPrecodeRX->m_rel32, target, NULL /* pMD */, pLoaderAllocator); - } -} - -void FixupPrecode::ResetTargetInterlocked() -{ - CONTRACTL - { - THROWS; // Creating a JumpStub could throw OutOfMemory - GC_NOTRIGGER; - } - CONTRACTL_END; - - FixupPrecode newValue = *this; - newValue.m_op = X86_INSTR_CALL_REL32; // call PrecodeFixupThunk - newValue.m_type = FixupPrecode::TypePrestub; - - PCODE target = (PCODE)GetEEFuncEntryPoint(PrecodeFixupThunk); - MethodDesc* pMD = (MethodDesc*)GetMethodDesc(); -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - // The entry point of LCG methods cannot revert back to the original entry point, as their jump stubs would have to be - // reused, which is currently not supported. This method is intended for resetting the entry point while the method is - // callable, which implies that the entry point may later be changed again to something else. Currently, this is not done - // for LCG methods. See GetDynamicMethodPrecodeFixupJumpStub() for more. - _ASSERTE(!pMD->IsLCGMethod()); -#endif // FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - - newValue.m_rel32 = rel32UsingJumpStub(&m_rel32, target, pMD); - - _ASSERTE(IS_ALIGNED(this, sizeof(INT64))); - - ExecutableWriterHolder precodeWriterHolder(this, sizeof(FixupPrecode)); - FastInterlockExchangeLong((INT64*)precodeWriterHolder.GetRW(), *(INT64*)&newValue); -} - -BOOL FixupPrecode::SetTargetInterlocked(TADDR target, TADDR expected) -{ - CONTRACTL - { - THROWS; // Creating a JumpStub could throw OutOfMemory - GC_NOTRIGGER; - } - CONTRACTL_END; - - INT64 oldValue = *(INT64*)this; - BYTE* pOldValue = (BYTE*)&oldValue; - - MethodDesc * pMD = (MethodDesc*)GetMethodDesc(); - g_IBCLogger.LogMethodPrecodeWriteAccess(pMD); - -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - // A different jump stub is used for this case, see Init(). This call is unexpected for resetting the entry point. - _ASSERTE(!pMD->IsLCGMethod() || target != (TADDR)GetEEFuncEntryPoint(PrecodeFixupThunk)); -#endif // FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - - INT64 newValue = oldValue; - BYTE* pNewValue = (BYTE*)&newValue; - - if (pOldValue[OFFSETOF_PRECODE_TYPE_CALL_OR_JMP] == FixupPrecode::TypePrestub) - { - pNewValue[OFFSETOF_PRECODE_TYPE_CALL_OR_JMP] = FixupPrecode::Type; - - pOldValue[offsetof(FixupPrecode, m_op)] = X86_INSTR_CALL_REL32; - pNewValue[offsetof(FixupPrecode, m_op)] = X86_INSTR_JMP_REL32; - } - else if (pOldValue[OFFSETOF_PRECODE_TYPE_CALL_OR_JMP] == FixupPrecode::Type) - { -#ifdef FEATURE_CODE_VERSIONING - // No change needed, jmp is already in place -#else - // Setting the target more than once is unexpected - return FALSE; -#endif - } - else - { - // Pre-existing code doesn't conform to the expectations for a FixupPrecode - return FALSE; - } - -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - ExecutableWriterHolder dynamicMethodEntryJumpStubWriterHolder; - if (pMD->IsLCGMethod()) - { - dynamicMethodEntryJumpStubWriterHolder = ExecutableWriterHolder((void*)GetDynamicMethodEntryJumpStub(), 12); - } -#endif - *(INT32*)(&pNewValue[offsetof(FixupPrecode, m_rel32)]) = -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - pMD->IsLCGMethod() ? - rel32UsingPreallocatedJumpStub(&m_rel32, target, GetDynamicMethodEntryJumpStub(), (PCODE)dynamicMethodEntryJumpStubWriterHolder.GetRW(), true /* emitJump */) : -#endif // FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - rel32UsingJumpStub(&m_rel32, target, pMD); - - _ASSERTE(IS_ALIGNED(this, sizeof(INT64))); - - ExecutableWriterHolder precodeWriterHolder(this, sizeof(FixupPrecode)); - return FastInterlockCompareExchangeLong((INT64*)precodeWriterHolder.GetRW(), newValue, oldValue) == oldValue; -} - -#endif // HAS_FIXUP_PRECODE - -#endif // !DACCESS_COMPILE - - #ifdef HAS_THISPTR_RETBUF_PRECODE // rel32 jmp target that points back to the jump (infinite loop). @@ -5278,8 +4995,12 @@ void ThisPtrRetBufPrecode::Init(MethodDesc* pMD, LoaderAllocator *pLoaderAllocat // This precode is never patched lazily - avoid unnecessary jump stub allocation m_rel32 = REL32_JMP_SELF; + + _ASSERTE(*((BYTE*)this + OFFSETOF_PRECODE_TYPE) == ThisPtrRetBufPrecode::Type); } +IN_TARGET_32BIT(static_assert_no_msg(offsetof(ThisPtrRetBufPrecode, m_movScratchArg0) == OFFSETOF_PRECODE_TYPE);) + BOOL ThisPtrRetBufPrecode::SetTargetInterlocked(TADDR target, TADDR expected) { CONTRACTL diff --git a/src/coreclr/vm/i386/stublinkerx86.h b/src/coreclr/vm/i386/stublinkerx86.h index c719057e97ea3..b1322f3d0dbdc 100644 --- a/src/coreclr/vm/i386/stublinkerx86.h +++ b/src/coreclr/vm/i386/stublinkerx86.h @@ -16,15 +16,12 @@ extern PCODE GetPreStubEntryPoint(); #define X86_INSTR_CALL_REL32 0xE8 // call rel32 #define X86_INSTR_CALL_IND 0x15FF // call dword ptr[addr32] #define X86_INSTR_CALL_IND_EAX 0x10FF // call dword ptr[eax] -#define X86_INSTR_CALL_IND_EAX_OFFSET 0x50FF // call dword ptr[eax + offset] ; where offset follows these 2 bytes -#define X86_INSTR_CALL_EAX 0xD0FF // call eax #define X86_INSTR_JMP_REL32 0xE9 // jmp rel32 #define X86_INSTR_JMP_IND 0x25FF // jmp dword ptr[addr32] #define X86_INSTR_JMP_EAX 0xE0FF // jmp eax #define X86_INSTR_MOV_EAX_IMM32 0xB8 // mov eax, imm32 #define X86_INSTR_MOV_EAX_ECX_IND 0x018b // mov eax, [ecx] -#define X86_INSTR_CMP_IND_ECX_IMM32 0x3981 // cmp [ecx], imm32 -#define X86_INSTR_MOV_RM_R 0x89 // mov r/m,reg +#define X86_INSTR_CMP_IND_ECX_EAX 0x0139 // cmp [ecx], eax #define X86_INSTR_MOV_AL 0xB0 // mov al, imm8 #define X86_INSTR_JMP_REL8 0xEB // jmp short rel8 @@ -43,10 +40,6 @@ extern PCODE GetPreStubEntryPoint(); #define X86_INSTR_MOVUPS_RM_R 0x110F // movups xmm1/mem128, xmm2 #define X86_INSTR_XORPS 0x570F // xorps xmm1, xmm2/mem128 -#ifdef TARGET_AMD64 -#define X86_INSTR_MOV_R10_IMM64 0xBA49 // mov r10, imm64 -#endif - //---------------------------------------------------------------------- // Encodes X86 registers. The numbers are chosen to match Intel's opcode // encoding. @@ -463,237 +456,8 @@ BOOL rel32SetInterlocked(/*PINT32*/ PVOID pRel32, /*PINT32*/ PVOID pRel32RW, TAD // //------------------------------------------------------------------------ -EXTERN_C VOID STDCALL PrecodeFixupThunk(); - -#ifdef HOST_64BIT - -#define OFFSETOF_PRECODE_TYPE 0 -#define OFFSETOF_PRECODE_TYPE_CALL_OR_JMP 5 -#define OFFSETOF_PRECODE_TYPE_MOV_R10 10 - -#define SIZEOF_PRECODE_BASE 16 - -#else - -EXTERN_C VOID STDCALL PrecodeRemotingThunk(); - -#define OFFSETOF_PRECODE_TYPE 5 -#define OFFSETOF_PRECODE_TYPE_CALL_OR_JMP 5 -#define OFFSETOF_PRECODE_TYPE_MOV_RM_R 6 - -#define SIZEOF_PRECODE_BASE 8 - -#endif // HOST_64BIT - - #include -// Invalid precode type -struct InvalidPrecode { - // int3 - static const int Type = 0xCC; -}; - - -// Regular precode -struct StubPrecode { - -#ifdef HOST_64BIT - static const BYTE Type = 0xF8; - // mov r10,pMethodDesc - // clc - // jmp Stub -#else - static const BYTE Type = 0xED; - // mov eax,pMethodDesc - // mov ebp,ebp - // jmp Stub -#endif // HOST_64BIT - - IN_TARGET_64BIT(USHORT m_movR10;) - IN_TARGET_32BIT(BYTE m_movEAX;) - TADDR m_pMethodDesc; - IN_TARGET_32BIT(BYTE m_mov_rm_r;) - BYTE m_type; - BYTE m_jmp; - INT32 m_rel32; - - void Init(StubPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator = NULL, BYTE type = StubPrecode::Type, TADDR target = NULL); - - TADDR GetMethodDesc() - { - LIMITED_METHOD_DAC_CONTRACT; - - return m_pMethodDesc; - } - - PCODE GetTarget() - { - LIMITED_METHOD_DAC_CONTRACT; - - return rel32Decode(PTR_HOST_MEMBER_TADDR(StubPrecode, this, m_rel32)); - } -#ifndef DACCESS_COMPILE - void ResetTargetInterlocked() - { - CONTRACTL - { - THROWS; - GC_NOTRIGGER; - } - CONTRACTL_END; - - ExecutableWriterHolder rel32WriterHolder(&m_rel32, sizeof(INT32)); - rel32SetInterlocked(&m_rel32, rel32WriterHolder.GetRW(), GetPreStubEntryPoint(), (MethodDesc*)GetMethodDesc()); - } - - BOOL SetTargetInterlocked(TADDR target, TADDR expected) - { - CONTRACTL - { - THROWS; - GC_NOTRIGGER; - } - CONTRACTL_END; - - ExecutableWriterHolder rel32Holder(&m_rel32, 4); - return rel32SetInterlocked(&m_rel32, rel32Holder.GetRW(), target, expected, (MethodDesc*)GetMethodDesc()); - } -#endif // !DACCESS_COMPILE -}; -IN_TARGET_64BIT(static_assert_no_msg(offsetof(StubPrecode, m_movR10) == OFFSETOF_PRECODE_TYPE);) -IN_TARGET_64BIT(static_assert_no_msg(offsetof(StubPrecode, m_type) == OFFSETOF_PRECODE_TYPE_MOV_R10);) -IN_TARGET_32BIT(static_assert_no_msg(offsetof(StubPrecode, m_mov_rm_r) == OFFSETOF_PRECODE_TYPE);) -IN_TARGET_32BIT(static_assert_no_msg(offsetof(StubPrecode, m_type) == OFFSETOF_PRECODE_TYPE_MOV_RM_R);) -typedef DPTR(StubPrecode) PTR_StubPrecode; - - -#ifdef HAS_NDIRECT_IMPORT_PRECODE - -// NDirect import precode -// (This is fake precode. VTable slot does not point to it.) -struct NDirectImportPrecode : StubPrecode { - -#ifdef HOST_64BIT - static const int Type = 0xF9; - // mov r10,pMethodDesc - // stc - // jmp NDirectImportThunk -#else - static const int Type = 0xC0; - // mov eax,pMethodDesc - // mov eax,eax - // jmp NDirectImportThunk -#endif // HOST_64BIT - - void Init(NDirectImportPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator); - - LPVOID GetEntrypoint() - { - LIMITED_METHOD_CONTRACT; - return this; - } -}; -typedef DPTR(NDirectImportPrecode) PTR_NDirectImportPrecode; - -#endif // HAS_NDIRECT_IMPORT_PRECODE - - -#ifdef HAS_FIXUP_PRECODE - -// Fixup precode is used in ngen images when the prestub does just one time fixup. -// The fixup precode is simple jump once patched. It does not have the two instruction overhead of regular precode. -struct FixupPrecode { - - static const int TypePrestub = 0x5E; - // The entrypoint has to be 8-byte aligned so that the "call PrecodeFixupThunk" can be patched to "jmp NativeCode" atomically. - // call PrecodeFixupThunk - // db TypePrestub (pop esi) - // db MethodDescChunkIndex - // db PrecodeChunkIndex - - static const int Type = 0x5F; - // After it has been patched to point to native code - // jmp NativeCode - // db Type (pop edi) - - BYTE m_op; - INT32 m_rel32; - BYTE m_type; - BYTE m_MethodDescChunkIndex; - BYTE m_PrecodeChunkIndex; -#ifdef HAS_FIXUP_PRECODE_CHUNKS - // Fixup precode chunk is associated with MethodDescChunk. The layout of the fixup precode chunk is: - // - // FixupPrecode Entrypoint PrecodeChunkIndex = 2 - // FixupPrecode Entrypoint PrecodeChunkIndex = 1 - // FixupPrecode Entrypoint PrecodeChunkIndex = 0 - // TADDR Base of MethodDescChunk -#else - TADDR m_pMethodDesc; -#endif - - void Init(FixupPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator, int iMethodDescChunkIndex = 0, int iPrecodeChunkIndex = 0); - -#ifdef HAS_FIXUP_PRECODE_CHUNKS - TADDR GetBase() - { - LIMITED_METHOD_CONTRACT; - SUPPORTS_DAC; - - return dac_cast(this) + (m_PrecodeChunkIndex + 1) * sizeof(FixupPrecode); - } - - size_t GetSizeRW() - { - LIMITED_METHOD_CONTRACT; - - return GetBase() + sizeof(void*) - dac_cast(this); - } - - TADDR GetMethodDesc(); -#else // HAS_FIXUP_PRECODE_CHUNKS - TADDR GetMethodDesc() - { - LIMITED_METHOD_CONTRACT; - return m_pMethodDesc; - } -#endif // HAS_FIXUP_PRECODE_CHUNKS - -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - PCODE GetDynamicMethodPrecodeFixupJumpStub(); - PCODE GetDynamicMethodEntryJumpStub(); -#endif // FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - - PCODE GetTarget() - { - LIMITED_METHOD_DAC_CONTRACT; - - return rel32Decode(PTR_HOST_MEMBER_TADDR(FixupPrecode, this, m_rel32)); - } - - void ResetTargetInterlocked(); - BOOL SetTargetInterlocked(TADDR target, TADDR expected); - - static BOOL IsFixupPrecodeByASM(TADDR addr) - { - LIMITED_METHOD_CONTRACT; - - return *dac_cast(addr) == X86_INSTR_JMP_REL32; - } - -#ifdef DACCESS_COMPILE - void EnumMemoryRegions(CLRDataEnumMemoryFlags flags); -#endif -}; -IN_TARGET_32BIT(static_assert_no_msg(offsetof(FixupPrecode, m_type) == OFFSETOF_PRECODE_TYPE)); -IN_TARGET_64BIT(static_assert_no_msg(offsetof(FixupPrecode, m_op) == OFFSETOF_PRECODE_TYPE);) -IN_TARGET_64BIT(static_assert_no_msg(offsetof(FixupPrecode, m_type) == OFFSETOF_PRECODE_TYPE_CALL_OR_JMP);) - -typedef DPTR(FixupPrecode) PTR_FixupPrecode; - -#endif // HAS_FIXUP_PRECODE - #ifdef HAS_THISPTR_RETBUF_PRECODE // Precode to stuffle this and retbuf for closed delegates over static methods with return buffer @@ -702,7 +466,7 @@ struct ThisPtrRetBufPrecode { #ifdef HOST_64BIT static const int Type = 0x90; #else - static const int Type = 0xC2; + static const int Type = 0x89; #endif // HOST_64BIT // mov regScratch,regArg0 @@ -738,7 +502,7 @@ struct ThisPtrRetBufPrecode { BOOL SetTargetInterlocked(TADDR target, TADDR expected); }; -IN_TARGET_32BIT(static_assert_no_msg(offsetof(ThisPtrRetBufPrecode, m_movArg1Scratch) + 1 == OFFSETOF_PRECODE_TYPE);) + typedef DPTR(ThisPtrRetBufPrecode) PTR_ThisPtrRetBufPrecode; #endif // HAS_THISPTR_RETBUF_PRECODE diff --git a/src/coreclr/vm/i386/thunktemplates.S b/src/coreclr/vm/i386/thunktemplates.S new file mode 100644 index 0000000000000..e9bd6db96beb4 --- /dev/null +++ b/src/coreclr/vm/i386/thunktemplates.S @@ -0,0 +1,124 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +.intel_syntax noprefix +#include "unixasmmacros.inc" +#include "asmconstants.h" + +PAGE_SIZE = 4096 + +//#define DATA_SLOT(stub, field) stub##Code + PAGE_SIZE + stub##Data__##field +#define DATA_SLOT(stub, field) PAGE_SIZE + stub##Data__##field + +.macro INDJMP target + .att_syntax + jmp *\target + .intel_syntax noprefix +.endm + +.macro INDCALL target + .att_syntax + call *\target + .intel_syntax noprefix +.endm + +.macro SLOT_ADDRESS_PATCH_LABEL stub, field, offset=-4, index="" + C_FUNC(\stub\()Code_\field\()_Offset\index) = .\offset-\stub\()Code + .global C_FUNC(\stub\()Code_\field\()_Offset\index) +.endm + +LEAF_ENTRY StubPrecodeCode + mov eax, dword ptr [DATA_SLOT(StubPrecode, MethodDesc)] +SLOT_ADDRESS_PATCH_LABEL StubPrecode, MethodDesc + INDJMP DATA_SLOT(StubPrecode, Target) +SLOT_ADDRESS_PATCH_LABEL StubPrecode, Target + nop +LEAF_END_MARKED StubPrecodeCode + +LEAF_ENTRY FixupPrecodeCode + INDJMP DATA_SLOT(FixupPrecode, Target) +SLOT_ADDRESS_PATCH_LABEL FixupPrecode, Target + mov eax, dword ptr [DATA_SLOT(FixupPrecode, MethodDesc)] +SLOT_ADDRESS_PATCH_LABEL FixupPrecode, MethodDesc + INDJMP DATA_SLOT(FixupPrecode, PrecodeFixupThunk) +SLOT_ADDRESS_PATCH_LABEL FixupPrecode, PrecodeFixupThunk +LEAF_END_MARKED FixupPrecodeCode + +LEAF_ENTRY CallCountingStubCode + mov eax, dword ptr [DATA_SLOT(CallCountingStub, RemainingCallCountCell)] +SLOT_ADDRESS_PATCH_LABEL CallCountingStub, RemainingCallCountCell + dec WORD PTR [eax] + je LOCAL_LABEL(CountReachedZero) + INDJMP DATA_SLOT(CallCountingStub, TargetForMethod) +SLOT_ADDRESS_PATCH_LABEL CallCountingStub, TargetForMethod +LOCAL_LABEL(CountReachedZero): + INDCALL DATA_SLOT(CallCountingStub, TargetForThresholdReached) +SLOT_ADDRESS_PATCH_LABEL CallCountingStub, TargetForThresholdReached + int 3 +LEAF_END_MARKED CallCountingStubCode + +LEAF_ENTRY LookupStubCode + push eax + push dword ptr [DATA_SLOT(LookupStub, DispatchToken)] +SLOT_ADDRESS_PATCH_LABEL LookupStub, DispatchToken + INDJMP DATA_SLOT(LookupStub, ResolveWorkerTarget) +SLOT_ADDRESS_PATCH_LABEL LookupStub, ResolveWorkerTarget +LEAF_END_MARKED LookupStubCode + +LEAF_ENTRY DispatchStubCode + push eax + mov eax, dword ptr [DATA_SLOT(DispatchStub, ExpectedMT)] +SLOT_ADDRESS_PATCH_LABEL DispatchStub, ExpectedMT +PATCH_LABEL _DispatchStubCode_ThisDeref + cmp dword ptr [ecx],eax + pop eax + jne NoMatch + INDJMP DATA_SLOT(DispatchStub, ImplTarget) +SLOT_ADDRESS_PATCH_LABEL DispatchStub, ImplTarget +NoMatch: + INDJMP DATA_SLOT(DispatchStub, FailTarget) +SLOT_ADDRESS_PATCH_LABEL DispatchStub, FailTarget +LEAF_END_MARKED DispatchStubCode + +LEAF_ENTRY ResolveStubCode +PATCH_LABEL ResolveStubCode_FailEntry + sub dword ptr [DATA_SLOT(ResolveStub, Counter)], 1 +SLOT_ADDRESS_PATCH_LABEL ResolveStub, Counter, -5 + jl LOCAL_LABEL(Backpatcher) +PATCH_LABEL ResolveStubCode_ResolveEntry +LOCAL_LABEL(ResolveEntry): + push eax +PATCH_LABEL ResolveStubCode_ThisDeref + mov eax,dword ptr [ecx] + push edx + mov edx,eax + shr eax, 12 + add eax,edx + xor eax,dword ptr [DATA_SLOT(ResolveStub, HashedToken)] +SLOT_ADDRESS_PATCH_LABEL ResolveStub, HashedToken + and eax,CALL_STUB_CACHE_MASK_ASM * 4 + add eax,dword ptr [DATA_SLOT(ResolveStub, CacheAddress)] +SLOT_ADDRESS_PATCH_LABEL ResolveStub, CacheAddress + mov eax,dword ptr [eax] + cmp edx,dword ptr [eax] + jne LOCAL_LABEL(Miss) + mov edx,dword ptr [DATA_SLOT(ResolveStub, Token)] +SLOT_ADDRESS_PATCH_LABEL ResolveStub, Token,, 1 + cmp edx,dword ptr [eax + 4] + jne LOCAL_LABEL(Miss) + mov eax,dword ptr [eax + 8] + pop edx + add esp, 4 + jmp eax +LOCAL_LABEL(Miss): + pop edx + push dword ptr [DATA_SLOT(ResolveStub, Token)] +SLOT_ADDRESS_PATCH_LABEL ResolveStub, Token,, 2 + INDJMP DATA_SLOT(ResolveStub, ResolveWorkerTarget) // <<< resolveWorker == ResolveWorkerChainLookupAsmStub or ResolveWorkerAsmStub +SLOT_ADDRESS_PATCH_LABEL ResolveStub, ResolveWorkerTarget +LOCAL_LABEL(Backpatcher): + INDCALL DATA_SLOT(ResolveStub, PatcherTarget) // <<< backpatcherWorker == BackPatchWorkerAsmStub +SLOT_ADDRESS_PATCH_LABEL ResolveStub, PatcherTarget + jmp LOCAL_LABEL(ResolveEntry) +LEAF_END_MARKED ResolveStubCode + diff --git a/src/coreclr/vm/i386/thunktemplates.asm b/src/coreclr/vm/i386/thunktemplates.asm new file mode 100644 index 0000000000000..59ccdb18fe27e --- /dev/null +++ b/src/coreclr/vm/i386/thunktemplates.asm @@ -0,0 +1,127 @@ +; Licensed to the .NET Foundation under one or more agreements. +; The .NET Foundation licenses this file to you under the MIT license. + + .586 + .model flat + +include +include AsmConstants.inc + + option casemap:none + .code + +.686P +.XMM + +PAGE_SIZE EQU 4096 + +DATA_SLOT macro stub, field + exitm @CatStr(<_>, stub, , stub, , field) +endm + +SLOT_ADDRESS_PATCH_LABEL macro stub, field, offset:=<-4>, index:=<> + LOCAL labelName, labelValue +labelName TEXTEQU @CatStr(<_>, stub, , field, <_Offset>, index) +labelValue TEXTEQU @CatStr(<$>, offset, <-_>, stub, ) + %labelName EQU labelValue + PUBLIC labelName +endm + +LEAF_ENTRY _StubPrecodeCode@0 + mov eax, dword ptr DATA_SLOT(StubPrecode, MethodDesc) +SLOT_ADDRESS_PATCH_LABEL StubPrecode, MethodDesc + jmp dword ptr DATA_SLOT(StubPrecode, Target) +SLOT_ADDRESS_PATCH_LABEL StubPrecode, Target +LEAF_END_MARKED _StubPrecodeCode@0 + +EXTERN _ThePreStub@0:PROC + +LEAF_ENTRY _FixupPrecodeCode@0 + jmp dword ptr DATA_SLOT(FixupPrecode, Target) +SLOT_ADDRESS_PATCH_LABEL FixupPrecode, Target + mov eax, dword ptr DATA_SLOT(FixupPrecode, MethodDesc) +SLOT_ADDRESS_PATCH_LABEL FixupPrecode, MethodDesc + jmp dword ptr DATA_SLOT(FixupPrecode, PrecodeFixupThunk) +SLOT_ADDRESS_PATCH_LABEL FixupPrecode, PrecodeFixupThunk +LEAF_END_MARKED _FixupPrecodeCode@0 + +LEAF_ENTRY _CallCountingStubCode@0 + mov eax, dword ptr DATA_SLOT(CallCountingStub, RemainingCallCountCell) +SLOT_ADDRESS_PATCH_LABEL CallCountingStub, RemainingCallCountCell + dec WORD PTR [eax] + je CountReachedZero + jmp dword ptr DATA_SLOT(CallCountingStub, TargetForMethod) +SLOT_ADDRESS_PATCH_LABEL CallCountingStub, TargetForMethod +CountReachedZero: + call dword ptr DATA_SLOT(CallCountingStub, TargetForThresholdReached) +SLOT_ADDRESS_PATCH_LABEL CallCountingStub, TargetForThresholdReached + int 3 +LEAF_END_MARKED _CallCountingStubCode@0 + +LEAF_ENTRY _LookupStubCode@0 + push eax + push dword ptr DATA_SLOT(LookupStub, DispatchToken) +SLOT_ADDRESS_PATCH_LABEL LookupStub, DispatchToken + jmp dword ptr DATA_SLOT(LookupStub, ResolveWorkerTarget) +SLOT_ADDRESS_PATCH_LABEL LookupStub, ResolveWorkerTarget +LEAF_END_MARKED _LookupStubCode@0 + +LEAF_ENTRY _DispatchStubCode@0 + push eax + mov eax, dword ptr DATA_SLOT(DispatchStub, ExpectedMT) +SLOT_ADDRESS_PATCH_LABEL DispatchStub, ExpectedMT +PATCH_LABEL _DispatchStubCode_ThisDeref@0 + cmp dword ptr [ecx],eax + pop eax + jne NoMatch + jmp dword ptr DATA_SLOT(DispatchStub, ImplTarget) +SLOT_ADDRESS_PATCH_LABEL DispatchStub, ImplTarget +NoMatch: + jmp dword ptr DATA_SLOT(DispatchStub, FailTarget) +SLOT_ADDRESS_PATCH_LABEL DispatchStub, FailTarget +LEAF_END_MARKED _DispatchStubCode@0 + +LEAF_ENTRY _ResolveStubCode@0 +_ResolveStubCode_FailEntry@0: +PUBLIC _ResolveStubCode_FailEntry@0 + sub dword ptr DATA_SLOT(ResolveStub, Counter), 1 +SLOT_ADDRESS_PATCH_LABEL ResolveStub, Counter, -5 + jl Backpatcher +PATCH_LABEL _ResolveStubCode_ResolveEntry@0 + push eax +PATCH_LABEL _ResolveStubCode_ThisDeref@0 + mov eax,dword ptr [ecx] + push edx + mov edx,eax + shr eax, 12 + add eax,edx + xor eax,dword ptr DATA_SLOT(ResolveStub, HashedToken) +SLOT_ADDRESS_PATCH_LABEL ResolveStub, HashedToken + and eax,CALL_STUB_CACHE_MASK_ASM * 4 + add eax,dword ptr DATA_SLOT(ResolveStub, CacheAddress) +SLOT_ADDRESS_PATCH_LABEL ResolveStub, CacheAddress + mov eax,dword ptr [eax] + cmp edx,dword ptr [eax] + jne Miss + mov edx,dword ptr DATA_SLOT(ResolveStub, Token) +SLOT_ADDRESS_PATCH_LABEL ResolveStub, Token,, 1 + cmp edx,dword ptr [eax + 4] + jne Miss + mov eax,dword ptr [eax + 8] + pop edx + add esp, 4 + jmp eax +Miss: + pop edx +Slow: + push dword ptr DATA_SLOT(ResolveStub, Token) +SLOT_ADDRESS_PATCH_LABEL ResolveStub, Token,, 2 + jmp dword ptr DATA_SLOT(ResolveStub, ResolveWorkerTarget); <<< resolveWorker == ResolveWorkerChainLookupAsmStub or ResolveWorkerAsmStub +SLOT_ADDRESS_PATCH_LABEL ResolveStub, ResolveWorkerTarget +Backpatcher: + call dword ptr DATA_SLOT(ResolveStub, PatcherTarget); <<< backpatcherWorker == BackPatchWorkerAsmStub +SLOT_ADDRESS_PATCH_LABEL ResolveStub, PatcherTarget + jmp _ResolveStubCode_ResolveEntry@0 +LEAF_END_MARKED _ResolveStubCode@0 + + end \ No newline at end of file diff --git a/src/coreclr/vm/i386/virtualcallstubcpu.hpp b/src/coreclr/vm/i386/virtualcallstubcpu.hpp index 38a5a9baafe4b..9f648f6788524 100644 --- a/src/coreclr/vm/i386/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/i386/virtualcallstubcpu.hpp @@ -13,370 +13,17 @@ #ifndef _VIRTUAL_CALL_STUB_X86_H #define _VIRTUAL_CALL_STUB_X86_H +#define DISPATCH_STUB_FIRST_WORD 0xa150 +#define RESOLVE_STUB_FIRST_WORD 0x2d83 +#define LOOKUP_STUB_FIRST_WORD 0xff50 +#define VTABLECALL_STUB_FIRST_WORD 0x018b + #ifdef DECLARE_DATA #include "asmconstants.h" #endif #include // Since we are placing code, we want byte packing of the structs -#define USES_LOOKUP_STUBS 1 - -/********************************************************************************************* -Stubs that contain code are all part of larger structs called Holders. There is a -Holder for each kind of stub, i.e XXXStub is contained with XXXHolder. Holders are -essentially an implementation trick that allowed rearranging the code sequences more -easily while trying out different alternatives, and for dealing with any alignment -issues in a way that was mostly immune to the actually code sequences. These Holders -should be revisited when the stub code sequences are fixed, since in many cases they -add extra space to a stub that is not really needed. - -Stubs are placed in cache and hash tables. Since unaligned access of data in memory -is very slow, the keys used in those tables should be aligned. The things used as keys -typically also occur in the generated code, e.g. a token as an immediate part of an instruction. -For now, to avoid alignment computations as different code strategies are tried out, the key -fields are all in the Holders. Eventually, many of these fields should be dropped, and the instruction -streams aligned so that the immediate fields fall on aligned boundaries. -*/ - -#if USES_LOOKUP_STUBS - -struct LookupStub; -struct LookupHolder; - -/*LookupStub************************************************************************************** -Virtual and interface call sites are initially setup to point at LookupStubs. -This is because the runtime type of the pointer is not yet known, -so the target cannot be resolved. Note: if the jit is able to determine the runtime type -of the pointer, it should be generating a direct call not a virtual or interface call. -This stub pushes a lookup token onto the stack to identify the sought after method, and then -jumps into the EE (VirtualCallStubManager::ResolveWorkerStub) to effectuate the lookup and -transfer of control to the appropriate target method implementation, perhaps patching of the call site -along the way to point to a more appropriate stub. Hence callsites that point to LookupStubs -get quickly changed to point to another kind of stub. -*/ -struct LookupStub -{ - inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } - inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } - -private: - friend struct LookupHolder; - - // DispatchStub:: _entryPoint expects: - // ecx: object (the "this" pointer) - // eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call - BYTE _entryPoint [2]; // 50 push eax ;save siteAddrForRegisterIndirect - this may be an indirect call - // 68 push - size_t _token; // xx xx xx xx 32-bit constant -#ifdef STUB_LOGGING - BYTE cntr2[2]; // ff 05 inc - size_t* c_lookup; // xx xx xx xx [call_lookup_counter] -#endif //STUB_LOGGING - BYTE part2 [1]; // e9 jmp - DISPL _resolveWorkerDispl;// xx xx xx xx pc-rel displ -}; - -/* LookupHolders are the containers for LookupStubs, they provide for any alignment of -stubs as necessary. In the case of LookupStubs, alignment is necessary since -LookupStubs are placed in a hash table keyed by token. */ -struct LookupHolder -{ - static void InitializeStatic(); - - void Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken); - - LookupStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } - - static LookupHolder* FromLookupEntry(PCODE lookupEntry); - -private: - friend struct LookupStub; - - BYTE align[(sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*)))%sizeof(void*)]; - LookupStub _stub; - BYTE pad[sizeof(void*) - - ((sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*))) + - (sizeof(LookupStub)) - ) % sizeof(void*)]; //complete DWORD - - static_assert_no_msg((sizeof(void*) - - ((sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*))) + - (sizeof(LookupStub)) - ) % sizeof(void*)) != 0); -}; - -#endif // USES_LOOKUP_STUBS - -struct DispatchStub; -struct DispatchHolder; - -/*DispatchStub************************************************************************************** -Monomorphic and mostly monomorphic call sites eventually point to DispatchStubs. -A dispatch stub has an expected type (expectedMT), target address (target) and fail address (failure). -If the calling frame does in fact have the type be of the expected type, then -control is transfered to the target address, the method implementation. If not, -then control is transfered to the fail address, a fail stub (see below) where a polymorphic -lookup is done to find the correct address to go to. - -implementation note: Order, choice of instructions, and branch directions -should be carefully tuned since it can have an inordinate effect on performance. Particular -attention needs to be paid to the effects on the BTB and branch prediction, both in the small -and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. -Note that since this stub is only used for mostly monomorphic callsites (ones that are not, get patched -to something else), therefore the conditional jump "jne failure" is mostly not taken, and hence it is important -that the branch prediction staticly predict this, which means it must be a forward jump. The alternative -is to reverse the order of the jumps and make sure that the resulting conditional jump "je implTarget" -is statically predicted as taken, i.e a backward jump. The current choice was taken since it was easier -to control the placement of the stubs than control the placement of the jitted code and the stubs. */ -struct DispatchStub -{ - inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } - - inline size_t expectedMT() { LIMITED_METHOD_CONTRACT; return _expectedMT; } - inline PCODE implTarget() { LIMITED_METHOD_CONTRACT; return (PCODE) &_implDispl + sizeof(DISPL) + _implDispl; } - - inline TADDR implTargetSlot(EntryPointSlots::SlotType *slotTypeRef) const - { - LIMITED_METHOD_CONTRACT; - _ASSERTE(slotTypeRef != nullptr); - - *slotTypeRef = EntryPointSlots::SlotType_ExecutableRel32; - return (TADDR)&_implDispl; - } - - inline PCODE failTarget() { LIMITED_METHOD_CONTRACT; return (PCODE) &_failDispl + sizeof(DISPL) + _failDispl; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(DispatchStub); } - -private: - friend struct DispatchHolder; - - // DispatchStub:: _entryPoint expects: - // ecx: object (the "this" pointer) - // eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call -#ifndef STUB_LOGGING - BYTE _entryPoint [2]; // 81 39 cmp [ecx], ; This is the place where we are going to fault on null this. - size_t _expectedMT; // xx xx xx xx expectedMT ; If you change it, change also AdjustContextForVirtualStub in excep.cpp!!! - BYTE jmpOp1[2]; // 0f 85 jne - DISPL _failDispl; // xx xx xx xx failEntry ;must be forward jmp for perf reasons - BYTE jmpOp2; // e9 jmp - DISPL _implDispl; // xx xx xx xx implTarget -#else //STUB_LOGGING - BYTE _entryPoint [2]; // ff 05 inc - size_t* d_call; // xx xx xx xx [call_mono_counter] - BYTE cmpOp [2]; // 81 39 cmp [ecx], - size_t _expectedMT; // xx xx xx xx expectedMT - BYTE jmpOp1[2]; // 0f 84 je - DISPL _implDispl; // xx xx xx xx implTarget ;during logging, perf is not so important - BYTE fail [2]; // ff 05 inc - size_t* d_miss; // xx xx xx xx [miss_mono_counter] - BYTE jmpFail; // e9 jmp - DISPL _failDispl; // xx xx xx xx failEntry -#endif //STUB_LOGGING -}; - -/* DispatchHolders are the containers for DispatchStubs, they provide for any alignment of -stubs as necessary. DispatchStubs are placed in a hashtable and in a cache. The keys for both -are the pair expectedMT and token. Efficiency of the of the hash table is not a big issue, -since lookups in it are fairly rare. Efficiency of the cache is paramount since it is accessed frequently -o(see ResolveStub below). Currently we are storing both of these fields in the DispatchHolder to simplify -alignment issues. If inlineMT in the stub itself was aligned, then it could be the expectedMT field. -While the token field can be logically gotten by following the failure target to the failEntryPoint -of the ResolveStub and then to the token over there, for perf reasons of cache access, it is duplicated here. -This allows us to use DispatchStubs in the cache. The alternative is to provide some other immutable struct -for the cache composed of the triplet (expectedMT, token, target) and some sort of reclaimation scheme when -they are thrown out of the cache via overwrites (since concurrency will make the obvious approaches invalid). -*/ - -/* @workaround for ee resolution - Since the EE does not currently have a resolver function that -does what we want, see notes in implementation of VirtualCallStubManager::Resolver, we are -using dispatch stubs to siumulate what we want. That means that inlineTarget, which should be immutable -is in fact written. Hence we have moved target out into the holder and aligned it so we can -atomically update it. When we get a resolver function that does what we want, we can drop this field, -and live with just the inlineTarget field in the stub itself, since immutability will hold.*/ -struct DispatchHolder -{ - static void InitializeStatic(); - - void Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT); - - DispatchStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } - - static DispatchHolder* FromDispatchEntry(PCODE dispatchEntry); - -private: - // Force _implDispl to be aligned so that it is backpatchable for tiering - BYTE align[(sizeof(void*) - (offsetof(DispatchStub, _implDispl) % sizeof(void*))) % sizeof(void*)]; - DispatchStub _stub; - BYTE pad[(sizeof(void*) - (sizeof(DispatchStub) % sizeof(void*)) + offsetof(DispatchStub, _implDispl)) % sizeof(void*)]; //complete DWORD -}; - -struct ResolveStub; -struct ResolveHolder; - -/*ResolveStub************************************************************************************** -Polymorphic call sites and monomorphic calls that fail end up in a ResolverStub. There is only -one resolver stub built for any given token, even though there may be many call sites that -use that token and many distinct types that are used in the calling call frames. A resolver stub -actually has two entry points, one for polymorphic call sites and one for dispatch stubs that fail on their -expectedMT test. There is a third part of the resolver stub that enters the ee when a decision should -be made about changing the callsite. Therefore, we have defined the resolver stub as three distinct pieces, -even though they are actually allocated as a single contiguous block of memory. These pieces are: - -A ResolveStub has two entry points: - -FailEntry - where the dispatch stub goes if the expected MT test fails. This piece of the stub does -a check to see how often we are actually failing. If failures are frequent, control transfers to the -patch piece to cause the call site to be changed from a mostly monomorphic callsite -(calls dispatch stub) to a polymorphic callsize (calls resolve stub). If failures are rare, control -transfers to the resolve piece (see ResolveStub). The failEntryPoint decrements a counter -every time it is entered. The ee at various times will add a large chunk to the counter. - -ResolveEntry - does a lookup via in a cache by hashing the actual type of the calling frame s - and the token identifying the (contract,method) pair desired. If found, control is transfered -to the method implementation. If not found in the cache, the token is pushed and the ee is entered via -the ResolveWorkerStub to do a full lookup and eventual transfer to the correct method implementation. Since -there is a different resolve stub for every token, the token can be inlined and the token can be pre-hashed. -The effectiveness of this approach is highly sensitive to the effectiveness of the hashing algorithm used, -as well as its speed. It turns out it is very important to make the hash function sensitive to all -of the bits of the method table, as method tables are laid out in memory in a very non-random way. Before -making any changes to the code sequences here, it is very important to measure and tune them as perf -can vary greatly, in unexpected ways, with seeming minor changes. - -Implementation note - Order, choice of instructions, and branch directions -should be carefully tuned since it can have an inordinate effect on performance. Particular -attention needs to be paid to the effects on the BTB and branch prediction, both in the small -and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. -Note that this stub is called in highly polymorphic cases, but the cache should have been sized -and the hash function chosen to maximize the cache hit case. Hence the cmp/jcc instructions should -mostly be going down the cache hit route, and it is important that this be statically predicted as so. -Hence the 3 jcc instrs need to be forward jumps. As structured, there is only one jmp/jcc that typically -gets put in the BTB since all the others typically fall straight thru. Minimizing potential BTB entries -is important. */ - -struct ResolveStub -{ - inline PCODE failEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_failEntryPoint[0]; } - inline PCODE resolveEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_resolveEntryPoint; } - inline PCODE slowEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_slowEntryPoint[0]; } - - inline INT32* pCounter() { LIMITED_METHOD_CONTRACT; return _pCounter; } - inline UINT32 hashedToken() { LIMITED_METHOD_CONTRACT; return _hashedToken >> LOG2_PTRSIZE; } - inline size_t cacheAddress() { LIMITED_METHOD_CONTRACT; return _cacheAddress; } - inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(ResolveStub); } -#ifndef UNIX_X86_ABI - inline static size_t offsetOfThisDeref(){ LIMITED_METHOD_CONTRACT; return offsetof(ResolveStub, part1) - offsetof(ResolveStub, _resolveEntryPoint); } - inline size_t stackArgumentsSize() { LIMITED_METHOD_CONTRACT; return _stackArgumentsSize; } -#endif - -private: - friend struct ResolveHolder; - - // ResolveStub::_failEntryPoint expects: - // ecx: object (the "this" pointer) - // eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call - BYTE _failEntryPoint [2]; // 83 2d sub - INT32* _pCounter; // xx xx xx xx [counter], - BYTE part0 [2]; // 01 01 - // 7c jl - BYTE toPatcher; // xx backpatcher ;must be forward jump, for perf reasons - // ;fall into the resolver stub - - // ResolveStub::_resolveEntryPoint expects: - // ecx: object (the "this" pointer) - // eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call - BYTE _resolveEntryPoint; // 50 push eax ;save siteAddrForRegisterIndirect - this may be an indirect call - BYTE part1 [11]; // 8b 01 mov eax,[ecx] ;get the method table from the "this" pointer. This is the place - // ; where we are going to fault on null this. If you change it, - // ; change also AdjustContextForVirtualStub in excep.cpp!!! - // 52 push edx - // 8b d0 mov edx, eax - // c1 e8 0C shr eax,12 ;we are adding upper bits into lower bits of mt - // 03 c2 add eax,edx - // 35 xor eax, - UINT32 _hashedToken; // xx xx xx xx hashedToken ;along with pre-hashed token - BYTE part2 [1]; // 25 and eax, - size_t mask; // xx xx xx xx cache_mask - BYTE part3 [2]; // 8b 80 mov eax, [eax+ - size_t _cacheAddress; // xx xx xx xx lookupCache] -#ifdef STUB_LOGGING - BYTE cntr1[2]; // ff 05 inc - size_t* c_call; // xx xx xx xx [call_cache_counter] -#endif //STUB_LOGGING - BYTE part4 [2]; // 3b 10 cmp edx,[eax+ - // BYTE mtOffset; // ResolverCacheElem.pMT] - BYTE part5 [1]; // 75 jne - BYTE toMiss1; // xx miss ;must be forward jump, for perf reasons - BYTE part6 [2]; // 81 78 cmp [eax+ - BYTE tokenOffset; // xx ResolverCacheElem.token], - size_t _token; // xx xx xx xx token - BYTE part7 [1]; // 75 jne - BYTE toMiss2; // xx miss ;must be forward jump, for perf reasons - BYTE part8 [2]; // 8B 40 xx mov eax,[eax+ - BYTE targetOffset; // ResolverCacheElem.target] - BYTE part9 [6]; // 5a pop edx - // 83 c4 04 add esp,4 ;throw away siteAddrForRegisterIndirect - we don't need it now - // ff e0 jmp eax - // miss: - BYTE miss [1]; // 5a pop edx ; don't pop siteAddrForRegisterIndirect - leave it on the stack for use by ResolveWorkerChainLookupAsmStub and/or ResolveWorkerAsmStub - BYTE _slowEntryPoint[1]; // 68 push - size_t _tokenPush; // xx xx xx xx token -#ifdef STUB_LOGGING - BYTE cntr2[2]; // ff 05 inc - size_t* c_miss; // xx xx xx xx [miss_cache_counter] -#endif //STUB_LOGGING - BYTE part10 [1]; // e9 jmp - DISPL _resolveWorkerDispl; // xx xx xx xx resolveWorker == ResolveWorkerChainLookupAsmStub or ResolveWorkerAsmStub - BYTE patch[1]; // e8 call - DISPL _backpatcherDispl; // xx xx xx xx backpatcherWorker == BackPatchWorkerAsmStub - BYTE part11 [1]; // eb jmp - BYTE toResolveStub; // xx resolveStub, i.e. go back to _resolveEntryPoint -#ifndef UNIX_X86_ABI - size_t _stackArgumentsSize; // xx xx xx xx -#endif -}; - -/* ResolveHolders are the containers for ResolveStubs, They provide -for any alignment of the stubs as necessary. The stubs are placed in a hash table keyed by -the token for which they are built. Efficiency of access requires that this token be aligned. -For now, we have copied that field into the ResolveHolder itself, if the resolve stub is arranged such that -any of its inlined tokens (non-prehashed) is aligned, then the token field in the ResolveHolder -is not needed. */ -struct ResolveHolder -{ - static void InitializeStatic(); - - void Initialize(ResolveHolder* pResolveHolderRX, - PCODE resolveWorkerTarget, PCODE patcherTarget, - size_t dispatchToken, UINT32 hashedToken, - void * cacheAddr, INT32 * counterAddr -#ifndef UNIX_X86_ABI - , size_t stackArgumentsSize -#endif - ); - - ResolveStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } - - static ResolveHolder* FromFailEntry(PCODE failEntry); - static ResolveHolder* FromResolveEntry(PCODE resolveEntry); - -private: - //align _token in resolve stub - - BYTE align[(sizeof(void*)-((offsetof(ResolveStub,_token))%sizeof(void*)))%sizeof(void*) -#ifdef STUB_LOGGING // This turns out to be zero-sized in stub_logging case, and is an error. So round up. - +sizeof(void*) -#endif - ]; - - ResolveStub _stub; - -//#ifdef STUB_LOGGING // This turns out to be zero-sized in non stub_logging case, and is an error. So remove - BYTE pad[(sizeof(void*)-((sizeof(ResolveStub))%sizeof(void*))+offsetof(ResolveStub,_token))%sizeof(void*)]; //fill out DWORD -//#endif -}; - /*VTableCallStub************************************************************************************** These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed in the first argument register (this pointer). From there, the stub extracts the MethodTable pointer, followed by the @@ -421,6 +68,7 @@ struct VTableCallHolder VTableCallStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this); } + size_t size() { return stub()->size(); } static size_t GetHolderSize(unsigned slot) { STATIC_CONTRACT_WRAPPER; @@ -708,283 +356,6 @@ PCODE StubCallSite::GetCallerAddress() #endif // UNIX_X86_ABI } -#ifdef STUB_LOGGING -extern size_t g_lookup_inline_counter; -extern size_t g_mono_call_counter; -extern size_t g_mono_miss_counter; -extern size_t g_poly_call_counter; -extern size_t g_poly_miss_counter; -#endif - -/* Template used to generate the stub. We generate a stub by allocating a block of - memory and copy the template over it and just update the specific fields that need - to be changed. -*/ -LookupStub lookupInit; - -void LookupHolder::InitializeStatic() -{ - static_assert_no_msg(((offsetof(LookupStub, _token)+offsetof(LookupHolder, _stub)) % sizeof(void*)) == 0); - static_assert_no_msg((sizeof(LookupHolder) % sizeof(void*)) == 0); - - lookupInit._entryPoint [0] = 0x50; - lookupInit._entryPoint [1] = 0x68; - static_assert_no_msg(sizeof(lookupInit._entryPoint) == 2); - lookupInit._token = 0xcccccccc; -#ifdef STUB_LOGGING - lookupInit.cntr2 [0] = 0xff; - lookupInit.cntr2 [1] = 0x05; - static_assert_no_msg(sizeof(lookupInit.cntr2) == 2); - lookupInit.c_lookup = &g_call_lookup_counter; -#endif //STUB_LOGGING - lookupInit.part2 [0] = 0xe9; - static_assert_no_msg(sizeof(lookupInit.part2) == 1); - lookupInit._resolveWorkerDispl = 0xcccccccc; -} - -void LookupHolder::Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken) -{ - _stub = lookupInit; - - //fill in the stub specific fields - //@TODO: Get rid of this duplication of data. - _stub._token = dispatchToken; - _stub._resolveWorkerDispl = resolveWorkerTarget - ((PCODE) &pLookupHolderRX->_stub._resolveWorkerDispl + sizeof(DISPL)); -} - -LookupHolder* LookupHolder::FromLookupEntry(PCODE lookupEntry) -{ - LIMITED_METHOD_CONTRACT; - LookupHolder* lookupHolder = (LookupHolder*) ( lookupEntry - offsetof(LookupHolder, _stub) - offsetof(LookupStub, _entryPoint) ); - // _ASSERTE(lookupHolder->_stub._entryPoint[0] == lookupInit._entryPoint[0]); - return lookupHolder; -} - - -/* Template used to generate the stub. We generate a stub by allocating a block of - memory and copy the template over it and just update the specific fields that need - to be changed. -*/ -DispatchStub dispatchInit; - -void DispatchHolder::InitializeStatic() -{ - // Check that _implDispl is aligned in the DispatchHolder for backpatching - static_assert_no_msg(((offsetof(DispatchHolder, _stub) + offsetof(DispatchStub, _implDispl)) % sizeof(void*)) == 0); - static_assert_no_msg((sizeof(DispatchHolder) % sizeof(void*)) == 0); - -#ifndef STUB_LOGGING - dispatchInit._entryPoint [0] = 0x81; - dispatchInit._entryPoint [1] = 0x39; - static_assert_no_msg(sizeof(dispatchInit._entryPoint) == 2); - - dispatchInit._expectedMT = 0xcccccccc; - dispatchInit.jmpOp1 [0] = 0x0f; - dispatchInit.jmpOp1 [1] = 0x85; - static_assert_no_msg(sizeof(dispatchInit.jmpOp1) == 2); - - dispatchInit._failDispl = 0xcccccccc; - dispatchInit.jmpOp2 = 0xe9; - dispatchInit._implDispl = 0xcccccccc; -#else //STUB_LOGGING - dispatchInit._entryPoint [0] = 0xff; - dispatchInit._entryPoint [1] = 0x05; - static_assert_no_msg(sizeof(dispatchInit._entryPoint) == 2); - - dispatchInit.d_call = &g_mono_call_counter; - dispatchInit.cmpOp [0] = 0x81; - dispatchInit.cmpOp [1] = 0x39; - static_assert_no_msg(sizeof(dispatchInit.cmpOp) == 2); - - dispatchInit._expectedMT = 0xcccccccc; - dispatchInit.jmpOp1 [0] = 0x0f; - dispatchInit.jmpOp1 [1] = 0x84; - static_assert_no_msg(sizeof(dispatchInit.jmpOp1) == 2); - - dispatchInit._implDispl = 0xcccccccc; - dispatchInit.fail [0] = 0xff; - dispatchInit.fail [1] = 0x05; - static_assert_no_msg(sizeof(dispatchInit.fail) == 2); - - dispatchInit.d_miss = &g_mono_miss_counter; - dispatchInit.jmpFail = 0xe9; - dispatchInit._failDispl = 0xcccccccc; -#endif //STUB_LOGGING -}; - -void DispatchHolder::Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT) -{ - _stub = dispatchInit; - - //fill in the stub specific fields - _stub._expectedMT = (size_t) expectedMT; - _stub._failDispl = failTarget - ((PCODE) &pDispatchHolderRX->_stub._failDispl + sizeof(DISPL)); - _stub._implDispl = implTarget - ((PCODE) &pDispatchHolderRX->_stub._implDispl + sizeof(DISPL)); -} - -DispatchHolder* DispatchHolder::FromDispatchEntry(PCODE dispatchEntry) -{ - LIMITED_METHOD_CONTRACT; - DispatchHolder* dispatchHolder = (DispatchHolder*) ( dispatchEntry - offsetof(DispatchHolder, _stub) - offsetof(DispatchStub, _entryPoint) ); - // _ASSERTE(dispatchHolder->_stub._entryPoint[0] == dispatchInit._entryPoint[0]); - return dispatchHolder; -} - - -/* Template used to generate the stub. We generate a stub by allocating a block of - memory and copy the template over it and just update the specific fields that need - to be changed. -*/ - -ResolveStub resolveInit; - -void ResolveHolder::InitializeStatic() -{ - //Check that _token is aligned in ResolveHolder - static_assert_no_msg(((offsetof(ResolveHolder, _stub) + offsetof(ResolveStub, _token)) % sizeof(void*)) == 0); - static_assert_no_msg((sizeof(ResolveHolder) % sizeof(void*)) == 0); - - resolveInit._failEntryPoint [0] = 0x83; - resolveInit._failEntryPoint [1] = 0x2d; - static_assert_no_msg(sizeof(resolveInit._failEntryPoint) == 2); - - resolveInit._pCounter = (INT32 *) (size_t) 0xcccccccc; - resolveInit.part0 [0] = 0x01; - resolveInit.part0 [1] = 0x7c; - static_assert_no_msg(sizeof(resolveInit.part0) == 2); - - resolveInit.toPatcher = (offsetof(ResolveStub, patch) - (offsetof(ResolveStub, toPatcher) + 1)) & 0xFF; - - resolveInit._resolveEntryPoint = 0x50; - resolveInit.part1 [0] = 0x8b; - resolveInit.part1 [1] = 0x01; - resolveInit.part1 [2] = 0x52; - resolveInit.part1 [3] = 0x8b; - resolveInit.part1 [4] = 0xd0; - resolveInit.part1 [5] = 0xc1; - resolveInit.part1 [6] = 0xe8; - resolveInit.part1 [7] = CALL_STUB_CACHE_NUM_BITS; - resolveInit.part1 [8] = 0x03; - resolveInit.part1 [9] = 0xc2; - resolveInit.part1 [10] = 0x35; - static_assert_no_msg(sizeof(resolveInit.part1) == 11); - - resolveInit._hashedToken = 0xcccccccc; - resolveInit.part2 [0] = 0x25; - static_assert_no_msg(sizeof(resolveInit.part2) == 1); - - resolveInit.mask = (CALL_STUB_CACHE_MASK << LOG2_PTRSIZE); - resolveInit.part3 [0] = 0x8b; - resolveInit.part3 [1] = 0x80;; - static_assert_no_msg(sizeof(resolveInit.part3) == 2); - - resolveInit._cacheAddress = 0xcccccccc; -#ifdef STUB_LOGGING - resolveInit.cntr1 [0] = 0xff; - resolveInit.cntr1 [1] = 0x05; - static_assert_no_msg(sizeof(resolveInit.cntr1) == 2); - - resolveInit.c_call = &g_poly_call_counter; -#endif //STUB_LOGGING - resolveInit.part4 [0] = 0x3b; - resolveInit.part4 [1] = 0x10; - static_assert_no_msg(sizeof(resolveInit.part4) == 2); - - // resolveInit.mtOffset = offsetof(ResolveCacheElem,pMT) & 0xFF; - static_assert_no_msg(offsetof(ResolveCacheElem,pMT) == 0); - - resolveInit.part5 [0] = 0x75; - static_assert_no_msg(sizeof(resolveInit.part5) == 1); - - resolveInit.toMiss1 = offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss1)+1); - - resolveInit.part6 [0] = 0x81; - resolveInit.part6 [1] = 0x78; - static_assert_no_msg(sizeof(resolveInit.part6) == 2); - - resolveInit.tokenOffset = offsetof(ResolveCacheElem,token) & 0xFF; - - resolveInit._token = 0xcccccccc; - - resolveInit.part7 [0] = 0x75; - static_assert_no_msg(sizeof(resolveInit.part7) == 1); - - resolveInit.part8 [0] = 0x8b; - resolveInit.part8 [1] = 0x40; - static_assert_no_msg(sizeof(resolveInit.part8) == 2); - - resolveInit.targetOffset = offsetof(ResolveCacheElem,target) & 0xFF; - - resolveInit.toMiss2 = offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss2)+1); - - resolveInit.part9 [0] = 0x5a; - resolveInit.part9 [1] = 0x83; - resolveInit.part9 [2] = 0xc4; - resolveInit.part9 [3] = 0x04; - resolveInit.part9 [4] = 0xff; - resolveInit.part9 [5] = 0xe0; - static_assert_no_msg(sizeof(resolveInit.part9) == 6); - - resolveInit.miss [0] = 0x5a; -// resolveInit.miss [1] = 0xb8; -// resolveInit._hashedTokenMov = 0xcccccccc; - resolveInit._slowEntryPoint [0] = 0x68; - resolveInit._tokenPush = 0xcccccccc; -#ifdef STUB_LOGGING - resolveInit.cntr2 [0] = 0xff; - resolveInit.cntr2 [1] = 0x05; - resolveInit.c_miss = &g_poly_miss_counter; -#endif //STUB_LOGGING - resolveInit.part10 [0] = 0xe9; - resolveInit._resolveWorkerDispl = 0xcccccccc; - - resolveInit.patch [0] = 0xe8; - resolveInit._backpatcherDispl = 0xcccccccc; - resolveInit.part11 [0] = 0xeb; - resolveInit.toResolveStub = (offsetof(ResolveStub, _resolveEntryPoint) - (offsetof(ResolveStub, toResolveStub) + 1)) & 0xFF; -}; - -void ResolveHolder::Initialize(ResolveHolder* pResolveHolderRX, - PCODE resolveWorkerTarget, PCODE patcherTarget, - size_t dispatchToken, UINT32 hashedToken, - void * cacheAddr, INT32 * counterAddr -#ifndef UNIX_X86_ABI - , size_t stackArgumentsSize -#endif - ) -{ - _stub = resolveInit; - - //fill in the stub specific fields - _stub._pCounter = counterAddr; - _stub._hashedToken = hashedToken << LOG2_PTRSIZE; - _stub._cacheAddress = (size_t) cacheAddr; - _stub._token = dispatchToken; -// _stub._hashedTokenMov = hashedToken; - _stub._tokenPush = dispatchToken; - _stub._resolveWorkerDispl = resolveWorkerTarget - ((PCODE) &pResolveHolderRX->_stub._resolveWorkerDispl + sizeof(DISPL)); - _stub._backpatcherDispl = patcherTarget - ((PCODE) &pResolveHolderRX->_stub._backpatcherDispl + sizeof(DISPL)); -#ifndef UNIX_X86_ABI - _stub._stackArgumentsSize = stackArgumentsSize; -#endif -} - -ResolveHolder* ResolveHolder::FromFailEntry(PCODE failEntry) -{ - LIMITED_METHOD_CONTRACT; - ResolveHolder* resolveHolder = (ResolveHolder*) ( failEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _failEntryPoint) ); - // _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]); - return resolveHolder; -} - -ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry) -{ - LIMITED_METHOD_CONTRACT; - ResolveHolder* resolveHolder = (ResolveHolder*) ( resolveEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _resolveEntryPoint) ); - // _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]); - return resolveHolder; -} - void VTableCallHolder::Initialize(unsigned slot) { unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE; @@ -1049,22 +420,23 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s WORD firstWord = *((WORD*) stubStartAddress); #ifndef STUB_LOGGING - if (firstWord == 0x3981) + if (firstWord == DISPATCH_STUB_FIRST_WORD) #else //STUB_LOGGING +#error if (firstWord == 0x05ff) #endif { stubKind = SK_DISPATCH; } - else if (firstWord == 0x6850) + else if (firstWord == LOOKUP_STUB_FIRST_WORD) { stubKind = SK_LOOKUP; } - else if (firstWord == 0x8b50) + else if (firstWord == RESOLVE_STUB_FIRST_WORD) { stubKind = SK_RESOLVE; } - else if (firstWord == 0x018b) + else if (firstWord == VTABLECALL_STUB_FIRST_WORD) { stubKind = SK_VTABLECALL; } diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index 752f2b23bf53a..d0a519e4f0fd2 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -9006,7 +9006,15 @@ void CEEInfo::getFunctionEntryPoint(CORINFO_METHOD_HANDLE ftnHnd, // Resolve methodImpl. ftn = ftn->GetMethodTable()->MapMethodDeclToMethodImpl(ftn); - ret = (void *)ftn->TryGetMultiCallableAddrOfCode(accessFlags); + if (!ftn->IsFCall() && ftn->MayHavePrecode() && ftn->GetPrecodeType() == PRECODE_FIXUP) + { + ret = ((FixupPrecode*)ftn->GetOrCreatePrecode())->GetTargetSlot(); + accessType = IAT_PVALUE; + } + else + { + ret = (void *)ftn->TryGetMultiCallableAddrOfCode(accessFlags); + } // TryGetMultiCallableAddrOfCode returns NULL if indirect access is desired if (ret == NULL) diff --git a/src/coreclr/vm/loaderallocator.cpp b/src/coreclr/vm/loaderallocator.cpp index 657ff7b2b40ce..32c8a93c2894a 100644 --- a/src/coreclr/vm/loaderallocator.cpp +++ b/src/coreclr/vm/loaderallocator.cpp @@ -1049,7 +1049,7 @@ void LoaderAllocator::ActivateManagedTracking() #define COLLECTIBLE_HIGH_FREQUENCY_HEAP_SIZE (3 * GetOsPageSize()) #define COLLECTIBLE_STUB_HEAP_SIZE GetOsPageSize() #define COLLECTIBLE_CODEHEAP_SIZE (7 * GetOsPageSize()) -#define COLLECTIBLE_VIRTUALSTUBDISPATCH_HEAP_SPACE (5 * GetOsPageSize()) +#define COLLECTIBLE_VIRTUALSTUBDISPATCH_HEAP_SPACE (14 * GetOsPageSize()) void LoaderAllocator::Init(BaseDomain *pDomain, BYTE *pExecutableHeapMemory) { @@ -1118,7 +1118,8 @@ void LoaderAllocator::Init(BaseDomain *pDomain, BYTE *pExecutableHeapMemory) #if !defined(HOST_64BIT) // Make sure that we reserve as little as possible on 32-bit to save address space - _ASSERTE(dwTotalReserveMemSize <= VIRTUAL_ALLOC_RESERVE_GRANULARITY); + // We cannot reserve less than needed + //_ASSERTE(dwTotalReserveMemSize <= VIRTUAL_ALLOC_RESERVE_GRANULARITY); #endif BYTE * initReservedMem = (BYTE*)ExecutableAllocator::Instance()->Reserve(dwTotalReserveMemSize); @@ -1161,7 +1162,7 @@ void LoaderAllocator::Init(BaseDomain *pDomain, BYTE *pExecutableHeapMemory) initReservedMem, dwExecutableHeapReserveSize, NULL, - TRUE /* Make heap executable */ + UnlockedLoaderHeap::HeapKind::Executable ); initReservedMem += dwExecutableHeapReserveSize; } @@ -1184,7 +1185,7 @@ void LoaderAllocator::Init(BaseDomain *pDomain, BYTE *pExecutableHeapMemory) initReservedMem, dwStubHeapReserveSize, STUBMANAGER_RANGELIST(StubLinkStubManager), - TRUE /* Make heap executable */); + UnlockedLoaderHeap::HeapKind::Executable); initReservedMem += dwStubHeapReserveSize; @@ -1194,6 +1195,22 @@ void LoaderAllocator::Init(BaseDomain *pDomain, BYTE *pExecutableHeapMemory) m_pPrecodeHeap = new (&m_PrecodeHeapInstance) CodeFragmentHeap(this, STUB_CODE_BLOCK_PRECODE); + m_pNewStubPrecodeHeap = new (&m_NewStubPrecodeHeapInstance) LoaderHeap(2 * GetOsPageSize(), + 2 * GetOsPageSize(), + PrecodeStubManager::g_pManager->GetStubPrecodeRangeList(), + UnlockedLoaderHeap::HeapKind::Interleaved, + false /* fUnlocked */, + StubPrecode::GenerateCodePage, + StubPrecode::CodeSize); + + m_pFixupPrecodeHeap = new (&m_FixupPrecodeHeapInstance) LoaderHeap(2 * GetOsPageSize(), + 2 * GetOsPageSize(), + PrecodeStubManager::g_pManager->GetFixupPrecodeRangeList(), + UnlockedLoaderHeap::HeapKind::Interleaved, + false /* fUnlocked */, + FixupPrecode::GenerateCodePage, + FixupPrecode::CodeSize); + // Initialize the EE marshaling data to NULL. m_pMarshalingData = NULL; @@ -1376,6 +1393,18 @@ void LoaderAllocator::Terminate() m_pPrecodeHeap = NULL; } + if (m_pFixupPrecodeHeap != NULL) + { + m_pFixupPrecodeHeap->~LoaderHeap(); + m_pFixupPrecodeHeap = NULL; + } + + if (m_pNewStubPrecodeHeap != NULL) + { + m_pNewStubPrecodeHeap->~LoaderHeap(); + m_pNewStubPrecodeHeap = NULL; + } + #ifdef FEATURE_READYTORUN if (m_pDynamicHelpersHeap != NULL) { diff --git a/src/coreclr/vm/loaderallocator.hpp b/src/coreclr/vm/loaderallocator.hpp index 0907d98e266d5..846ec6346d418 100644 --- a/src/coreclr/vm/loaderallocator.hpp +++ b/src/coreclr/vm/loaderallocator.hpp @@ -160,6 +160,8 @@ class LoaderAllocator BYTE m_HighFreqHeapInstance[sizeof(LoaderHeap)]; BYTE m_StubHeapInstance[sizeof(LoaderHeap)]; BYTE m_PrecodeHeapInstance[sizeof(CodeFragmentHeap)]; + BYTE m_FixupPrecodeHeapInstance[sizeof(LoaderHeap)]; + BYTE m_NewStubPrecodeHeapInstance[sizeof(LoaderHeap)]; PTR_LoaderHeap m_pLowFrequencyHeap; PTR_LoaderHeap m_pHighFrequencyHeap; PTR_LoaderHeap m_pStubHeap; // stubs for PInvoke, remoting, etc @@ -168,6 +170,8 @@ class LoaderAllocator #ifdef FEATURE_READYTORUN PTR_CodeFragmentHeap m_pDynamicHelpersHeap; #endif + PTR_LoaderHeap m_pFixupPrecodeHeap; + PTR_LoaderHeap m_pNewStubPrecodeHeap; //**************************************************************************************** OBJECTHANDLE m_hLoaderAllocatorObjectHandle; FuncPtrStubs * m_pFuncPtrStubs; // for GetMultiCallableAddrOfCode() @@ -443,6 +447,12 @@ class LoaderAllocator return m_pPrecodeHeap; } + PTR_LoaderHeap GetNewStubPrecodeHeap() + { + LIMITED_METHOD_CONTRACT; + return m_pNewStubPrecodeHeap; + } + // The executable heap is intended to only be used by the global loader allocator. // It refers to executable memory that is not associated with a rangelist. PTR_LoaderHeap GetExecutableHeap() @@ -451,6 +461,12 @@ class LoaderAllocator return m_pExecutableHeap; } + PTR_LoaderHeap GetFixupPrecodeHeap() + { + LIMITED_METHOD_CONTRACT; + return m_pFixupPrecodeHeap; + } + PTR_CodeFragmentHeap GetDynamicHelpersHeap(); FuncPtrStubs * GetFuncPtrStubs(); diff --git a/src/coreclr/vm/method.cpp b/src/coreclr/vm/method.cpp index 4309978b13234..d8d447e7da597 100644 --- a/src/coreclr/vm/method.cpp +++ b/src/coreclr/vm/method.cpp @@ -534,7 +534,6 @@ PTR_PCODE MethodDesc::GetAddrOfSlot() CONTRACTL_END; // Keep implementations of MethodDesc::GetMethodEntryPoint and MethodDesc::GetAddrOfSlot in sync! - if (HasNonVtableSlot()) { SIZE_T size = GetBaseSize(); @@ -1732,6 +1731,13 @@ MethodDescChunk *MethodDescChunk::CreateChunk(LoaderHeap *pHeap, DWORD methodDes DWORD maxMethodDescsPerChunk = MethodDescChunk::MaxSizeOfMethodDescs / oneSize; + // Limit the maximum MethodDescs per chunk by the number of precodes that can fit to a single memory page, + // since we allocate consecutive temporary entry points for all MethodDescs in the whole chunk. + DWORD maxPrecodesPerPage = Precode::GetMaxTemporaryEntryPointsCount(); + + if (maxPrecodesPerPage < maxMethodDescsPerChunk) + maxMethodDescsPerChunk = maxPrecodesPerPage; + if (methodDescCount == 0) methodDescCount = maxMethodDescsPerChunk; @@ -2124,7 +2130,20 @@ MethodDesc* NonVirtualEntry2MethodDesc(PCODE entryPoint) RangeSection* pRS = ExecutionManager::FindCodeRange(entryPoint, ExecutionManager::GetScanFlags()); if (pRS == NULL) + { + TADDR pInstr = PCODEToPINSTR(entryPoint); + if (PrecodeStubManager::g_pManager->GetStubPrecodeRangeList()->IsInRange(entryPoint)) + { + return (MethodDesc*)((StubPrecode*)pInstr)->GetMethodDesc(); + } + + if (PrecodeStubManager::g_pManager->GetFixupPrecodeRangeList()->IsInRange(entryPoint)) + { + return (MethodDesc*)((FixupPrecode*)pInstr)->GetMethodDesc(); + } + return NULL; + } MethodDesc* pMD; if (pRS->pjit->JitCodeToMethodInfo(pRS, entryPoint, &pMD, NULL)) @@ -2333,8 +2352,8 @@ BOOL MethodDesc::MayHaveNativeCode() { CONTRACTL { - THROWS; - GC_TRIGGERS; + NOTHROW; + GC_NOTRIGGER; MODE_ANY; } CONTRACTL_END @@ -2450,7 +2469,7 @@ MethodDesc* MethodDesc::GetMethodDescFromStubAddr(PCODE addr, BOOL fSpeculative // Otherwise this must be some kind of precode // - Precode* pPrecode = Precode::GetPrecodeFromEntryPoint(addr, fSpeculative); + PTR_Precode pPrecode = Precode::GetPrecodeFromEntryPoint(addr, fSpeculative); PREFIX_ASSUME(fSpeculative || (pPrecode != NULL)); if (pPrecode != NULL) { @@ -3015,7 +3034,6 @@ Precode* MethodDesc::GetOrCreatePrecode() AllocMemTracker amt; Precode* pPrecode = Precode::Allocate(requiredType, this, GetLoaderAllocator(), &amt); - if (FastInterlockCompareExchangePointer(pSlot, pPrecode->GetEntryPoint(), tempEntry) == tempEntry) amt.SuppressRelease(); } diff --git a/src/coreclr/vm/method.hpp b/src/coreclr/vm/method.hpp index 79d6fd612ff76..f1d47e81d348a 100644 --- a/src/coreclr/vm/method.hpp +++ b/src/coreclr/vm/method.hpp @@ -283,8 +283,8 @@ class MethodDesc { CONTRACTL { - THROWS; - GC_TRIGGERS; + NOTHROW; + GC_NOTRIGGER; MODE_ANY; } CONTRACTL_END @@ -2657,7 +2657,7 @@ class NDirectImportThunkGlue PVOID m_dummy; // Dummy field to make the alignment right public: - LPVOID GetEntrypoint() + LPVOID GetEntryPoint() { LIMITED_METHOD_CONTRACT; return NULL; diff --git a/src/coreclr/vm/peimage.cpp b/src/coreclr/vm/peimage.cpp index 946943cc3cd61..f9476846130ec 100644 --- a/src/coreclr/vm/peimage.cpp +++ b/src/coreclr/vm/peimage.cpp @@ -505,7 +505,7 @@ LoaderHeap *PEImage::IJWFixupData::GetThunkHeap() LoaderHeap *pNewHeap = new LoaderHeap(VIRTUAL_ALLOC_RESERVE_GRANULARITY, // DWORD dwReserveBlockSize 0, // DWORD dwCommitBlockSize ThunkHeapStubManager::g_pManager->GetRangeList(), - TRUE); // BOOL fMakeExecutable + UnlockedLoaderHeap::HeapKind::Executable); if (FastInterlockCompareExchangePointer((PVOID*)&m_DllThunkHeap, (VOID*)pNewHeap, (VOID*)0) != 0) { diff --git a/src/coreclr/vm/precode.cpp b/src/coreclr/vm/precode.cpp index ebccb7be43d47..dc2117db0e6da 100644 --- a/src/coreclr/vm/precode.cpp +++ b/src/coreclr/vm/precode.cpp @@ -169,19 +169,6 @@ BOOL Precode::IsCorrectMethodDesc(MethodDesc * pMD) if (pMDfromPrecode == pMD) return TRUE; -#ifdef HAS_FIXUP_PRECODE_CHUNKS - if (pMDfromPrecode == NULL) - { - PrecodeType precodeType = GetType(); - -#ifdef HAS_FIXUP_PRECODE_CHUNKS - // We do not keep track of the MethodDesc in every kind of fixup precode - if (precodeType == PRECODE_FIXUP) - return TRUE; -#endif - } -#endif // HAS_FIXUP_PRECODE_CHUNKS - return FALSE; } @@ -199,7 +186,7 @@ BOOL Precode::IsPointingToPrestub(PCODE target) return TRUE; #ifdef HAS_FIXUP_PRECODE - if (IsPointingTo(target, GetEEFuncEntryPoint(PrecodeFixupThunk))) + if (IsPointingTo(target, ((PCODE)this + FixupPrecode::FixupCodeOffset))) return TRUE; #endif @@ -223,46 +210,15 @@ Precode* Precode::GetPrecodeForTemporaryEntryPoint(TADDR temporaryEntryPoints, i { WRAPPER_NO_CONTRACT; PrecodeType t = PTR_Precode(temporaryEntryPoints)->GetType(); -#ifdef HAS_FIXUP_PRECODE_CHUNKS - if (t == PRECODE_FIXUP) - { - return PTR_Precode(temporaryEntryPoints + index * sizeof(FixupPrecode)); - } -#endif SIZE_T oneSize = SizeOfTemporaryEntryPoint(t); return PTR_Precode(temporaryEntryPoints + index * oneSize); } -SIZE_T Precode::SizeOfTemporaryEntryPoints(PrecodeType t, bool preallocateJumpStubs, int count) +SIZE_T Precode::SizeOfTemporaryEntryPoints(PrecodeType t, int count) { WRAPPER_NO_CONTRACT; SUPPORTS_DAC; -#ifdef HAS_FIXUP_PRECODE_CHUNKS - if (t == PRECODE_FIXUP) - { - SIZE_T size = count * sizeof(FixupPrecode) + sizeof(PTR_MethodDesc); - -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - if (preallocateJumpStubs) - { - // For dynamic methods, space for jump stubs is allocated along with the precodes as part of the temporary entry - // points block. The first jump stub begins immediately after the PTR_MethodDesc. Aside from a jump stub per - // precode, an additional shared precode fixup jump stub is also allocated (see - // GetDynamicMethodPrecodeFixupJumpStub()). - size += ((SIZE_T)count + 1) * BACK_TO_BACK_JUMP_ALLOCATE_SIZE; - } -#else // !FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - _ASSERTE(!preallocateJumpStubs); -#endif // FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - - return size; - } - else - { - _ASSERTE(!preallocateJumpStubs); - } -#endif SIZE_T oneSize = SizeOfTemporaryEntryPoint(t); return count * oneSize; } @@ -273,14 +229,7 @@ SIZE_T Precode::SizeOfTemporaryEntryPoints(TADDR temporaryEntryPoints, int count SUPPORTS_DAC; PrecodeType precodeType = PTR_Precode(temporaryEntryPoints)->GetType(); -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - bool preallocateJumpStubs = - precodeType == PRECODE_FIXUP && - ((PTR_MethodDesc)((PTR_FixupPrecode)temporaryEntryPoints)->GetMethodDesc())->IsLCGMethod(); -#else // !FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - bool preallocateJumpStubs = false; -#endif // FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - return SizeOfTemporaryEntryPoints(precodeType, preallocateJumpStubs, count); + return SizeOfTemporaryEntryPoints(precodeType, count); } #ifndef DACCESS_COMPILE @@ -297,24 +246,27 @@ Precode* Precode::Allocate(PrecodeType t, MethodDesc* pMD, } CONTRACTL_END; - SIZE_T size; + SIZE_T size = Precode::SizeOf(t); + Precode* pPrecode; -#ifdef HAS_FIXUP_PRECODE_CHUNKS if (t == PRECODE_FIXUP) { - size = sizeof(FixupPrecode) + sizeof(PTR_MethodDesc); + pPrecode = (Precode*)pamTracker->Track(pLoaderAllocator->GetFixupPrecodeHeap()->AllocAlignedMem(size, 1)); + pPrecode->Init(pPrecode, t, pMD, pLoaderAllocator); } - else -#endif + else if (t == PRECODE_STUB || t == PRECODE_NDIRECT_IMPORT) { - size = Precode::SizeOf(t); + pPrecode = (Precode*)pamTracker->Track(pLoaderAllocator->GetNewStubPrecodeHeap()->AllocAlignedMem(size, 1)); + pPrecode->Init(pPrecode, t, pMD, pLoaderAllocator); } + else + { + pPrecode = (Precode*)pamTracker->Track(pLoaderAllocator->GetPrecodeHeap()->AllocAlignedMem(size, AlignOf(t))); + ExecutableWriterHolder precodeWriterHolder(pPrecode, size); + precodeWriterHolder.GetRW()->Init(pPrecode, t, pMD, pLoaderAllocator); + ClrFlushInstructionCache(pPrecode, size); - Precode* pPrecode = (Precode*)pamTracker->Track(pLoaderAllocator->GetPrecodeHeap()->AllocAlignedMem(size, AlignOf(t))); - ExecutableWriterHolder precodeWriterHolder(pPrecode, size); - precodeWriterHolder.GetRW()->Init(pPrecode, t, pMD, pLoaderAllocator); - - ClrFlushInstructionCache(pPrecode, size); + } return pPrecode; } @@ -424,24 +376,20 @@ void Precode::Reset() WRAPPER_NO_CONTRACT; MethodDesc* pMD = GetMethodDesc(); - SIZE_T size; + PrecodeType t = GetType(); -#ifdef HAS_FIXUP_PRECODE_CHUNKS + SIZE_T size = Precode::SizeOf(t); + if (t == PRECODE_FIXUP) { - // The writeable size the Init method accesses is dynamic depending on - // the FixupPrecode members. - size = ((FixupPrecode*)this)->GetSizeRW(); + Init(this, t, pMD, pMD->GetLoaderAllocator()); } else -#endif { - size = Precode::SizeOf(t); + ExecutableWriterHolder precodeWriterHolder(this, size); + precodeWriterHolder.GetRW()->Init(this, t, pMD, pMD->GetLoaderAllocator()); + ClrFlushInstructionCache(this, SizeOf()); } - - ExecutableWriterHolder precodeWriterHolder(this, size); - precodeWriterHolder.GetRW()->Init(this, GetType(), pMD, pMD->GetLoaderAllocator()); - ClrFlushInstructionCache(this, SizeOf()); } /* static */ @@ -490,12 +438,6 @@ TADDR Precode::AllocateTemporaryEntryPoints(MethodDescChunk * pChunk, { t = PRECODE_FIXUP; -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - if (pFirstMD->IsLCGMethod()) - { - preallocateJumpStubs = true; - } -#endif // FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS } else { @@ -503,7 +445,7 @@ TADDR Precode::AllocateTemporaryEntryPoints(MethodDescChunk * pChunk, } #endif // HAS_FIXUP_PRECODE - SIZE_T totalSize = SizeOfTemporaryEntryPoints(t, preallocateJumpStubs, count); + SIZE_T totalSize = SizeOfTemporaryEntryPoints(t, count); #ifdef HAS_COMPACT_ENTRYPOINTS // Note that these are just best guesses to save memory. If we guessed wrong, @@ -523,70 +465,52 @@ TADDR Precode::AllocateTemporaryEntryPoints(MethodDescChunk * pChunk, return NULL; #endif - TADDR temporaryEntryPoints = (TADDR)pamTracker->Track(pLoaderAllocator->GetPrecodeHeap()->AllocAlignedMem(totalSize, AlignOf(t))); - ExecutableWriterHolder entryPointsWriterHolder((void*)temporaryEntryPoints, totalSize); - -#ifdef HAS_FIXUP_PRECODE_CHUNKS - if (t == PRECODE_FIXUP) + TADDR temporaryEntryPoints; + SIZE_T oneSize = SizeOfTemporaryEntryPoint(t); + MethodDesc * pMD = pChunk->GetFirstMethodDesc(); + + if (t == PRECODE_FIXUP || t == PRECODE_STUB) { -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - PCODE precodeFixupJumpStubRW = NULL; - PCODE precodeFixupJumpStub = NULL; - if (preallocateJumpStubs) + LoaderHeap *pStubHeap; + if (t == PRECODE_FIXUP) + { + pStubHeap = pLoaderAllocator->GetFixupPrecodeHeap(); + } + else { - // Emit the jump for the precode fixup jump stub now. This jump stub immediately follows the MethodDesc (see - // GetDynamicMethodPrecodeFixupJumpStub()). - precodeFixupJumpStub = temporaryEntryPoints + count * sizeof(FixupPrecode) + sizeof(PTR_MethodDesc); - // TODO: how to get the size? - precodeFixupJumpStubRW = (TADDR)entryPointsWriterHolder.GetRW() + count * sizeof(FixupPrecode) + sizeof(PTR_MethodDesc); - emitBackToBackJump((BYTE*)precodeFixupJumpStub, (BYTE*)precodeFixupJumpStubRW, (LPVOID)GetEEFuncEntryPoint(PrecodeFixupThunk)); + pStubHeap = pLoaderAllocator->GetNewStubPrecodeHeap(); } -#endif // FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS + temporaryEntryPoints = (TADDR)pamTracker->Track(pStubHeap->AllocAlignedMem(totalSize, 1)); TADDR entryPoint = temporaryEntryPoints; - TADDR entryPointRW = (TADDR)entryPointsWriterHolder.GetRW(); - - MethodDesc * pMD = pChunk->GetFirstMethodDesc(); for (int i = 0; i < count; i++) { - ((FixupPrecode *)entryPointRW)->Init((FixupPrecode*)entryPoint, pMD, pLoaderAllocator, pMD->GetMethodDescIndex(), (count - 1) - i); - -#ifdef FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS - _ASSERTE( - !preallocateJumpStubs || - !pMD->IsLCGMethod() || - ((FixupPrecode *)entryPoint)->GetDynamicMethodPrecodeFixupJumpStub() == precodeFixupJumpStub); -#endif // FIXUP_PRECODE_PREALLOCATE_DYNAMIC_METHOD_JUMP_STUBS + ((Precode *)entryPoint)->Init((Precode *)entryPoint, t, pMD, pLoaderAllocator); _ASSERTE((Precode *)entryPoint == GetPrecodeForTemporaryEntryPoint(temporaryEntryPoints, i)); - entryPoint += sizeof(FixupPrecode); - entryPointRW += sizeof(FixupPrecode); + entryPoint += oneSize; pMD = (MethodDesc *)(dac_cast(pMD) + pMD->SizeOf()); } - -#ifdef FEATURE_PERFMAP - PerfMap::LogStubs(__FUNCTION__, "PRECODE_FIXUP", (PCODE)temporaryEntryPoints, count * sizeof(FixupPrecode)); -#endif - ClrFlushInstructionCache((LPVOID)temporaryEntryPoints, count * sizeof(FixupPrecode)); - - return temporaryEntryPoints; } -#endif - - SIZE_T oneSize = SizeOfTemporaryEntryPoint(t); - TADDR entryPoint = temporaryEntryPoints; - TADDR entryPointRW = (TADDR)entryPointsWriterHolder.GetRW(); - MethodDesc * pMD = pChunk->GetFirstMethodDesc(); - for (int i = 0; i < count; i++) + else { - ((Precode *)entryPointRW)->Init((Precode *)entryPoint, t, pMD, pLoaderAllocator); + _ASSERTE(FALSE); + temporaryEntryPoints = (TADDR)pamTracker->Track(pLoaderAllocator->GetPrecodeHeap()->AllocAlignedMem(totalSize, AlignOf(t))); + ExecutableWriterHolder entryPointsWriterHolder((void*)temporaryEntryPoints, totalSize); - _ASSERTE((Precode *)entryPoint == GetPrecodeForTemporaryEntryPoint(temporaryEntryPoints, i)); - entryPoint += oneSize; - entryPointRW += oneSize; + TADDR entryPoint = temporaryEntryPoints; + TADDR entryPointRW = (TADDR)entryPointsWriterHolder.GetRW(); + for (int i = 0; i < count; i++) + { + ((Precode *)entryPointRW)->Init((Precode *)entryPoint, t, pMD, pLoaderAllocator); + + _ASSERTE((Precode *)entryPoint == GetPrecodeForTemporaryEntryPoint(temporaryEntryPoints, i)); + entryPoint += oneSize; + entryPointRW += oneSize; - pMD = (MethodDesc *)(dac_cast(pMD) + pMD->SizeOf()); + pMD = (MethodDesc *)(dac_cast(pMD) + pMD->SizeOf()); + } } #ifdef FEATURE_PERFMAP @@ -606,15 +530,281 @@ void Precode::EnumMemoryRegions(CLRDataEnumMemoryFlags flags) SUPPORTS_DAC; PrecodeType t = GetType(); -#ifdef HAS_FIXUP_PRECODE_CHUNKS - if (t == PRECODE_FIXUP) + DacEnumMemoryRegion(GetStart(), SizeOf(t)); +} +#endif + +#ifdef HAS_FIXUP_PRECODE + +#ifdef DACCESS_COMPILE +void FixupPrecode::EnumMemoryRegions(CLRDataEnumMemoryFlags flags) +{ + SUPPORTS_DAC; + DacEnumMemoryRegion(dac_cast(this), sizeof(FixupPrecode)); + DacEnumMemoryRegion(dac_cast(GetData()), sizeof(FixupPrecodeData)); +} +#endif // DACCESS_COMPILE + +#endif // HAS_FIXUP_PRECODE + +#ifndef DACCESS_COMPILE + +void StubPrecode::Init(StubPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator /* = NULL */, + BYTE type /* = StubPrecode::Type */, TADDR target /* = NULL */) +{ + WRAPPER_NO_CONTRACT; + + StubPrecodeData *pStubData = GetData(); + + if (pLoaderAllocator != NULL) { - AsFixupPrecode()->EnumMemoryRegions(flags); - return; + // Use pMD == NULL in all precode initialization methods to allocate the initial jump stub in non-dynamic heap + // that has the same lifetime like as the precode itself + if (target == NULL) + target = GetPreStubEntryPoint(); + pStubData->Target = target; } + + pStubData->MethodDesc = pMD; + pStubData->Type = type; +} + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + #define ENUM_PAGE_SIZE(size) \ + extern "C" void StubPrecodeCode##size(); \ + extern "C" void StubPrecodeCode##size##_End(); + ENUM_PAGE_SIZES + #undef ENUM_PAGE_SIZE +#else +extern "C" void StubPrecodeCode(); +extern "C" void StubPrecodeCode_End(); #endif - DacEnumMemoryRegion(GetStart(), SizeOf(t)); +#ifdef TARGET_X86 +extern "C" size_t StubPrecodeCode_MethodDesc_Offset; +extern "C" size_t StubPrecodeCode_Target_Offset; + +#define SYMBOL_VALUE(name) ((size_t)&name) + +#endif + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) +void (*StubPrecode::StubPrecodeCode)(); +void (*StubPrecode::StubPrecodeCode_End)(); +#endif + +void StubPrecode::StaticInitialize() +{ +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + #define ENUM_PAGE_SIZE(size) \ + case size: \ + StubPrecodeCode = StubPrecodeCode##size; \ + _ASSERTE(((BYTE*)StubPrecodeCode##size##_End - (BYTE*)StubPrecodeCode##size) <= StubPrecode::CodeSize); \ + break; + + int pageSize = GetOsPageSize(); + switch (pageSize) + { + ENUM_PAGE_SIZES + default: + EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size")); + } + #undef ENUM_PAGE_SIZE +#else + _ASSERTE(((BYTE*)StubPrecodeCode_End - (BYTE*)StubPrecodeCode) <= StubPrecode::CodeSize); +#endif + _ASSERTE((*((BYTE*)PCODEToPINSTR((PCODE)StubPrecodeCode) + OFFSETOF_PRECODE_TYPE)) == StubPrecode::Type); +} + +void StubPrecode::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX) +{ + int pageSize = GetOsPageSize(); + +#ifdef TARGET_X86 + int totalCodeSize = (pageSize / StubPrecode::CodeSize) * StubPrecode::CodeSize; + for (int i = 0; i < totalCodeSize; i += StubPrecode::CodeSize) + { + memcpy(pageBase + i, (const void*)StubPrecodeCode, (BYTE*)StubPrecodeCode_End - (BYTE*)StubPrecodeCode); + + BYTE* pTargetSlot = pageBaseRX + i + pageSize + offsetof(StubPrecodeData, Target); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(StubPrecodeCode_Target_Offset)) = pTargetSlot; + + BYTE* pMethodDescSlot = pageBaseRX + i + pageSize + offsetof(StubPrecodeData, MethodDesc); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(StubPrecodeCode_MethodDesc_Offset)) = pMethodDescSlot; + } +#else // TARGET_X86 + FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)StubPrecodeCode), StubPrecode::CodeSize, pageSize); +#endif // TARGET_X86 +} + +BOOL StubPrecode::IsStubPrecodeByASM(PCODE addr) +{ + BYTE *pInstr = (BYTE*)PCODEToPINSTR(addr); +#ifdef TARGET_X86 + return *pInstr == *(BYTE*)(StubPrecodeCode) && + *(DWORD*)(pInstr + SYMBOL_VALUE(StubPrecodeCode_MethodDesc_Offset)) == (DWORD)(pInstr + GetOsPageSize() + offsetof(StubPrecodeData, MethodDesc)) && + *(WORD*)(pInstr + 5) == *(WORD*)((BYTE*)StubPrecodeCode + 5) && + *(DWORD*)(pInstr + SYMBOL_VALUE(StubPrecodeCode_Target_Offset)) == (DWORD)(pInstr + GetOsPageSize() + offsetof(StubPrecodeData, Target)); +#else // TARGET_X86 + return memcmp(pInstr, (void*)PCODEToPINSTR((PCODE)StubPrecodeCode), (BYTE*)StubPrecodeCode_End - (BYTE*)StubPrecodeCode) == 0; +#endif // TARGET_X86 +} + +#ifdef HAS_NDIRECT_IMPORT_PRECODE + +void NDirectImportPrecode::Init(NDirectImportPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator) +{ + WRAPPER_NO_CONTRACT; + StubPrecode::Init(pPrecodeRX, pMD, pLoaderAllocator, NDirectImportPrecode::Type, GetEEFuncEntryPoint(NDirectImportThunk)); +} + +#endif // HAS_NDIRECT_IMPORT_PRECODE + +#ifdef HAS_FIXUP_PRECODE +void FixupPrecode::Init(FixupPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator) +{ + WRAPPER_NO_CONTRACT; + + _ASSERTE(pPrecodeRX == this); + + FixupPrecodeData *pData = GetData(); + pData->MethodDesc = pMD; + + _ASSERTE(GetMethodDesc() == (TADDR)pMD); + + pData->Target = (PCODE)pPrecodeRX + FixupPrecode::FixupCodeOffset; + pData->PrecodeFixupThunk = GetPreStubEntryPoint(); } + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + #define ENUM_PAGE_SIZE(size) \ + extern "C" void FixupPrecodeCode##size(); \ + extern "C" void FixupPrecodeCode##size##_End(); + ENUM_PAGE_SIZES + #undef ENUM_PAGE_SIZE +#else +extern "C" void FixupPrecodeCode(); +extern "C" void FixupPrecodeCode_End(); #endif +#ifdef TARGET_X86 +extern "C" size_t FixupPrecodeCode_MethodDesc_Offset; +extern "C" size_t FixupPrecodeCode_Target_Offset; +extern "C" size_t FixupPrecodeCode_PrecodeFixupThunk_Offset; +#endif + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) +void (*FixupPrecode::FixupPrecodeCode)(); +void (*FixupPrecode::FixupPrecodeCode_End)(); +#endif + +void FixupPrecode::StaticInitialize() +{ +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + #define ENUM_PAGE_SIZE(size) \ + case size: \ + FixupPrecodeCode = FixupPrecodeCode##size; \ + FixupPrecodeCode_End = FixupPrecodeCode##size##_End; \ + _ASSERTE(((BYTE*)FixupPrecodeCode##size##_End - (BYTE*)FixupPrecodeCode##size) <= FixupPrecode::CodeSize); \ + break; + + int pageSize = GetOsPageSize(); + + switch (pageSize) + { + ENUM_PAGE_SIZES + default: + EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size")); + } + #undef ENUM_PAGE_SIZE +#else + _ASSERTE((BYTE*)FixupPrecodeCode_End - (BYTE*)FixupPrecodeCode <= FixupPrecode::CodeSize); +#endif + _ASSERTE(*((BYTE*)PCODEToPINSTR((PCODE)FixupPrecodeCode) + OFFSETOF_PRECODE_TYPE) == FixupPrecode::Type); +} + +void FixupPrecode::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX) +{ + int pageSize = GetOsPageSize(); + +#ifdef TARGET_X86 + int totalCodeSize = (pageSize / FixupPrecode::CodeSize) * FixupPrecode::CodeSize; + + for (int i = 0; i < totalCodeSize; i += FixupPrecode::CodeSize) + { + memcpy(pageBase + i, (const void*)FixupPrecodeCode, FixupPrecode::CodeSize); + BYTE* pTargetSlot = pageBaseRX + i + pageSize + offsetof(FixupPrecodeData, Target); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(FixupPrecodeCode_Target_Offset)) = pTargetSlot; + + BYTE* pMethodDescSlot = pageBaseRX + i + pageSize + offsetof(FixupPrecodeData, MethodDesc); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(FixupPrecodeCode_MethodDesc_Offset)) = pMethodDescSlot; + + BYTE* pPrecodeFixupThunkSlot = pageBaseRX + i + pageSize + offsetof(FixupPrecodeData, PrecodeFixupThunk); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(FixupPrecodeCode_PrecodeFixupThunk_Offset)) = pPrecodeFixupThunkSlot; + } +#else // TARGET_X86 + FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)FixupPrecodeCode), FixupPrecode::CodeSize, pageSize); +#endif // TARGET_X86 +} + +BOOL FixupPrecode::IsFixupPrecodeByASM(PCODE addr) +{ + BYTE *pInstr = (BYTE*)PCODEToPINSTR(addr); +#ifdef TARGET_X86 + return + *(WORD*)(pInstr) == *(WORD*)(FixupPrecodeCode) && + *(DWORD*)(pInstr + SYMBOL_VALUE(FixupPrecodeCode_Target_Offset)) == (DWORD)(pInstr + GetOsPageSize() + offsetof(FixupPrecodeData, Target)) && + *(pInstr + 6) == *((BYTE*)FixupPrecodeCode + 6) && + *(DWORD*)(pInstr + SYMBOL_VALUE(FixupPrecodeCode_MethodDesc_Offset)) == (DWORD)(pInstr + GetOsPageSize() + offsetof(FixupPrecodeData, MethodDesc)) && + *(WORD*)(pInstr + 11) == *(WORD*)((BYTE*)FixupPrecodeCode + 11) && + *(DWORD*)(pInstr + SYMBOL_VALUE(FixupPrecodeCode_PrecodeFixupThunk_Offset)) == (DWORD)(pInstr + GetOsPageSize() + offsetof(FixupPrecodeData, PrecodeFixupThunk)); +#else // TARGET_X86 + return memcmp(pInstr, (void*)PCODEToPINSTR((PCODE)FixupPrecodeCode), (BYTE*)FixupPrecodeCode_End - (BYTE*)FixupPrecodeCode) == 0; +#endif // TARGET_X86 +} + +#endif // HAS_FIXUP_PRECODE + +BOOL DoesSlotCallPrestub(PCODE pCode) +{ + CONTRACTL { + NOTHROW; + GC_NOTRIGGER; + PRECONDITION(pCode != GetPreStubEntryPoint()); + } CONTRACTL_END; + + TADDR pInstr = dac_cast(PCODEToPINSTR(pCode)); + +#ifdef HAS_COMPACT_ENTRYPOINTS + if (MethodDescChunk::GetMethodDescFromCompactEntryPoint(pCode, TRUE) != NULL) + { + return TRUE; + } +#endif + + if (!IS_ALIGNED(pInstr, PRECODE_ALIGNMENT)) + { + return FALSE; + } + + //FixupPrecode +#if defined(HAS_FIXUP_PRECODE) + if (FixupPrecode::IsFixupPrecodeByASM(pCode)) + { + PCODE pTarget = dac_cast(pInstr)->GetTarget(); + + return pTarget == PCODEToPINSTR(pCode) + FixupPrecode::FixupCodeOffset; + } +#endif + + // StubPrecode + if (StubPrecode::IsStubPrecodeByASM(pCode)) + { + pCode = dac_cast(pInstr)->GetTarget(); + return pCode == GetPreStubEntryPoint(); + } + + return FALSE; +} + +#endif // !DACCESS_COMPILE diff --git a/src/coreclr/vm/precode.h b/src/coreclr/vm/precode.h index 494747175a942..da1dfd593dad7 100644 --- a/src/coreclr/vm/precode.h +++ b/src/coreclr/vm/precode.h @@ -9,12 +9,315 @@ #ifndef __PRECODE_H__ #define __PRECODE_H__ -typedef DPTR(class Precode) PTR_Precode; - -#ifndef PRECODE_ALIGNMENT #define PRECODE_ALIGNMENT sizeof(void*) + +#if defined(HOST_AMD64) + +#define OFFSETOF_PRECODE_TYPE 0 +#define OFFSETOF_PRECODE_TYPE_CALL_OR_JMP 5 +#define OFFSETOF_PRECODE_TYPE_MOV_R10 10 + +#define SIZEOF_PRECODE_BASE 16 + +#elif defined(HOST_X86) + +EXTERN_C VOID STDCALL PrecodeRemotingThunk(); + +#define OFFSETOF_PRECODE_TYPE 0 +#define OFFSETOF_PRECODE_TYPE_CALL_OR_JMP 5 +#define OFFSETOF_PRECODE_TYPE_MOV_RM_R 6 + +#define SIZEOF_PRECODE_BASE 8 + +#elif defined(HOST_ARM64) + +#define SIZEOF_PRECODE_BASE CODE_SIZE_ALIGN +#define OFFSETOF_PRECODE_TYPE 0 + +#elif defined(HOST_ARM) + +#define SIZEOF_PRECODE_BASE CODE_SIZE_ALIGN +#define OFFSETOF_PRECODE_TYPE 3 + +#endif // HOST_AMD64 + +#ifndef DACCESS_COMPILE +// Given an address in a slot, figure out if the prestub will be called +BOOL DoesSlotCallPrestub(PCODE pCode); +#endif + +#include + +// Invalid precode type +struct InvalidPrecode +{ +#if defined(HOST_AMD64) || defined(HOST_X86) + // int3 + static const int Type = 0xCC; +#elif defined(HOST_ARM64) || defined(HOST_ARM) + static const int Type = 0; +#endif +}; + +struct StubPrecodeData +{ + PTR_MethodDesc MethodDesc; + PCODE Target; + BYTE Type; +}; + +typedef DPTR(StubPrecodeData) PTR_StubPrecodeData; + +#if !(defined(TARGET_ARM64) && defined(TARGET_UNIX)) +extern "C" void StubPrecodeCode(); +extern "C" void StubPrecodeCode_End(); +#endif + +// Regular precode +struct StubPrecode +{ +#if defined(HOST_AMD64) + static const BYTE Type = 0x4C; + static const int CodeSize = 24; +#elif defined(HOST_X86) + static const BYTE Type = 0xA1; + static const int CodeSize = 24; +#elif defined(HOST_ARM64) + static const int Type = 0x4A; + static const int CodeSize = 24; +#elif defined(HOST_ARM) + static const int Type = 0xCF; + static const int CodeSize = 12; +#endif // HOST_AMD64 + + BYTE m_code[CodeSize]; + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + static void (*StubPrecodeCode)(); + static void (*StubPrecodeCode_End)(); +#endif + + void Init(StubPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator = NULL, BYTE type = StubPrecode::Type, TADDR target = NULL); + + static void StaticInitialize(); + + PTR_StubPrecodeData GetData() const + { + LIMITED_METHOD_CONTRACT; + return dac_cast(dac_cast(this) + GetOsPageSize()); + } + + TADDR GetMethodDesc() + { + LIMITED_METHOD_DAC_CONTRACT; + + return dac_cast(GetData()->MethodDesc); + } + + PCODE GetTarget() + { + LIMITED_METHOD_DAC_CONTRACT; + + return GetData()->Target; + } + + BYTE GetType() + { + return GetData()->Type; + } + +#ifndef DACCESS_COMPILE + static BOOL IsStubPrecodeByASM(PCODE addr); + + void ResetTargetInterlocked() + { + CONTRACTL + { + THROWS; + GC_NOTRIGGER; + } + CONTRACTL_END; + + StubPrecodeData *pData = GetData(); + InterlockedExchangeT(&pData->Target, GetPreStubEntryPoint()); + } + + BOOL SetTargetInterlocked(TADDR target, TADDR expected) + { + CONTRACTL + { + THROWS; + GC_NOTRIGGER; + } + CONTRACTL_END; + + StubPrecodeData *pData = GetData(); + return InterlockedCompareExchangeT(&pData->Target, (PCODE)target, (PCODE)expected) == expected; + } + + static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX); + +#endif // !DACCESS_COMPILE +}; + +typedef DPTR(StubPrecode) PTR_StubPrecode; + + +#ifdef HAS_NDIRECT_IMPORT_PRECODE + +// NDirect import precode +// (This is fake precode. VTable slot does not point to it.) +struct NDirectImportPrecode : StubPrecode +{ + static const int Type = 0x01; + + void Init(NDirectImportPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator); + + LPVOID GetEntrypoint() + { + LIMITED_METHOD_CONTRACT; + return (LPVOID)PINSTRToPCODE(dac_cast(this)); + } +}; +typedef DPTR(NDirectImportPrecode) PTR_NDirectImportPrecode; + +#endif // HAS_NDIRECT_IMPORT_PRECODE + + +#ifdef HAS_FIXUP_PRECODE + +struct FixupPrecodeData +{ + PCODE Target; + MethodDesc *MethodDesc; + PCODE PrecodeFixupThunk; +}; + +typedef DPTR(FixupPrecodeData) PTR_FixupPrecodeData; + +#if !(defined(TARGET_ARM64) && defined(TARGET_UNIX)) +extern "C" void FixupPrecodeCode(); +extern "C" void FixupPrecodeCode_End(); +#endif + +// Fixup precode is used in ngen images when the prestub does just one time fixup. +// The fixup precode is simple jump once patched. It does not have the two instruction overhead of regular precode. +struct FixupPrecode +{ +#if defined(HOST_AMD64) + static const int Type = 0xFF; + static const int CodeSize = 24; + static const int FixupCodeOffset = 6; +#elif defined(HOST_X86) + static const int Type = 0xFF; + static const int CodeSize = 24; + static const int FixupCodeOffset = 6; +#elif defined(HOST_ARM64) + static const int Type = 0x0B; + static const int CodeSize = 24; + static const int FixupCodeOffset = 8; +#elif defined(HOST_ARM) + static const int Type = 0xFF; + static const int CodeSize = 12; + static const int FixupCodeOffset = 4 + THUMB_CODE; +#endif // HOST_AMD64 + + BYTE m_code[CodeSize]; + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + static void (*FixupPrecodeCode)(); + static void (*FixupPrecodeCode_End)(); #endif + void Init(FixupPrecode* pPrecodeRX, MethodDesc* pMD, LoaderAllocator *pLoaderAllocator); + + static void StaticInitialize(); + + static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX); + + PTR_FixupPrecodeData GetData() const + { + LIMITED_METHOD_CONTRACT; + return dac_cast(dac_cast(this) + GetOsPageSize()); + } + + TADDR GetMethodDesc() + { + LIMITED_METHOD_CONTRACT; + return (TADDR)GetData()->MethodDesc; + } + + PCODE GetTarget() + { + LIMITED_METHOD_DAC_CONTRACT; + return GetData()->Target; + } + + PCODE *GetTargetSlot() + { + LIMITED_METHOD_CONTRACT; + return &GetData()->Target; + } + +#ifndef DACCESS_COMPILE + static BOOL IsFixupPrecodeByASM(PCODE addr); + + void ResetTargetInterlocked() + { + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + } + CONTRACTL_END; + + PCODE target = (PCODE)this + FixupCodeOffset; + + _ASSERTE(IS_ALIGNED(&GetData()->Target, sizeof(SIZE_T))); + InterlockedExchangeT(&GetData()->Target, target); + } + + BOOL SetTargetInterlocked(TADDR target, TADDR expected) + { + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + } + CONTRACTL_END; + + MethodDesc * pMD = (MethodDesc*)GetMethodDesc(); + g_IBCLogger.LogMethodPrecodeWriteAccess(pMD); + + PCODE oldTarget = (PCODE)GetData()->Target; + if (oldTarget != ((PCODE)this + FixupCodeOffset)) + { +#ifdef FEATURE_CODE_VERSIONING + // No change needed, jmp is already in place +#else + // Setting the target more than once is unexpected + return FALSE; +#endif + } + + _ASSERTE(IS_ALIGNED(&GetData()->Target, sizeof(SIZE_T))); + return InterlockedCompareExchangeT(&GetData()->Target, (PCODE)target, (PCODE)oldTarget) == (PCODE)oldTarget; + } +#endif + +#ifdef DACCESS_COMPILE + void EnumMemoryRegions(CLRDataEnumMemoryFlags flags); +#endif +}; + +typedef DPTR(FixupPrecode) PTR_FixupPrecode; + +#endif // HAS_FIXUP_PRECODE + +#include + +typedef DPTR(class Precode) PTR_Precode; + enum PrecodeType { PRECODE_INVALID = InvalidPrecode::Type, PRECODE_STUB = StubPrecode::Type, @@ -57,7 +360,7 @@ class Precode { #endif // HAS_NDIRECT_IMPORT_PRECODE #ifdef HAS_FIXUP_PRECODE - FixupPrecode* AsFixupPrecode() + PTR_FixupPrecode AsFixupPrecode() { LIMITED_METHOD_CONTRACT; SUPPORTS_DAC; @@ -109,29 +412,12 @@ class Precode { #ifdef OFFSETOF_PRECODE_TYPE BYTE type = m_data[OFFSETOF_PRECODE_TYPE]; -#ifdef TARGET_X86 - if (type == X86_INSTR_MOV_RM_R) - type = m_data[OFFSETOF_PRECODE_TYPE_MOV_RM_R]; -#endif // TARGET_X86 - -#ifdef TARGET_AMD64 - if (type == (X86_INSTR_MOV_R10_IMM64 & 0xFF)) - type = m_data[OFFSETOF_PRECODE_TYPE_MOV_R10]; - else if ((type == (X86_INSTR_CALL_REL32 & 0xFF)) || (type == (X86_INSTR_JMP_REL32 & 0xFF))) - type = m_data[OFFSETOF_PRECODE_TYPE_CALL_OR_JMP]; -#endif // _AMD64 - -#if defined(HAS_FIXUP_PRECODE) && (defined(TARGET_X86) || defined(TARGET_AMD64)) - if (type == FixupPrecode::TypePrestub) - type = FixupPrecode::Type; -#endif - -#ifdef TARGET_ARM - static_assert_no_msg(offsetof(StubPrecode, m_pTarget) == offsetof(NDirectImportPrecode, m_pMethodDesc)); - // If the precode does not have thumb bit on target, it must be NDirectImportPrecode. - if (type == StubPrecode::Type && ((AsStubPrecode()->m_pTarget & THUMB_CODE) == 0)) - type = NDirectImportPrecode::Type; -#endif + if (type == StubPrecode::Type) + { + // StubPrecode code is used for both StubPrecode and NDirectImportPrecode, + // so we need to get the real type + type = AsStubPrecode()->GetType(); + } return (PrecodeType)type; @@ -147,12 +433,6 @@ class Precode { SUPPORTS_DAC; unsigned int align = PRECODE_ALIGNMENT; -#if defined(TARGET_X86) && defined(HAS_FIXUP_PRECODE) - // Fixup precodes has to be aligned to allow atomic patching - if (t == PRECODE_FIXUP) - align = 8; -#endif // TARGET_X86 && HAS_FIXUP_PRECODE - #if defined(TARGET_ARM) && defined(HAS_COMPACT_ENTRYPOINTS) // Precodes have to be aligned to allow fast compact entry points check _ASSERTE (align >= sizeof(void*)); @@ -211,19 +491,11 @@ class Precode { PCODE GetEntryPoint() { LIMITED_METHOD_CONTRACT; - return dac_cast(this) + GetEntryPointOffset(); - } - - static SIZE_T GetEntryPointOffset() - { - LIMITED_METHOD_CONTRACT; -#ifdef TARGET_ARM - return THUMB_CODE; -#else - return 0; -#endif + return PINSTRToPCODE(dac_cast(this)); } + PTR_PCODE GetTargetSlot(); + MethodDesc * GetMethodDesc(BOOL fSpeculative = FALSE); BOOL IsCorrectMethodDesc(MethodDesc * pMD); @@ -239,7 +511,7 @@ class Precode { void Reset(); #endif // DACCESS_COMPILE - static Precode* GetPrecodeFromEntryPoint(PCODE addr, BOOL fSpeculative = FALSE) + static PTR_Precode GetPrecodeFromEntryPoint(PCODE addr, BOOL fSpeculative = FALSE) { LIMITED_METHOD_DAC_CONTRACT; @@ -260,7 +532,7 @@ class Precode { } } - Precode* pPrecode = PTR_Precode(pInstr); + PTR_Precode pPrecode = PTR_Precode(pInstr); if (!fSpeculative) { @@ -280,38 +552,35 @@ class Precode { static SIZE_T SizeOfTemporaryEntryPoint(PrecodeType t) { LIMITED_METHOD_DAC_CONTRACT; -#ifdef HAS_FIXUP_PRECODE_CHUNKS - _ASSERTE(t != PRECODE_FIXUP); -#endif + return ALIGN_UP(SizeOf(t), AlignOf(t)); } static Precode * GetPrecodeForTemporaryEntryPoint(TADDR temporaryEntryPoints, int index); - static SIZE_T SizeOfTemporaryEntryPoints(PrecodeType t, bool preallocateJumpStubs, int count); + static SIZE_T SizeOfTemporaryEntryPoints(PrecodeType t, int count); static SIZE_T SizeOfTemporaryEntryPoints(TADDR temporaryEntryPoints, int count); static TADDR AllocateTemporaryEntryPoints(MethodDescChunk* pChunk, LoaderAllocator *pLoaderAllocator, AllocMemTracker *pamTracker); -#ifdef DACCESS_COMPILE - void EnumMemoryRegions(CLRDataEnumMemoryFlags flags); -#endif - -#ifdef HAS_FIXUP_PRECODE_CHUNKS - static DWORD GetOffsetOfBase(PrecodeType t, DWORD count) + static SIZE_T GetMaxTemporaryEntryPointsCount() { - assert(t == PRECODE_FIXUP); - return (DWORD)(count * sizeof(FixupPrecode)); + SIZE_T maxPrecodeCodeSize = Max(FixupPrecode::CodeSize, StubPrecode::CodeSize); + return GetOsPageSize() / maxPrecodeCodeSize; } - static DWORD GetOffset(PrecodeType t, DWORD index, DWORD count) - { - assert(t == PRECODE_FIXUP); - assert(index < count); - return (DWORD)((count - index - 1)* sizeof(FixupPrecode)); - } +#ifdef DACCESS_COMPILE + void EnumMemoryRegions(CLRDataEnumMemoryFlags flags); #endif }; +// Verify that the type for each precode is different +static_assert_no_msg(StubPrecode::Type != NDirectImportPrecode::Type); +static_assert_no_msg(StubPrecode::Type != FixupPrecode::Type); +static_assert_no_msg(StubPrecode::Type != ThisPtrRetBufPrecode::Type); +static_assert_no_msg(FixupPrecode::Type != NDirectImportPrecode::Type); +static_assert_no_msg(FixupPrecode::Type != ThisPtrRetBufPrecode::Type); +static_assert_no_msg(NDirectImportPrecode::Type != ThisPtrRetBufPrecode::Type); + #endif // __PRECODE_H__ diff --git a/src/coreclr/vm/stublink.cpp b/src/coreclr/vm/stublink.cpp index a2070a558403f..51884d7cd8dbb 100644 --- a/src/coreclr/vm/stublink.cpp +++ b/src/coreclr/vm/stublink.cpp @@ -2133,7 +2133,7 @@ Stub* Stub::NewStub(PTR_VOID pCode, DWORD flags) // Make sure that the payload of the stub is aligned Stub* pStubRX = (Stub*)(pBlock + stubPayloadOffset); Stub* pStubRW; - ExecutableWriterHolder stubWriterHolder; + ExecutableWriterHolderNoLog stubWriterHolder; if (pHeap == NULL) { @@ -2141,7 +2141,7 @@ Stub* Stub::NewStub(PTR_VOID pCode, DWORD flags) } else { - stubWriterHolder = ExecutableWriterHolder(pStubRX, sizeof(Stub)); + stubWriterHolder.AssignExecutableWriterHolder(pStubRX, sizeof(Stub)); pStubRW = stubWriterHolder.GetRW(); } pStubRW->SetupStub( diff --git a/src/coreclr/vm/stubmgr.cpp b/src/coreclr/vm/stubmgr.cpp index d08ded97c933b..83a3b2bce4094 100644 --- a/src/coreclr/vm/stubmgr.cpp +++ b/src/coreclr/vm/stubmgr.cpp @@ -1004,8 +1004,7 @@ BOOL PrecodeStubManager::CheckIsStub_Internal(PCODE stubStartAddress) } CONTRACTL_END; - // Forwarded to from RangeSectionStubManager - return FALSE; + return GetStubPrecodeRangeList()->IsInRange(stubStartAddress) || GetFixupPrecodeRangeList()->IsInRange(stubStartAddress); } BOOL PrecodeStubManager::DoTraceStub(PCODE stubStartAddress, @@ -1033,7 +1032,14 @@ BOOL PrecodeStubManager::DoTraceStub(PCODE stubStartAddress, else #endif // HAS_COMPACT_ENTRYPOINTS { - Precode* pPrecode = Precode::GetPrecodeFromEntryPoint(stubStartAddress); + // When the target slot points to the fixup part of the fixup precode, we need to compensate + // for that to get the actual stub address + Precode* pPrecode = Precode::GetPrecodeFromEntryPoint(stubStartAddress - FixupPrecode::FixupCodeOffset, TRUE /* speculative */); + if ((pPrecode == NULL) || (pPrecode->GetType() != PRECODE_FIXUP)) + { + pPrecode = Precode::GetPrecodeFromEntryPoint(stubStartAddress); + } + PREFIX_ASSUME(pPrecode != NULL); switch (pPrecode->GetType()) @@ -1459,21 +1465,6 @@ BOOL RangeSectionStubManager::TraceManager(Thread *thread, } #endif -PCODE RangeSectionStubManager::GetMethodThunkTarget(PCODE stubStartAddress) -{ - WRAPPER_NO_CONTRACT; - -#if defined(TARGET_X86) || defined(TARGET_AMD64) - return rel32Decode(stubStartAddress+1); -#elif defined(TARGET_ARM) - TADDR pInstr = PCODEToPINSTR(stubStartAddress); - return *dac_cast(pInstr + 2 * sizeof(DWORD)); -#else - PORTABILITY_ASSERT("RangeSectionStubManager::GetMethodThunkTarget"); - return NULL; -#endif -} - #ifdef DACCESS_COMPILE LPCWSTR RangeSectionStubManager::GetStubManagerName(PCODE addr) { @@ -2339,6 +2330,8 @@ PrecodeStubManager::DoEnumMemoryRegions(CLRDataEnumMemoryFlags flags) WRAPPER_NO_CONTRACT; DAC_ENUM_VTHIS(); EMEM_OUT(("MEM: %p PrecodeStubManager\n", dac_cast(this))); + GetStubPrecodeRangeList()->EnumMemoryRegions(flags); + GetFixupPrecodeRangeList()->EnumMemoryRegions(flags); } void diff --git a/src/coreclr/vm/stubmgr.h b/src/coreclr/vm/stubmgr.h index 2bb4b10eda150..d3cca675bce99 100644 --- a/src/coreclr/vm/stubmgr.h +++ b/src/coreclr/vm/stubmgr.h @@ -399,6 +399,28 @@ class PrecodeStubManager : public StubManager ~PrecodeStubManager() {WRAPPER_NO_CONTRACT;} #endif + protected: + LockedRangeList m_stubPrecodeRangeList; + LockedRangeList m_fixupPrecodeRangeList; + + public: + // Get dac-ized pointer to rangelist. + PTR_RangeList GetStubPrecodeRangeList() + { + SUPPORTS_DAC; + + TADDR addr = PTR_HOST_MEMBER_TADDR(PrecodeStubManager, this, m_stubPrecodeRangeList); + return PTR_RangeList(addr); + } + + PTR_RangeList GetFixupPrecodeRangeList() + { + SUPPORTS_DAC; + + TADDR addr = PTR_HOST_MEMBER_TADDR(PrecodeStubManager, this, m_fixupPrecodeRangeList); + return PTR_RangeList(addr); + } + public: virtual BOOL CheckIsStub_Internal(PCODE stubStartAddress); @@ -591,8 +613,6 @@ class RangeSectionStubManager : public StubManager static StubCodeBlockKind GetStubKind(PCODE stubStartAddress); - static PCODE GetMethodThunkTarget(PCODE stubStartAddress); - public: #ifdef _DEBUG virtual const char * DbgGetName() { LIMITED_METHOD_CONTRACT; return "RangeSectionStubManager"; } diff --git a/src/coreclr/vm/virtualcallstub.cpp b/src/coreclr/vm/virtualcallstub.cpp index fd85071676e94..544a3bc58fc00 100644 --- a/src/coreclr/vm/virtualcallstub.cpp +++ b/src/coreclr/vm/virtualcallstub.cpp @@ -560,19 +560,16 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA cache_entry_heap_reserve_size *= sizeof(ResolveCacheElem); cache_entry_heap_commit_size *= sizeof(ResolveCacheElem); - lookup_heap_reserve_size *= sizeof(LookupHolder); - lookup_heap_commit_size *= sizeof(LookupHolder); + lookup_heap_reserve_size *= sizeof(LookupStub); + lookup_heap_commit_size *= sizeof(LookupStub); - DWORD dispatchHolderSize = sizeof(DispatchHolder); -#ifdef TARGET_AMD64 - dispatchHolderSize = static_cast(DispatchHolder::GetHolderSize(DispatchStub::e_TYPE_SHORT)); -#endif + DWORD dispatchStubSize = sizeof(DispatchStub); - dispatch_heap_reserve_size *= dispatchHolderSize; - dispatch_heap_commit_size *= dispatchHolderSize; + dispatch_heap_reserve_size *= dispatchStubSize; + dispatch_heap_commit_size *= dispatchStubSize; - resolve_heap_reserve_size *= sizeof(ResolveHolder); - resolve_heap_commit_size *= sizeof(ResolveHolder); + resolve_heap_reserve_size *= sizeof(ResolveStub); + resolve_heap_commit_size *= sizeof(ResolveStub); vtable_heap_reserve_size *= static_cast(VTableCallHolder::GetHolderSize(0)); vtable_heap_commit_size *= static_cast(VTableCallHolder::GetHolderSize(0)); @@ -586,14 +583,14 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA cache_entry_heap_reserve_size = (DWORD) ALIGN_UP(cache_entry_heap_reserve_size, GetOsPageSize()); cache_entry_heap_commit_size = (DWORD) ALIGN_UP(cache_entry_heap_commit_size, GetOsPageSize()); - lookup_heap_reserve_size = (DWORD) ALIGN_UP(lookup_heap_reserve_size, GetOsPageSize()); - lookup_heap_commit_size = (DWORD) ALIGN_UP(lookup_heap_commit_size, GetOsPageSize()); + lookup_heap_reserve_size = (DWORD) ALIGN_UP(lookup_heap_reserve_size, 2 * GetOsPageSize()); + lookup_heap_commit_size = (DWORD) ALIGN_UP(lookup_heap_commit_size, 2 * GetOsPageSize()); - dispatch_heap_reserve_size = (DWORD) ALIGN_UP(dispatch_heap_reserve_size, GetOsPageSize()); - dispatch_heap_commit_size = (DWORD) ALIGN_UP(dispatch_heap_commit_size, GetOsPageSize()); + dispatch_heap_reserve_size = (DWORD) ALIGN_UP(dispatch_heap_reserve_size, 2 * GetOsPageSize()); + dispatch_heap_commit_size = (DWORD) ALIGN_UP(dispatch_heap_commit_size, 2 * GetOsPageSize()); - resolve_heap_reserve_size = (DWORD) ALIGN_UP(resolve_heap_reserve_size, GetOsPageSize()); - resolve_heap_commit_size = (DWORD) ALIGN_UP(resolve_heap_commit_size, GetOsPageSize()); + resolve_heap_reserve_size = (DWORD) ALIGN_UP(resolve_heap_reserve_size, 2 * GetOsPageSize()); + resolve_heap_commit_size = (DWORD) ALIGN_UP(resolve_heap_commit_size, 2 * GetOsPageSize()); vtable_heap_reserve_size = (DWORD) ALIGN_UP(vtable_heap_reserve_size, GetOsPageSize()); vtable_heap_commit_size = (DWORD) ALIGN_UP(vtable_heap_commit_size, GetOsPageSize()); @@ -617,16 +614,17 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA if (dwWastedReserveMemSize != 0) { DWORD cWastedPages = dwWastedReserveMemSize / GetOsPageSize(); - DWORD cPagesPerHeap = cWastedPages / 6; - DWORD cPagesRemainder = cWastedPages % 6; // We'll throw this at the resolve heap + DWORD cPagesPerHeap = cWastedPages / 9; + DWORD cPagesRemainder = cWastedPages % 9; // We'll throw this at the resolve heap indcell_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); cache_entry_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); - lookup_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); - dispatch_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); + lookup_heap_reserve_size += 2 * cPagesPerHeap * GetOsPageSize(); + dispatch_heap_reserve_size += 2 * cPagesPerHeap * GetOsPageSize(); vtable_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); - resolve_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); - resolve_heap_reserve_size += cPagesRemainder * GetOsPageSize(); + resolve_heap_reserve_size += 2 * cPagesPerHeap * GetOsPageSize(); + resolve_heap_reserve_size += (cPagesRemainder & 0xFFFFFFFE) * GetOsPageSize(); + indcell_heap_reserve_size += (cPagesRemainder & 1) * GetOsPageSize(); } CONSISTENCY_CHECK((indcell_heap_reserve_size + @@ -653,14 +651,14 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA cache_entry_heap_reserve_size = GetOsPageSize(); cache_entry_heap_commit_size = GetOsPageSize(); - lookup_heap_reserve_size = GetOsPageSize(); - lookup_heap_commit_size = GetOsPageSize(); + lookup_heap_reserve_size = 4 * GetOsPageSize(); + lookup_heap_commit_size = 2 * GetOsPageSize(); - dispatch_heap_reserve_size = GetOsPageSize(); - dispatch_heap_commit_size = GetOsPageSize(); + dispatch_heap_reserve_size = 4 * GetOsPageSize(); + dispatch_heap_commit_size = 2 * GetOsPageSize(); - resolve_heap_reserve_size = GetOsPageSize(); - resolve_heap_commit_size = GetOsPageSize(); + resolve_heap_reserve_size = 4 * GetOsPageSize(); + resolve_heap_commit_size = 2 * GetOsPageSize(); // Heap for the collectible case is carefully tuned to sum up to 16 pages. Today, we only use the // vtable jump stubs in the R2R scenario, which is unlikely to be loaded in the collectible context, @@ -693,7 +691,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA NewHolder indcell_heap_holder( new LoaderHeap(indcell_heap_reserve_size, indcell_heap_commit_size, initReservedMem, indcell_heap_reserve_size, - NULL, FALSE)); + NULL, UnlockedLoaderHeap::HeapKind::Data, FALSE)); initReservedMem += indcell_heap_reserve_size; @@ -701,7 +699,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA NewHolder cache_entry_heap_holder( new LoaderHeap(cache_entry_heap_reserve_size, cache_entry_heap_commit_size, initReservedMem, cache_entry_heap_reserve_size, - &cache_entry_rangeList, FALSE)); + &cache_entry_rangeList, UnlockedLoaderHeap::HeapKind::Data, FALSE)); initReservedMem += cache_entry_heap_reserve_size; @@ -709,7 +707,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA NewHolder lookup_heap_holder( new LoaderHeap(lookup_heap_reserve_size, lookup_heap_commit_size, initReservedMem, lookup_heap_reserve_size, - &lookup_rangeList, TRUE)); + &lookup_rangeList, UnlockedLoaderHeap::HeapKind::Interleaved, FALSE, LookupStub::GenerateCodePage, LookupStub::CodeSize)); initReservedMem += lookup_heap_reserve_size; @@ -717,7 +715,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA NewHolder dispatch_heap_holder( new LoaderHeap(dispatch_heap_reserve_size, dispatch_heap_commit_size, initReservedMem, dispatch_heap_reserve_size, - &dispatch_rangeList, TRUE)); + &dispatch_rangeList, UnlockedLoaderHeap::HeapKind::Interleaved, FALSE, DispatchStub::GenerateCodePage, DispatchStub::CodeSize)); initReservedMem += dispatch_heap_reserve_size; @@ -725,7 +723,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA NewHolder resolve_heap_holder( new LoaderHeap(resolve_heap_reserve_size, resolve_heap_commit_size, initReservedMem, resolve_heap_reserve_size, - &resolve_rangeList, TRUE)); + &resolve_rangeList, UnlockedLoaderHeap::HeapKind::Interleaved, FALSE, ResolveStub::GenerateCodePage, ResolveStub::CodeSize)); initReservedMem += resolve_heap_reserve_size; @@ -733,13 +731,10 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA NewHolder vtable_heap_holder( new LoaderHeap(vtable_heap_reserve_size, vtable_heap_commit_size, initReservedMem, vtable_heap_reserve_size, - &vtable_rangeList, TRUE)); + &vtable_rangeList, UnlockedLoaderHeap::HeapKind::Executable)); initReservedMem += vtable_heap_reserve_size; - // Allocate the initial counter block - NewHolder m_counters_holder(new counter_block); - // // On success of every allocation, assign the objects and suppress the release // @@ -757,16 +752,6 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA vtableCallers = vtableCallers_holder; vtableCallers_holder.SuppressRelease(); cache_entries = cache_entries_holder; cache_entries_holder.SuppressRelease(); - m_counters = m_counters_holder; m_counters_holder.SuppressRelease(); - - // Create the initial failure counter block - m_counters->next = NULL; - m_counters->used = 0; - m_cur_counter_block = m_counters; - - m_cur_counter_block_for_reclaim = m_counters; - m_cur_counter_block_for_reclaim_index = 0; - // Keep track of all of our managers VirtualCallStubManagerManager::GlobalManager()->AddStubManager(this); } @@ -823,14 +808,6 @@ VirtualCallStubManager::~VirtualCallStubManager() if (vtableCallers) { delete vtableCallers; vtableCallers = NULL;} if (cache_entries) { delete cache_entries; cache_entries = NULL;} - // Now get rid of the memory taken by the counter_blocks - while (m_counters != NULL) - { - counter_block *del = m_counters; - m_counters = m_counters->next; - delete del; - } - // This was the block reserved by Init for the heaps. // For the collectible case, the VSD logic does not allocate the memory. if (m_initialReservedMemForHeaps && !m_loaderAllocator->IsCollectible()) @@ -856,18 +833,15 @@ void VirtualCallStubManager::InitStatic() g_resetCacheIncr = (INT32) CLRConfig::GetConfigValue(CLRConfig::INTERNAL_VirtualCallStubResetCacheIncr); #endif // STUB_LOGGING -#ifndef STUB_DISPATCH_PORTABLE - DispatchHolder::InitializeStatic(); - ResolveHolder::InitializeStatic(); -#endif // !STUB_DISPATCH_PORTABLE - LookupHolder::InitializeStatic(); - g_resolveCache = new DispatchCache(); if(CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_VirtualCallStubLogging)) StartupLogging(); VirtualCallStubManagerManager::InitStatic(); + LookupStub::InitStatic(); + DispatchStub::InitStatic(); + ResolveStub::InitStatic(); } // Static shutdown code. @@ -929,29 +903,6 @@ i.e. need to be serialized and non-concurrant. */ void VirtualCallStubManager::Reclaim() { LIMITED_METHOD_CONTRACT; - - UINT32 limit = min(counter_block::MAX_COUNTER_ENTRIES, - m_cur_counter_block_for_reclaim->used); - limit = min(m_cur_counter_block_for_reclaim_index + 16, limit); - - for (UINT32 i = m_cur_counter_block_for_reclaim_index; i < limit; i++) - { - m_cur_counter_block_for_reclaim->block[i] += (STUB_MISS_COUNT_VALUE/10)+1; - } - - // Increment the index by the number we processed - m_cur_counter_block_for_reclaim_index = limit; - - // If we ran to the end of the block, go to the next - if (m_cur_counter_block_for_reclaim_index == m_cur_counter_block->used) - { - m_cur_counter_block_for_reclaim = m_cur_counter_block_for_reclaim->next; - m_cur_counter_block_for_reclaim_index = 0; - - // If this was the last block in the chain, go back to the beginning - if (m_cur_counter_block_for_reclaim == NULL) - m_cur_counter_block_for_reclaim = m_counters; - } } #endif // !DACCESS_COMPILE @@ -1126,8 +1077,8 @@ PCODE VirtualCallStubManager::GetCallStub(TypeHandle ownerType, DWORD slot) { if ((stub = (PCODE)(lookups->Find(&probeL))) == CALL_STUB_EMPTY_ENTRY) { - LookupHolder *pLookupHolder = GenerateLookupStub(addrOfResolver, token.To_SIZE_T()); - stub = (PCODE) (lookups->Add((size_t)(pLookupHolder->stub()->entryPoint()), &probeL)); + LookupStub *pLookupStub = GenerateLookupStub(addrOfResolver, token.To_SIZE_T()); + stub = (PCODE) (lookups->Add((size_t)(pLookupStub->entryPoint()), &probeL)); } } @@ -1337,22 +1288,22 @@ size_t VirtualCallStubManager::GetTokenFromStubQuick(VirtualCallStubManager * pM if (kind == SK_DISPATCH) { _ASSERTE(pMgr->isDispatchingStub(stub)); - DispatchStub * dispatchStub = (DispatchStub *) PCODEToPINSTR(stub); - ResolveHolder * resolveHolder = ResolveHolder::FromFailEntry(dispatchStub->failTarget()); - _ASSERTE(pMgr->isResolvingStub(resolveHolder->stub()->resolveEntryPoint())); - return resolveHolder->stub()->token(); + DispatchStub * pDispatchStub = (DispatchStub *) PCODEToPINSTR(stub); + ResolveStub * pResolveStub = ResolveStub::FromFailEntry(pDispatchStub->failTarget()); + _ASSERTE(pMgr->isResolvingStub(pResolveStub->resolveEntryPoint())); + return pResolveStub->token(); } else if (kind == SK_RESOLVE) { _ASSERTE(pMgr->isResolvingStub(stub)); - ResolveHolder * resolveHolder = ResolveHolder::FromResolveEntry(stub); - return resolveHolder->stub()->token(); + ResolveStub * pResolveStub = ResolveStub::FromResolveEntry(stub); + return pResolveStub->token(); } else if (kind == SK_LOOKUP) { _ASSERTE(pMgr->isLookupStub(stub)); - LookupHolder * lookupHolder = LookupHolder::FromLookupEntry(stub); - return lookupHolder->stub()->token(); + LookupStub * pLookupStub = LookupStub::FromLookupEntry(stub); + return pLookupStub->token(); } else if (kind == SK_VTABLECALL) { @@ -1747,7 +1698,7 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, { //we have a target but not the dispatcher stub, lets build it //First we need a failure target (the resolver stub) - ResolveHolder *pResolveHolder = NULL; + ResolveStub *pResolveStub = NULL; ResolveEntry entryR; Prober probeR(&entryR); PCODE pBackPatchFcn; @@ -1783,7 +1734,7 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, } #endif // TARGET_X86 && !UNIX_X86_ABI - pResolveHolder = GenerateResolveStub(pResolverFcn, + pResolveStub = GenerateResolveStub(pResolverFcn, pBackPatchFcn, token.To_SIZE_T() #if defined(TARGET_X86) && !defined(UNIX_X86_ABI) @@ -1793,14 +1744,14 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, // Add the resolve entrypoint into the cache. //@TODO: Can we store a pointer to the holder rather than the entrypoint? - resolvers->Add((size_t)(pResolveHolder->stub()->resolveEntryPoint()), &probeR); + resolvers->Add((size_t)(pResolveStub->resolveEntryPoint()), &probeR); } else { - pResolveHolder = ResolveHolder::FromResolveEntry(addrOfResolver); + pResolveStub = ResolveStub::FromResolveEntry(addrOfResolver); } - CONSISTENCY_CHECK(CheckPointer(pResolveHolder)); - stub = pResolveHolder->stub()->resolveEntryPoint(); + CONSISTENCY_CHECK(CheckPointer(pResolveStub)); + stub = pResolveStub->resolveEntryPoint(); CONSISTENCY_CHECK(stub != NULL); } @@ -1810,7 +1761,7 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, // 3. The call site is currently wired to a lookup stub. If the call site is wired // to anything else, then we're never going to use the dispatch stub so there's // no use in creating it. - if (pResolveHolder != NULL && stubKind == SK_LOOKUP) + if (pResolveStub != NULL && stubKind == SK_LOOKUP) { DispatchEntry entryD; Prober probeD(&entryD); @@ -1819,13 +1770,13 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, { // We are allowed to create a reusable dispatch stub for all assemblies // this allows us to optimize the call interception case the same way - DispatchHolder *pDispatchHolder = NULL; + DispatchStub *pDispatchStub = NULL; PCODE addrOfDispatch = (PCODE)(dispatchers->Find(&probeD)); if (addrOfDispatch == CALL_STUB_EMPTY_ENTRY) { - PCODE addrOfFail = pResolveHolder->stub()->failEntryPoint(); + PCODE addrOfFail = pResolveStub->failEntryPoint(); bool reenteredCooperativeGCMode = false; - pDispatchHolder = GenerateDispatchStub( + pDispatchStub = GenerateDispatchStub( target, addrOfFail, objectType, token.To_SIZE_T(), &reenteredCooperativeGCMode); if (reenteredCooperativeGCMode) { @@ -1833,16 +1784,16 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, BOOL success = dispatchers->SetUpProber(token.To_SIZE_T(), (size_t)objectType, &probeD); _ASSERTE(success); } - dispatchers->Add((size_t)(pDispatchHolder->stub()->entryPoint()), &probeD); + dispatchers->Add((size_t)(pDispatchStub->entryPoint()), &probeD); } else { - pDispatchHolder = DispatchHolder::FromDispatchEntry(addrOfDispatch); + pDispatchStub = DispatchStub::FromDispatchEntry(addrOfDispatch); } // Now assign the entrypoint to stub - CONSISTENCY_CHECK(CheckPointer(pDispatchHolder)); - stub = pDispatchHolder->stub()->entryPoint(); + CONSISTENCY_CHECK(CheckPointer(pDispatchStub)); + stub = pDispatchStub->entryPoint(); CONSISTENCY_CHECK(stub != NULL); } else @@ -1940,16 +1891,16 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, Prober probeD(&entryD); if (dispatchers->SetUpProber(token.To_SIZE_T(), (size_t) objectType, &probeD)) { - DispatchHolder *pDispatchHolder = NULL; + DispatchStub *pDispatchStub = NULL; PCODE addrOfDispatch = (PCODE)(dispatchers->Find(&probeD)); if (addrOfDispatch == CALL_STUB_EMPTY_ENTRY) { // It is possible that we never created this monomorphic dispatch stub // so we may have to create it now - ResolveHolder* pResolveHolder = ResolveHolder::FromResolveEntry(pCallSite->GetSiteTarget()); - PCODE addrOfFail = pResolveHolder->stub()->failEntryPoint(); + ResolveStub* pResolveStub = ResolveStub::FromResolveEntry(pCallSite->GetSiteTarget()); + PCODE addrOfFail = pResolveStub->stub()->failEntryPoint(); bool reenteredCooperativeGCMode = false; - pDispatchHolder = GenerateDispatchStub( + pDispatchStub = GenerateDispatchStub( target, addrOfFail, objectType, token.To_SIZE_T(), &reenteredCooperativeGCMode); if (reenteredCooperativeGCMode) { @@ -1957,19 +1908,19 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, BOOL success = dispatchers->SetUpProber(token.To_SIZE_T(), (size_t)objectType, &probeD); _ASSERTE(success); } - dispatchers->Add((size_t)(pDispatchHolder->stub()->entryPoint()), &probeD); + dispatchers->Add((size_t)(pDispatchStub->entryPoint()), &probeD); } else { - pDispatchHolder = DispatchHolder::FromDispatchEntry(addrOfDispatch); + pDispatchStub = DispatchStub::FromDispatchEntry(addrOfDispatch); } // increment the of times we changed a cache collision into a mono stub stats.worker_collide_to_mono++; // Now assign the entrypoint to stub - CONSISTENCY_CHECK(pDispatchHolder != NULL); - stub = pDispatchHolder->stub()->entryPoint(); + CONSISTENCY_CHECK(pDispatchStub != NULL); + stub = pDispatchStub->entryPoint(); CONSISTENCY_CHECK(stub != NULL); } } @@ -2475,20 +2426,19 @@ void VirtualCallStubManager::BackPatchWorker(StubCallSite* pCallSite) if (isDispatchingStub(callSiteTarget)) { - DispatchHolder * dispatchHolder = DispatchHolder::FromDispatchEntry(callSiteTarget); - DispatchStub * dispatchStub = dispatchHolder->stub(); + DispatchStub * dispatchStub = DispatchStub::FromDispatchEntry(callSiteTarget); //yes, patch it to point to the resolve stub //We can ignore the races now since we now know that the call site does go thru our //stub mechanisms, hence no matter who wins the race, we are correct. //We find the correct resolve stub by following the failure path in the dispatcher stub itself - PCODE failEntry = dispatchStub->failTarget(); - ResolveStub* resolveStub = ResolveHolder::FromFailEntry(failEntry)->stub(); + PCODE failEntry = dispatchStub->failTarget(); + ResolveStub* resolveStub = ResolveStub::FromFailEntry(failEntry); PCODE resolveEntry = resolveStub->resolveEntryPoint(); BackPatchSite(pCallSite, resolveEntry); LOG((LF_STUBS, LL_INFO10000, "BackPatchWorker call-site" FMT_ADDR "dispatchStub" FMT_ADDR "\n", - DBG_ADDR(pCallSite->GetReturnAddress()), DBG_ADDR(dispatchHolder->stub()))); + DBG_ADDR(pCallSite->GetReturnAddress()), DBG_ADDR(dispatchStub))); //Add back the default miss count to the counter being used by this resolve stub //Since resolve stub are shared among many dispatch stubs each dispatch stub @@ -2563,13 +2513,13 @@ void StubCallSite::SetSiteTarget(PCODE newTarget) /* Generate a dispatcher stub, pMTExpected is the method table to burn in the stub, and the two addrOf's are the addresses the stub is to transfer to depending on the test with pMTExpected */ -DispatchHolder *VirtualCallStubManager::GenerateDispatchStub(PCODE addrOfCode, +DispatchStub *VirtualCallStubManager::GenerateDispatchStub(PCODE addrOfCode, PCODE addrOfFail, void * pMTExpected, size_t dispatchToken, bool * pMayHaveReenteredCooperativeGCMode) { - CONTRACT (DispatchHolder*) { + CONTRACT (DispatchStub*) { THROWS; GC_TRIGGERS; INJECT_FAULT(COMPlusThrowOM();); @@ -2581,113 +2531,20 @@ DispatchHolder *VirtualCallStubManager::GenerateDispatchStub(PCODE ad POSTCONDITION(CheckPointer(RETVAL)); } CONTRACT_END; - size_t dispatchHolderSize = sizeof(DispatchHolder); - -#ifdef TARGET_AMD64 - // See comment around m_fShouldAllocateLongJumpDispatchStubs for explanation. - if (m_fShouldAllocateLongJumpDispatchStubs - INDEBUG(|| g_pConfig->ShouldGenerateLongJumpDispatchStub())) - { - RETURN GenerateDispatchStubLong(addrOfCode, - addrOfFail, - pMTExpected, - dispatchToken, - pMayHaveReenteredCooperativeGCMode); - } - - dispatchHolderSize = DispatchHolder::GetHolderSize(DispatchStub::e_TYPE_SHORT); -#endif - - //allocate from the requisite heap and copy the template over it. - DispatchHolder * holder = (DispatchHolder*) (void*) - dispatch_heap->AllocAlignedMem(dispatchHolderSize, CODE_SIZE_ALIGN); - -#ifdef TARGET_AMD64 - if (!DispatchHolder::CanShortJumpDispatchStubReachFailTarget(addrOfFail, (LPCBYTE)holder)) - { - m_fShouldAllocateLongJumpDispatchStubs = TRUE; - RETURN GenerateDispatchStub(addrOfCode, addrOfFail, pMTExpected, dispatchToken, pMayHaveReenteredCooperativeGCMode); - } -#endif - - ExecutableWriterHolder dispatchWriterHolder(holder, dispatchHolderSize); - dispatchWriterHolder.GetRW()->Initialize(holder, addrOfCode, - addrOfFail, - (size_t)pMTExpected -#ifdef TARGET_AMD64 - , DispatchStub::e_TYPE_SHORT -#endif - ); - -#ifdef FEATURE_CODE_VERSIONING - MethodDesc *pMD = MethodTable::GetMethodDescForSlotAddress(addrOfCode); - if (pMD->IsVersionableWithVtableSlotBackpatch()) - { - EntryPointSlots::SlotType slotType; - TADDR slot = holder->stub()->implTargetSlot(&slotType); - pMD->RecordAndBackpatchEntryPointSlot(m_loaderAllocator, slot, slotType); - - // RecordAndBackpatchEntryPointSlot() may exit and reenter cooperative GC mode - *pMayHaveReenteredCooperativeGCMode = true; - } -#endif - - ClrFlushInstructionCache(holder->stub(), holder->stub()->size()); - - AddToCollectibleVSDRangeList(holder); - - //incr our counters - stats.stub_mono_counter++; - stats.stub_space += (UINT32)dispatchHolderSize; - LOG((LF_STUBS, LL_INFO10000, "GenerateDispatchStub for token" FMT_ADDR "and pMT" FMT_ADDR "at" FMT_ADDR "\n", - DBG_ADDR(dispatchToken), DBG_ADDR(pMTExpected), DBG_ADDR(holder->stub()))); - -#ifdef FEATURE_PERFMAP - PerfMap::LogStubs(__FUNCTION__, "GenerateDispatchStub", (PCODE)holder->stub(), holder->stub()->size()); -#endif - - RETURN (holder); -} - -#ifdef TARGET_AMD64 -//---------------------------------------------------------------------------- -/* Generate a dispatcher stub, pMTExpected is the method table to burn in the stub, and the two addrOf's -are the addresses the stub is to transfer to depending on the test with pMTExpected -*/ -DispatchHolder *VirtualCallStubManager::GenerateDispatchStubLong(PCODE addrOfCode, - PCODE addrOfFail, - void * pMTExpected, - size_t dispatchToken, - bool * pMayHaveReenteredCooperativeGCMode) -{ - CONTRACT (DispatchHolder*) { - THROWS; - GC_TRIGGERS; - INJECT_FAULT(COMPlusThrowOM();); - PRECONDITION(addrOfCode != NULL); - PRECONDITION(addrOfFail != NULL); - PRECONDITION(CheckPointer(pMTExpected)); - PRECONDITION(pMayHaveReenteredCooperativeGCMode != nullptr); - PRECONDITION(!*pMayHaveReenteredCooperativeGCMode); - POSTCONDITION(CheckPointer(RETVAL)); - } CONTRACT_END; + size_t dispatchStubSize = DispatchStub::size(); //allocate from the requisite heap and copy the template over it. - size_t dispatchHolderSize = DispatchHolder::GetHolderSize(DispatchStub::e_TYPE_LONG); - DispatchHolder * holder = (DispatchHolder*) (void*)dispatch_heap->AllocAlignedMem(dispatchHolderSize, CODE_SIZE_ALIGN); - ExecutableWriterHolder dispatchWriterHolder(holder, dispatchHolderSize); + DispatchStub * pStub = (DispatchStub*) (void*) + dispatch_heap->AllocAlignedMem(dispatchStubSize, 1);// CODE_SIZE_ALIGN); - dispatchWriterHolder.GetRW()->Initialize(holder, addrOfCode, - addrOfFail, - (size_t)pMTExpected, - DispatchStub::e_TYPE_LONG); + pStub->Initialize(addrOfCode, addrOfFail, (size_t)pMTExpected); #ifdef FEATURE_CODE_VERSIONING MethodDesc *pMD = MethodTable::GetMethodDescForSlotAddress(addrOfCode); if (pMD->IsVersionableWithVtableSlotBackpatch()) { EntryPointSlots::SlotType slotType; - TADDR slot = holder->stub()->implTargetSlot(&slotType); + TADDR slot = pStub->implTargetSlot(&slotType); pMD->RecordAndBackpatchEntryPointSlot(m_loaderAllocator, slot, slotType); // RecordAndBackpatchEntryPointSlot() may exit and reenter cooperative GC mode @@ -2695,38 +2552,35 @@ DispatchHolder *VirtualCallStubManager::GenerateDispatchStubLong(PCODE } #endif - ClrFlushInstructionCache(holder->stub(), holder->stub()->size()); - - AddToCollectibleVSDRangeList(holder); + AddToCollectibleVSDRangeList(pStub); //incr our counters stats.stub_mono_counter++; - stats.stub_space += static_cast(DispatchHolder::GetHolderSize(DispatchStub::e_TYPE_LONG)); + stats.stub_space += (UINT32)dispatchStubSize; LOG((LF_STUBS, LL_INFO10000, "GenerateDispatchStub for token" FMT_ADDR "and pMT" FMT_ADDR "at" FMT_ADDR "\n", - DBG_ADDR(dispatchToken), DBG_ADDR(pMTExpected), DBG_ADDR(holder->stub()))); + DBG_ADDR(dispatchToken), DBG_ADDR(pMTExpected), DBG_ADDR(pStub))); #ifdef FEATURE_PERFMAP - PerfMap::LogStubs(__FUNCTION__, "GenerateDispatchStub", (PCODE)holder->stub(), holder->stub()->size()); + PerfMap::LogStubs(__FUNCTION__, "GenerateDispatchStub", (PCODE)pStub, pStub->size()); #endif - RETURN (holder); + RETURN (pStub); } -#endif //---------------------------------------------------------------------------- /* Generate a resolve stub for the given dispatchToken. addrOfResolver is where to go if the inline cache check misses addrOfPatcher is who to call if the fail piece is being called too often by dispacher stubs */ -ResolveHolder *VirtualCallStubManager::GenerateResolveStub(PCODE addrOfResolver, - PCODE addrOfPatcher, - size_t dispatchToken +ResolveStub *VirtualCallStubManager::GenerateResolveStub(PCODE addrOfResolver, + PCODE addrOfPatcher, + size_t dispatchToken #if defined(TARGET_X86) && !defined(UNIX_X86_ABI) - , size_t stackArgumentsSize + , size_t stackArgumentsSize #endif - ) + ) { - CONTRACT (ResolveHolder*) { + CONTRACT (ResolveStub*) { THROWS; GC_TRIGGERS; INJECT_FAULT(COMPlusThrowOM();); @@ -2739,85 +2593,40 @@ ResolveHolder *VirtualCallStubManager::GenerateResolveStub(PCODE addr _ASSERTE(addrOfResolver); - //get a counter for the fail piece - - UINT32 counter_index = counter_block::MAX_COUNTER_ENTRIES; - counter_block *cur_block = NULL; - - while (true) - { - cur_block = VolatileLoad(&m_cur_counter_block); - - if ((cur_block != NULL) && (cur_block->used < counter_block::MAX_COUNTER_ENTRIES)) - { - counter_index = FastInterlockIncrement((LONG*)&cur_block->used) - 1; - if (counter_index < counter_block::MAX_COUNTER_ENTRIES) - { - // Typical case we allocate the next free counter in the block - break; - } - } - - // Otherwise we have to create a new counter_block to serve as the head of m_cur_counter_block list - - // Create the new block in the main heap - counter_block *pNew = new counter_block; - - // Initialize the new block - pNew->next = cur_block; - pNew->used = 0; - - // Try to link in the new block - if (InterlockedCompareExchangeT(&m_cur_counter_block, pNew, cur_block) != cur_block) - { - // Lost a race to add pNew as new head - delete pNew; - } - } - - CONSISTENCY_CHECK(counter_index < counter_block::MAX_COUNTER_ENTRIES); - CONSISTENCY_CHECK(CheckPointer(cur_block)); - - // Initialize the default miss counter for this resolve stub - INT32* counterAddr = &(cur_block->block[counter_index]); - *counterAddr = STUB_MISS_COUNT_VALUE; - //allocate from the requisite heap and copy the templates for each piece over it. - ResolveHolder * holder = (ResolveHolder*) (void*) - resolve_heap->AllocAlignedMem(sizeof(ResolveHolder), CODE_SIZE_ALIGN); - ExecutableWriterHolder resolveWriterHolder(holder, sizeof(ResolveHolder)); + ResolveStub * pResolveStub = (ResolveStub*) (void*) + resolve_heap->AllocAlignedMem(sizeof(ResolveStub), 1);// CODE_SIZE_ALIGN); - resolveWriterHolder.GetRW()->Initialize(holder, + pResolveStub->Initialize( addrOfResolver, addrOfPatcher, dispatchToken, DispatchCache::HashToken(dispatchToken), - g_resolveCache->GetCacheBaseAddr(), counterAddr + g_resolveCache->GetCacheBaseAddr(), STUB_MISS_COUNT_VALUE #if defined(TARGET_X86) && !defined(UNIX_X86_ABI) , stackArgumentsSize #endif ); - ClrFlushInstructionCache(holder->stub(), holder->stub()->size()); - AddToCollectibleVSDRangeList(holder); + AddToCollectibleVSDRangeList(pResolveStub); //incr our counters stats.stub_poly_counter++; - stats.stub_space += sizeof(ResolveHolder)+sizeof(size_t); + stats.stub_space += sizeof(ResolveStub)+sizeof(size_t); LOG((LF_STUBS, LL_INFO10000, "GenerateResolveStub for token" FMT_ADDR "at" FMT_ADDR "\n", - DBG_ADDR(dispatchToken), DBG_ADDR(holder->stub()))); + DBG_ADDR(dispatchToken), DBG_ADDR(pResolveStub))); #ifdef FEATURE_PERFMAP - PerfMap::LogStubs(__FUNCTION__, "GenerateResolveStub", (PCODE)holder->stub(), holder->stub()->size()); + PerfMap::LogStubs(__FUNCTION__, "GenerateResolveStub", (PCODE)pResolveStub, pResolveStub->size()); #endif - RETURN (holder); + RETURN (pResolveStub); } //---------------------------------------------------------------------------- /* Generate a lookup stub for the given dispatchToken. addrOfResolver is where the stub always transfers control */ -LookupHolder *VirtualCallStubManager::GenerateLookupStub(PCODE addrOfResolver, size_t dispatchToken) +LookupStub *VirtualCallStubManager::GenerateLookupStub(PCODE addrOfResolver, size_t dispatchToken) { - CONTRACT (LookupHolder*) { + CONTRACT (LookupStub*) { THROWS; GC_TRIGGERS; INJECT_FAULT(COMPlusThrowOM();); @@ -2826,25 +2635,23 @@ LookupHolder *VirtualCallStubManager::GenerateLookupStub(PCODE addrOfResolver, s } CONTRACT_END; //allocate from the requisite heap and copy the template over it. - LookupHolder * holder = (LookupHolder*) (void*) lookup_heap->AllocAlignedMem(sizeof(LookupHolder), CODE_SIZE_ALIGN); - ExecutableWriterHolder lookupWriterHolder(holder, sizeof(LookupHolder)); - - lookupWriterHolder.GetRW()->Initialize(holder, addrOfResolver, dispatchToken); - ClrFlushInstructionCache(holder->stub(), holder->stub()->size()); + LookupStub* pStub = (LookupStub*)(void*)lookup_heap->AllocAlignedMem(sizeof(LookupStub), 1); // CODE_SIZE_ALIGN); + pStub->Initialize(addrOfResolver, dispatchToken); + ClrFlushInstructionCache(pStub, pStub->size()); - AddToCollectibleVSDRangeList(holder); + AddToCollectibleVSDRangeList(pStub); //incr our counters stats.stub_lookup_counter++; - stats.stub_space += sizeof(LookupHolder); + stats.stub_space += sizeof(LookupStub); LOG((LF_STUBS, LL_INFO10000, "GenerateLookupStub for token" FMT_ADDR "at" FMT_ADDR "\n", - DBG_ADDR(dispatchToken), DBG_ADDR(holder->stub()))); + DBG_ADDR(dispatchToken), DBG_ADDR(pStub))); #ifdef FEATURE_PERFMAP - PerfMap::LogStubs(__FUNCTION__, "GenerateLookupStub", (PCODE)holder->stub(), holder->stub()->size()); + PerfMap::LogStubs(__FUNCTION__, "GenerateLookupStub", (PCODE)pStub, pStub->size()); #endif - RETURN (holder); + RETURN (pStub); } //---------------------------------------------------------------------------- @@ -4047,3 +3854,322 @@ BOOL VirtualCallStubManagerManager::TraceManager( // Forward the call to the appropriate manager. return pMgr->TraceManager(thread, trace, pContext, pRetAddr); } + + +#ifndef DACCESS_COMPILE + +//#include "asmconstants.h" + +#ifdef STUB_LOGGING +extern size_t g_lookup_inline_counter; +extern size_t g_call_inline_counter; +extern size_t g_miss_inline_counter; +extern size_t g_call_cache_counter; +extern size_t g_miss_cache_counter; +#endif + +void LookupStub::Initialize(PCODE resolveWorkerTarget, size_t dispatchToken) +{ + LookupStubData *pData = GetData(); + pData->DispatchToken = dispatchToken; + pData->ResolveWorkerTarget = resolveWorkerTarget; +} + +void DispatchStub::Initialize(PCODE implTarget, PCODE failTarget, size_t expectedMT) +{ + DispatchStubData *pData = GetData(); + pData->ExpectedMT = expectedMT; + pData->ImplTarget = implTarget; + pData->FailTarget = failTarget; +} + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + #define ENUM_PAGE_SIZE(size) \ + extern "C" void LookupStubCode##size(); \ + extern "C" void LookupStubCode##size##_End(); + + ENUM_PAGE_SIZES + #undef ENUM_PAGE_SIZE +#else +extern "C" void LookupStubCode(); +extern "C" void LookupStubCode_End(); +#endif + +#ifdef TARGET_X86 +extern "C" size_t LookupStubCode_DispatchToken_Offset; +extern "C" size_t LookupStubCode_ResolveWorkerTarget_Offset; + +#define SYMBOL_VALUE(name) ((size_t)&name) + +#endif + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) +void (*LookupStub::LookupStubCode)(); +#endif + +void LookupStub::InitStatic() +{ +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + int pageSize = GetOsPageSize(); + #define ENUM_PAGE_SIZE(size) \ + case size: \ + LookupStubCode = LookupStubCode##size; \ + _ASSERTE(((BYTE*)LookupStubCode##size##_End - (BYTE*)LookupStubCode##size) <= LookupStub::CodeSize); \ + break; + + switch (pageSize) + { + ENUM_PAGE_SIZES + default: + EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size")); + } + #undef ENUM_PAGE_SIZE +#else + _ASSERTE(((BYTE*)LookupStubCode_End - (BYTE*)LookupStubCode) <= LookupStub::CodeSize); +#endif + _ASSERTE(VirtualCallStubManager::predictStubKind((PCODE)(void*)LookupStubCode) == VirtualCallStubManager::SK_LOOKUP); +} + +void LookupStub::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX) +{ + int pageSize = GetOsPageSize(); + +#ifdef TARGET_X86 + int totalCodeSize = (pageSize / LookupStub::CodeSize) * LookupStub::CodeSize; + + for (int i = 0; i < pageSize; i += LookupStub::CodeSize) + { + memcpy(pageBase + i, (const void*)LookupStubCode, LookupStub::CodeSize); + + BYTE* pDispatchTokenSlot = pageBaseRX + i + pageSize + offsetof(LookupStubData, DispatchToken); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(LookupStubCode_DispatchToken_Offset)) = pDispatchTokenSlot; + + BYTE* pResolveWorkerTargetSlot = pageBaseRX + i + pageSize + offsetof(LookupStubData, ResolveWorkerTarget); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(LookupStubCode_ResolveWorkerTarget_Offset)) = pResolveWorkerTargetSlot; + } +#else // TARGET_X86 + FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)LookupStubCode), LookupStub::CodeSize, pageSize); +#endif // TARGET_X86 +} + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + #define ENUM_PAGE_SIZE(size) \ + extern "C" void DispatchStubCode##size(); \ + extern "C" void DispatchStubCode##size##_End(); \ + extern "C" void DispatchStubCode_ThisDeref##size(); + + ENUM_PAGE_SIZES + #undef ENUM_PAGE_SIZE +#else +extern "C" void DispatchStubCode(); +extern "C" void DispatchStubCode_End(); +#endif + +#ifdef TARGET_X86 +extern "C" size_t DispatchStubCode_ExpectedMT_Offset; +extern "C" size_t DispatchStubCode_ImplTarget_Offset; +extern "C" size_t DispatchStubCode_FailTarget_Offset; +#endif + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) +void (*DispatchStub::DispatchStubCode)(); +void (*DispatchStub::DispatchStubCode_ThisDeref)(); +#endif // TARGET_ARM64 && TARGET_UNIX + +void DispatchStub::InitStatic() +{ +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + int pageSize = GetOsPageSize(); + #define ENUM_PAGE_SIZE(size) \ + case size: \ + DispatchStubCode = DispatchStubCode##size; \ + DispatchStubCode_ThisDeref = DispatchStubCode_ThisDeref##size; \ + _ASSERTE(((BYTE*)DispatchStubCode##size##_End - (BYTE*)DispatchStubCode##size) <= DispatchStub::CodeSize); \ + break; + + switch (pageSize) + { + ENUM_PAGE_SIZES + default: + EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size")); + } + #undef ENUM_PAGE_SIZE +#else + _ASSERTE(((BYTE*)DispatchStubCode_End - (BYTE*)DispatchStubCode) <= DispatchStub::CodeSize); +#endif + _ASSERTE(VirtualCallStubManager::predictStubKind((PCODE)(void*)DispatchStubCode) == VirtualCallStubManager::SK_DISPATCH); +} + +void DispatchStub::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX) +{ + int pageSize = GetOsPageSize(); + +#ifdef TARGET_X86 + int totalCodeSize = (pageSize / DispatchStub::CodeSize) * DispatchStub::CodeSize; + for (int i = 0; i <= pageSize - DispatchStub::CodeSize; i += DispatchStub::CodeSize) + { + memcpy(pageBase + i, (const void*)DispatchStubCode, DispatchStub::CodeSize); + + BYTE* pExpectedMTSlot = pageBaseRX + i + pageSize + offsetof(DispatchStubData, ExpectedMT); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(DispatchStubCode_ExpectedMT_Offset)) = pExpectedMTSlot; + + BYTE* pImplTargetSlot = pageBaseRX + i + pageSize + offsetof(DispatchStubData, ImplTarget); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(DispatchStubCode_ImplTarget_Offset)) = pImplTargetSlot; + + BYTE* pFailTargetSlot = pageBaseRX + i + pageSize + offsetof(DispatchStubData, FailTarget); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(DispatchStubCode_FailTarget_Offset)) = pFailTargetSlot; + } +#else // TARGET_X86 + FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)DispatchStubCode), DispatchStub::CodeSize, pageSize); +#endif // TARGET_X86 +} + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + #define ENUM_PAGE_SIZE(size) \ + extern "C" void ResolveStubCode##size(); \ + extern "C" void ResolveStubCode##size##_End(); \ + extern "C" void ResolveStubCode_ResolveEntry##size(); \ + extern "C" void ResolveStubCode_FailEntry##size(); \ + extern "C" void ResolveStubCode_SlowEntry##size(); \ + extern "C" void ResolveStubCode_ThisDeref##size(); + ENUM_PAGE_SIZES + #undef ENUM_PAGE_SIZE +#else +extern "C" void ResolveStubCode(); +extern "C" void ResolveStubCode_End(); +extern "C" void ResolveStubCode_ResolveEntry(); +extern "C" void ResolveStubCode_FailEntry(); +extern "C" void ResolveStubCode_SlowEntry(); +extern "C" void ResolveStubCode_ThisDeref(); +#endif + +#ifdef TARGET_X86 +extern "C" size_t ResolveStubCode_Counter_Offset; +extern "C" size_t ResolveStubCode_HashedToken_Offset; +extern "C" size_t ResolveStubCode_CacheAddress_Offset; +extern "C" size_t ResolveStubCode_Token_Offset1; +extern "C" size_t ResolveStubCode_Token_Offset2; +extern "C" size_t ResolveStubCode_ResolveWorkerTarget_Offset; +extern "C" size_t ResolveStubCode_PatcherTarget_Offset; +#endif + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) +void (*ResolveStub::ResolveStubCode)(); +void (*ResolveStub::ResolveStubCode_FailEntry)(); +void (*ResolveStub::ResolveStubCode_SlowEntry)(); +void (*ResolveStub::ResolveStubCode_ResolveEntry)(); +void (*ResolveStub::ResolveStubCode_ThisDeref)(); +#endif // TARGET_ARM64 && TARGET_UNIX + +void ResolveStub::InitStatic() +{ +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + int pageSize = GetOsPageSize(); + #define ENUM_PAGE_SIZE(size) \ + case size: ResolveStubCode = ResolveStubCode##size; \ + ResolveStubCode_FailEntry = ResolveStubCode_FailEntry##size; \ + ResolveStubCode_SlowEntry = ResolveStubCode_SlowEntry##size; \ + ResolveStubCode_ResolveEntry = ResolveStubCode_ResolveEntry##size; \ + _ASSERTE(((BYTE*)ResolveStubCode##size##_End - (BYTE*)ResolveStubCode##size) <= ResolveStub::CodeSize); \ + break; + + switch (pageSize) + { + ENUM_PAGE_SIZES + default: + EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size")); + } + #undef ENUM_PAGE_SIZE +#else + _ASSERTE(((BYTE*)ResolveStubCode_End - (BYTE*)ResolveStubCode) <= ResolveStub::CodeSize); +#endif + _ASSERTE(VirtualCallStubManager::predictStubKind((PCODE)(void*)ResolveStubCode) == VirtualCallStubManager::SK_RESOLVE); +} + +void ResolveStub::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX) +{ + int pageSize = GetOsPageSize(); + +#ifdef TARGET_X86 + int totalCodeSize = (pageSize / ResolveStub::CodeSize) * ResolveStub::CodeSize; + for (int i = 0; i <= pageSize - ResolveStub::CodeSize; i += ResolveStub::CodeSize) + { + memcpy(pageBase + i, (const void*)ResolveStubCode, ResolveStub::CodeSize); + + BYTE* pCounterSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, Counter); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_Counter_Offset)) = pCounterSlot; + + BYTE* pHashedTokenSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, HashedToken); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_HashedToken_Offset)) = pHashedTokenSlot; + + BYTE* pLookupCacheSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, CacheAddress); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_CacheAddress_Offset)) = pLookupCacheSlot; + + BYTE* pTokenSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, Token); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_Token_Offset1)) = pTokenSlot; + + *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_Token_Offset2)) = pTokenSlot; + + BYTE* pResolveWorkerSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, ResolveWorkerTarget); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_ResolveWorkerTarget_Offset)) = pResolveWorkerSlot; + + BYTE* pBackpatcherSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, PatcherTarget); + *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_PatcherTarget_Offset)) = pBackpatcherSlot; + } +#else // TARGET_X86 + FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)ResolveStubCode), ResolveStub::CodeSize, pageSize); +#endif // TARGET_X86 +} + +void ResolveStub::Initialize(PCODE resolveWorkerTarget, PCODE patcherTarget, + size_t dispatchToken, UINT32 hashedToken, + void * cacheAddr, INT32 counterValue +#if defined(TARGET_X86) && !defined(UNIX_X86_ABI) + , size_t stackArgumentsSize +#endif + ) +{ + ResolveStubData *pData = GetData(); + + pData->CacheAddress = (size_t)cacheAddr; + pData->HashedToken = hashedToken << LOG2_PTRSIZE; + pData->Token = dispatchToken; + pData->Counter = counterValue; + pData->ResolveWorkerTarget = resolveWorkerTarget; +#ifdef TARGET_X86 + pData->PatcherTarget = patcherTarget; +#ifndef UNIX_X86_ABI + pData->StackArgumentsSize = stackArgumentsSize; +#endif +#endif +} + +LookupStub* LookupStub::FromLookupEntry(PCODE lookupEntry) +{ + LIMITED_METHOD_CONTRACT; + LookupStub* pLookupStub = (LookupStub*)PCODEToPINSTR(lookupEntry); + return pLookupStub; +} + +DispatchStub* DispatchStub::FromDispatchEntry(PCODE dispatchEntry) +{ + LIMITED_METHOD_CONTRACT; + DispatchStub* pDispatchStub = (DispatchStub*)PCODEToPINSTR(dispatchEntry); + return pDispatchStub; +} + +ResolveStub* ResolveStub::FromFailEntry(PCODE failEntry) +{ + LIMITED_METHOD_CONTRACT; + ResolveStub* pResolveStub = (ResolveStub*) (PCODEToPINSTR(failEntry) - ((BYTE*)ResolveStubCode_FailEntry - (BYTE*)ResolveStubCode)); + return pResolveStub; +} + +ResolveStub* ResolveStub::FromResolveEntry(PCODE resolveEntry) +{ + LIMITED_METHOD_CONTRACT; + ResolveStub* pResolveStub = (ResolveStub*) (PCODEToPINSTR(resolveEntry) - ((BYTE*)ResolveStubCode_ResolveEntry - (BYTE*)ResolveStubCode)); + return pResolveStub; +} + +#endif // DACCESS_COMPILE diff --git a/src/coreclr/vm/virtualcallstub.h b/src/coreclr/vm/virtualcallstub.h index ec1c99877e129..e9a9be18bdb76 100644 --- a/src/coreclr/vm/virtualcallstub.h +++ b/src/coreclr/vm/virtualcallstub.h @@ -33,9 +33,9 @@ class Entry; class Prober; class VirtualCallStubManager; class VirtualCallStubManagerManager; -struct LookupHolder; -struct DispatchHolder; -struct ResolveHolder; +struct LookupStub; +struct DispatchStub; +struct ResolveStub; struct VTableCallHolder; ///////////////////////////////////////////////////////////////////////////////////// @@ -159,7 +159,6 @@ extern "C" void BackPatchWorkerStaticStub(PCODE returnAddr, TADDR siteAddrForReg #endif // TARGET_UNIX #endif // TARGET_X86 - typedef VPTR(class VirtualCallStubManager) PTR_VirtualCallStubManager; // VirtualCallStubManager is the heart of the stub dispatch logic. See the book of the runtime entry @@ -280,17 +279,10 @@ class VirtualCallStubManager : public StubManager lookup_heap(NULL), dispatch_heap(NULL), resolve_heap(NULL), -#ifdef TARGET_AMD64 - m_fShouldAllocateLongJumpDispatchStubs(FALSE), -#endif lookups(NULL), cache_entries(NULL), dispatchers(NULL), resolvers(NULL), - m_counters(NULL), - m_cur_counter_block(NULL), - m_cur_counter_block_for_reclaim(NULL), - m_cur_counter_block_for_reclaim_index(NULL), m_pNext(NULL) { LIMITED_METHOD_CONTRACT; @@ -311,7 +303,7 @@ class VirtualCallStubManager : public StubManager }; // peek at the assembly code and predict which kind of a stub we have - StubKind predictStubKind(PCODE stubStartAddress); + static StubKind predictStubKind(PCODE stubStartAddress); /* know thine own stubs. It is possible that when multiple virtualcallstub managers are built that these may need to become @@ -483,7 +475,7 @@ class VirtualCallStubManager : public StubManager private: //allocate and initialize a stub of the desired kind - DispatchHolder *GenerateDispatchStub(PCODE addrOfCode, + DispatchStub *GenerateDispatchStub(PCODE addrOfCode, PCODE addrOfFail, void *pMTExpected, size_t dispatchToken, @@ -492,33 +484,33 @@ class VirtualCallStubManager : public StubManager #ifdef TARGET_AMD64 // Used to allocate a long jump dispatch stub. See comment around // m_fShouldAllocateLongJumpDispatchStubs for explaination. - DispatchHolder *GenerateDispatchStubLong(PCODE addrOfCode, + DispatchStub *GenerateDispatchStubLong(PCODE addrOfCode, PCODE addrOfFail, void *pMTExpected, size_t dispatchToken, bool *pMayHaveReenteredCooperativeGCMode); #endif - ResolveHolder *GenerateResolveStub(PCODE addrOfResolver, - PCODE addrOfPatcher, - size_t dispatchToken + ResolveStub *GenerateResolveStub(PCODE addrOfResolver, + PCODE addrOfPatcher, + size_t dispatchToken #if defined(TARGET_X86) && !defined(UNIX_X86_ABI) - , size_t stackArgumentsSize + , size_t stackArgumentsSize #endif - ); + ); - LookupHolder *GenerateLookupStub(PCODE addrOfResolver, + LookupStub *GenerateLookupStub(PCODE addrOfResolver, size_t dispatchToken); VTableCallHolder* GenerateVTableCallStub(DWORD slot); - template - void AddToCollectibleVSDRangeList(STUB_HOLDER *holder) + template + void AddToCollectibleVSDRangeList(STUB *pStub) { if (m_loaderAllocator->IsCollectible()) { - parentDomain->GetCollectibleVSDRanges()->AddRange(reinterpret_cast(holder->stub()), - reinterpret_cast(holder->stub()) + holder->stub()->size(), + parentDomain->GetCollectibleVSDRanges()->AddRange(reinterpret_cast(pStub), + reinterpret_cast(pStub) + pStub->size(), this); } } @@ -729,45 +721,12 @@ class VirtualCallStubManager : public StubManager PTR_LoaderHeap resolve_heap; // resolve stubs go here PTR_LoaderHeap vtable_heap; // vtable-based jump stubs go here -#ifdef TARGET_AMD64 - // When we layout the stub heaps, we put them close together in a sequential order - // so that we maximize performance with respect to branch predictions. On AMD64, - // dispatch stubs use a rel32 jump on failure to the resolve stub. This works for - // a while because of the ordering, but as soon as we have to start allocating more - // memory for either the dispatch or resolve heaps we have a chance that we'll be - // further away than a rel32 jump can reach, because we're in a 64-bit address - // space. As such, this flag will indicate when we allocate the first dispatch stub - // that cannot reach a resolve stub, and when this happens we'll switch over to - // allocating the larger version of the dispatch stub which contains an abs64 jump. - //@TODO: This is a bit of a workaround, but the limitations of LoaderHeap require that we - //@TODO: take this approach. Hopefully in Orcas we'll have a chance to rewrite LoaderHeap. - BOOL m_fShouldAllocateLongJumpDispatchStubs; // Defaults to FALSE. -#endif - BucketTable * lookups; // hash table of lookups keyed by tokens BucketTable * cache_entries; // hash table of dispatch token/target structs for dispatch cache BucketTable * dispatchers; // hash table of dispatching stubs keyed by tokens/actualtype BucketTable * resolvers; // hash table of resolvers keyed by tokens/resolverstub BucketTable * vtableCallers; // hash table of vtable call stubs keyed by slot values - // This structure is used to keep track of the fail counters. - // We only need one fail counter per ResolveStub, - // and most programs use less than 250 ResolveStubs - // We allocate these on the main heap using "new counter block" - struct counter_block - { - static const UINT32 MAX_COUNTER_ENTRIES = 256-2; // 254 counters should be enough for most cases. - - counter_block * next; // the next block - UINT32 used; // the index of the next free entry - INT32 block[MAX_COUNTER_ENTRIES]; // the counters - }; - - counter_block *m_counters; // linked list of counter blocks of failure counters - counter_block *m_cur_counter_block; // current block for updating counts - counter_block *m_cur_counter_block_for_reclaim; // current block for updating - UINT32 m_cur_counter_block_for_reclaim_index; // index into the current block for updating - // Used to keep track of all the VCSManager objects in the system. PTR_VirtualCallStubManager m_pNext; // Linked list pointer @@ -1059,6 +1018,358 @@ class Entry #include +//#ifdef TARGET_AMD64 +#pragma pack(push, 1) +// since we are placing code, we want byte packing of the structs + +// Codes of the instruction in the stub where the instruction access violation +// is converted to NullReferenceException at the caller site. +#ifdef UNIX_AMD64_ABI +#define X64_INSTR_CMP_IND_THIS_REG_RAX 0x073948 +#define X64_INSTR_MOV_RAX_IND_THIS_REG 0x078b48 +#else // UNIX_AMD64_ABI +#define X64_INSTR_CMP_IND_THIS_REG_RAX 0x013948 +#define X64_INSTR_MOV_RAX_IND_THIS_REG 0x018b48 +#endif // UNIX_AMD64_ABI + +#define USES_LOOKUP_STUBS 1 + +/********************************************************************************************* +Stubs that contain code are all part of larger structs called Holders. There is a +Holder for each kind of stub, i.e XXXStub is contained with XXXHolder. Holders are +essentially an implementation trick that allowed rearranging the code sequences more +easily while trying out different alternatives, and for dealing with any alignment +issues in a way that was mostly immune to the actually code sequences. These Holders +should be revisited when the stub code sequences are fixed, since in many cases they +add extra space to a stub that is not really needed. + +Stubs are placed in cache and hash tables. Since unaligned access of data in memory +is very slow, the keys used in those tables should be aligned. The things used as keys +typically also occur in the generated code, e.g. a token as an immediate part of an instruction. +For now, to avoid alignment computations as different code strategies are tried out, the key +fields are all in the Holders. Eventually, many of these fields should be dropped, and the instruction +streams aligned so that the immediate fields fall on aligned boundaries. +*/ + +#if USES_LOOKUP_STUBS + +/*LookupStub************************************************************************************** +Virtual and interface call sites are initially setup to point at LookupStubs. +This is because the runtime type of the pointer is not yet known, +so the target cannot be resolved. Note: if the jit is able to determine the runtime type +of the pointer, it should be generating a direct call not a virtual or interface call. +This stub pushes a lookup token onto the stack to identify the sought after method, and then +jumps into the EE (VirtualCallStubManager::ResolveWorkerStub) to effectuate the lookup and +transfer of control to the appropriate target method implementation, perhaps patching of the call site +along the way to point to a more appropriate stub. Hence callsites that point to LookupStubs +get quickly changed to point to another kind of stub. +*/ +struct LookupStubData +{ + size_t DispatchToken; + PCODE ResolveWorkerTarget; +}; + +typedef DPTR(LookupStubData) PTR_LookupStubData; + +struct LookupStub +{ +#if defined(HOST_AMD64) + static const int CodeSize = 16; +#elif defined(HOST_X86) + static const int CodeSize = 16; +#elif defined(HOST_ARM64) + static const int CodeSize = 16; +#elif defined(HOST_ARM) + static const int CodeSize = 12; +#endif // HOST_AMD64 + + void Initialize(PCODE resolveWorkerTarget, size_t dispatchToken); + + static void InitStatic(); + + inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return PINSTRToPCODE((TADDR)this); } + inline size_t token() { LIMITED_METHOD_CONTRACT; return GetData()->DispatchToken; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } + + static LookupStub* FromLookupEntry(PCODE lookupEntry); + + static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX); + +private: + PTR_LookupStubData GetData() const + { + return dac_cast(dac_cast(this) + GetOsPageSize()); + } + // The lookup entry point starts with a nop in order to allow us to quickly see + // if the stub is lookup stub or a dispatch stub. We can read thye first byte + // of a stub to find out what kind of a stub we have. + + BYTE _code[CodeSize]; + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + static void (*LookupStubCode)(); +#endif // TARGET_ARM64 && TARGET_UNIX) +}; + +#endif // USES_LOOKUP_STUBS + +/*DispatchStub************************************************************************************** +The structure of a full dispatch stub in memory is a DispatchStub followed contiguously in memory +by either a DispatchStubShort of a DispatchStubLong. DispatchStubShort is used when the resolve +stub (failTarget()) is reachable by a rel32 (DISPL) jump. We make a pretty good effort to make sure +that the stub heaps are set up so that this is the case. If we allocate enough stubs that the heap +end up allocating in a new block that is further away than a DISPL jump can go, then we end up using +a DispatchStubLong which is bigger but is a full 64-bit jump. */ + +/*DispatchStub************************************************************************************** +Monomorphic and mostly monomorphic call sites eventually point to DispatchStubs. +A dispatch stub has an expected type (expectedMT), target address (target) and fail address (failure). +If the calling frame does in fact have the type be of the expected type, then +control is transfered to the target address, the method implementation. If not, +then control is transfered to the fail address, a fail stub (see below) where a polymorphic +lookup is done to find the correct address to go to. + +implementation note: Order, choice of instructions, and branch directions +should be carefully tuned since it can have an inordinate effect on performance. Particular +attention needs to be paid to the effects on the BTB and branch prediction, both in the small +and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. +Note that since this stub is only used for mostly monomorphic callsites (ones that are not, get patched +to something else), therefore the conditional jump "jne failure" is mostly not taken, and hence it is important +that the branch prediction staticly predict this, which means it must be a forward jump. The alternative +is to reverse the order of the jumps and make sure that the resulting conditional jump "je implTarget" +is statically predicted as taken, i.e a backward jump. The current choice was taken since it was easier +to control the placement of the stubs than control the placement of the jitted code and the stubs. */ +struct DispatchStubData +{ + size_t ExpectedMT; + PCODE ImplTarget; + PCODE FailTarget; +}; + +typedef DPTR(DispatchStubData) PTR_DispatchStubData; + +/* DispatchHolders are the containers for DispatchStubs, they provide for any alignment of +stubs as necessary. DispatchStubs are placed in a hashtable and in a cache. The keys for both +are the pair expectedMT and token. Efficiency of the of the hash table is not a big issue, +since lookups in it are fairly rare. Efficiency of the cache is paramount since it is accessed frequently +(see ResolveStub below). Currently we are storing both of these fields in the DispatchHolder to simplify +alignment issues. If inlineMT in the stub itself was aligned, then it could be the expectedMT field. +While the token field can be logically gotten by following the failure target to the failEntryPoint +of the ResolveStub and then to the token over there, for perf reasons of cache access, it is duplicated here. +This allows us to use DispatchStubs in the cache. The alternative is to provide some other immutable struct +for the cache composed of the triplet (expectedMT, token, target) and some sort of reclaimation scheme when +they are thrown out of the cache via overwrites (since concurrency will make the obvious approaches invalid). +*/ + +/* @workaround for ee resolution - Since the EE does not currently have a resolver function that +does what we want, see notes in implementation of VirtualCallStubManager::Resolver, we are +using dispatch stubs to siumulate what we want. That means that inlineTarget, which should be immutable +is in fact written. Hence we have moved target out into the holder and aligned it so we can +atomically update it. When we get a resolver function that does what we want, we can drop this field, +and live with just the inlineTarget field in the stub itself, since immutability will hold.*/ + +#if !(defined(TARGET_ARM64) && defined(TARGET_UNIX)) +extern "C" void DispatchStubCode(); +extern "C" void DispatchStubCode_ThisDeref(); +#endif // !(TARGET_ARM64 && TARGET_UNIX) + + +struct DispatchStub +{ +#if defined(HOST_AMD64) + static const int CodeSize = 24; +#elif defined(HOST_X86) + static const int CodeSize = 24; +#elif defined(HOST_ARM64) + static const int CodeSize = 32; +#elif defined(HOST_ARM) + static const int CodeSize = 24; +#endif // HOST_AMD64 + + void Initialize(PCODE implTarget, PCODE failTarget, size_t expectedMT); + + static void InitStatic(); + + static DispatchStub* FromDispatchEntry(PCODE dispatchEntry); + static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX); + + inline PCODE entryPoint() const { LIMITED_METHOD_CONTRACT; return PINSTRToPCODE((TADDR)this); } + inline size_t expectedMT() const { LIMITED_METHOD_CONTRACT; return GetData()->ExpectedMT; } + inline static size_t size() { WRAPPER_NO_CONTRACT; return sizeof(DispatchStub); } + + inline static size_t offsetOfThisDeref() + { + LIMITED_METHOD_CONTRACT; + return (BYTE*)DispatchStubCode_ThisDeref - (BYTE*)DispatchStubCode; + } + + inline PCODE implTarget() const + { + LIMITED_METHOD_CONTRACT; + return GetData()->ImplTarget; + } + + inline TADDR implTargetSlot(EntryPointSlots::SlotType *slotTypeRef) const + { + LIMITED_METHOD_CONTRACT; + _ASSERTE(slotTypeRef != nullptr); + + *slotTypeRef = EntryPointSlots::SlotType_Normal; + return (TADDR)&GetData()->ImplTarget; + } + + inline PCODE failTarget() const + { + return GetData()->FailTarget; + } + +private: + BYTE code[CodeSize]; + + PTR_DispatchStubData GetData() const + { + return dac_cast(dac_cast(this) + GetOsPageSize()); + } + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + static void (*DispatchStubCode)(); + static void (*DispatchStubCode_ThisDeref)(); +#endif +}; + +/*ResolveStub************************************************************************************** +Polymorphic call sites and monomorphic calls that fail end up in a ResolverStub. There is only +one resolver stub built for any given token, even though there may be many call sites that +use that token and many distinct types that are used in the calling call frames. A resolver stub +actually has two entry points, one for polymorphic call sites and one for dispatch stubs that fail on their +expectedMT test. There is a third part of the resolver stub that enters the ee when a decision should +be made about changing the callsite. Therefore, we have defined the resolver stub as three distinct pieces, +even though they are actually allocated as a single contiguous block of memory. These pieces are: + +A ResolveStub has two entry points: + +FailEntry - where the dispatch stub goes if the expected MT test fails. This piece of the stub does +a check to see how often we are actually failing. If failures are frequent, control transfers to the +patch piece to cause the call site to be changed from a mostly monomorphic callsite +(calls dispatch stub) to a polymorphic callsize (calls resolve stub). If failures are rare, control +transfers to the resolve piece (see ResolveStub). The failEntryPoint decrements a counter +every time it is entered. The ee at various times will add a large chunk to the counter. + +ResolveEntry - does a lookup via in a cache by hashing the actual type of the calling frame s + and the token identifying the (contract,method) pair desired. If found, control is transfered +to the method implementation. If not found in the cache, the token is pushed and the ee is entered via +the ResolveWorkerStub to do a full lookup and eventual transfer to the correct method implementation. Since +there is a different resolve stub for every token, the token can be inlined and the token can be pre-hashed. +The effectiveness of this approach is highly sensitive to the effectiveness of the hashing algorithm used, +as well as its speed. It turns out it is very important to make the hash function sensitive to all +of the bits of the method table, as method tables are laid out in memory in a very non-random way. Before +making any changes to the code sequences here, it is very important to measure and tune them as perf +can vary greatly, in unexpected ways, with seeming minor changes. + +Implementation note - Order, choice of instructions, and branch directions +should be carefully tuned since it can have an inordinate effect on performance. Particular +attention needs to be paid to the effects on the BTB and branch prediction, both in the small +and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. +Note that this stub is called in highly polymorphic cases, but the cache should have been sized +and the hash function chosen to maximize the cache hit case. Hence the cmp/jcc instructions should +mostly be going down the cache hit route, and it is important that this be statically predicted as so. +Hence the 3 jcc instrs need to be forward jumps. As structured, there is only one jmp/jcc that typically +gets put in the BTB since all the others typically fall straight thru. Minimizing potential BTB entries +is important. */ + +struct ResolveStubData +{ + size_t CacheAddress; + UINT32 HashedToken; + INT32 Counter; + size_t Token; + PCODE ResolveWorkerTarget; +#ifdef TARGET_X86 + PCODE PatcherTarget; +#ifndef UNIX_X86_ABI + size_t StackArgumentsSize; +#endif // UNIX_X86_ABI +#endif // TARGET_X86 +}; + +typedef DPTR(ResolveStubData) PTR_ResolveStubData; + +#if !(defined(TARGET_ARM64) && defined(TARGET_UNIX)) +extern "C" void ResolveStubCode(); +extern "C" void ResolveStubCode_FailEntry(); +extern "C" void ResolveStubCode_SlowEntry(); +extern "C" void ResolveStubCode_ResolveEntry(); +extern "C" void ResolveStubCode_ThisDeref(); +#endif // !(TARGET_ARM64 && TARGET_UNIX) + +struct ResolveStub +{ +#if defined(HOST_AMD64) + static const int CodeSize = 88; +#elif defined(HOST_X86) + static const int CodeSize = 88; +#elif defined(HOST_ARM64) + static const int CodeSize = 128; +#elif defined(HOST_ARM) + static const int CodeSize = 108; +#endif // HOST_AMD64 + + void Initialize(PCODE resolveWorkerTarget, PCODE patcherTarget, + size_t dispatchToken, UINT32 hashedToken, + void * cacheAddr, INT32 counterValue +#if defined(TARGET_X86) && !defined(UNIX_X86_ABI) + , size_t stackArgumentsSize +#endif + ); + + static void InitStatic(); + + static ResolveStub* FromFailEntry(PCODE resolveEntry); + static ResolveStub* FromResolveEntry(PCODE resolveEntry); + + static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX); + + inline PCODE failEntryPoint() { LIMITED_METHOD_CONTRACT; return PINSTRToPCODE((TADDR)this + ((BYTE*)ResolveStubCode_FailEntry - (BYTE*)ResolveStubCode)); } + inline PCODE resolveEntryPoint() { LIMITED_METHOD_CONTRACT; return PINSTRToPCODE((TADDR)this + ((BYTE*)ResolveStubCode_ResolveEntry - (BYTE*)ResolveStubCode)); } + inline PCODE slowEntryPoint() { LIMITED_METHOD_CONTRACT; return PINSTRToPCODE((TADDR)this + ((BYTE*)ResolveStubCode_SlowEntry - (BYTE*)ResolveStubCode)); } + + inline INT32* pCounter() { LIMITED_METHOD_CONTRACT; return &GetData()->Counter; } + inline UINT32 hashedToken() { LIMITED_METHOD_CONTRACT; return GetData()->HashedToken >> LOG2_PTRSIZE; } + inline size_t cacheAddress() { LIMITED_METHOD_CONTRACT; return GetData()->CacheAddress; } + inline size_t token() { LIMITED_METHOD_CONTRACT; return GetData()->Token; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(ResolveStub); } + + inline static size_t offsetOfThisDeref() + { + LIMITED_METHOD_CONTRACT; + return (BYTE*)ResolveStubCode_ThisDeref - (BYTE*)ResolveStubCode_ResolveEntry; + } + +#if defined(TARGET_X86) && !defined(UNIX_X86_ABI) + inline size_t stackArgumentsSize() { LIMITED_METHOD_CONTRACT; return GetData()->StackArgumentsSize; } +#endif + +private: + PTR_ResolveStubData GetData() const + { + return dac_cast(dac_cast(this) + GetOsPageSize()); + } + + BYTE code[CodeSize]; + +#if defined(TARGET_ARM64) && defined(TARGET_UNIX) + static void (*ResolveStubCode)(); + static void (*ResolveStubCode_FailEntry)(); + static void (*ResolveStubCode_SlowEntry)(); + static void (*ResolveStubCode_ResolveEntry)(); + static void (*ResolveStubCode_ThisDeref)(); +#endif // TARGET_ARM64 && TARGET_UNIX +}; + +//#endif // TARGET_AMD64 +#pragma pack(pop) + #if USES_LOOKUP_STUBS /********************************************************************************************** LookupEntry wraps LookupStubs and provide the concrete implementation of the abstract class Entry. @@ -1092,7 +1403,7 @@ class LookupEntry : public Entry { LIMITED_METHOD_CONTRACT; _ASSERTE(VirtualCallStubManager::isLookupStubStatic((PCODE)contents)); - stub = LookupHolder::FromLookupEntry((PCODE)contents)->stub(); + stub = LookupStub::FromLookupEntry((PCODE)contents); } //extract the token of the underlying lookup stub @@ -1214,7 +1525,7 @@ class ResolveEntry : public Entry { LIMITED_METHOD_CONTRACT; _ASSERTE(VirtualCallStubManager::isResolvingStubStatic((PCODE)contents)); - stub = ResolveHolder::FromResolveEntry((PCODE)contents)->stub(); + stub = ResolveStub::FromResolveEntry((PCODE)contents); } //extract the token of the underlying resolve stub inline size_t Token() { WRAPPER_NO_CONTRACT; return stub ? (size_t)(stub->token()) : 0; } @@ -1252,7 +1563,7 @@ class DispatchEntry : public Entry { LIMITED_METHOD_CONTRACT; _ASSERTE(VirtualCallStubManager::isDispatchingStubStatic((PCODE)contents)); - stub = DispatchHolder::FromDispatchEntry((PCODE)contents)->stub(); + stub = DispatchStub::FromDispatchEntry((PCODE)contents); } //extract the fields of the underlying dispatch stub @@ -1264,8 +1575,8 @@ class DispatchEntry : public Entry WRAPPER_NO_CONTRACT; if (stub) { - ResolveHolder * resolveHolder = ResolveHolder::FromFailEntry(stub->failTarget()); - size_t token = resolveHolder->stub()->token(); + ResolveStub * pResolveStub = ResolveStub::FromFailEntry(stub->failTarget()); + size_t token = pResolveStub->token(); _ASSERTE(token == VirtualCallStubManager::GetTokenFromStub((PCODE)stub)); return token; } From 9114ae8fa6fcef750798a0edba95d8e5ee50c420 Mon Sep 17 00:00:00 2001 From: Jan Vorlicek Date: Tue, 22 Feb 2022 22:21:44 +0100 Subject: [PATCH 02/10] Disable executable allocator statistics by default --- src/coreclr/inc/executableallocator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/inc/executableallocator.h b/src/coreclr/inc/executableallocator.h index eeb572837a82a..c229f5546aa7f 100644 --- a/src/coreclr/inc/executableallocator.h +++ b/src/coreclr/inc/executableallocator.h @@ -15,7 +15,7 @@ #ifndef DACCESS_COMPILE -#define LOG_EXECUTABLE_ALLOCATOR_STATISTICS +//#define LOG_EXECUTABLE_ALLOCATOR_STATISTICS // This class is responsible for allocation of all the executable memory in the runtime. class ExecutableAllocator From 9e5211ac0382042e8a6f7493556d8625bf35b3a6 Mon Sep 17 00:00:00 2001 From: Jan Vorlicek Date: Tue, 22 Feb 2022 23:05:04 +0100 Subject: [PATCH 03/10] Fix build with executable allocator logging disabled --- src/coreclr/inc/holder.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/coreclr/inc/holder.h b/src/coreclr/inc/holder.h index a0ff213fd6031..88b7993a5cf92 100644 --- a/src/coreclr/inc/holder.h +++ b/src/coreclr/inc/holder.h @@ -945,12 +945,13 @@ FORCEINLINE void StubRelease(TYPE* value) { if (value) { +#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS #ifdef TARGET_UNIX LOGGER::LogUsage(__FILE__, __LINE__, __PRETTY_FUNCTION__); #else LOGGER::LogUsage(__FILE__, __LINE__, __FUNCTION__); #endif - +#endif // LOG_EXECUTABLE_ALLOCATOR_STATISTICS ExecutableWriterHolderNoLog stubWriterHolder(value, sizeof(TYPE)); stubWriterHolder.GetRW()->DecRef(); } From 5e0d202de05544050a394c6a9ed66871fabe0578 Mon Sep 17 00:00:00 2001 From: Jan Vorlicek Date: Wed, 23 Feb 2022 02:00:10 +0100 Subject: [PATCH 04/10] Fix Windows ARM/ARM64 and macOS x64 builds --- src/coreclr/vm/amd64/thunktemplates.S | 4 ++-- src/coreclr/vm/arm/thunktemplates.asm | 2 -- src/coreclr/vm/arm64/thunktemplates.asm | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/coreclr/vm/amd64/thunktemplates.S b/src/coreclr/vm/amd64/thunktemplates.S index d2b1cea9108c2..f1efc137aad49 100644 --- a/src/coreclr/vm/amd64/thunktemplates.S +++ b/src/coreclr/vm/amd64/thunktemplates.S @@ -7,7 +7,7 @@ PAGE_SIZE = 4096 -#define DATA_SLOT(stub, field) (stub##Code + PAGE_SIZE + stub##Data__##field) +#define DATA_SLOT(stub, field) C_FUNC(stub##Code) + PAGE_SIZE + stub##Data__##field LEAF_ENTRY StubPrecodeCode, _TEXT mov r10, [rip + DATA_SLOT(StubPrecode, MethodDesc)] @@ -67,7 +67,7 @@ PATCH_LABEL ResolveStubCode_ThisDeref jmp QWORD PTR [rax+0x10] PATCH_LABEL ResolveStubCode_FailEntry add DWORD PTR [rip + DATA_SLOT(ResolveStub, Counter)], -1 - jge Resolve + jge LOCAL_LABEL(Resolve) or r11, 1 // SDF_ResolveBackPatch PATCH_LABEL ResolveStubCode_SlowEntry push rdx diff --git a/src/coreclr/vm/arm/thunktemplates.asm b/src/coreclr/vm/arm/thunktemplates.asm index f8ae8043903db..9a8d655a1e305 100644 --- a/src/coreclr/vm/arm/thunktemplates.asm +++ b/src/coreclr/vm/arm/thunktemplates.asm @@ -10,8 +10,6 @@ ALIGN 4 - PAGE_SIZE = 4096 - #define DATA_SLOT(stub, field) stub##Code + PAGE_SIZE + stub##Data__##field LEAF_ENTRY StubPrecodeCode diff --git a/src/coreclr/vm/arm64/thunktemplates.asm b/src/coreclr/vm/arm64/thunktemplates.asm index 812791d6e0a38..5c1aaaabc0406 100644 --- a/src/coreclr/vm/arm64/thunktemplates.asm +++ b/src/coreclr/vm/arm64/thunktemplates.asm @@ -5,8 +5,6 @@ #include "asmconstants.h" #include "asmmacros.h" -PAGE_SIZE = 4096 - #define DATA_SLOT(stub, field) (stub##Code + PAGE_SIZE + stub##Data__##field) LEAF_ENTRY StubPrecodeCode From ad97eed2ff2554e80d87c4605ab8d0cc1c965eca Mon Sep 17 00:00:00 2001 From: Jan Vorlicek Date: Thu, 24 Feb 2022 14:40:52 +0100 Subject: [PATCH 05/10] Reflect PR feedback and few fixes * Change the CallCountingStub to not to use return address of an unbalanced call as a stub identifying token. The counter address is used instead on all targets. * Fix some tabs instead of spaces * Fix getTargetMethodDesc - in some cases, we get address of the start of the FixupPrecode too. * Remove a leftover comment --- src/coreclr/utilcode/executableallocator.cpp | 110 +++++++++---------- src/coreclr/vm/amd64/AsmHelpers.asm | 10 +- src/coreclr/vm/amd64/thunktemplates.S | 2 +- src/coreclr/vm/amd64/thunktemplates.asm | 2 +- src/coreclr/vm/amd64/unixasmhelpers.S | 10 +- src/coreclr/vm/arm/thunktemplates.S | 1 - src/coreclr/vm/arm/thunktemplates.asm | 1 - src/coreclr/vm/arm64/asmhelpers.S | 2 +- src/coreclr/vm/arm64/asmhelpers.asm | 2 +- src/coreclr/vm/arm64/thunktemplates.S | 5 +- src/coreclr/vm/arm64/thunktemplates.asm | 5 +- src/coreclr/vm/callcounting.h | 65 +++++------ src/coreclr/vm/gccover.cpp | 15 ++- src/coreclr/vm/i386/asmhelpers.S | 10 +- src/coreclr/vm/i386/asmhelpers.asm | 8 +- src/coreclr/vm/i386/thunktemplates.S | 3 +- src/coreclr/vm/i386/thunktemplates.asm | 3 +- src/coreclr/vm/loaderallocator.cpp | 6 - 18 files changed, 114 insertions(+), 146 deletions(-) diff --git a/src/coreclr/utilcode/executableallocator.cpp b/src/coreclr/utilcode/executableallocator.cpp index 2f094739618f0..197ce6e8e6929 100644 --- a/src/coreclr/utilcode/executableallocator.cpp +++ b/src/coreclr/utilcode/executableallocator.cpp @@ -426,43 +426,43 @@ void ExecutableAllocator::Release(void* pRX) if (IsDoubleMappingEnabled()) { - CRITSEC_Holder csh(m_CriticalSection); - - // Locate the RX block corresponding to the pRX and remove it from the linked list - BlockRX* pBlock; - BlockRX* pPrevBlock = NULL; - - for (pBlock = m_pFirstBlockRX; pBlock != NULL; pBlock = pBlock->next) - { - if (pRX == pBlock->baseRX) - { - if (pPrevBlock == NULL) - { - m_pFirstBlockRX = pBlock->next; - } - else - { - pPrevBlock->next = pBlock->next; - } - - break; - } - pPrevBlock = pBlock; - } - - if (pBlock != NULL) - { + CRITSEC_Holder csh(m_CriticalSection); + + // Locate the RX block corresponding to the pRX and remove it from the linked list + BlockRX* pBlock; + BlockRX* pPrevBlock = NULL; + + for (pBlock = m_pFirstBlockRX; pBlock != NULL; pBlock = pBlock->next) + { + if (pRX == pBlock->baseRX) + { + if (pPrevBlock == NULL) + { + m_pFirstBlockRX = pBlock->next; + } + else + { + pPrevBlock->next = pBlock->next; + } + + break; + } + pPrevBlock = pBlock; + } + + if (pBlock != NULL) + { VMToOSInterface::ReleaseDoubleMappedMemory(m_doubleMemoryMapperHandle, pRX, pBlock->offset, pBlock->size); // Put the released block into the free block list - pBlock->baseRX = NULL; - pBlock->next = m_pFirstFreeBlockRX; - m_pFirstFreeBlockRX = pBlock; - } - else - { - // The block was not found, which should never happen. - g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("The RX block to release was not found")); - } + pBlock->baseRX = NULL; + pBlock->next = m_pFirstFreeBlockRX; + m_pFirstFreeBlockRX = pBlock; + } + else + { + // The block was not found, which should never happen. + g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("The RX block to release was not found")); + } } else { @@ -573,9 +573,9 @@ void* ExecutableAllocator::ReserveWithinRange(size_t size, const void* loAddress bool isFreeBlock; BlockRX* block = AllocateBlock(size, &isFreeBlock); if (block == NULL) - { - return NULL; - } + { + return NULL; + } void *result = VMToOSInterface::ReserveDoubleMappedMemory(m_doubleMemoryMapperHandle, block->offset, size, loAddress, hiAddress); @@ -660,14 +660,14 @@ void* ExecutableAllocator::Reserve(size_t size) { if (IsDoubleMappingEnabled()) { - CRITSEC_Holder csh(m_CriticalSection); + CRITSEC_Holder csh(m_CriticalSection); - bool isFreeBlock; + bool isFreeBlock; BlockRX* block = AllocateBlock(size, &isFreeBlock); - if (block == NULL) - { - return NULL; - } + if (block == NULL) + { + return NULL; + } result = (BYTE*)VMToOSInterface::ReserveDoubleMappedMemory(m_doubleMemoryMapperHandle, block->offset, size, 0, 0); @@ -753,7 +753,7 @@ void* ExecutableAllocator::MapRW(void* pRX, size_t size) } #ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS - StopWatch swAll(&g_mapTimeWithLockSum); + StopWatch swAll(&g_mapTimeWithLockSum); #endif CRITSEC_Holder csh(m_CriticalSection); @@ -762,13 +762,13 @@ void* ExecutableAllocator::MapRW(void* pRX, size_t size) StopWatch sw(&g_mapTimeSum); #endif - void* result = FindRWBlock(pRX, size); - if (result != NULL) - { - return result; - } + void* result = FindRWBlock(pRX, size); + if (result != NULL) + { + return result; + } #ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS - StopWatch sw2(&g_mapFindRXTimeSum); + StopWatch sw2(&g_mapFindRXTimeSum); #endif for (BlockRX* pBlock = m_pFirstBlockRX; pBlock != NULL; pBlock = pBlock->next) @@ -836,10 +836,10 @@ void ExecutableAllocator::UnmapRW(void* pRW) void* unmapAddress = NULL; size_t unmapSize; - if (!RemoveRWBlock(pRW, &unmapAddress, &unmapSize)) - { - g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("The RW block to unmap was not found")); - } + if (!RemoveRWBlock(pRW, &unmapAddress, &unmapSize)) + { + g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("The RW block to unmap was not found")); + } if (unmapAddress && !VMToOSInterface::ReleaseRWMapping(unmapAddress, unmapSize)) { diff --git a/src/coreclr/vm/amd64/AsmHelpers.asm b/src/coreclr/vm/amd64/AsmHelpers.asm index 273f8173415c6..90b3dc62faefa 100644 --- a/src/coreclr/vm/amd64/AsmHelpers.asm +++ b/src/coreclr/vm/amd64/AsmHelpers.asm @@ -697,13 +697,7 @@ ifdef FEATURE_TIERED_COMPILATION extern OnCallCountThresholdReached:proc -LEAF_ENTRY OnCallCountThresholdReachedStub, _TEXT - ; Pop the return address (the stub-identifying token) into a non-argument volatile register that can be trashed - pop rax - jmp OnCallCountThresholdReachedStub2 -LEAF_END OnCallCountThresholdReachedStub, _TEXT - -NESTED_ENTRY OnCallCountThresholdReachedStub2, _TEXT +NESTED_ENTRY OnCallCountThresholdReachedStub, _TEXT PROLOG_WITH_TRANSITION_BLOCK lea rcx, [rsp + __PWTB_TransitionBlock] ; TransitionBlock * @@ -712,7 +706,7 @@ NESTED_ENTRY OnCallCountThresholdReachedStub2, _TEXT EPILOG_WITH_TRANSITION_BLOCK_TAILCALL TAILJMP_RAX -NESTED_END OnCallCountThresholdReachedStub2, _TEXT +NESTED_END OnCallCountThresholdReachedStub, _TEXT endif ; FEATURE_TIERED_COMPILATION diff --git a/src/coreclr/vm/amd64/thunktemplates.S b/src/coreclr/vm/amd64/thunktemplates.S index f1efc137aad49..cf248410b417b 100644 --- a/src/coreclr/vm/amd64/thunktemplates.S +++ b/src/coreclr/vm/amd64/thunktemplates.S @@ -27,7 +27,7 @@ LEAF_ENTRY CallCountingStubCode, _TEXT je LOCAL_LABEL(CountReachedZero) jmp QWORD PTR [rip + DATA_SLOT(CallCountingStub, TargetForMethod)] LOCAL_LABEL(CountReachedZero): - call QWORD PTR [rip + DATA_SLOT(CallCountingStub, TargetForThresholdReached)] + jmp QWORD PTR [rip + DATA_SLOT(CallCountingStub, TargetForThresholdReached)] LEAF_END_MARKED CallCountingStubCode, _TEXT LEAF_ENTRY LookupStubCode, _TEXT diff --git a/src/coreclr/vm/amd64/thunktemplates.asm b/src/coreclr/vm/amd64/thunktemplates.asm index 076e957f2b552..d003f5682098a 100644 --- a/src/coreclr/vm/amd64/thunktemplates.asm +++ b/src/coreclr/vm/amd64/thunktemplates.asm @@ -28,7 +28,7 @@ LEAF_ENTRY CallCountingStubCode, _TEXT je CountReachedZero jmp QWORD PTR [DATA_SLOT(CallCountingStub, TargetForMethod)] CountReachedZero: - call QWORD PTR [DATA_SLOT(CallCountingStub, TargetForThresholdReached)] + jmp QWORD PTR [DATA_SLOT(CallCountingStub, TargetForThresholdReached)] LEAF_END_MARKED CallCountingStubCode, _TEXT LEAF_ENTRY LookupStubCode, _TEXT diff --git a/src/coreclr/vm/amd64/unixasmhelpers.S b/src/coreclr/vm/amd64/unixasmhelpers.S index 7848d068a82cf..4711ee9857f2c 100644 --- a/src/coreclr/vm/amd64/unixasmhelpers.S +++ b/src/coreclr/vm/amd64/unixasmhelpers.S @@ -207,13 +207,7 @@ LEAF_END SinglecastDelegateInvokeStub, _TEXT #ifdef FEATURE_TIERED_COMPILATION -LEAF_ENTRY OnCallCountThresholdReachedStub, _TEXT - // Pop the return address (the stub-identifying token) into a non-argument volatile register that can be trashed - pop rax - jmp C_FUNC(OnCallCountThresholdReachedStub2) -LEAF_END OnCallCountThresholdReachedStub, _TEXT - -NESTED_ENTRY OnCallCountThresholdReachedStub2, _TEXT, NoHandler +NESTED_ENTRY OnCallCountThresholdReachedStub, _TEXT, NoHandler PROLOG_WITH_TRANSITION_BLOCK lea rdi, [rsp + __PWTB_TransitionBlock] // TransitionBlock * @@ -222,6 +216,6 @@ NESTED_ENTRY OnCallCountThresholdReachedStub2, _TEXT, NoHandler EPILOG_WITH_TRANSITION_BLOCK_TAILCALL TAILJMP_RAX -NESTED_END OnCallCountThresholdReachedStub2, _TEXT +NESTED_END OnCallCountThresholdReachedStub, _TEXT #endif // FEATURE_TIERED_COMPILATION diff --git a/src/coreclr/vm/arm/thunktemplates.S b/src/coreclr/vm/arm/thunktemplates.S index e2b4dda34c0ea..a50365eebff0c 100644 --- a/src/coreclr/vm/arm/thunktemplates.S +++ b/src/coreclr/vm/arm/thunktemplates.S @@ -38,7 +38,6 @@ PAGE_SIZE = 4096 beq LOCAL_LABEL(CountReachedZero) ldr pc, DATA_SLOT(CallCountingStub, TargetForMethod) LOCAL_LABEL(CountReachedZero): - adr r12, CallCountingStubCode ldr pc, DATA_SLOT(CallCountingStub, TargetForThresholdReached) LEAF_END_MARKED CallCountingStubCode diff --git a/src/coreclr/vm/arm/thunktemplates.asm b/src/coreclr/vm/arm/thunktemplates.asm index 9a8d655a1e305..37f0c54c36470 100644 --- a/src/coreclr/vm/arm/thunktemplates.asm +++ b/src/coreclr/vm/arm/thunktemplates.asm @@ -37,7 +37,6 @@ beq CountReachedZero ldr pc, DATA_SLOT(CallCountingStub, TargetForMethod) CountReachedZero - adr r12, CallCountingStubCode ldr pc, DATA_SLOT(CallCountingStub, TargetForThresholdReached) LEAF_END_MARKED CallCountingStubCode diff --git a/src/coreclr/vm/arm64/asmhelpers.S b/src/coreclr/vm/arm64/asmhelpers.S index 972529a44beca..6baca17be8bb6 100644 --- a/src/coreclr/vm/arm64/asmhelpers.S +++ b/src/coreclr/vm/arm64/asmhelpers.S @@ -957,7 +957,7 @@ NESTED_ENTRY OnCallCountThresholdReachedStub, _TEXT, NoHandler PROLOG_WITH_TRANSITION_BLOCK add x0, sp, #__PWTB_TransitionBlock // TransitionBlock * - mov x1, x10 // stub-identifying token + mov x1, x9 // stub-identifying token bl C_FUNC(OnCallCountThresholdReached) mov x9, x0 diff --git a/src/coreclr/vm/arm64/asmhelpers.asm b/src/coreclr/vm/arm64/asmhelpers.asm index 398ff0091fa29..8c9523916d379 100644 --- a/src/coreclr/vm/arm64/asmhelpers.asm +++ b/src/coreclr/vm/arm64/asmhelpers.asm @@ -1370,7 +1370,7 @@ __HelperNakedFuncName SETS "$helper":CC:"Naked" PROLOG_WITH_TRANSITION_BLOCK add x0, sp, #__PWTB_TransitionBlock ; TransitionBlock * - mov x1, x10 ; stub-identifying token + mov x1, x9 ; stub-identifying token bl OnCallCountThresholdReached mov x9, x0 diff --git a/src/coreclr/vm/arm64/thunktemplates.S b/src/coreclr/vm/arm64/thunktemplates.S index a047c9949d197..91d36ed080f52 100644 --- a/src/coreclr/vm/arm64/thunktemplates.S +++ b/src/coreclr/vm/arm64/thunktemplates.S @@ -32,9 +32,8 @@ LOCAL_LABEL(StubStart\PAGE_SIZE): ldr x9, DATA_SLOT(CallCountingStub, TargetForMethod) br x9 LOCAL_LABEL(CountReachedZero\PAGE_SIZE): - adr x10, LOCAL_LABEL(StubStart\PAGE_SIZE) - ldr x9, DATA_SLOT(CallCountingStub, TargetForThresholdReached) - br x9 + ldr x10, DATA_SLOT(CallCountingStub, TargetForThresholdReached) + br x10 LEAF_END_MARKED CallCountingStubCode\PAGE_SIZE diff --git a/src/coreclr/vm/arm64/thunktemplates.asm b/src/coreclr/vm/arm64/thunktemplates.asm index 5c1aaaabc0406..b1c31c7ca3232 100644 --- a/src/coreclr/vm/arm64/thunktemplates.asm +++ b/src/coreclr/vm/arm64/thunktemplates.asm @@ -30,9 +30,8 @@ ldr x9, DATA_SLOT(CallCountingStub, TargetForMethod) br x9 CountReachedZero - adr x10, CallCountingStubCode - ldr x9, DATA_SLOT(CallCountingStub, TargetForThresholdReached) - br x9 + ldr x10, DATA_SLOT(CallCountingStub, TargetForThresholdReached) + br x10 LEAF_END_MARKED CallCountingStubCode diff --git a/src/coreclr/vm/callcounting.h b/src/coreclr/vm/callcounting.h index 1e28c3448e47b..089702e066cb1 100644 --- a/src/coreclr/vm/callcounting.h +++ b/src/coreclr/vm/callcounting.h @@ -91,16 +91,12 @@ class CallCountingStub public: #if defined(TARGET_AMD64) static const int CodeSize = 24; - static const int StubIdentifyingTokenOffset = 24; #elif defined(TARGET_X86) static const int CodeSize = 24; - static const int StubIdentifyingTokenOffset = 22; #elif defined(TARGET_ARM64) static const int CodeSize = 40; - static const int StubIdentifyingTokenOffset = 0; #elif defined(TARGET_ARM) static const int CodeSize = 32; - static const int StubIdentifyingTokenOffset = 0; #endif private: @@ -158,34 +154,6 @@ class CallCountingStub DISABLE_COPY(CallCountingStub); }; -//////////////////////////////////////////////////////////////// -// CallCountingStub definitions - -#ifndef DACCESS_COMPILE -inline const CallCountingStub *CallCountingStub::From(TADDR stubIdentifyingToken) -{ - WRAPPER_NO_CONTRACT; - _ASSERTE(stubIdentifyingToken != NULL); - - const CallCountingStub *stub = (const CallCountingStub *)(stubIdentifyingToken - StubIdentifyingTokenOffset); - - _ASSERTE(IS_ALIGNED(stub, Alignment)); - return stub; -} -#endif - -inline PTR_CallCount CallCountingStub::GetRemainingCallCountCell() const -{ - WRAPPER_NO_CONTRACT; - return GetData()->RemainingCallCountCell; -} - -inline PCODE CallCountingStub::GetTargetForMethod() const -{ - WRAPPER_NO_CONTRACT; - return GetData()->TargetForMethod; -} - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // CallCountingManager @@ -398,6 +366,10 @@ class CallCountingManager public: static void StopAndDeleteAllCallCountingStubs(); + static const CallCountingStub* GetCallCountingStub(CallCount *pCallCount) + { + return CallCountingInfo::From(pCallCount)->GetCallCountingStub(); + } private: static void StopAllCallCounting(TieredCompilationManager *tieredCompilationManager); static void DeleteAllCallCountingStubs(); @@ -414,6 +386,35 @@ class CallCountingManager DISABLE_COPY(CallCountingManager); }; +//////////////////////////////////////////////////////////////// +// CallCountingStub definitions + +#ifndef DACCESS_COMPILE +inline const CallCountingStub *CallCountingStub::From(TADDR stubIdentifyingToken) +{ + WRAPPER_NO_CONTRACT; + _ASSERTE(stubIdentifyingToken != NULL); + + // The stubIdentifyingToken is the pointer to the CallCount + const CallCountingStub *stub = CallCountingManager::GetCallCountingStub((CallCount*)stubIdentifyingToken); + + _ASSERTE(IS_ALIGNED(stub, Alignment)); + return stub; +} +#endif + +inline PTR_CallCount CallCountingStub::GetRemainingCallCountCell() const +{ + WRAPPER_NO_CONTRACT; + return GetData()->RemainingCallCountCell; +} + +inline PCODE CallCountingStub::GetTargetForMethod() const +{ + WRAPPER_NO_CONTRACT; + return GetData()->TargetForMethod; +} + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // CallCountingManager::CallCountingStubManager diff --git a/src/coreclr/vm/gccover.cpp b/src/coreclr/vm/gccover.cpp index 9499b3efcd65b..b0e6aa953b4e4 100644 --- a/src/coreclr/vm/gccover.cpp +++ b/src/coreclr/vm/gccover.cpp @@ -74,14 +74,17 @@ static MethodDesc* getTargetMethodDesc(PCODE target) if (PrecodeStubManager::g_pManager->GetFixupPrecodeRangeList()->IsInRange(target)) { - // If the target slot points to the fixup part of the stub, the actual - // stub starts FixupPrecode::FixupCodeOffset bytes below the target, - // so we need to compensate for it. - target -= FixupPrecode::FixupCodeOffset; if (!FixupPrecode::IsFixupPrecodeByASM(target)) { - _ASSERTE(FALSE); // We should never get other precode type here - return nullptr; + // If the target slot points to the fixup part of the stub, the actual + // stub starts FixupPrecode::FixupCodeOffset bytes below the target, + // so we need to compensate for it. + target -= FixupPrecode::FixupCodeOffset; + if (!FixupPrecode::IsFixupPrecodeByASM(target)) + { + _ASSERTE(!"Invalid FixupPrecode address"); // We should never get other precode type here + return nullptr; + } } return (MethodDesc*)((FixupPrecode*)PCODEToPINSTR(target))->GetMethodDesc(); diff --git a/src/coreclr/vm/i386/asmhelpers.S b/src/coreclr/vm/i386/asmhelpers.S index ffaa33831a85e..7a620dd0c1f7a 100644 --- a/src/coreclr/vm/i386/asmhelpers.S +++ b/src/coreclr/vm/i386/asmhelpers.S @@ -1197,13 +1197,7 @@ NESTED_END JIT_ProfilerEnterLeaveTailcallStub, _TEXT #ifdef FEATURE_TIERED_COMPILATION -LEAF_ENTRY OnCallCountThresholdReachedStub, _TEXT - // Pop the return address (the stub-identifying token) into a non-argument volatile register that can be trashed - pop eax - jmp C_FUNC(OnCallCountThresholdReachedStub2) -LEAF_END OnCallCountThresholdReachedStub, _TEXT - -NESTED_ENTRY OnCallCountThresholdReachedStub2, _TEXT, NoHandler +NESTED_ENTRY OnCallCountThresholdReachedStub, _TEXT, NoHandler STUB_PROLOG mov esi, esp @@ -1226,6 +1220,6 @@ NESTED_ENTRY OnCallCountThresholdReachedStub2, _TEXT, NoHandler // This will never be executed. It is just to help out stack-walking logic // which disassembles the epilog to unwind the stack. ret -NESTED_END OnCallCountThresholdReachedStub2, _TEXT +NESTED_END OnCallCountThresholdReachedStub, _TEXT #endif // FEATURE_TIERED_COMPILATION diff --git a/src/coreclr/vm/i386/asmhelpers.asm b/src/coreclr/vm/i386/asmhelpers.asm index c73a6d8da9051..20cfa31a7556f 100644 --- a/src/coreclr/vm/i386/asmhelpers.asm +++ b/src/coreclr/vm/i386/asmhelpers.asm @@ -1504,12 +1504,6 @@ ifdef FEATURE_TIERED_COMPILATION EXTERN _OnCallCountThresholdReached@8:proc _OnCallCountThresholdReachedStub@0 proc public - ; Pop the return address (the stub-identifying token) into a non-argument volatile register that can be trashed - pop eax - jmp _OnCallCountThresholdReachedStub2@0 -_OnCallCountThresholdReachedStub@0 endp - -_OnCallCountThresholdReachedStub2@0 proc public STUB_PROLOG mov esi, esp @@ -1524,7 +1518,7 @@ _OnCallCountThresholdReachedStub2@0 proc public ; This will never be executed. It is just to help out stack-walking logic ; which disassembles the epilog to unwind the stack. ret -_OnCallCountThresholdReachedStub2@0 endp +_OnCallCountThresholdReachedStub@0 endp endif ; FEATURE_TIERED_COMPILATION diff --git a/src/coreclr/vm/i386/thunktemplates.S b/src/coreclr/vm/i386/thunktemplates.S index e9bd6db96beb4..5ca0d1767fc31 100644 --- a/src/coreclr/vm/i386/thunktemplates.S +++ b/src/coreclr/vm/i386/thunktemplates.S @@ -52,9 +52,8 @@ SLOT_ADDRESS_PATCH_LABEL CallCountingStub, RemainingCallCountCell INDJMP DATA_SLOT(CallCountingStub, TargetForMethod) SLOT_ADDRESS_PATCH_LABEL CallCountingStub, TargetForMethod LOCAL_LABEL(CountReachedZero): - INDCALL DATA_SLOT(CallCountingStub, TargetForThresholdReached) + INDJMP DATA_SLOT(CallCountingStub, TargetForThresholdReached) SLOT_ADDRESS_PATCH_LABEL CallCountingStub, TargetForThresholdReached - int 3 LEAF_END_MARKED CallCountingStubCode LEAF_ENTRY LookupStubCode diff --git a/src/coreclr/vm/i386/thunktemplates.asm b/src/coreclr/vm/i386/thunktemplates.asm index 59ccdb18fe27e..e133865abc59c 100644 --- a/src/coreclr/vm/i386/thunktemplates.asm +++ b/src/coreclr/vm/i386/thunktemplates.asm @@ -53,9 +53,8 @@ SLOT_ADDRESS_PATCH_LABEL CallCountingStub, RemainingCallCountCell jmp dword ptr DATA_SLOT(CallCountingStub, TargetForMethod) SLOT_ADDRESS_PATCH_LABEL CallCountingStub, TargetForMethod CountReachedZero: - call dword ptr DATA_SLOT(CallCountingStub, TargetForThresholdReached) + jmp dword ptr DATA_SLOT(CallCountingStub, TargetForThresholdReached) SLOT_ADDRESS_PATCH_LABEL CallCountingStub, TargetForThresholdReached - int 3 LEAF_END_MARKED _CallCountingStubCode@0 LEAF_ENTRY _LookupStubCode@0 diff --git a/src/coreclr/vm/loaderallocator.cpp b/src/coreclr/vm/loaderallocator.cpp index 32c8a93c2894a..7795d47a362bf 100644 --- a/src/coreclr/vm/loaderallocator.cpp +++ b/src/coreclr/vm/loaderallocator.cpp @@ -1116,12 +1116,6 @@ void LoaderAllocator::Init(BaseDomain *pDomain, BYTE *pExecutableHeapMemory) dwTotalReserveMemSize = (DWORD) ALIGN_UP(dwTotalReserveMemSize, VIRTUAL_ALLOC_RESERVE_GRANULARITY); -#if !defined(HOST_64BIT) - // Make sure that we reserve as little as possible on 32-bit to save address space - // We cannot reserve less than needed - //_ASSERTE(dwTotalReserveMemSize <= VIRTUAL_ALLOC_RESERVE_GRANULARITY); -#endif - BYTE * initReservedMem = (BYTE*)ExecutableAllocator::Instance()->Reserve(dwTotalReserveMemSize); m_InitialReservedMemForLoaderHeaps = initReservedMem; From c769d0f43de79112418762a39e30eae97089b304 Mon Sep 17 00:00:00 2001 From: Jan Vorlicek Date: Fri, 25 Feb 2022 16:29:55 +0100 Subject: [PATCH 06/10] Fix macOS x64 The assembler was generating 32 bit conditional relative jumps instead of ones with 8 bit displacement. I've found that a presence of a global label between the jump site and the destination makes the assembler to do that. Changing PATCH_LABEL macro fixed it. --- src/coreclr/pal/inc/unixasmmacrosamd64.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index 9a656ddf1bec2..2aca375faa838 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -27,7 +27,7 @@ .macro PATCH_LABEL Name .global C_FUNC(\Name) -C_FUNC(\Name): + C_FUNC(\Name) = . .endm .macro LEAF_ENTRY Name, Section From 4dd122bca03724bc7784ac069c5655702475cd40 Mon Sep 17 00:00:00 2001 From: Jan Vorlicek Date: Fri, 25 Feb 2022 22:21:59 +0100 Subject: [PATCH 07/10] Fix ARM64 StubPrecodeCode_End extraction --- src/coreclr/vm/precode.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/vm/precode.cpp b/src/coreclr/vm/precode.cpp index dc2117db0e6da..ad95a538e9528 100644 --- a/src/coreclr/vm/precode.cpp +++ b/src/coreclr/vm/precode.cpp @@ -599,6 +599,7 @@ void StubPrecode::StaticInitialize() #define ENUM_PAGE_SIZE(size) \ case size: \ StubPrecodeCode = StubPrecodeCode##size; \ + StubPrecodeCode_End = StubPrecodeCode##size##_End; \ _ASSERTE(((BYTE*)StubPrecodeCode##size##_End - (BYTE*)StubPrecodeCode##size) <= StubPrecode::CodeSize); \ break; From 118b427ff677cb6354e458ef284109b303ea43fe Mon Sep 17 00:00:00 2001 From: Jan Vorlicek Date: Thu, 3 Mar 2022 02:27:14 +0100 Subject: [PATCH 08/10] Reflect feedback and improve some JIT helpers perf --- src/coreclr/vm/jitinterface.cpp | 36 ++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index d0a519e4f0fd2..ffb88585192d5 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -9006,7 +9006,7 @@ void CEEInfo::getFunctionEntryPoint(CORINFO_METHOD_HANDLE ftnHnd, // Resolve methodImpl. ftn = ftn->GetMethodTable()->MapMethodDeclToMethodImpl(ftn); - if (!ftn->IsFCall() && ftn->MayHavePrecode() && ftn->GetPrecodeType() == PRECODE_FIXUP) + if (!ftn->IsFCall() && ftn->IsVersionableWithPrecode() && (ftn->GetPrecodeType() == PRECODE_FIXUP) && !ftn->IsPointingToStableNativeCode()) { ret = ((FixupPrecode*)ftn->GetOrCreatePrecode())->GetTargetSlot(); accessType = IAT_PVALUE; @@ -9014,20 +9014,19 @@ void CEEInfo::getFunctionEntryPoint(CORINFO_METHOD_HANDLE ftnHnd, else { ret = (void *)ftn->TryGetMultiCallableAddrOfCode(accessFlags); - } - // TryGetMultiCallableAddrOfCode returns NULL if indirect access is desired - if (ret == NULL) - { - // should never get here for EnC methods or if interception via remoting stub is required - _ASSERTE(!ftn->IsEnCMethod()); + // TryGetMultiCallableAddrOfCode returns NULL if indirect access is desired + if (ret == NULL) + { + // should never get here for EnC methods or if interception via remoting stub is required + _ASSERTE(!ftn->IsEnCMethod()); - ret = (void *)ftn->GetAddrOfSlot(); + ret = (void *)ftn->GetAddrOfSlot(); - accessType = IAT_PVALUE; + accessType = IAT_PVALUE; + } } - #if defined(FEATURE_GDBJIT) CalledMethod * pCM = new CalledMethod(orig_ftn, ret, m_pCalledMethods); m_pCalledMethods = pCM; @@ -11134,6 +11133,23 @@ void* CEEJitInfo::getHelperFtn(CorInfoHelpFunc ftnNum, /* IN */ } #endif + if (dynamicFtnNum == DYNAMIC_CORINFO_HELP_ISINSTANCEOFINTERFACE || + dynamicFtnNum == DYNAMIC_CORINFO_HELP_ISINSTANCEOFANY || + dynamicFtnNum == DYNAMIC_CORINFO_HELP_ISINSTANCEOFARRAY || + dynamicFtnNum == DYNAMIC_CORINFO_HELP_ISINSTANCEOFCLASS || + dynamicFtnNum == DYNAMIC_CORINFO_HELP_CHKCASTANY || + dynamicFtnNum == DYNAMIC_CORINFO_HELP_CHKCASTARRAY || + dynamicFtnNum == DYNAMIC_CORINFO_HELP_CHKCASTINTERFACE || + dynamicFtnNum == DYNAMIC_CORINFO_HELP_CHKCASTCLASS || + dynamicFtnNum == DYNAMIC_CORINFO_HELP_CHKCASTCLASS_SPECIAL || + dynamicFtnNum == DYNAMIC_CORINFO_HELP_UNBOX) + { + Precode* pPrecode = Precode::GetPrecodeFromEntryPoint((PCODE)hlpDynamicFuncTable[dynamicFtnNum].pfnHelper); + _ASSERTE(pPrecode->GetType() == PRECODE_FIXUP); + *ppIndirection = ((FixupPrecode*)pPrecode)->GetTargetSlot(); + return NULL; + } + pfnHelper = hlpDynamicFuncTable[dynamicFtnNum].pfnHelper; #ifdef _PREFAST_ From 346af352af803da4224d87c7742d3d093275fdf1 Mon Sep 17 00:00:00 2001 From: Jan Vorlicek Date: Wed, 16 Mar 2022 20:21:32 +0100 Subject: [PATCH 09/10] Revert VSD stubs changes Extensive benchmarking has shown that the new model for VSD stubs is causing steady state performance degradation for code using a lot of virtual calls. It wasn't showing up in the plaintext benchmark I have used as the driver of the change. This change looses part of the startup perf improvements, but I am planning to add back the Lookup stubs in a follow up change. Those should not be perf critical and they are likely the ones where the startup improvements are gained from. --- src/coreclr/vm/amd64/asmconstants.h | 33 - src/coreclr/vm/amd64/thunktemplates.S | 47 -- src/coreclr/vm/amd64/thunktemplates.asm | 46 -- src/coreclr/vm/amd64/virtualcallstubcpu.hpp | 751 +++++++++++++++++++- src/coreclr/vm/arm/asmconstants.h | 33 - src/coreclr/vm/arm/stubs.cpp | 358 +++++++++- src/coreclr/vm/arm/thunktemplates.S | 71 -- src/coreclr/vm/arm/thunktemplates.asm | 69 -- src/coreclr/vm/arm/virtualcallstubcpu.hpp | 323 ++++++++- src/coreclr/vm/arm64/asmconstants.h | 33 - src/coreclr/vm/arm64/thunktemplates.S | 55 -- src/coreclr/vm/arm64/thunktemplates.asm | 53 -- src/coreclr/vm/arm64/virtualcallstubcpu.hpp | 421 ++++++++++- src/coreclr/vm/i386/asmconstants.h | 36 - src/coreclr/vm/i386/excepx86.cpp | 15 +- src/coreclr/vm/i386/stublinkerx86.h | 2 +- src/coreclr/vm/i386/thunktemplates.S | 66 -- src/coreclr/vm/i386/thunktemplates.asm | 66 -- src/coreclr/vm/i386/virtualcallstubcpu.hpp | 650 ++++++++++++++++- src/coreclr/vm/loaderallocator.cpp | 2 +- src/coreclr/vm/virtualcallstub.cpp | 726 ++++++++----------- src/coreclr/vm/virtualcallstub.h | 435 ++---------- 22 files changed, 2831 insertions(+), 1460 deletions(-) diff --git a/src/coreclr/vm/amd64/asmconstants.h b/src/coreclr/vm/amd64/asmconstants.h index 5bbdff5576ea6..2afddae98a4d3 100644 --- a/src/coreclr/vm/amd64/asmconstants.h +++ b/src/coreclr/vm/amd64/asmconstants.h @@ -590,39 +590,6 @@ ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForMethod == offsetof(CallCoun #define CallCountingStubData__TargetForThresholdReached 0x10 ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForThresholdReached == offsetof(CallCountingStubData, TargetForThresholdReached)) -#define LookupStubData__DispatchToken 0x00 -ASMCONSTANTS_C_ASSERT(LookupStubData__DispatchToken == offsetof(LookupStubData, DispatchToken)) - -#define LookupStubData__ResolveWorkerTarget 0x08 -ASMCONSTANTS_C_ASSERT(LookupStubData__ResolveWorkerTarget == offsetof(LookupStubData, ResolveWorkerTarget)) - -#define DispatchStubData__ExpectedMT 0x00 -ASMCONSTANTS_C_ASSERT(DispatchStubData__ExpectedMT == offsetof(DispatchStubData, ExpectedMT)) - -#define DispatchStubData__ImplTarget 0x08 -ASMCONSTANTS_C_ASSERT(DispatchStubData__ImplTarget == offsetof(DispatchStubData, ImplTarget)) - -#define DispatchStubData__FailTarget 0x10 -ASMCONSTANTS_C_ASSERT(DispatchStubData__FailTarget == offsetof(DispatchStubData, FailTarget)) - -#define ResolveStubData__HashedToken 0x08 -ASMCONSTANTS_C_ASSERT(ResolveStubData__HashedToken == offsetof(ResolveStubData, HashedToken)) - -#define ResolveStubData__CacheAddress 0x00 -ASMCONSTANTS_C_ASSERT(ResolveStubData__CacheAddress == offsetof(ResolveStubData, CacheAddress)) - -#define ResolveStubData__Token 0x10 -ASMCONSTANTS_C_ASSERT(ResolveStubData__Token == offsetof(ResolveStubData, Token)) - -#define ResolveStubData__Counter 0x0c -ASMCONSTANTS_C_ASSERT(ResolveStubData__Counter == offsetof(ResolveStubData, Counter)) - -#define ResolveStubData__ResolveWorkerTarget 0x18 -ASMCONSTANTS_C_ASSERT(ResolveStubData__ResolveWorkerTarget == offsetof(ResolveStubData, ResolveWorkerTarget)) - -#define CALL_STUB_CACHE_MASK_ASM 0xfff -ASMCONSTANTS_C_ASSERT(CALL_STUB_CACHE_MASK_ASM == CALL_STUB_CACHE_MASK) - #undef ASMCONSTANTS_RUNTIME_ASSERT #undef ASMCONSTANTS_C_ASSERT #ifndef UNIX_AMD64_ABI diff --git a/src/coreclr/vm/amd64/thunktemplates.S b/src/coreclr/vm/amd64/thunktemplates.S index cf248410b417b..11d417cb3b971 100644 --- a/src/coreclr/vm/amd64/thunktemplates.S +++ b/src/coreclr/vm/amd64/thunktemplates.S @@ -29,50 +29,3 @@ LEAF_ENTRY CallCountingStubCode, _TEXT LOCAL_LABEL(CountReachedZero): jmp QWORD PTR [rip + DATA_SLOT(CallCountingStub, TargetForThresholdReached)] LEAF_END_MARKED CallCountingStubCode, _TEXT - -LEAF_ENTRY LookupStubCode, _TEXT - push QWORD PTR [rip + DATA_SLOT(LookupStub, DispatchToken)] - jmp QWORD PTR [rip + DATA_SLOT(LookupStub, ResolveWorkerTarget)] -LEAF_END_MARKED LookupStubCode, _TEXT - -LEAF_ENTRY DispatchStubCode, _TEXT - mov rax,QWORD PTR [rip + DATA_SLOT(DispatchStub, ExpectedMT)] -PATCH_LABEL DispatchStubCode_ThisDeref - cmp QWORD PTR [rdi],rax; - jne LOCAL_LABEL(Fail) - jmp QWORD PTR [rip + DATA_SLOT(DispatchStub, ImplTarget)] - LOCAL_LABEL(Fail): - jmp QWORD PTR [rip + DATA_SLOT(DispatchStub, FailTarget)] -LEAF_END_MARKED DispatchStubCode, _TEXT - -LEAF_ENTRY ResolveStubCode, _TEXT -PATCH_LABEL ResolveStubCode_ResolveEntry - LOCAL_LABEL(Resolve): - push rdx - mov r10,QWORD PTR [rip + DATA_SLOT(ResolveStub, CacheAddress)] -PATCH_LABEL ResolveStubCode_ThisDeref - mov rax,QWORD PTR [rdi] - mov rdx,rax - shr rax,12 - add rax,rdx - xor eax,DWORD PTR [rip + DATA_SLOT(ResolveStub, HashedToken)] - and eax, CALL_STUB_CACHE_MASK_ASM * 8 - mov rax,QWORD PTR [r10+rax*1] - mov r10,QWORD PTR [rip + DATA_SLOT(ResolveStub, Token)] - cmp rdx,QWORD PTR [rax] - jne LOCAL_LABEL(Miss) - cmp r10,QWORD PTR [rax+8] - jne LOCAL_LABEL(Miss) - pop rdx - jmp QWORD PTR [rax+0x10] -PATCH_LABEL ResolveStubCode_FailEntry - add DWORD PTR [rip + DATA_SLOT(ResolveStub, Counter)], -1 - jge LOCAL_LABEL(Resolve) - or r11, 1 // SDF_ResolveBackPatch -PATCH_LABEL ResolveStubCode_SlowEntry - push rdx - mov r10,QWORD PTR [rip + DATA_SLOT(ResolveStub, Token)] - LOCAL_LABEL(Miss): - push rax - jmp QWORD PTR [rip + DATA_SLOT(ResolveStub, ResolveWorkerTarget)] -LEAF_END_MARKED ResolveStubCode, _TEXT diff --git a/src/coreclr/vm/amd64/thunktemplates.asm b/src/coreclr/vm/amd64/thunktemplates.asm index d003f5682098a..af3d03135619e 100644 --- a/src/coreclr/vm/amd64/thunktemplates.asm +++ b/src/coreclr/vm/amd64/thunktemplates.asm @@ -31,50 +31,4 @@ LEAF_ENTRY CallCountingStubCode, _TEXT jmp QWORD PTR [DATA_SLOT(CallCountingStub, TargetForThresholdReached)] LEAF_END_MARKED CallCountingStubCode, _TEXT -LEAF_ENTRY LookupStubCode, _TEXT - push QWORD PTR [DATA_SLOT(LookupStub, DispatchToken)] - jmp QWORD PTR [DATA_SLOT(LookupStub, ResolveWorkerTarget)] -LEAF_END_MARKED LookupStubCode, _TEXT - -LEAF_ENTRY DispatchStubCode, _TEXT - mov rax,QWORD PTR [DATA_SLOT(DispatchStub, ExpectedMT)] -PATCH_LABEL DispatchStubCode_ThisDeref - cmp QWORD PTR [rcx],rax; - jne Fail - jmp QWORD PTR [DATA_SLOT(DispatchStub, ImplTarget)] - Fail: - jmp QWORD PTR [DATA_SLOT(DispatchStub, FailTarget)] -LEAF_END_MARKED DispatchStubCode, _TEXT - -LEAF_ENTRY ResolveStubCode, _TEXT -PATCH_LABEL ResolveStubCode_ResolveEntry - push rdx - mov r10,QWORD PTR [DATA_SLOT(ResolveStub, CacheAddress)] -PATCH_LABEL ResolveStubCode_ThisDeref - mov rax,QWORD PTR [rcx] - mov rdx,rax - shr rax,12 - add rax,rdx - xor eax,DWORD PTR [DATA_SLOT(ResolveStub, HashedToken)] - and eax, CALL_STUB_CACHE_MASK_ASM * 8 - mov rax,QWORD PTR [r10+rax*1] - mov r10,QWORD PTR [DATA_SLOT(ResolveStub, Token)] - cmp rdx,QWORD PTR [rax] - jne Miss - cmp r10,QWORD PTR [rax+8] - jne Miss - pop rdx - jmp QWORD PTR [rax+10h] -PATCH_LABEL ResolveStubCode_FailEntry - add DWORD PTR [DATA_SLOT(ResolveStub, Counter)], -1 - jge ResolveStubCode - or r11, 1; SDF_ResolveBackPatch -PATCH_LABEL ResolveStubCode_SlowEntry - push rdx - mov r10,QWORD PTR [DATA_SLOT(ResolveStub, Token)] -Miss: - push rax - jmp QWORD PTR [DATA_SLOT(ResolveStub, ResolveWorkerTarget)] -LEAF_END_MARKED ResolveStubCode, _TEXT - end diff --git a/src/coreclr/vm/amd64/virtualcallstubcpu.hpp b/src/coreclr/vm/amd64/virtualcallstubcpu.hpp index 12490953e8c41..d579633e527c5 100644 --- a/src/coreclr/vm/amd64/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/amd64/virtualcallstubcpu.hpp @@ -15,16 +15,457 @@ #ifndef _VIRTUAL_CALL_STUB_AMD64_H #define _VIRTUAL_CALL_STUB_AMD64_H -#define DISPATCH_STUB_FIRST_WORD 0x8B48 -#define DISPATCH_STUB_THIRD_BYTE 0x05 -#define RESOLVE_STUB_FIRST_WORD 0x4C52 -#define LOOKUP_STUB_FIRST_WORD 0x35FF -#define VTABLECALL_STUB_FIRST_WORD 0x8B48 - #include "dbginterface.h" //#define STUB_LOGGING + #pragma pack(push, 1) +// since we are placing code, we want byte packing of the structs + +// Codes of the instruction in the stub where the instruction access violation +// is converted to NullReferenceException at the caller site. +#ifdef UNIX_AMD64_ABI +#define X64_INSTR_CMP_IND_THIS_REG_RAX 0x073948 +#define X64_INSTR_MOV_RAX_IND_THIS_REG 0x078b48 +#else // UNIX_AMD64_ABI +#define X64_INSTR_CMP_IND_THIS_REG_RAX 0x013948 +#define X64_INSTR_MOV_RAX_IND_THIS_REG 0x018b48 +#endif // UNIX_AMD64_ABI + +#define USES_LOOKUP_STUBS 1 + +/********************************************************************************************* +Stubs that contain code are all part of larger structs called Holders. There is a +Holder for each kind of stub, i.e XXXStub is contained with XXXHolder. Holders are +essentially an implementation trick that allowed rearranging the code sequences more +easily while trying out different alternatives, and for dealing with any alignment +issues in a way that was mostly immune to the actually code sequences. These Holders +should be revisited when the stub code sequences are fixed, since in many cases they +add extra space to a stub that is not really needed. + +Stubs are placed in cache and hash tables. Since unaligned access of data in memory +is very slow, the keys used in those tables should be aligned. The things used as keys +typically also occur in the generated code, e.g. a token as an immediate part of an instruction. +For now, to avoid alignment computations as different code strategies are tried out, the key +fields are all in the Holders. Eventually, many of these fields should be dropped, and the instruction +streams aligned so that the immediate fields fall on aligned boundaries. +*/ + +#if USES_LOOKUP_STUBS + +struct LookupStub; +struct LookupHolder; + +/*LookupStub************************************************************************************** +Virtual and interface call sites are initially setup to point at LookupStubs. +This is because the runtime type of the pointer is not yet known, +so the target cannot be resolved. Note: if the jit is able to determine the runtime type +of the pointer, it should be generating a direct call not a virtual or interface call. +This stub pushes a lookup token onto the stack to identify the sought after method, and then +jumps into the EE (VirtualCallStubManager::ResolveWorkerStub) to effectuate the lookup and +transfer of control to the appropriate target method implementation, perhaps patching of the call site +along the way to point to a more appropriate stub. Hence callsites that point to LookupStubs +get quickly changed to point to another kind of stub. +*/ +struct LookupStub +{ + inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } + + inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } + +private: + friend struct LookupHolder; + + // The lookup entry point starts with a nop in order to allow us to quickly see + // if the stub is lookup stub or a dispatch stub. We can read thye first byte + // of a stub to find out what kind of a stub we have. + + BYTE _entryPoint [3]; // 90 nop + // 48 B8 mov rax, + size_t _token; // xx xx xx xx xx xx xx xx 64-bit address + BYTE part2 [3]; // 50 push rax + // 48 B8 mov rax, + size_t _resolveWorkerAddr; // xx xx xx xx xx xx xx xx 64-bit address + BYTE part3 [2]; // FF E0 jmp rax +}; + +/* LookupHolders are the containers for LookupStubs, they provide for any alignment of +stubs as necessary. In the case of LookupStubs, alignment is necessary since +LookupStubs are placed in a hash table keyed by token. */ +struct LookupHolder +{ + static void InitializeStatic(); + + void Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken); + + LookupStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } + + static LookupHolder* FromLookupEntry(PCODE lookupEntry); + +private: + friend struct LookupStub; + + LookupStub _stub; +}; + +#endif // USES_LOOKUP_STUBS + +struct DispatchStub; +struct DispatchStubShort; +struct DispatchStubLong; +struct DispatchHolder; + +/*DispatchStub************************************************************************************** +The structure of a full dispatch stub in memory is a DispatchStub followed contiguously in memory +by either a DispatchStubShort of a DispatchStubLong. DispatchStubShort is used when the resolve +stub (failTarget()) is reachable by a rel32 (DISPL) jump. We make a pretty good effort to make sure +that the stub heaps are set up so that this is the case. If we allocate enough stubs that the heap +end up allocating in a new block that is further away than a DISPL jump can go, then we end up using +a DispatchStubLong which is bigger but is a full 64-bit jump. */ + +/*DispatchStubShort********************************************************************************* +This is the logical continuation of DispatchStub for the case when the failure target is within +a rel32 jump (DISPL). */ +struct DispatchStubShort +{ + friend struct DispatchHolder; + friend struct DispatchStub; + + static BOOL isShortStub(LPCBYTE pCode); + inline PCODE implTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) _implTarget; } + + inline TADDR implTargetSlot() const + { + LIMITED_METHOD_CONTRACT; + return (TADDR)&_implTarget; + } + + inline PCODE failTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) &_failDispl + sizeof(DISPL) + _failDispl; } + +private: + BYTE part1 [2]; // 48 B8 mov rax, + size_t _implTarget; // xx xx xx xx xx xx xx xx 64-bit address + BYTE part2[2]; // 0f 85 jne + DISPL _failDispl; // xx xx xx xx failEntry ;must be forward jmp for perf reasons + BYTE part3 [2]; // FF E0 jmp rax +}; + +#define DispatchStubShort_offsetof_failDisplBase (offsetof(DispatchStubLong, _failDispl) + sizeof(DISPL)) + +inline BOOL DispatchStubShort::isShortStub(LPCBYTE pCode) +{ + LIMITED_METHOD_CONTRACT; + return reinterpret_cast(pCode)->part2[0] == 0x0f; +} + + +/*DispatchStubLong********************************************************************************** +This is the logical continuation of DispatchStub for the case when the failure target is not +reachable by a rel32 jump (DISPL). */ +struct DispatchStubLong +{ + friend struct DispatchHolder; + friend struct DispatchStub; + + static inline BOOL isLongStub(LPCBYTE pCode); + inline PCODE implTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) _implTarget; } + + inline TADDR implTargetSlot() const + { + LIMITED_METHOD_CONTRACT; + return (TADDR)&_implTarget; + } + + inline PCODE failTarget() const { LIMITED_METHOD_CONTRACT; return (PCODE) _failTarget; } + +private: + BYTE part1[2]; // 48 B8 mov rax, + size_t _implTarget; // xx xx xx xx xx xx xx xx 64-bit address + BYTE part2 [1]; // 75 jne + BYTE _failDispl; // xx failLabel + BYTE part3 [2]; // FF E0 jmp rax + // failLabel: + BYTE part4 [2]; // 48 B8 mov rax, + size_t _failTarget; // xx xx xx xx xx xx xx xx 64-bit address + BYTE part5 [2]; // FF E0 jmp rax +}; + +#define DispatchStubLong_offsetof_failDisplBase (offsetof(DispatchStubLong, _failDispl) + sizeof(BYTE)) +#define DispatchStubLong_offsetof_failLabel (offsetof(DispatchStubLong, part4[0])) + +inline BOOL DispatchStubLong::isLongStub(LPCBYTE pCode) +{ + LIMITED_METHOD_CONTRACT; + return reinterpret_cast(pCode)->part2[0] == 0x75; +} + +/*DispatchStub************************************************************************************** +Monomorphic and mostly monomorphic call sites eventually point to DispatchStubs. +A dispatch stub has an expected type (expectedMT), target address (target) and fail address (failure). +If the calling frame does in fact have the type be of the expected type, then +control is transfered to the target address, the method implementation. If not, +then control is transfered to the fail address, a fail stub (see below) where a polymorphic +lookup is done to find the correct address to go to. + +implementation note: Order, choice of instructions, and branch directions +should be carefully tuned since it can have an inordinate effect on performance. Particular +attention needs to be paid to the effects on the BTB and branch prediction, both in the small +and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. +Note that since this stub is only used for mostly monomorphic callsites (ones that are not, get patched +to something else), therefore the conditional jump "jne failure" is mostly not taken, and hence it is important +that the branch prediction staticly predict this, which means it must be a forward jump. The alternative +is to reverse the order of the jumps and make sure that the resulting conditional jump "je implTarget" +is statically predicted as taken, i.e a backward jump. The current choice was taken since it was easier +to control the placement of the stubs than control the placement of the jitted code and the stubs. */ +struct DispatchStub +{ + friend struct DispatchHolder; + + enum DispatchStubType + { + e_TYPE_SHORT, + e_TYPE_LONG, + }; + + inline DispatchStubType type() const + { + LIMITED_METHOD_CONTRACT; + CONSISTENCY_CHECK(DispatchStubShort::isShortStub(reinterpret_cast(this + 1)) + || DispatchStubLong::isLongStub(reinterpret_cast(this + 1))); + return DispatchStubShort::isShortStub((BYTE *)(this + 1)) ? e_TYPE_SHORT : e_TYPE_LONG; + } + + inline static size_t size(DispatchStubType type) + { + STATIC_CONTRACT_LEAF; + return sizeof(DispatchStub) + + ((type == e_TYPE_SHORT) ? sizeof(DispatchStubShort) : sizeof(DispatchStubLong)); + } + + inline PCODE entryPoint() const { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } + inline size_t expectedMT() const { LIMITED_METHOD_CONTRACT; return _expectedMT; } + inline size_t size() const { WRAPPER_NO_CONTRACT; return size(type()); } + + inline PCODE implTarget() const + { + LIMITED_METHOD_CONTRACT; + if (type() == e_TYPE_SHORT) + return getShortStub()->implTarget(); + else + return getLongStub()->implTarget(); + } + + inline TADDR implTargetSlot(EntryPointSlots::SlotType *slotTypeRef) const + { + LIMITED_METHOD_CONTRACT; + _ASSERTE(slotTypeRef != nullptr); + + *slotTypeRef = EntryPointSlots::SlotType_Executable; + if (type() == e_TYPE_SHORT) + return getShortStub()->implTargetSlot(); + else + return getLongStub()->implTargetSlot(); + } + + inline PCODE failTarget() const + { + if (type() == e_TYPE_SHORT) + return getShortStub()->failTarget(); + else + return getLongStub()->failTarget(); + } + +private: + inline DispatchStubShort const *getShortStub() const + { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this + 1); } + + inline DispatchStubLong const *getLongStub() const + { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this + 1); } + + BYTE _entryPoint [2]; // 48 B8 mov rax, + size_t _expectedMT; // xx xx xx xx xx xx xx xx 64-bit address + BYTE part1 [3]; // 48 39 XX cmp [THIS_REG], rax + BYTE nopOp; // 90 nop ; 1-byte nop to align _implTarget + + // Followed by either DispatchStubShort or DispatchStubLong, depending + // on whether we were able to make a rel32 or had to make an abs64 jump + // to the resolve stub on failure. + +}; + +/* DispatchHolders are the containers for DispatchStubs, they provide for any alignment of +stubs as necessary. DispatchStubs are placed in a hashtable and in a cache. The keys for both +are the pair expectedMT and token. Efficiency of the of the hash table is not a big issue, +since lookups in it are fairly rare. Efficiency of the cache is paramount since it is accessed frequently +(see ResolveStub below). Currently we are storing both of these fields in the DispatchHolder to simplify +alignment issues. If inlineMT in the stub itself was aligned, then it could be the expectedMT field. +While the token field can be logically gotten by following the failure target to the failEntryPoint +of the ResolveStub and then to the token over there, for perf reasons of cache access, it is duplicated here. +This allows us to use DispatchStubs in the cache. The alternative is to provide some other immutable struct +for the cache composed of the triplet (expectedMT, token, target) and some sort of reclaimation scheme when +they are thrown out of the cache via overwrites (since concurrency will make the obvious approaches invalid). +*/ + +/* @workaround for ee resolution - Since the EE does not currently have a resolver function that +does what we want, see notes in implementation of VirtualCallStubManager::Resolver, we are +using dispatch stubs to siumulate what we want. That means that inlineTarget, which should be immutable +is in fact written. Hence we have moved target out into the holder and aligned it so we can +atomically update it. When we get a resolver function that does what we want, we can drop this field, +and live with just the inlineTarget field in the stub itself, since immutability will hold.*/ +struct DispatchHolder +{ + static void InitializeStatic(); + + void Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT, + DispatchStub::DispatchStubType type); + + static size_t GetHolderSize(DispatchStub::DispatchStubType type) + { STATIC_CONTRACT_WRAPPER; return DispatchStub::size(type); } + + static BOOL CanShortJumpDispatchStubReachFailTarget(PCODE failTarget, LPCBYTE stubMemory) + { + STATIC_CONTRACT_WRAPPER; + LPCBYTE pFrom = stubMemory + sizeof(DispatchStub) + DispatchStubShort_offsetof_failDisplBase; + size_t cbRelJump = failTarget - (PCODE)pFrom; + return FitsInI4(cbRelJump); + } + + DispatchStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this); } + + static DispatchHolder* FromDispatchEntry(PCODE dispatchEntry); + +private: + // DispatchStub follows here. It is dynamically sized on allocation + // because it could be a DispatchStubLong or a DispatchStubShort +}; + +struct ResolveStub; +struct ResolveHolder; + +/*ResolveStub************************************************************************************** +Polymorphic call sites and monomorphic calls that fail end up in a ResolverStub. There is only +one resolver stub built for any given token, even though there may be many call sites that +use that token and many distinct types that are used in the calling call frames. A resolver stub +actually has two entry points, one for polymorphic call sites and one for dispatch stubs that fail on their +expectedMT test. There is a third part of the resolver stub that enters the ee when a decision should +be made about changing the callsite. Therefore, we have defined the resolver stub as three distinct pieces, +even though they are actually allocated as a single contiguous block of memory. These pieces are: + +A ResolveStub has two entry points: + +FailEntry - where the dispatch stub goes if the expected MT test fails. This piece of the stub does +a check to see how often we are actually failing. If failures are frequent, control transfers to the +patch piece to cause the call site to be changed from a mostly monomorphic callsite +(calls dispatch stub) to a polymorphic callsize (calls resolve stub). If failures are rare, control +transfers to the resolve piece (see ResolveStub). The failEntryPoint decrements a counter +every time it is entered. The ee at various times will add a large chunk to the counter. + +ResolveEntry - does a lookup via in a cache by hashing the actual type of the calling frame s + and the token identifying the (contract,method) pair desired. If found, control is transfered +to the method implementation. If not found in the cache, the token is pushed and the ee is entered via +the ResolveWorkerStub to do a full lookup and eventual transfer to the correct method implementation. Since +there is a different resolve stub for every token, the token can be inlined and the token can be pre-hashed. +The effectiveness of this approach is highly sensitive to the effectiveness of the hashing algorithm used, +as well as its speed. It turns out it is very important to make the hash function sensitive to all +of the bits of the method table, as method tables are laid out in memory in a very non-random way. Before +making any changes to the code sequences here, it is very important to measure and tune them as perf +can vary greatly, in unexpected ways, with seeming minor changes. + +Implementation note - Order, choice of instructions, and branch directions +should be carefully tuned since it can have an inordinate effect on performance. Particular +attention needs to be paid to the effects on the BTB and branch prediction, both in the small +and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. +Note that this stub is called in highly polymorphic cases, but the cache should have been sized +and the hash function chosen to maximize the cache hit case. Hence the cmp/jcc instructions should +mostly be going down the cache hit route, and it is important that this be statically predicted as so. +Hence the 3 jcc instrs need to be forward jumps. As structured, there is only one jmp/jcc that typically +gets put in the BTB since all the others typically fall straight thru. Minimizing potential BTB entries +is important. */ + +struct ResolveStub +{ + inline PCODE failEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_failEntryPoint[0]; } + inline PCODE resolveEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_resolveEntryPoint[0]; } + inline PCODE slowEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_slowEntryPoint[0]; } + + inline INT32* pCounter() { LIMITED_METHOD_CONTRACT; return _pCounter; } + inline UINT32 hashedToken() { LIMITED_METHOD_CONTRACT; return _hashedToken >> LOG2_PTRSIZE; } + inline size_t cacheAddress() { LIMITED_METHOD_CONTRACT; return _cacheAddress; } + inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } + +private: + friend struct ResolveHolder; + + BYTE _resolveEntryPoint[3];// resolveStub: + // 52 push rdx + // 49 BA mov r10, + size_t _cacheAddress; // xx xx xx xx xx xx xx xx 64-bit address + BYTE part1 [15]; // 48 8B XX mov rax, [THIS_REG] ; Compute hash = ((MT + MT>>12) ^ prehash) + // 48 8B D0 mov rdx, rax ; rdx <- current MethodTable + // 48 C1 E8 0C shr rax, 12 + // 48 03 C2 add rax, rdx + // 48 35 xor rax, + UINT32 _hashedToken; // xx xx xx xx hashedtoken ; xor with pre-hashed token + BYTE part2 [2]; // 48 25 and rax, + UINT32 mask; // xx xx xx xx cache_mask ; and with cache mask + BYTE part3 [6]; // 4A 8B 04 10 mov rax, [r10 + rax] ; get cache entry address + // 49 BA mov r10, + size_t _token; // xx xx xx xx xx xx xx xx 64-bit address + BYTE part4 [3]; // 48 3B 50 cmp rdx, [rax+ ; compare our MT vs. cache MT + BYTE mtOffset; // xx ResolverCacheElem.pMT] + BYTE part5 [1]; // 75 jne + BYTE toMiss1; // xx miss ; must be forward jump, for perf reasons + BYTE part6 [3]; // 4C 3B 50 cmp r10, [rax+ ; compare our token vs. cache token + BYTE tokenOffset; // xx ResolverCacheElem.token] + BYTE part7 [1]; // 75 jne + BYTE toMiss2; // xx miss ; must be forward jump, for perf reasons + BYTE part8 [3]; // 48 8B 40 mov rax, [rax+ ; setup rax with method impl address + BYTE targetOffset; // xx ResolverCacheElem.target] + BYTE part9 [3]; // 5A pop rdx + // FF E0 jmp rax + // failStub: + BYTE _failEntryPoint [2]; // 48 B8 mov rax, + INT32* _pCounter; // xx xx xx xx xx xx xx xx 64-bit address + BYTE part11 [4]; // 83 00 FF add dword ptr [rax], -1 + // 7d jnl + BYTE toResolveStub1; // xx resolveStub + BYTE part12 [4]; // 49 83 CB 01 or r11, 1 + BYTE _slowEntryPoint [3]; // 52 slow: push rdx + // 49 BA mov r10, + size_t _tokenSlow; // xx xx xx xx xx xx xx xx 64-bit address +// BYTE miss [5]; // 5A miss: pop rdx ; don't pop rdx +// // 41 52 push r10 ; don't push r10 leave it setup with token + BYTE miss [3]; // 50 push rax ; push ptr to cache elem + // 48 B8 mov rax, + size_t _resolveWorker; // xx xx xx xx xx xx xx xx 64-bit address + BYTE part10 [2]; // FF E0 jmp rax +}; + +/* ResolveHolders are the containers for ResolveStubs, They provide +for any alignment of the stubs as necessary. The stubs are placed in a hash table keyed by +the token for which they are built. Efficiency of access requires that this token be aligned. +For now, we have copied that field into the ResolveHolder itself, if the resolve stub is arranged such that +any of its inlined tokens (non-prehashed) is aligned, then the token field in the ResolveHolder +is not needed. */ +struct ResolveHolder +{ + static void InitializeStatic(); + + void Initialize(ResolveHolder* pResolveHolderRX, + PCODE resolveWorkerTarget, PCODE patcherTarget, + size_t dispatchToken, UINT32 hashedToken, + void * cacheAddr, INT32* counterAddr); + + ResolveStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } + + static ResolveHolder* FromFailEntry(PCODE resolveEntry); + static ResolveHolder* FromResolveEntry(PCODE resolveEntry); + +private: + ResolveStub _stub; +}; /*VTableCallStub************************************************************************************** These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed @@ -70,7 +511,6 @@ struct VTableCallHolder VTableCallStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this); } - size_t size() { return stub()->size(); } static size_t GetHolderSize(unsigned slot) { STATIC_CONTRACT_WRAPPER; @@ -89,6 +529,12 @@ struct VTableCallHolder #ifdef DECLARE_DATA +LookupStub lookupInit; +DispatchStub dispatchInit; +DispatchStubShort dispatchShortInit; +DispatchStubLong dispatchLongInit; +ResolveStub resolveInit; + #define INSTR_INT3 0xcc #define INSTR_NOP 0x90 @@ -96,8 +542,287 @@ struct VTableCallHolder #include "asmconstants.h" +#ifdef STUB_LOGGING +extern size_t g_lookup_inline_counter; +extern size_t g_call_inline_counter; +extern size_t g_miss_inline_counter; +extern size_t g_call_cache_counter; +extern size_t g_miss_cache_counter; +#endif + +/* Template used to generate the stub. We generate a stub by allocating a block of + memory and copy the template over it and just update the specific fields that need + to be changed. +*/ + +void LookupHolder::InitializeStatic() +{ + static_assert_no_msg((sizeof(LookupHolder) % sizeof(void*)) == 0); + + // The first instruction of a LookupStub is nop + // and we use it in order to differentiate the first two bytes + // of a LookupStub and a ResolveStub + lookupInit._entryPoint [0] = INSTR_NOP; + lookupInit._entryPoint [1] = 0x48; + lookupInit._entryPoint [2] = 0xB8; + lookupInit._token = 0xcccccccccccccccc; + lookupInit.part2 [0] = 0x50; + lookupInit.part2 [1] = 0x48; + lookupInit.part2 [2] = 0xB8; + lookupInit._resolveWorkerAddr = 0xcccccccccccccccc; + lookupInit.part3 [0] = 0xFF; + lookupInit.part3 [1] = 0xE0; +} + +void LookupHolder::Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken) +{ + _stub = lookupInit; + + //fill in the stub specific fields + _stub._token = dispatchToken; + _stub._resolveWorkerAddr = (size_t) resolveWorkerTarget; +} + +/* Template used to generate the stub. We generate a stub by allocating a block of + memory and copy the template over it and just update the specific fields that need + to be changed. +*/ + +void DispatchHolder::InitializeStatic() +{ + // Check that _implTarget is aligned in the DispatchStub for backpatching + static_assert_no_msg(((sizeof(DispatchStub) + offsetof(DispatchStubShort, _implTarget)) % sizeof(void *)) == 0); + static_assert_no_msg(((sizeof(DispatchStub) + offsetof(DispatchStubLong, _implTarget)) % sizeof(void *)) == 0); + + static_assert_no_msg(((sizeof(DispatchStub) + sizeof(DispatchStubShort)) % sizeof(void*)) == 0); + static_assert_no_msg(((sizeof(DispatchStub) + sizeof(DispatchStubLong)) % sizeof(void*)) == 0); + static_assert_no_msg((DispatchStubLong_offsetof_failLabel - DispatchStubLong_offsetof_failDisplBase) < INT8_MAX); + + // Common dispatch stub initialization + dispatchInit._entryPoint [0] = 0x48; + dispatchInit._entryPoint [1] = 0xB8; + dispatchInit._expectedMT = 0xcccccccccccccccc; + dispatchInit.part1 [0] = X64_INSTR_CMP_IND_THIS_REG_RAX & 0xff; + dispatchInit.part1 [1] = (X64_INSTR_CMP_IND_THIS_REG_RAX >> 8) & 0xff; + dispatchInit.part1 [2] = (X64_INSTR_CMP_IND_THIS_REG_RAX >> 16) & 0xff; + dispatchInit.nopOp = 0x90; + + // Short dispatch stub initialization + dispatchShortInit.part1 [0] = 0x48; + dispatchShortInit.part1 [1] = 0xb8; + dispatchShortInit._implTarget = 0xcccccccccccccccc; + dispatchShortInit.part2 [0] = 0x0F; + dispatchShortInit.part2 [1] = 0x85; + dispatchShortInit._failDispl = 0xcccccccc; + dispatchShortInit.part3 [0] = 0xFF; + dispatchShortInit.part3 [1] = 0xE0; + + // Long dispatch stub initialization + dispatchLongInit.part1 [0] = 0x48; + dispatchLongInit.part1 [1] = 0xb8; + dispatchLongInit._implTarget = 0xcccccccccccccccc; + dispatchLongInit.part2 [0] = 0x75; + dispatchLongInit._failDispl = BYTE(DispatchStubLong_offsetof_failLabel - DispatchStubLong_offsetof_failDisplBase); + dispatchLongInit.part3 [0] = 0xFF; + dispatchLongInit.part3 [1] = 0xE0; + // failLabel: + dispatchLongInit.part4 [0] = 0x48; + dispatchLongInit.part4 [1] = 0xb8; + dispatchLongInit._failTarget = 0xcccccccccccccccc; + dispatchLongInit.part5 [0] = 0xFF; + dispatchLongInit.part5 [1] = 0xE0; +}; + +void DispatchHolder::Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT, + DispatchStub::DispatchStubType type) +{ + // + // Initialize the common area + // + + // initialize the static data + *stub() = dispatchInit; + + // fill in the dynamic data + stub()->_expectedMT = expectedMT; + + // + // Initialize the short/long areas + // + if (type == DispatchStub::e_TYPE_SHORT) + { + DispatchStubShort *shortStubRW = const_cast(stub()->getShortStub()); + DispatchStubShort *shortStubRX = const_cast(pDispatchHolderRX->stub()->getShortStub()); + + // initialize the static data + *shortStubRW = dispatchShortInit; + + // fill in the dynamic data + size_t displ = (failTarget - ((PCODE) &shortStubRX->_failDispl + sizeof(DISPL))); + CONSISTENCY_CHECK(FitsInI4(displ)); + shortStubRW->_failDispl = (DISPL) displ; + shortStubRW->_implTarget = (size_t) implTarget; + CONSISTENCY_CHECK((PCODE)&shortStubRX->_failDispl + sizeof(DISPL) + shortStubRX->_failDispl == failTarget); + } + else + { + CONSISTENCY_CHECK(type == DispatchStub::e_TYPE_LONG); + DispatchStubLong *longStub = const_cast(stub()->getLongStub()); + + // initialize the static data + *longStub = dispatchLongInit; + + // fill in the dynamic data + longStub->_implTarget = implTarget; + longStub->_failTarget = failTarget; + } +} + +/* Template used to generate the stub. We generate a stub by allocating a block of + memory and copy the template over it and just update the specific fields that need + to be changed. +*/ + +void ResolveHolder::InitializeStatic() +{ + static_assert_no_msg((sizeof(ResolveHolder) % sizeof(void*)) == 0); + + resolveInit._resolveEntryPoint [0] = 0x52; + resolveInit._resolveEntryPoint [1] = 0x49; + resolveInit._resolveEntryPoint [2] = 0xBA; + resolveInit._cacheAddress = 0xcccccccccccccccc; + resolveInit.part1 [ 0] = X64_INSTR_MOV_RAX_IND_THIS_REG & 0xff; + resolveInit.part1 [ 1] = (X64_INSTR_MOV_RAX_IND_THIS_REG >> 8) & 0xff; + resolveInit.part1 [ 2] = (X64_INSTR_MOV_RAX_IND_THIS_REG >> 16) & 0xff; + resolveInit.part1 [ 3] = 0x48; + resolveInit.part1 [ 4] = 0x8B; + resolveInit.part1 [ 5] = 0xD0; + resolveInit.part1 [ 6] = 0x48; + resolveInit.part1 [ 7] = 0xC1; + resolveInit.part1 [ 8] = 0xE8; + resolveInit.part1 [ 9] = CALL_STUB_CACHE_NUM_BITS; + resolveInit.part1 [10] = 0x48; + resolveInit.part1 [11] = 0x03; + resolveInit.part1 [12] = 0xC2; + resolveInit.part1 [13] = 0x48; + resolveInit.part1 [14] = 0x35; +// Review truncation from unsigned __int64 to UINT32 of a constant value. +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable:4305 4309) +#endif // defined(_MSC_VER) + + resolveInit._hashedToken = 0xcccccccc; + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif // defined(_MSC_VER) + + resolveInit.part2 [ 0] = 0x48; + resolveInit.part2 [ 1] = 0x25; + resolveInit.mask = CALL_STUB_CACHE_MASK*sizeof(void *); + resolveInit.part3 [0] = 0x4A; + resolveInit.part3 [1] = 0x8B; + resolveInit.part3 [2] = 0x04; + resolveInit.part3 [3] = 0x10; + resolveInit.part3 [4] = 0x49; + resolveInit.part3 [5] = 0xBA; + resolveInit._token = 0xcccccccccccccccc; + resolveInit.part4 [0] = 0x48; + resolveInit.part4 [1] = 0x3B; + resolveInit.part4 [2] = 0x50; + resolveInit.mtOffset = offsetof(ResolveCacheElem,pMT) & 0xFF; + resolveInit.part5 [0] = 0x75; + resolveInit.toMiss1 = (offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss1)+1)) & 0xFF; + resolveInit.part6 [0] = 0x4C; + resolveInit.part6 [1] = 0x3B; + resolveInit.part6 [2] = 0x50; + resolveInit.tokenOffset = offsetof(ResolveCacheElem,token) & 0xFF; + resolveInit.part7 [0] = 0x75; + resolveInit.toMiss2 = (offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss2)+1)) & 0xFF; + resolveInit.part8 [0] = 0x48; + resolveInit.part8 [1] = 0x8B; + resolveInit.part8 [2] = 0x40; + resolveInit.targetOffset = offsetof(ResolveCacheElem,target) & 0xFF; + resolveInit.part9 [0] = 0x5A; + resolveInit.part9 [1] = 0xFF; + resolveInit.part9 [2] = 0xE0; + resolveInit._failEntryPoint [0] = 0x48; + resolveInit._failEntryPoint [1] = 0xB8; + resolveInit._pCounter = (INT32*) (size_t) 0xcccccccccccccccc; + resolveInit.part11 [0] = 0x83; + resolveInit.part11 [1] = 0x00; + resolveInit.part11 [2] = 0xFF; + resolveInit.part11 [3] = 0x7D; + resolveInit.toResolveStub1 = (offsetof(ResolveStub, _resolveEntryPoint) - (offsetof(ResolveStub, toResolveStub1)+1)) & 0xFF; + resolveInit.part12 [0] = 0x49; + resolveInit.part12 [1] = 0x83; + resolveInit.part12 [2] = 0xCB; + resolveInit.part12 [3] = 0x01; + resolveInit._slowEntryPoint [0] = 0x52; + resolveInit._slowEntryPoint [1] = 0x49; + resolveInit._slowEntryPoint [2] = 0xBA; + resolveInit._tokenSlow = 0xcccccccccccccccc; + resolveInit.miss [0] = 0x50; + resolveInit.miss [1] = 0x48; + resolveInit.miss [2] = 0xB8; + resolveInit._resolveWorker = 0xcccccccccccccccc; + resolveInit.part10 [0] = 0xFF; + resolveInit.part10 [1] = 0xE0; +}; + +void ResolveHolder::Initialize(ResolveHolder* pResolveHolderRX, + PCODE resolveWorkerTarget, PCODE patcherTarget, + size_t dispatchToken, UINT32 hashedToken, + void * cacheAddr, INT32* counterAddr) +{ + _stub = resolveInit; + + //fill in the stub specific fields + _stub._cacheAddress = (size_t) cacheAddr; + _stub._hashedToken = hashedToken << LOG2_PTRSIZE; + _stub._token = dispatchToken; + _stub._tokenSlow = dispatchToken; + _stub._resolveWorker = (size_t) resolveWorkerTarget; + _stub._pCounter = counterAddr; +} + +ResolveHolder* ResolveHolder::FromFailEntry(PCODE failEntry) +{ + LIMITED_METHOD_CONTRACT; + ResolveHolder* resolveHolder = (ResolveHolder*) ( failEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _failEntryPoint) ); + _ASSERTE(resolveHolder->_stub._resolveEntryPoint[1] == resolveInit._resolveEntryPoint[1]); + return resolveHolder; +} + #endif // DACCESS_COMPILE +LookupHolder* LookupHolder::FromLookupEntry(PCODE lookupEntry) +{ + LIMITED_METHOD_CONTRACT; + LookupHolder* lookupHolder = (LookupHolder*) ( lookupEntry - offsetof(LookupHolder, _stub) - offsetof(LookupStub, _entryPoint) ); + _ASSERTE(lookupHolder->_stub._entryPoint[2] == lookupInit._entryPoint[2]); + return lookupHolder; +} + + +DispatchHolder* DispatchHolder::FromDispatchEntry(PCODE dispatchEntry) +{ + LIMITED_METHOD_CONTRACT; + DispatchHolder* dispatchHolder = (DispatchHolder*) ( dispatchEntry - offsetof(DispatchStub, _entryPoint) ); + _ASSERTE(dispatchHolder->stub()->_entryPoint[1] == dispatchInit._entryPoint[1]); + return dispatchHolder; +} + + +ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry) +{ + LIMITED_METHOD_CONTRACT; + ResolveHolder* resolveHolder = (ResolveHolder*) ( resolveEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _resolveEntryPoint) ); + _ASSERTE(resolveHolder->_stub._resolveEntryPoint[1] == resolveInit._resolveEntryPoint[1]); + return resolveHolder; +} + void VTableCallHolder::Initialize(unsigned slot) { unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE; @@ -160,19 +885,23 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s WORD firstWord = *((WORD*) stubStartAddress); - if (firstWord == DISPATCH_STUB_FIRST_WORD && *((BYTE*)stubStartAddress + 2) == DISPATCH_STUB_THIRD_BYTE) + if (firstWord == 0xB848) { stubKind = SK_DISPATCH; } - else if (firstWord == LOOKUP_STUB_FIRST_WORD) + else if (firstWord == 0x4890) { stubKind = SK_LOOKUP; } - else if (firstWord == RESOLVE_STUB_FIRST_WORD) + else if (firstWord == 0x4952) { stubKind = SK_RESOLVE; } - else if (firstWord == VTABLECALL_STUB_FIRST_WORD) + else if (firstWord == 0x48F8) + { + stubKind = SK_LOOKUP; + } + else if (firstWord == 0x8B48) { stubKind = SK_VTABLECALL; } diff --git a/src/coreclr/vm/arm/asmconstants.h b/src/coreclr/vm/arm/asmconstants.h index 06da653d85fa1..8d8a1f4f0ea0e 100644 --- a/src/coreclr/vm/arm/asmconstants.h +++ b/src/coreclr/vm/arm/asmconstants.h @@ -247,38 +247,5 @@ ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForMethod == offsetof(CallCoun #define CallCountingStubData__TargetForThresholdReached 0x08 ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForThresholdReached == offsetof(CallCountingStubData, TargetForThresholdReached)) -#define LookupStubData__DispatchToken 0x00 -ASMCONSTANTS_C_ASSERT(LookupStubData__DispatchToken == offsetof(LookupStubData, DispatchToken)) - -#define LookupStubData__ResolveWorkerTarget 0x04 -ASMCONSTANTS_C_ASSERT(LookupStubData__ResolveWorkerTarget == offsetof(LookupStubData, ResolveWorkerTarget)) - -#define DispatchStubData__ExpectedMT 0x00 -ASMCONSTANTS_C_ASSERT(DispatchStubData__ExpectedMT == offsetof(DispatchStubData, ExpectedMT)) - -#define DispatchStubData__ImplTarget 0x04 -ASMCONSTANTS_C_ASSERT(DispatchStubData__ImplTarget == offsetof(DispatchStubData, ImplTarget)) - -#define DispatchStubData__FailTarget 0x08 -ASMCONSTANTS_C_ASSERT(DispatchStubData__FailTarget == offsetof(DispatchStubData, FailTarget)) - -#define ResolveStubData__HashedToken 0x04 -ASMCONSTANTS_C_ASSERT(ResolveStubData__HashedToken == offsetof(ResolveStubData, HashedToken)) - -#define ResolveStubData__CacheAddress 0x00 -ASMCONSTANTS_C_ASSERT(ResolveStubData__CacheAddress == offsetof(ResolveStubData, CacheAddress)) - -#define ResolveStubData__Token 0x0c -ASMCONSTANTS_C_ASSERT(ResolveStubData__Token == offsetof(ResolveStubData, Token)) - -#define ResolveStubData__Counter 0x08 -ASMCONSTANTS_C_ASSERT(ResolveStubData__Counter == offsetof(ResolveStubData, Counter)) - -#define ResolveStubData__ResolveWorkerTarget 0x10 -ASMCONSTANTS_C_ASSERT(ResolveStubData__ResolveWorkerTarget == offsetof(ResolveStubData, ResolveWorkerTarget)) - -#define CALL_STUB_CACHE_MASK_ASM 0xfff -ASMCONSTANTS_C_ASSERT(CALL_STUB_CACHE_MASK_ASM == CALL_STUB_CACHE_MASK) - #undef ASMCONSTANTS_RUNTIME_ASSERT #undef ASMCONSTANTS_C_ASSERT diff --git a/src/coreclr/vm/arm/stubs.cpp b/src/coreclr/vm/arm/stubs.cpp index 75922effad94f..bb733cb6b36f6 100644 --- a/src/coreclr/vm/arm/stubs.cpp +++ b/src/coreclr/vm/arm/stubs.cpp @@ -752,20 +752,20 @@ Rough pseudo-code of interface dispatching: // jitted code calls *indirectionCell switch (*indirectionCell) { - case LookupStub: + case LookupHolder._stub: // ResolveWorkerAsmStub: - *indirectionCell = DispatchStub; + *indirectionCell = DispatchHolder._stub; call ResolveWorkerStatic, jump to target method; - case DispatchStub: + case DispatchHolder._stub: if (r0.methodTable == expectedMethodTable) jump to target method; - // ResolveStub._failEntryPoint: - jump to case ResolveStub._resolveEntryPoint; - case ResolveStub._resolveEntryPoint: + // ResolveHolder._stub._failEntryPoint: + jump to case ResolveHolder._stub._resolveEntryPoint; + case ResolveHolder._stub._resolveEntryPoint: if (r0.methodTable in hashTable) jump to target method; - // ResolveStub._slowEntryPoint: + // ResolveHolder._stub._slowEntryPoint: // ResolveWorkerChainLookupAsmStub: // ResolveWorkerAsmStub: - if (_failEntryPoint called too many times) *indirectionCell = ResolveStub._resolveEntryPoint; + if (_failEntryPoint called too many times) *indirectionCell = ResolveHolder._stub._resolveEntryPoint; call ResolveWorkerStatic, jump to target method; } @@ -773,6 +773,348 @@ Note that ResolveWorkerChainLookupAsmStub currently points directly to ResolveWorkerAsmStub; in the future, this could be separate. */ +void LookupHolder::Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken) +{ + // Called directly by JITTED code + // See ResolveWorkerAsmStub + + // ldr r12, [pc + 8] ; #_token + _stub._entryPoint[0] = 0xf8df; + _stub._entryPoint[1] = 0xc008; + // ldr pc, [pc] ; #_resolveWorkerTarget + _stub._entryPoint[2] = 0xf8df; + _stub._entryPoint[3] = 0xf000; + + _stub._resolveWorkerTarget = resolveWorkerTarget; + _stub._token = dispatchToken; + _ASSERTE(4 == LookupStub::entryPointLen); +} + +void DispatchHolder::Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT) +{ + // Called directly by JITTED code + // DispatchHolder._stub._entryPoint(r0:object, r1, r2, r3, r4:IndirectionCell) + // { + // if (r0.methodTable == this._expectedMT) (this._implTarget)(r0, r1, r2, r3); + // else (this._failTarget)(r0, r1, r2, r3, r4); + // } + + int n = 0; + WORD offset; + + // We rely on the stub entry-point being DWORD aligned (so we can tell whether any subsequent WORD is + // DWORD-aligned or not, which matters in the calculation of PC-relative offsets). + _ASSERTE(((UINT_PTR)_stub._entryPoint & 0x3) == 0); + +// Compute a PC-relative offset for use in an instruction encoding. Must call this prior to emitting the +// instruction halfword to which it applies. For thumb-2 encodings the offset must be computed before emitting +// the first of the halfwords. +#undef PC_REL_OFFSET +#define PC_REL_OFFSET(_field) (WORD)(offsetof(DispatchStub, _field) - ((offsetof(DispatchStub, _entryPoint) + sizeof(*DispatchStub::_entryPoint) * (n + 2)) & 0xfffffffc)) + + // r0 : object. It can be null as well. + // when it is null the code causes an AV. This AV is seen by the VM's personality routine + // and it converts it into nullRef. We want the AV to happen before modifying the stack so that we can get the + // call stack in windbg at the point of AV. So therefore "ldr r12, [r0]" should be the first instruction. + + // ldr r12, [r0 + #Object.m_pMethTab] + _stub._entryPoint[n++] = DISPATCH_STUB_FIRST_WORD; + _stub._entryPoint[n++] = 0xc000; + + // push {r5} + _stub._entryPoint[n++] = 0xb420; + + // ldr r5, [pc + #_expectedMT] + offset = PC_REL_OFFSET(_expectedMT); + _ASSERTE((offset & 0x3) == 0); + _stub._entryPoint[n++] = 0x4d00 | (offset >> 2); + + // cmp r5, r12 + _stub._entryPoint[n++] = 0x4565; + + // pop {r5} + _stub._entryPoint[n++] = 0xbc20; + + // bne failTarget + _stub._entryPoint[n++] = 0xd101; + + // ldr pc, [pc + #_implTarget] + offset = PC_REL_OFFSET(_implTarget); + _stub._entryPoint[n++] = 0xf8df; + _stub._entryPoint[n++] = 0xf000 | offset; + + // failTarget: + // ldr pc, [pc + #_failTarget] + offset = PC_REL_OFFSET(_failTarget); + _stub._entryPoint[n++] = 0xf8df; + _stub._entryPoint[n++] = 0xf000 | offset; + + // nop - insert padding + _stub._entryPoint[n++] = 0xbf00; + + _ASSERTE(n == DispatchStub::entryPointLen); + + // Make sure that the data members below are aligned + _ASSERTE((n & 1) == 0); + + _stub._expectedMT = DWORD(expectedMT); + _stub._failTarget = failTarget; + _stub._implTarget = implTarget; +} + +void ResolveHolder::Initialize(ResolveHolder* pResolveHolderRX, + PCODE resolveWorkerTarget, PCODE patcherTarget, + size_t dispatchToken, UINT32 hashedToken, + void * cacheAddr, INT32 * counterAddr) +{ + // Called directly by JITTED code + // ResolveStub._resolveEntryPoint(r0:Object*, r1, r2, r3, r4:IndirectionCellAndFlags) + // { + // MethodTable mt = r0.m_pMethTab; + // int i = ((mt + mt >> 12) ^ this._hashedToken) & this._cacheMask + // ResolveCacheElem e = this._cacheAddress + i + // do + // { + // if (mt == e.pMT && this._token == e.token) (e.target)(r0, r1, r2, r3); + // e = e.pNext; + // } while (e != null) + // (this._slowEntryPoint)(r0, r1, r2, r3, r4); + // } + // + + int n = 0; + WORD offset; + + // We rely on the stub entry-point being DWORD aligned (so we can tell whether any subsequent WORD is + // DWORD-aligned or not, which matters in the calculation of PC-relative offsets). + _ASSERTE(((UINT_PTR)_stub._resolveEntryPoint & 0x3) == 0); + +// Compute a PC-relative offset for use in an instruction encoding. Must call this prior to emitting the +// instruction halfword to which it applies. For thumb-2 encodings the offset must be computed before emitting +// the first of the halfwords. +#undef PC_REL_OFFSET +#define PC_REL_OFFSET(_field) (WORD)(offsetof(ResolveStub, _field) - ((offsetof(ResolveStub, _resolveEntryPoint) + sizeof(*ResolveStub::_resolveEntryPoint) * (n + 2)) & 0xfffffffc)) + + // ldr r12, [r0 + #Object.m_pMethTab] + _stub._resolveEntryPoint[n++] = RESOLVE_STUB_FIRST_WORD; + _stub._resolveEntryPoint[n++] = 0xc000; + + // ;; We need two scratch registers, r5 and r6 + // push {r5,r6} + _stub._resolveEntryPoint[n++] = 0xb460; + + // ;; Compute i = ((mt + mt >> 12) ^ this._hashedToken) & this._cacheMask + + // add r6, r12, r12 lsr #12 + _stub._resolveEntryPoint[n++] = 0xeb0c; + _stub._resolveEntryPoint[n++] = 0x361c; + + // ldr r5, [pc + #_hashedToken] + offset = PC_REL_OFFSET(_hashedToken); + _ASSERTE((offset & 0x3) == 0); + _stub._resolveEntryPoint[n++] = 0x4d00 | (offset >> 2); + + // eor r6, r6, r5 + _stub._resolveEntryPoint[n++] = 0xea86; + _stub._resolveEntryPoint[n++] = 0x0605; + + // ldr r5, [pc + #_cacheMask] + offset = PC_REL_OFFSET(_cacheMask); + _ASSERTE((offset & 0x3) == 0); + _stub._resolveEntryPoint[n++] = 0x4d00 | (offset >> 2); + + // and r6, r6, r5 + _stub._resolveEntryPoint[n++] = 0xea06; + _stub._resolveEntryPoint[n++] = 0x0605; + + // ;; ResolveCacheElem e = this._cacheAddress + i + // ldr r5, [pc + #_cacheAddress] + offset = PC_REL_OFFSET(_cacheAddress); + _ASSERTE((offset & 0x3) == 0); + _stub._resolveEntryPoint[n++] = 0x4d00 | (offset >> 2); + + // ldr r6, [r5 + r6] ;; r6 = e = this._cacheAddress + i + _stub._resolveEntryPoint[n++] = 0x59ae; + + // ;; do { + int loop = n; + + // ;; Check mt == e.pMT + // ldr r5, [r6 + #ResolveCacheElem.pMT] + offset = offsetof(ResolveCacheElem, pMT); + _ASSERTE(offset <= 124 && (offset & 0x3) == 0); + _stub._resolveEntryPoint[n++] = 0x6835 | (offset<< 4); + + // cmp r12, r5 + _stub._resolveEntryPoint[n++] = 0x45ac; + + // bne nextEntry + _stub._resolveEntryPoint[n++] = 0xd108; + + // ;; Check this._token == e.token + // ldr r5, [pc + #_token] + offset = PC_REL_OFFSET(_token); + _ASSERTE((offset & 0x3) == 0); + _stub._resolveEntryPoint[n++] = 0x4d00 | (offset>>2); + + // ldr r12, [r6 + #ResolveCacheElem.token] + offset = offsetof(ResolveCacheElem, token); + _stub._resolveEntryPoint[n++] = 0xf8d6; + _stub._resolveEntryPoint[n++] = 0xc000 | offset; + + // cmp r12, r5 + _stub._resolveEntryPoint[n++] = 0x45ac; + + // bne nextEntry + _stub._resolveEntryPoint[n++] = 0xd103; + + // ldr r12, [r6 + #ResolveCacheElem.target] ;; r12 : e.target + offset = offsetof(ResolveCacheElem, target); + _stub._resolveEntryPoint[n++] = 0xf8d6; + _stub._resolveEntryPoint[n++] = 0xc000 | offset; + + // ;; Restore r5 and r6 + // pop {r5,r6} + _stub._resolveEntryPoint[n++] = 0xbc60; + + // ;; Branch to e.target + // bx r12 ;; (e.target)(r0,r1,r2,r3) + _stub._resolveEntryPoint[n++] = 0x4760; + + // nextEntry: + // ;; e = e.pNext; + // ldr r6, [r6 + #ResolveCacheElem.pNext] + offset = offsetof(ResolveCacheElem, pNext); + _ASSERTE(offset <=124 && (offset & 0x3) == 0); + _stub._resolveEntryPoint[n++] = 0x6836 | (offset << 4); + + // ;; } while(e != null); + // cbz r6, slowEntryPoint + _stub._resolveEntryPoint[n++] = 0xb116; + + // ldr r12, [r0 + #Object.m_pMethTab] + _stub._resolveEntryPoint[n++] = 0xf8d0; + _stub._resolveEntryPoint[n++] = 0xc000; + + // b loop + offset = (WORD)((loop - (n + 2)) * sizeof(WORD)); + offset = (offset >> 1) & 0x07ff; + _stub._resolveEntryPoint[n++] = 0xe000 | offset; + + // slowEntryPoint: + // pop {r5,r6} + _stub._resolveEntryPoint[n++] = 0xbc60; + + // nop for alignment + _stub._resolveEntryPoint[n++] = 0xbf00; + + // the slow entry point be DWORD-aligned (see _ASSERTE below) insert nops if necessary . + + // ARMSTUB TODO: promotion + + // fall through to slow case + _ASSERTE(_stub._resolveEntryPoint + n == _stub._slowEntryPoint); + _ASSERTE(n == ResolveStub::resolveEntryPointLen); + + // ResolveStub._slowEntryPoint(r0:MethodToken, r1, r2, r3, r4:IndirectionCellAndFlags) + // { + // r12 = this._tokenSlow; + // this._resolveWorkerTarget(r0, r1, r2, r3, r4, r12); + // } + + // The following macro relies on this entry point being DWORD-aligned. We've already asserted that the + // overall stub is aligned above, just need to check that the preceding stubs occupy an even number of + // WORD slots. + _ASSERTE((n & 1) == 0); + +#undef PC_REL_OFFSET +#define PC_REL_OFFSET(_field) (WORD)(offsetof(ResolveStub, _field) - ((offsetof(ResolveStub, _slowEntryPoint) + sizeof(*ResolveStub::_slowEntryPoint) * (n + 2)) & 0xfffffffc)) + + n = 0; + + // ldr r12, [pc + #_tokenSlow] + offset = PC_REL_OFFSET(_tokenSlow); + _stub._slowEntryPoint[n++] = 0xf8df; + _stub._slowEntryPoint[n++] = 0xc000 | offset; + + // ldr pc, [pc + #_resolveWorkerTarget] + offset = PC_REL_OFFSET(_resolveWorkerTarget); + _stub._slowEntryPoint[n++] = 0xf8df; + _stub._slowEntryPoint[n++] = 0xf000 | offset; + + _ASSERTE(n == ResolveStub::slowEntryPointLen); + + // ResolveStub._failEntryPoint(r0:MethodToken, r1, r2, r3, r4:IndirectionCellAndFlags) + // { + // if(--*(this._pCounter) < 0) r4 = r4 | SDF_ResolveBackPatch; + // this._resolveEntryPoint(r0, r1, r2, r3, r4); + // } + + // The following macro relies on this entry point being DWORD-aligned. We've already asserted that the + // overall stub is aligned above, just need to check that the preceding stubs occupy an even number of + // WORD slots. + _ASSERTE((n & 1) == 0); + +#undef PC_REL_OFFSET +#define PC_REL_OFFSET(_field) (WORD)(offsetof(ResolveStub, _field) - ((offsetof(ResolveStub, _failEntryPoint) + sizeof(*ResolveStub::_failEntryPoint) * (n + 2)) & 0xfffffffc)) + + n = 0; + + // push {r5} + _stub._failEntryPoint[n++] = 0xb420; + + // ldr r5, [pc + #_pCounter] + offset = PC_REL_OFFSET(_pCounter); + _ASSERTE((offset & 0x3) == 0); + _stub._failEntryPoint[n++] = 0x4d00 | (offset >>2); + + // ldr r12, [r5] + _stub._failEntryPoint[n++] = 0xf8d5; + _stub._failEntryPoint[n++] = 0xc000; + + // subs r12, r12, #1 + _stub._failEntryPoint[n++] = 0xf1bc; + _stub._failEntryPoint[n++] = 0x0c01; + + // str r12, [r5] + _stub._failEntryPoint[n++] = 0xf8c5; + _stub._failEntryPoint[n++] = 0xc000; + + // pop {r5} + _stub._failEntryPoint[n++] = 0xbc20; + + // bge resolveEntryPoint + _stub._failEntryPoint[n++] = 0xda01; + + // or r4, r4, SDF_ResolveBackPatch + _ASSERTE(SDF_ResolveBackPatch < 256); + _stub._failEntryPoint[n++] = 0xf044; + _stub._failEntryPoint[n++] = 0x0400 | SDF_ResolveBackPatch; + + // resolveEntryPoint: + // b _resolveEntryPoint + offset = (WORD)(offsetof(ResolveStub, _resolveEntryPoint) - (offsetof(ResolveStub, _failEntryPoint) + sizeof(*ResolveStub::_failEntryPoint) * (n + 2))); + _ASSERTE((offset & 1) == 0); + offset = (offset >> 1) & 0x07ff; + _stub._failEntryPoint[n++] = 0xe000 | offset; + + // nop for alignment + _stub._failEntryPoint[n++] = 0xbf00; + + _ASSERTE(n == ResolveStub::failEntryPointLen); + + _stub._pCounter = counterAddr; + _stub._hashedToken = hashedToken << LOG2_PTRSIZE; + _stub._cacheAddress = (size_t) cacheAddr; + _stub._token = dispatchToken; + _stub._tokenSlow = dispatchToken; + _stub._resolveWorkerTarget = resolveWorkerTarget; + _stub._cacheMask = CALL_STUB_CACHE_MASK * sizeof(void*); + + _ASSERTE(resolveWorkerTarget == (PCODE)ResolveWorkerChainLookupAsmStub); + _ASSERTE(patcherTarget == NULL); +} + Stub *GenerateInitPInvokeFrameHelper() { CONTRACT(Stub*) diff --git a/src/coreclr/vm/arm/thunktemplates.S b/src/coreclr/vm/arm/thunktemplates.S index a50365eebff0c..0686bb2ed4b73 100644 --- a/src/coreclr/vm/arm/thunktemplates.S +++ b/src/coreclr/vm/arm/thunktemplates.S @@ -40,74 +40,3 @@ PAGE_SIZE = 4096 LOCAL_LABEL(CountReachedZero): ldr pc, DATA_SLOT(CallCountingStub, TargetForThresholdReached) LEAF_END_MARKED CallCountingStubCode - - .align 4 - - LEAF_ENTRY LookupStubCode - ldr r12, DATA_SLOT(LookupStub, DispatchToken) - ldr pc, DATA_SLOT(LookupStub, ResolveWorkerTarget) - LEAF_END_MARKED LookupStubCode - - .align 4 - - LEAF_ENTRY DispatchStubCode - PATCH_LABEL DispatchStubCode_ThisDeref - ldr r12, [r0] - push {r5} - ldr r5, DATA_SLOT(DispatchStub, ExpectedMT) - cmp r5, r12 - pop {r5} - bne LOCAL_LABEL(FailTarget) - ldr pc, DATA_SLOT(DispatchStub, ImplTarget) -LOCAL_LABEL(FailTarget): - ldr pc, DATA_SLOT(DispatchStub, FailTarget) - LEAF_END_MARKED DispatchStubCode - - .align 4 - - LEAF_ENTRY ResolveStubCode - PATCH_LABEL ResolveStubCode_ResolveEntry - PATCH_LABEL ResolveStubCode_ThisDeref - ldr r12, [r0] - push {r5, r6} - add r6, r12, r12, lsr #12 - ldr r5, DATA_SLOT(ResolveStub, HashedToken) - eor r6, r6, r5 - mov r5, #CALL_STUB_CACHE_MASK_ASM * 4 - and r6, r6, r5 - ldr r5, DATA_SLOT(ResolveStub, CacheAddress) - ldr r6, [r5, r6] -LOCAL_LABEL(Loop): - ldr r5, [r6] - cmp r12, r5 - bne LOCAL_LABEL(NextEntry) - ldr r5, DATA_SLOT(ResolveStub, Token) - ldr r12, [r6, #4] - cmp r12, r5 - bne LOCAL_LABEL(NextEntry) - ldr r12, [r6, #8] - pop {r5, r6} - bx r12 -LOCAL_LABEL(NextEntry): - ldr r6, [r6, #12] - cbz r6, LOCAL_LABEL(Slow) - ldr r12, [r0] - b LOCAL_LABEL(Loop) -LOCAL_LABEL(Slow): - pop {r5, r6} - nop - ldr r12, DATA_SLOT(ResolveStub, Token) - ldr pc, DATA_SLOT(ResolveStub, ResolveWorkerTarget) - PATCH_LABEL ResolveStubCode_FailEntry - push {r5} - adr r5, DATA_SLOT(ResolveStub, Counter) - ldr r12, [r5] - subs r12, r12, #1 - str r12, [r5] - pop {r5} - bge ResolveStubCode - orr r4, r4, #1 // SDF_ResolveBackPatch - b ResolveStubCode - LEAF_END_MARKED ResolveStubCode - - diff --git a/src/coreclr/vm/arm/thunktemplates.asm b/src/coreclr/vm/arm/thunktemplates.asm index 37f0c54c36470..6562be72146c2 100644 --- a/src/coreclr/vm/arm/thunktemplates.asm +++ b/src/coreclr/vm/arm/thunktemplates.asm @@ -40,73 +40,4 @@ CountReachedZero ldr pc, DATA_SLOT(CallCountingStub, TargetForThresholdReached) LEAF_END_MARKED CallCountingStubCode - ALIGN 4 - - LEAF_ENTRY LookupStubCode - ldr r12, DATA_SLOT(LookupStub, DispatchToken) - ldr pc, DATA_SLOT(LookupStub, ResolveWorkerTarget) - LEAF_END_MARKED LookupStubCode - - ALIGN 4 - - LEAF_ENTRY DispatchStubCode - PATCH_LABEL DispatchStubCode_ThisDeref - ldr r12, [r0] - push {r5} - ldr r5, DATA_SLOT(DispatchStub, ExpectedMT) - cmp r5, r12 - pop {r5} - bne FailTarget - ldr pc, DATA_SLOT(DispatchStub, ImplTarget) -FailTarget - ldr pc, DATA_SLOT(DispatchStub, FailTarget) - LEAF_END_MARKED DispatchStubCode - - ALIGN 4 - - LEAF_ENTRY ResolveStubCode - PATCH_LABEL ResolveStubCode_ResolveEntry - PATCH_LABEL ResolveStubCode_ThisDeref - ldr r12, [r0] - push {r5, r6} - add r6, r12, r12 lsr #12 - ldr r5, DATA_SLOT(ResolveStub, HashedToken) - eor r6, r6, r5 - mov r5, #CALL_STUB_CACHE_MASK_ASM * 4 - and r6, r6, r5 - ldr r5, DATA_SLOT(ResolveStub, CacheAddress) - ldr r6, [r5, r6] -Loop - ldr r5, [r6] - cmp r12, r5 - bne NextEntry - ldr r5, DATA_SLOT(ResolveStub, Token) - ldr r12, [r6, #4] - cmp r12, r5 - bne NextEntry - ldr r12, [r6, #8] - pop {r5, r6} - bx r12 -NextEntry - ldr r6, [r6, #12] - cbz r6, Slow - ldr r12, [r0] - b Loop -Slow - pop {r5, r6} - nop - ldr r12, DATA_SLOT(ResolveStub, Token) - ldr pc, DATA_SLOT(ResolveStub, ResolveWorkerTarget) - PATCH_LABEL ResolveStubCode_FailEntry - push {r5} - adr r5, DATA_SLOT(ResolveStub, Counter) - ldr r12, [r5] - subs r12, r12, #1 - str r12, [r5] - pop {r5} - bge ResolveStubCode - orr r4, r4, #1; SDF_ResolveBackPatch - b ResolveStubCode - LEAF_END_MARKED ResolveStubCode - END diff --git a/src/coreclr/vm/arm/virtualcallstubcpu.hpp b/src/coreclr/vm/arm/virtualcallstubcpu.hpp index 7e8f6b4baf022..041c8267d1812 100644 --- a/src/coreclr/vm/arm/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/arm/virtualcallstubcpu.hpp @@ -14,6 +14,265 @@ #include // Since we are placing code, we want byte packing of the structs +#define USES_LOOKUP_STUBS 1 + +/********************************************************************************************* +Stubs that contain code are all part of larger structs called Holders. There is a +Holder for each kind of stub, i.e XXXStub is contained with XXXHolder. Holders are +essentially an implementation trick that allowed rearranging the code sequences more +easily while trying out different alternatives, and for dealing with any alignment +issues in a way that was mostly immune to the actually code sequences. These Holders +should be revisited when the stub code sequences are fixed, since in many cases they +add extra space to a stub that is not really needed. + +Stubs are placed in cache and hash tables. Since unaligned access of data in memory +is very slow, the keys used in those tables should be aligned. The things used as keys +typically also occur in the generated code, e.g. a token as an immediate part of an instruction. +For now, to avoid alignment computations as different code strategies are tried out, the key +fields are all in the Holders. Eventually, many of these fields should be dropped, and the instruction +streams aligned so that the immediate fields fall on aligned boundaries. +*/ + +#if USES_LOOKUP_STUBS + +struct LookupStub; +struct LookupHolder; + +/*LookupStub************************************************************************************** +Virtual and interface call sites are initially setup to point at LookupStubs. +This is because the runtime type of the pointer is not yet known, +so the target cannot be resolved. Note: if the jit is able to determine the runtime type +of the pointer, it should be generating a direct call not a virtual or interface call. +This stub pushes a lookup token onto the stack to identify the sought after method, and then +jumps into the EE (VirtualCallStubManager::ResolveWorkerStub) to effectuate the lookup and +transfer of control to the appropriate target method implementation, perhaps patching of the call site +along the way to point to a more appropriate stub. Hence callsites that point to LookupStubs +get quickly changed to point to another kind of stub. +*/ +struct LookupStub +{ + inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0] + THUMB_CODE; } + inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } + +private: + friend struct LookupHolder; + const static int entryPointLen = 4; + + WORD _entryPoint[entryPointLen]; + PCODE _resolveWorkerTarget; // xx xx xx xx target address + size_t _token; // xx xx xx xx 32-bit constant +}; + +/* LookupHolders are the containers for LookupStubs, they provide for any alignment of +stubs as necessary. In the case of LookupStubs, alignment is necessary since +LookupStubs are placed in a hash table keyed by token. */ +struct LookupHolder +{ + static void InitializeStatic() { LIMITED_METHOD_CONTRACT; } + + void Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken); + + LookupStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } + + static LookupHolder* FromLookupEntry(PCODE lookupEntry); + +private: + friend struct LookupStub; + + LookupStub _stub; +}; + + +#endif // USES_LOOKUP_STUBS + +struct DispatchStub; +struct DispatchHolder; + +/*DispatchStub************************************************************************************** +Monomorphic and mostly monomorphic call sites eventually point to DispatchStubs. +A dispatch stub has an expected type (expectedMT), target address (target) and fail address (failure). +If the calling frame does in fact have the type be of the expected type, then +control is transfered to the target address, the method implementation. If not, +then control is transfered to the fail address, a fail stub (see below) where a polymorphic +lookup is done to find the correct address to go to. + +implementation note: Order, choice of instructions, and branch directions +should be carefully tuned since it can have an inordinate effect on performance. Particular +attention needs to be paid to the effects on the BTB and branch prediction, both in the small +and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. +Note that since this stub is only used for mostly monomorphic callsites (ones that are not, get patched +to something else), therefore the conditional jump "jne failure" is mostly not taken, and hence it is important +that the branch prediction staticly predict this, which means it must be a forward jump. The alternative +is to reverse the order of the jumps and make sure that the resulting conditional jump "je implTarget" +is statically predicted as taken, i.e a backward jump. The current choice was taken since it was easier +to control the placement of the stubs than control the placement of the jitted code and the stubs. */ +struct DispatchStub +{ + inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)(&_entryPoint[0]) + THUMB_CODE; } + + inline size_t expectedMT() { LIMITED_METHOD_CONTRACT; return _expectedMT; } + inline PCODE implTarget() { LIMITED_METHOD_CONTRACT; return _implTarget; } + + inline TADDR implTargetSlot(EntryPointSlots::SlotType *slotTypeRef) const + { + LIMITED_METHOD_CONTRACT; + _ASSERTE(slotTypeRef != nullptr); + + *slotTypeRef = EntryPointSlots::SlotType_Executable; + return (TADDR)&_implTarget; + } + + inline PCODE failTarget() { LIMITED_METHOD_CONTRACT; return _failTarget; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(DispatchStub); } + +private: + friend struct DispatchHolder; + const static int entryPointLen = 12; + + WORD _entryPoint[entryPointLen]; + size_t _expectedMT; + PCODE _failTarget; + PCODE _implTarget; +}; + +/* DispatchHolders are the containers for DispatchStubs, they provide for any alignment of +stubs as necessary. DispatchStubs are placed in a hashtable and in a cache. The keys for both +are the pair expectedMT and token. Efficiency of the of the hash table is not a big issue, +since lookups in it are fairly rare. Efficiency of the cache is paramount since it is accessed frequently +o(see ResolveStub below). Currently we are storing both of these fields in the DispatchHolder to simplify +alignment issues. If inlineMT in the stub itself was aligned, then it could be the expectedMT field. +While the token field can be logically gotten by following the failure target to the failEntryPoint +of the ResolveStub and then to the token over there, for perf reasons of cache access, it is duplicated here. +This allows us to use DispatchStubs in the cache. The alternative is to provide some other immutable struct +for the cache composed of the triplet (expectedMT, token, target) and some sort of reclaimation scheme when +they are thrown out of the cache via overwrites (since concurrency will make the obvious approaches invalid). +*/ + +/* @workaround for ee resolution - Since the EE does not currently have a resolver function that +does what we want, see notes in implementation of VirtualCallStubManager::Resolver, we are +using dispatch stubs to siumulate what we want. That means that inlineTarget, which should be immutable +is in fact written. Hence we have moved target out into the holder and aligned it so we can +atomically update it. When we get a resolver function that does what we want, we can drop this field, +and live with just the inlineTarget field in the stub itself, since immutability will hold.*/ +struct DispatchHolder +{ + static void InitializeStatic() + { + LIMITED_METHOD_CONTRACT; + + // Check that _implTarget is aligned in the DispatchHolder for backpatching + static_assert_no_msg(((offsetof(DispatchHolder, _stub) + offsetof(DispatchStub, _implTarget)) % sizeof(void *)) == 0); + } + + void Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT); + + DispatchStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } + + static DispatchHolder* FromDispatchEntry(PCODE dispatchEntry); + +private: + //force expectedMT to be aligned since used as key in hash tables. + DispatchStub _stub; +}; + +struct ResolveStub; +struct ResolveHolder; + +/*ResolveStub************************************************************************************** +Polymorphic call sites and monomorphic calls that fail end up in a ResolverStub. There is only +one resolver stub built for any given token, even though there may be many call sites that +use that token and many distinct types that are used in the calling call frames. A resolver stub +actually has two entry points, one for polymorphic call sites and one for dispatch stubs that fail on their +expectedMT test. There is a third part of the resolver stub that enters the ee when a decision should +be made about changing the callsite. Therefore, we have defined the resolver stub as three distinct pieces, +even though they are actually allocated as a single contiguous block of memory. These pieces are: + +A ResolveStub has two entry points: + +FailEntry - where the dispatch stub goes if the expected MT test fails. This piece of the stub does +a check to see how often we are actually failing. If failures are frequent, control transfers to the +patch piece to cause the call site to be changed from a mostly monomorphic callsite +(calls dispatch stub) to a polymorphic callsize (calls resolve stub). If failures are rare, control +transfers to the resolve piece (see ResolveStub). The failEntryPoint decrements a counter +every time it is entered. The ee at various times will add a large chunk to the counter. + +ResolveEntry - does a lookup via in a cache by hashing the actual type of the calling frame s + and the token identifying the (contract,method) pair desired. If found, control is transfered +to the method implementation. If not found in the cache, the token is pushed and the ee is entered via +the ResolveWorkerStub to do a full lookup and eventual transfer to the correct method implementation. Since +there is a different resolve stub for every token, the token can be inlined and the token can be pre-hashed. +The effectiveness of this approach is highly sensitive to the effectiveness of the hashing algorithm used, +as well as its speed. It turns out it is very important to make the hash function sensitive to all +of the bits of the method table, as method tables are laid out in memory in a very non-random way. Before +making any changes to the code sequences here, it is very important to measure and tune them as perf +can vary greatly, in unexpected ways, with seeming minor changes. + +Implementation note - Order, choice of instructions, and branch directions +should be carefully tuned since it can have an inordinate effect on performance. Particular +attention needs to be paid to the effects on the BTB and branch prediction, both in the small +and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. +Note that this stub is called in highly polymorphic cases, but the cache should have been sized +and the hash function chosen to maximize the cache hit case. Hence the cmp/jcc instructions should +mostly be going down the cache hit route, and it is important that this be statically predicted as so. +Hence the 3 jcc instrs need to be forward jumps. As structured, there is only one jmp/jcc that typically +gets put in the BTB since all the others typically fall straight thru. Minimizing potential BTB entries +is important. */ + +struct ResolveStub +{ + inline PCODE failEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)(&_failEntryPoint[0]) + THUMB_CODE; } + inline PCODE resolveEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)(&_resolveEntryPoint[0]) + THUMB_CODE; } + inline PCODE slowEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)(&_slowEntryPoint[0]) + THUMB_CODE; } + + inline INT32* pCounter() { LIMITED_METHOD_CONTRACT; return _pCounter; } + inline UINT32 hashedToken() { LIMITED_METHOD_CONTRACT; return _hashedToken >> LOG2_PTRSIZE; } + inline size_t cacheAddress() { LIMITED_METHOD_CONTRACT; return _cacheAddress; } + inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(ResolveStub); } + +private: + friend struct ResolveHolder; + const static int resolveEntryPointLen = 32; + const static int slowEntryPointLen = 4; + const static int failEntryPointLen = 14; + + WORD _resolveEntryPoint[resolveEntryPointLen]; + WORD _slowEntryPoint[slowEntryPointLen]; + WORD _failEntryPoint[failEntryPointLen]; + INT32* _pCounter; + UINT32 _hashedToken; + size_t _cacheAddress; // lookupCache + size_t _token; + size_t _tokenSlow; + PCODE _resolveWorkerTarget; + UINT32 _cacheMask; +}; + +/* ResolveHolders are the containers for ResolveStubs, They provide +for any alignment of the stubs as necessary. The stubs are placed in a hash table keyed by +the token for which they are built. Efficiency of access requires that this token be aligned. +For now, we have copied that field into the ResolveHolder itself, if the resolve stub is arranged such that +any of its inlined tokens (non-prehashed) is aligned, then the token field in the ResolveHolder +is not needed. */ +struct ResolveHolder +{ + static void InitializeStatic() { LIMITED_METHOD_CONTRACT; } + + void Initialize(ResolveHolder* pResolveHolderRX, + PCODE resolveWorkerTarget, PCODE patcherTarget, + size_t dispatchToken, UINT32 hashedToken, + void * cacheAddr, INT32 * counterAddr); + + ResolveStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } + + static ResolveHolder* FromFailEntry(PCODE failEntry); + static ResolveHolder* FromResolveEntry(PCODE resolveEntry); + +private: + ResolveStub _stub; +}; + /*VTableCallStub************************************************************************************** These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed in the first argument register (this pointer). From there, the stub extracts the MethodTable pointer, followed by the @@ -70,7 +329,6 @@ struct VTableCallHolder VTableCallStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this); } - size_t size() { return stub()->size(); } static size_t GetHolderSize(unsigned slot) { STATIC_CONTRACT_WRAPPER; @@ -102,8 +360,64 @@ struct VTableCallHolder #ifndef DACCESS_COMPILE +#ifdef STUB_LOGGING +extern size_t g_lookup_inline_counter; +extern size_t g_mono_call_counter; +extern size_t g_mono_miss_counter; +extern size_t g_poly_call_counter; +extern size_t g_poly_miss_counter; +#endif + TADDR StubDispatchFrame_MethodFrameVPtr; +LookupHolder* LookupHolder::FromLookupEntry(PCODE lookupEntry) +{ + lookupEntry = lookupEntry & ~THUMB_CODE; + return (LookupHolder*) ( lookupEntry - offsetof(LookupHolder, _stub) - offsetof(LookupStub, _entryPoint) ); +} + + +/* Template used to generate the stub. We generate a stub by allocating a block of + memory and copy the template over it and just update the specific fields that need + to be changed. +*/ +DispatchStub dispatchInit; + +DispatchHolder* DispatchHolder::FromDispatchEntry(PCODE dispatchEntry) +{ + LIMITED_METHOD_CONTRACT; + dispatchEntry = dispatchEntry & ~THUMB_CODE; + DispatchHolder* dispatchHolder = (DispatchHolder*) ( dispatchEntry - offsetof(DispatchHolder, _stub) - offsetof(DispatchStub, _entryPoint) ); + // _ASSERTE(dispatchHolder->_stub._entryPoint[0] == dispatchInit._entryPoint[0]); + return dispatchHolder; +} + + +/* Template used to generate the stub. We generate a stub by allocating a block of + memory and copy the template over it and just update the specific fields that need + to be changed. +*/ + +ResolveStub resolveInit; + +ResolveHolder* ResolveHolder::FromFailEntry(PCODE failEntry) +{ + LIMITED_METHOD_CONTRACT; + failEntry = failEntry & ~THUMB_CODE; + ResolveHolder* resolveHolder = (ResolveHolder*) ( failEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _failEntryPoint) ); + // _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]); + return resolveHolder; +} + +ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry) +{ + LIMITED_METHOD_CONTRACT; + resolveEntry = resolveEntry & ~THUMB_CODE; + ResolveHolder* resolveHolder = (ResolveHolder*) ( resolveEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _resolveEntryPoint) ); + // _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]); + return resolveHolder; +} + void MovRegImm(BYTE* p, int reg, TADDR imm); void VTableCallHolder::Initialize(unsigned slot) @@ -201,20 +515,19 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s if (stubKind == SK_UNKNOWN) { //Assuming that RESOLVE_STUB_FIRST_WORD & DISPATCH_STUB_FIRST_WORD have same values - _ASSERTE(RESOLVE_STUB_FIRST_WORD == DISPATCH_STUB_FIRST_WORD); if (firstWord == DISPATCH_STUB_FIRST_WORD) { WORD thirdWord = ((WORD*)pInstr)[2]; - if (thirdWord == DISPATCH_STUB_THIRD_WORD) + if (thirdWord == 0xf84d) { stubKind = SK_DISPATCH; } - else if (thirdWord == RESOLVE_STUB_THIRD_WORD) + else if (thirdWord == 0xb460) { stubKind = SK_RESOLVE; } } - else if (firstWord == LOOKUP_STUB_FIRST_WORD) + else if (firstWord == 0xf8df) { stubKind = SK_LOOKUP; } diff --git a/src/coreclr/vm/arm64/asmconstants.h b/src/coreclr/vm/arm64/asmconstants.h index 4945b7d462ef5..fadd6be2ded14 100644 --- a/src/coreclr/vm/arm64/asmconstants.h +++ b/src/coreclr/vm/arm64/asmconstants.h @@ -233,38 +233,5 @@ ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForMethod == offsetof(CallCoun #define CallCountingStubData__TargetForThresholdReached 0x10 ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForThresholdReached == offsetof(CallCountingStubData, TargetForThresholdReached)) -#define LookupStubData__DispatchToken 0x00 -ASMCONSTANTS_C_ASSERT(LookupStubData__DispatchToken == offsetof(LookupStubData, DispatchToken)) - -#define LookupStubData__ResolveWorkerTarget 0x08 -ASMCONSTANTS_C_ASSERT(LookupStubData__ResolveWorkerTarget == offsetof(LookupStubData, ResolveWorkerTarget)) - -#define DispatchStubData__ExpectedMT 0x00 -ASMCONSTANTS_C_ASSERT(DispatchStubData__ExpectedMT == offsetof(DispatchStubData, ExpectedMT)) - -#define DispatchStubData__ImplTarget 0x08 -ASMCONSTANTS_C_ASSERT(DispatchStubData__ImplTarget == offsetof(DispatchStubData, ImplTarget)) - -#define DispatchStubData__FailTarget 0x10 -ASMCONSTANTS_C_ASSERT(DispatchStubData__FailTarget == offsetof(DispatchStubData, FailTarget)) - -#define ResolveStubData__HashedToken 0x08 -ASMCONSTANTS_C_ASSERT(ResolveStubData__HashedToken == offsetof(ResolveStubData, HashedToken)) - -#define ResolveStubData__CacheAddress 0x00 -ASMCONSTANTS_C_ASSERT(ResolveStubData__CacheAddress == offsetof(ResolveStubData, CacheAddress)) - -#define ResolveStubData__Token 0x10 -ASMCONSTANTS_C_ASSERT(ResolveStubData__Token == offsetof(ResolveStubData, Token)) - -#define ResolveStubData__Counter 0x0c -ASMCONSTANTS_C_ASSERT(ResolveStubData__Counter == offsetof(ResolveStubData, Counter)) - -#define ResolveStubData__ResolveWorkerTarget 0x18 -ASMCONSTANTS_C_ASSERT(ResolveStubData__ResolveWorkerTarget == offsetof(ResolveStubData, ResolveWorkerTarget)) - -#define CALL_STUB_CACHE_MASK_ASM 0xfff -ASMCONSTANTS_C_ASSERT(CALL_STUB_CACHE_MASK_ASM == CALL_STUB_CACHE_MASK) - #undef ASMCONSTANTS_RUNTIME_ASSERT #undef ASMCONSTANTS_C_ASSERT diff --git a/src/coreclr/vm/arm64/thunktemplates.S b/src/coreclr/vm/arm64/thunktemplates.S index 91d36ed080f52..4645ba17be59c 100644 --- a/src/coreclr/vm/arm64/thunktemplates.S +++ b/src/coreclr/vm/arm64/thunktemplates.S @@ -36,59 +36,4 @@ LOCAL_LABEL(CountReachedZero\PAGE_SIZE): br x10 LEAF_END_MARKED CallCountingStubCode\PAGE_SIZE - - LEAF_ENTRY LookupStubCode\PAGE_SIZE - ldr x12, DATA_SLOT(LookupStub, DispatchToken) - ldr x10, DATA_SLOT(LookupStub, ResolveWorkerTarget) - br x10 - LEAF_END_MARKED LookupStubCode\PAGE_SIZE - - LEAF_ENTRY DispatchStubCode\PAGE_SIZE - PATCH_LABEL DispatchStubCode_ThisDeref\PAGE_SIZE - ldr x13, [x0] // methodTable from object in x0 - adr x9, DATA_SLOT(DispatchStub, ExpectedMT) - ldp x10, x12, [x9] // x10 = ExpectedMT & x12 = ImplTarget - cmp x13, x10 - bne LOCAL_LABEL(Fail\PAGE_SIZE) - br x12 -LOCAL_LABEL(Fail\PAGE_SIZE): - ldr x9, DATA_SLOT(DispatchStub, FailTarget) - br x9 - LEAF_END_MARKED DispatchStubCode\PAGE_SIZE - - LEAF_ENTRY ResolveStubCode\PAGE_SIZE - PATCH_LABEL ResolveStubCode_ResolveEntry\PAGE_SIZE - PATCH_LABEL ResolveStubCode_ThisDeref\PAGE_SIZE -LOCAL_LABEL(Resolve\PAGE_SIZE): - ldr x12, [x0] - add x9, x12, x12, lsr #12 - ldr w13, DATA_SLOT(ResolveStub, HashedToken) - eor x9, x9, x13 - and x9, x9, #CALL_STUB_CACHE_MASK_ASM * 8 - ldr x13, DATA_SLOT(ResolveStub, CacheAddress) - ldr x9, [x13, x9] - ldr x15, DATA_SLOT(ResolveStub, Token) - ldr x13, [x9, #ResolveCacheElem__pMT] - cmp x12, x13 - bne LOCAL_LABEL(SlowEntry\PAGE_SIZE) - ldr x13, [x9, #ResolveCacheElem__token] - cmp x15, x13 - bne LOCAL_LABEL(SlowEntry\PAGE_SIZE) - ldr x12, [x9, ResolveCacheElem__target] - br x12 - PATCH_LABEL ResolveStubCode_SlowEntry\PAGE_SIZE -LOCAL_LABEL(SlowEntry\PAGE_SIZE): - ldr x12, DATA_SLOT(ResolveStub, Token) - ldr x13, DATA_SLOT(ResolveStub, ResolveWorkerTarget) - br x13 - PATCH_LABEL ResolveStubCode_FailEntry\PAGE_SIZE - adr x10, DATA_SLOT(ResolveStub, Counter) - ldr w9, [x10] - subs w9, w9, #1 - str w9, [x10] - bge LOCAL_LABEL(Resolve\PAGE_SIZE) - orr x11, x11, #1 // SDF_ResolveBackPatch - b LOCAL_LABEL(Resolve\PAGE_SIZE) - LEAF_END_MARKED ResolveStubCode\PAGE_SIZE - .endr diff --git a/src/coreclr/vm/arm64/thunktemplates.asm b/src/coreclr/vm/arm64/thunktemplates.asm index b1c31c7ca3232..958ddb029a6ee 100644 --- a/src/coreclr/vm/arm64/thunktemplates.asm +++ b/src/coreclr/vm/arm64/thunktemplates.asm @@ -34,57 +34,4 @@ CountReachedZero br x10 LEAF_END_MARKED CallCountingStubCode - - LEAF_ENTRY LookupStubCode - ldr x12, DATA_SLOT(LookupStub, DispatchToken) - ldr x10, DATA_SLOT(LookupStub, ResolveWorkerTarget) - br x10 - LEAF_END_MARKED LookupStubCode - - LEAF_ENTRY DispatchStubCode - PATCH_LABEL DispatchStubCode_ThisDeref - ldr x13, [x0] ; methodTable from object in x0 - adr x9, DATA_SLOT(DispatchStub, ExpectedMT) - ldp x10, x12, [x9] ; x10 = ExpectedMT & x12 = ImplTarget - cmp x13, x10 - bne Fail - br x12 -Fail - ldr x9, DATA_SLOT(DispatchStub, FailTarget) - br x9 - LEAF_END_MARKED DispatchStubCode - - LEAF_ENTRY ResolveStubCode - PATCH_LABEL ResolveStubCode_ResolveEntry - PATCH_LABEL ResolveStubCode_ThisDeref - ldr x12, [x0] - add x9, x12, x12, lsr #12 - ldr w13, DATA_SLOT(ResolveStub, HashedToken) - eor x9, x9, x13 - and x9, x9, #CALL_STUB_CACHE_MASK_ASM * 8 - ldr x13, DATA_SLOT(ResolveStub, CacheAddress) - ldr x9, [x13, x9] - ldr x15, DATA_SLOT(ResolveStub, Token) - ldr x13, [x9, #ResolveCacheElem__pMT] - cmp x12, x13 - bne ResolveStubCode_SlowEntry - ldr x13, [x9, #ResolveCacheElem__token] - cmp x15, x13 - bne ResolveStubCode_SlowEntry - ldr x12, [x9, ResolveCacheElem__target] - br x12 - PATCH_LABEL ResolveStubCode_SlowEntry - ldr x12, DATA_SLOT(ResolveStub, Token) - ldr x13, DATA_SLOT(ResolveStub, ResolveWorkerTarget) - br x13 - PATCH_LABEL ResolveStubCode_FailEntry - adr x10, DATA_SLOT(ResolveStub, Counter) - ldr w9, [x10] - subs w9, w9, #1 - str w9, [x10] - bge ResolveStubCode - orr x11, x11, #1; SDF_ResolveBackPatch - b ResolveStubCode - LEAF_END_MARKED ResolveStubCode - END diff --git a/src/coreclr/vm/arm64/virtualcallstubcpu.hpp b/src/coreclr/vm/arm64/virtualcallstubcpu.hpp index 92e145ace49dd..4944b198e5212 100644 --- a/src/coreclr/vm/arm64/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/arm64/virtualcallstubcpu.hpp @@ -8,10 +8,405 @@ #define DISPATCH_STUB_FIRST_DWORD 0xf940000d #define RESOLVE_STUB_FIRST_DWORD 0xF940000C -#define LOOKUP_STUB_FIRST_DWORD 0x5800000C -#define LOOKUP_STUB_FIRST_DWORD_MASK 0xFFF07FFF #define VTABLECALL_STUB_FIRST_DWORD 0xF9400009 +struct ARM64EncodeHelpers +{ + inline static DWORD ADR_PATCH(DWORD offset) + { + DWORD immLO = (offset & 0x03)<<29 ; + + if (immLO ==0 ) + return (offset<<3); + else + return immLO<<29 | (offset -immLO)<<3; + } + +}; + +#define USES_LOOKUP_STUBS 1 + +struct LookupStub +{ + inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } + inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } +private : + friend struct LookupHolder; + + DWORD _entryPoint[4]; + PCODE _resolveWorkerTarget; + size_t _token; +}; + +struct LookupHolder +{ +private: + LookupStub _stub; +public: + static void InitializeStatic() { } + + void Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken) + { + // adr x9, _resolveWorkerTarget + // ldp x10, x12, [x9] + // br x10 + // _resolveWorkerTarget + // _token + _stub._entryPoint[0] = 0x10000089; + _stub._entryPoint[1] = 0xa940312a; + _stub._entryPoint[2] = 0xd61f0140; + //4th element of _entryPoint array is padding for 8byte alignment + _stub._resolveWorkerTarget = resolveWorkerTarget; + _stub._token = dispatchToken; + } + + LookupStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } + static LookupHolder* FromLookupEntry(PCODE lookupEntry) + { + return (LookupHolder*) ( lookupEntry - offsetof(LookupHolder, _stub) - offsetof(LookupStub, _entryPoint) ); + } +}; + +struct DispatchStub +{ + inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } + + inline size_t expectedMT() { LIMITED_METHOD_CONTRACT; return _expectedMT; } + inline PCODE implTarget() { LIMITED_METHOD_CONTRACT; return _implTarget; } + + inline TADDR implTargetSlot(EntryPointSlots::SlotType *slotTypeRef) const + { + LIMITED_METHOD_CONTRACT; + _ASSERTE(slotTypeRef != nullptr); + + *slotTypeRef = EntryPointSlots::SlotType_Executable; + return (TADDR)&_implTarget; + } + + inline PCODE failTarget() { LIMITED_METHOD_CONTRACT; return _failTarget; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(DispatchStub); } + +private: + friend struct DispatchHolder; + + DWORD _entryPoint[8]; + size_t _expectedMT; + PCODE _implTarget; + PCODE _failTarget; +}; + +struct DispatchHolder +{ + static void InitializeStatic() + { + LIMITED_METHOD_CONTRACT; + + // Check that _implTarget is aligned in the DispatchHolder for backpatching + static_assert_no_msg(((offsetof(DispatchHolder, _stub) + offsetof(DispatchStub, _implTarget)) % sizeof(void *)) == 0); + } + + void Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT) + { + // ldr x13, [x0] ; methodTable from object in x0 + // adr x9, _expectedMT ; _expectedMT is at offset 28 from pc + // ldp x10, x12, [x9] ; x10 = _expectedMT & x12 = _implTarget + // cmp x13, x10 + // bne failLabel + // br x12 + // failLabel + // ldr x9, _failTarget ; _failTarget is at offset 24 from pc + // br x9 + // _expectedMT + // _implTarget + // _failTarget + + _stub._entryPoint[0] = DISPATCH_STUB_FIRST_DWORD; // 0xf940000d + _stub._entryPoint[1] = 0x100000e9; + _stub._entryPoint[2] = 0xa940312a; + _stub._entryPoint[3] = 0xeb0a01bf; + _stub._entryPoint[4] = 0x54000041; + _stub._entryPoint[5] = 0xd61f0180; + _stub._entryPoint[6] = 0x580000c9; + _stub._entryPoint[7] = 0xd61f0120; + + _stub._expectedMT = expectedMT; + _stub._implTarget = implTarget; + _stub._failTarget = failTarget; + } + + DispatchStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } + + static DispatchHolder* FromDispatchEntry(PCODE dispatchEntry) + { + LIMITED_METHOD_CONTRACT; + DispatchHolder* dispatchHolder = (DispatchHolder*) ( dispatchEntry - offsetof(DispatchHolder, _stub) - offsetof(DispatchStub, _entryPoint) ); + return dispatchHolder; + } + +private: + DispatchStub _stub; +}; + +struct ResolveStub +{ + inline PCODE failEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_failEntryPoint[0]; } + inline PCODE resolveEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_resolveEntryPoint[0]; } + inline PCODE slowEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_slowEntryPoint[0]; } + inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } + inline INT32* pCounter() { LIMITED_METHOD_CONTRACT; return _pCounter; } + + inline UINT32 hashedToken() { LIMITED_METHOD_CONTRACT; return _hashedToken >> LOG2_PTRSIZE; } + inline size_t cacheAddress() { LIMITED_METHOD_CONTRACT; return _cacheAddress; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(ResolveStub); } + +private: + friend struct ResolveHolder; + const static int resolveEntryPointLen = 17; + const static int slowEntryPointLen = 4; + const static int failEntryPointLen = 8; + + DWORD _resolveEntryPoint[resolveEntryPointLen]; + DWORD _slowEntryPoint[slowEntryPointLen]; + DWORD _failEntryPoint[failEntryPointLen]; + INT32* _pCounter; //Base of the Data Region + size_t _cacheAddress; // lookupCache + size_t _token; + PCODE _resolveWorkerTarget; + UINT32 _hashedToken; +}; + +struct ResolveHolder +{ + static void InitializeStatic() { } + + void Initialize(ResolveHolder* pResolveHolderRX, + PCODE resolveWorkerTarget, PCODE patcherTarget, + size_t dispatchToken, UINT32 hashedToken, + void * cacheAddr, INT32 * counterAddr) + { + int n=0; + DWORD offset; + int br_nextEntry[2]; +/******** Rough Convention of used in this routine + ;;x9 hash scratch / current ResolveCacheElem + ;;x10 base address of the data region + ;;x11 indirection cell + ;;x12 MethodTable (from object ref in x0), out: this._token + ;;X13 temp + ;;X15 temp, this._token + ;;cachemask => [CALL_STUB_CACHE_MASK * sizeof(void*)] +*********/ + // Called directly by JITTED code + // ResolveStub._resolveEntryPoint(x0:Object*, x1 ...,r7, x11:IndirectionCellAndFlags) + // { + // MethodTable mt = x0.m_pMethTab; + // int i = ((mt + mt >> 12) ^ this._hashedToken) & _cacheMask + // ResolveCacheElem e = this._cacheAddress + i + // x9 = e = this._cacheAddress + i + // if (mt == e.pMT && this._token == e.token) + // { + // (e.target)(x0, [x1,...,x7 and x8]); + // } + // else + // { + // x12 = this._token; + // (this._slowEntryPoint)(x0, [x1,.., x7 and x8], x9, x11, x12); + // } + // } + // + +#define Dataregionbase _pCounter +#define DATA_OFFSET(_fieldHigh) (DWORD)((offsetof(ResolveStub, _fieldHigh ) - offsetof(ResolveStub, Dataregionbase)) & 0xffffffff) +#define PC_REL_OFFSET(_field) (DWORD)((offsetof(ResolveStub, _field) - (offsetof(ResolveStub, _resolveEntryPoint) + sizeof(*ResolveStub::_resolveEntryPoint) * n)) & 0xffffffff) + + //ldr x12, [x0,#Object.m_pMethTab ] ; methodTable from object in x0 + _stub._resolveEntryPoint[n++] = RESOLVE_STUB_FIRST_DWORD; //0xF940000C + + // ;; Compute i = ((mt + mt >> 12) ^ this._hashedToken) & _cacheMask + + //add x9, x12, x12 lsr #12 + _stub._resolveEntryPoint[n++] = 0x8B4C3189; + + //;;adr x10, #Dataregionbase of ResolveStub + _stub._resolveEntryPoint[n] = 0x1000000A | ARM64EncodeHelpers::ADR_PATCH(PC_REL_OFFSET(Dataregionbase)); + n++; + + //w13- this._hashedToken + //ldr w13, [x10 + DATA_OFFSET(_hashedToken)] + offset = DATA_OFFSET(_hashedToken); + _ASSERTE(offset >=0 && offset%4 == 0); + _stub._resolveEntryPoint[n++] = 0xB940014D | offset<<8; + + //eor x9,x9,x13 + _stub._resolveEntryPoint[n++] = 0xCA0D0129; + + _ASSERTE(CALL_STUB_CACHE_MASK * sizeof(void*) == 0x7FF8); + //x9-i + //and x9,x9,#cachemask + _stub._resolveEntryPoint[n++] = 0x927D2D29; + + //;; ResolveCacheElem e = this._cacheAddress + i + // + //ldr x13, [x10 + DATA_OFFSET(_cacheAddress)] + offset=DATA_OFFSET(_cacheAddress); + _ASSERTE(offset >=0 && offset%8 == 0); + _stub._resolveEntryPoint[n++] = 0xF940014D | offset<<7; + + //ldr x9, [x13, x9] ;; x9 = e = this._cacheAddress + i + _stub._resolveEntryPoint[n++] = 0xF86969A9 ; + + //ldr x15, [x10 + DATA_OFFSET(_token)] + offset = DATA_OFFSET(_token); + _ASSERTE(offset >=0 && offset%8 == 0); + _stub._resolveEntryPoint[n++] = 0xF940014F | offset<<7; + + //;; Check mt == e.pMT + // + // + //ldr x13, [x9, #offsetof(ResolveCacheElem, pMT) ] + offset = offsetof(ResolveCacheElem, pMT) & 0x000001ff; + _ASSERTE(offset >=0 && offset%8 == 0); + _stub._resolveEntryPoint[n++] = 0xF940012D | offset<<7; + + //cmp x12, x13 + _stub._resolveEntryPoint[n++] = 0xEB0D019F; + + //;; bne nextEntry + //place holder for the above instruction + br_nextEntry[0]=n++; + + //;; Check this._token == e.token + //x15: this._token + // + //ldr x13, [x9, #offsetof(ResolveCacheElem, token) ] + offset = offsetof(ResolveCacheElem, token) & 0xffffffff; + _ASSERTE(offset >=0 && offset%8 == 0); + _stub._resolveEntryPoint[n++] = 0xF940012D | offset<<7; + + //cmp x15, x13 + _stub._resolveEntryPoint[n++] = 0xEB0D01FF; + + //;; bne nextEntry + //place holder for the above instruction + br_nextEntry[1]=n++; + + //ldr x12, [x9, #offsetof(ResolveCacheElem, target) ] + offset = offsetof(ResolveCacheElem, target) & 0xffffffff; + _ASSERTE(offset >=0 && offset%8 == 0); + _stub._resolveEntryPoint[n++] = 0xF940012C | offset<<7; + + // ;; Branch to e.target + // br x12 + _stub._resolveEntryPoint[n++] = 0xD61F0180; + + //;;nextEntry: + //back patching the call sites as now we know the offset to nextEntry + //bne #offset + for(auto i: br_nextEntry) + { + _stub._resolveEntryPoint[i] = 0x54000001 | ((((n-i)*sizeof(DWORD))<<3) & 0x3FFFFFF); + } + + _ASSERTE(n == ResolveStub::resolveEntryPointLen); + _ASSERTE(_stub._resolveEntryPoint + n == _stub._slowEntryPoint); + + // ResolveStub._slowEntryPoint(x0:MethodToken, [x1..x7 and x8], x11:IndirectionCellAndFlags) + // { + // x12 = this._token; + // this._resolveWorkerTarget(x0, [x1..x7 and x8], x9, x11, x12); + // } + +#undef PC_REL_OFFSET +#define PC_REL_OFFSET(_field) (DWORD)((offsetof(ResolveStub, _field) - (offsetof(ResolveStub, _slowEntryPoint) + sizeof(*ResolveStub::_slowEntryPoint) * n)) & 0xffffffff ) + n = 0; + // ;;slowEntryPoint: + // ;;fall through to the slow case + + //;;adr x10, #Dataregionbase + _stub._slowEntryPoint[n] = 0x1000000A | ARM64EncodeHelpers::ADR_PATCH(PC_REL_OFFSET(Dataregionbase)); + n++; + + //ldr x12, [x10 , DATA_OFFSET(_token)] + offset=DATA_OFFSET(_token); + _ASSERTE(offset >=0 && offset%8 == 0); + _stub._slowEntryPoint[n++] = 0xF940014C | (offset<<7); + + // + //ldr x13, [x10 , DATA_OFFSET(_resolveWorkerTarget)] + offset=DATA_OFFSET(_resolveWorkerTarget); + _ASSERTE(offset >=0 && offset%8 == 0); + _stub._slowEntryPoint[n++] = 0xF940014d | (offset<<7); + + // br x13 + _stub._slowEntryPoint[n++] = 0xD61F01A0; + + _ASSERTE(n == ResolveStub::slowEntryPointLen); + // ResolveStub._failEntryPoint(x0:MethodToken, x1,.., x7 and x8, x11:IndirectionCellAndFlags) + // { + // if(--*(this._pCounter) < 0) x11 = x11 | SDF_ResolveBackPatch; + // this._resolveEntryPoint(x0, [x1..x7 and x8]); + // } + +#undef PC_REL_OFFSET //NOTE Offset can be negative +#define PC_REL_OFFSET(_field) (DWORD)((offsetof(ResolveStub, _field) - (offsetof(ResolveStub, _failEntryPoint) + sizeof(*ResolveStub::_failEntryPoint) * n)) & 0xffffffff) + n = 0; + + //;;failEntryPoint + //;;adr x10, #Dataregionbase + _stub._failEntryPoint[n] = 0x1000000A | ARM64EncodeHelpers::ADR_PATCH(PC_REL_OFFSET(Dataregionbase)); + n++; + + // + //ldr x13, [x10] + offset=DATA_OFFSET(_pCounter); + _ASSERTE(offset >=0 && offset%8 == 0); + _stub._failEntryPoint[n++] = 0xF940014D | offset<<7; + + //ldr w9, [x13] + _stub._failEntryPoint[n++] = 0xB94001A9; + //subs w9,w9,#1 + _stub._failEntryPoint[n++] = 0x71000529; + //str w9, [x13] + _stub._failEntryPoint[n++] = 0xB90001A9; + + //;;bge resolveEntryPoint + offset = PC_REL_OFFSET(_resolveEntryPoint); + _stub._failEntryPoint[n++] = 0x5400000A | ((offset <<3)& 0x00FFFFF0) ; + + // ;; orr x11, x11, SDF_ResolveBackPatch + // orr x11, x11, #1 + _ASSERTE(SDF_ResolveBackPatch == 0x1); + _stub._failEntryPoint[n++] = 0xB240016B; + + //;;b resolveEntryPoint: + offset = PC_REL_OFFSET(_resolveEntryPoint); + _stub._failEntryPoint[n++] = 0x14000000 | ((offset>>2) & 0x3FFFFFF); + + _ASSERTE(n == ResolveStub::failEntryPointLen); + _stub._pCounter = counterAddr; + _stub._hashedToken = hashedToken << LOG2_PTRSIZE; + _stub._cacheAddress = (size_t) cacheAddr; + _stub._token = dispatchToken; + _stub._resolveWorkerTarget = resolveWorkerTarget; + + _ASSERTE(resolveWorkerTarget == (PCODE)ResolveWorkerChainLookupAsmStub); + _ASSERTE(patcherTarget == NULL); + +#undef DATA_OFFSET +#undef PC_REL_OFFSET +#undef Dataregionbase + } + + ResolveStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } + + static ResolveHolder* FromFailEntry(PCODE failEntry); + static ResolveHolder* FromResolveEntry(PCODE resolveEntry); +private: + ResolveStub _stub; +}; + + /*VTableCallStub************************************************************************************** These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed in the first argument register (this pointer). From there, the stub extracts the MethodTable pointer, followed by the @@ -74,7 +469,6 @@ struct VTableCallHolder VTableCallStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this); } - size_t size() { return stub()->size(); } static size_t GetHolderSize(unsigned slot) { STATIC_CONTRACT_WRAPPER; @@ -96,6 +490,19 @@ struct VTableCallHolder #ifdef DECLARE_DATA #ifndef DACCESS_COMPILE +ResolveHolder* ResolveHolder::FromFailEntry(PCODE failEntry) +{ + LIMITED_METHOD_CONTRACT; + ResolveHolder* resolveHolder = (ResolveHolder*) ( failEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _failEntryPoint) ); + return resolveHolder; +} + +ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry) +{ + LIMITED_METHOD_CONTRACT; + ResolveHolder* resolveHolder = (ResolveHolder*) ( resolveEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _resolveEntryPoint) ); + return resolveHolder; +} void VTableCallHolder::Initialize(unsigned slot) { @@ -192,19 +599,19 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s DWORD firstDword = *((DWORD*) pInstr); - if (firstDword == DISPATCH_STUB_FIRST_DWORD) + if (firstDword == DISPATCH_STUB_FIRST_DWORD) // assembly of first instruction of DispatchStub : ldr x13, [x0] { stubKind = SK_DISPATCH; } - else if (firstDword == RESOLVE_STUB_FIRST_DWORD) + else if (firstDword == RESOLVE_STUB_FIRST_DWORD) // assembly of first instruction of ResolveStub : ldr x12, [x0,#Object.m_pMethTab ] { stubKind = SK_RESOLVE; } - else if (firstDword == VTABLECALL_STUB_FIRST_DWORD) + else if (firstDword == VTABLECALL_STUB_FIRST_DWORD) // assembly of first instruction of VTableCallStub : ldr x9, [x0] { stubKind = SK_VTABLECALL; } - else if ((firstDword & LOOKUP_STUB_FIRST_DWORD_MASK) == LOOKUP_STUB_FIRST_DWORD) // The instruction depends on page size, so we mask out the dependent part + else if (firstDword == 0x10000089) // assembly of first instruction of LookupStub : adr x9, _resolveWorkerTarget { stubKind = SK_LOOKUP; } diff --git a/src/coreclr/vm/i386/asmconstants.h b/src/coreclr/vm/i386/asmconstants.h index fc01b0cf139a7..fa9f2b79657a7 100644 --- a/src/coreclr/vm/i386/asmconstants.h +++ b/src/coreclr/vm/i386/asmconstants.h @@ -351,42 +351,6 @@ ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForMethod == offsetof(CallCoun #define CallCountingStubData__TargetForThresholdReached 0x08 ASMCONSTANTS_C_ASSERT(CallCountingStubData__TargetForThresholdReached == offsetof(CallCountingStubData, TargetForThresholdReached)) -#define LookupStubData__DispatchToken 0x00 -ASMCONSTANTS_C_ASSERT(LookupStubData__DispatchToken == offsetof(LookupStubData, DispatchToken)) - -#define LookupStubData__ResolveWorkerTarget 0x04 -ASMCONSTANTS_C_ASSERT(LookupStubData__ResolveWorkerTarget == offsetof(LookupStubData, ResolveWorkerTarget)) - -#define DispatchStubData__ExpectedMT 0x00 -ASMCONSTANTS_C_ASSERT(DispatchStubData__ExpectedMT == offsetof(DispatchStubData, ExpectedMT)) - -#define DispatchStubData__ImplTarget 0x04 -ASMCONSTANTS_C_ASSERT(DispatchStubData__ImplTarget == offsetof(DispatchStubData, ImplTarget)) - -#define DispatchStubData__FailTarget 0x08 -ASMCONSTANTS_C_ASSERT(DispatchStubData__FailTarget == offsetof(DispatchStubData, FailTarget)) - -#define ResolveStubData__HashedToken 0x04 -ASMCONSTANTS_C_ASSERT(ResolveStubData__HashedToken == offsetof(ResolveStubData, HashedToken)) - -#define ResolveStubData__CacheAddress 0x00 -ASMCONSTANTS_C_ASSERT(ResolveStubData__CacheAddress == offsetof(ResolveStubData, CacheAddress)) - -#define ResolveStubData__Token 0x0c -ASMCONSTANTS_C_ASSERT(ResolveStubData__Token == offsetof(ResolveStubData, Token)) - -#define ResolveStubData__Counter 0x08 -ASMCONSTANTS_C_ASSERT(ResolveStubData__Counter == offsetof(ResolveStubData, Counter)) - -#define ResolveStubData__ResolveWorkerTarget 0x10 -ASMCONSTANTS_C_ASSERT(ResolveStubData__ResolveWorkerTarget == offsetof(ResolveStubData, ResolveWorkerTarget)) - -#define ResolveStubData__PatcherTarget 0x14 -ASMCONSTANTS_C_ASSERT(ResolveStubData__PatcherTarget == offsetof(ResolveStubData, PatcherTarget)) - -#define CALL_STUB_CACHE_MASK_ASM 0xfff -ASMCONSTANTS_C_ASSERT(CALL_STUB_CACHE_MASK_ASM == CALL_STUB_CACHE_MASK) - #undef ASMCONSTANTS_C_ASSERT #undef ASMCONSTANTS_RUNTIME_ASSERT diff --git a/src/coreclr/vm/i386/excepx86.cpp b/src/coreclr/vm/i386/excepx86.cpp index 418bf7090a775..15dd0667dd6c4 100644 --- a/src/coreclr/vm/i386/excepx86.cpp +++ b/src/coreclr/vm/i386/excepx86.cpp @@ -3455,13 +3455,11 @@ AdjustContextForVirtualStub( if (sk == VirtualCallStubManager::SK_DISPATCH) { - if (*PTR_WORD(f_IP) != X86_INSTR_CMP_IND_ECX_EAX) + if (*PTR_WORD(f_IP) != X86_INSTR_CMP_IND_ECX_IMM32) { _ASSERTE(!"AV in DispatchStub at unknown instruction"); return FALSE; } - - SetSP(pContext, dac_cast(dac_cast(GetSP(pContext)) + sizeof(void*))); // rollback push eax } else if (sk == VirtualCallStubManager::SK_RESOLVE) @@ -3504,18 +3502,17 @@ AdjustContextForVirtualStub( { ENABLE_FORBID_GC_LOADER_USE_IN_THIS_SCOPE(); - PCODE dispatchEntry = f_IP - DispatchStub::offsetOfThisDeref(); - DispatchStub *pStub = DispatchStub::FromDispatchEntry(dispatchEntry); - MethodTable *pMT = (MethodTable*)pStub->expectedMT(); - DispatchToken token(VirtualCallStubManager::GetTokenFromStubQuick(pMgr, dispatchEntry, sk)); + DispatchHolder *holder = DispatchHolder::FromDispatchEntry(f_IP); + MethodTable *pMT = (MethodTable*)holder->stub()->expectedMT(); + DispatchToken token(VirtualCallStubManager::GetTokenFromStubQuick(pMgr, f_IP, sk)); MethodDesc* pMD = VirtualCallStubManager::GetRepresentativeMethodDescFromToken(token, pMT); stackArgumentsSize = pMD->SizeOfArgStack(); } else { // Compute the stub entry address from the address of failure (location of dereferencing of "this" pointer) - ResolveStub *pResolveStub = ResolveStub::FromResolveEntry(f_IP - ResolveStub::offsetOfThisDeref()); - stackArgumentsSize = pResolveStub->stackArgumentsSize(); + ResolveHolder *holder = ResolveHolder::FromResolveEntry(f_IP - ResolveStub::offsetOfThisDeref()); + stackArgumentsSize = holder->stub()->stackArgumentsSize(); } sp += stackArgumentsSize; diff --git a/src/coreclr/vm/i386/stublinkerx86.h b/src/coreclr/vm/i386/stublinkerx86.h index b1322f3d0dbdc..c41441314d982 100644 --- a/src/coreclr/vm/i386/stublinkerx86.h +++ b/src/coreclr/vm/i386/stublinkerx86.h @@ -21,7 +21,7 @@ extern PCODE GetPreStubEntryPoint(); #define X86_INSTR_JMP_EAX 0xE0FF // jmp eax #define X86_INSTR_MOV_EAX_IMM32 0xB8 // mov eax, imm32 #define X86_INSTR_MOV_EAX_ECX_IND 0x018b // mov eax, [ecx] -#define X86_INSTR_CMP_IND_ECX_EAX 0x0139 // cmp [ecx], eax +#define X86_INSTR_CMP_IND_ECX_IMM32 0x3981 // cmp [ecx], imm32 #define X86_INSTR_MOV_AL 0xB0 // mov al, imm8 #define X86_INSTR_JMP_REL8 0xEB // jmp short rel8 diff --git a/src/coreclr/vm/i386/thunktemplates.S b/src/coreclr/vm/i386/thunktemplates.S index 5ca0d1767fc31..eedd6ac1dbe2e 100644 --- a/src/coreclr/vm/i386/thunktemplates.S +++ b/src/coreclr/vm/i386/thunktemplates.S @@ -55,69 +55,3 @@ LOCAL_LABEL(CountReachedZero): INDJMP DATA_SLOT(CallCountingStub, TargetForThresholdReached) SLOT_ADDRESS_PATCH_LABEL CallCountingStub, TargetForThresholdReached LEAF_END_MARKED CallCountingStubCode - -LEAF_ENTRY LookupStubCode - push eax - push dword ptr [DATA_SLOT(LookupStub, DispatchToken)] -SLOT_ADDRESS_PATCH_LABEL LookupStub, DispatchToken - INDJMP DATA_SLOT(LookupStub, ResolveWorkerTarget) -SLOT_ADDRESS_PATCH_LABEL LookupStub, ResolveWorkerTarget -LEAF_END_MARKED LookupStubCode - -LEAF_ENTRY DispatchStubCode - push eax - mov eax, dword ptr [DATA_SLOT(DispatchStub, ExpectedMT)] -SLOT_ADDRESS_PATCH_LABEL DispatchStub, ExpectedMT -PATCH_LABEL _DispatchStubCode_ThisDeref - cmp dword ptr [ecx],eax - pop eax - jne NoMatch - INDJMP DATA_SLOT(DispatchStub, ImplTarget) -SLOT_ADDRESS_PATCH_LABEL DispatchStub, ImplTarget -NoMatch: - INDJMP DATA_SLOT(DispatchStub, FailTarget) -SLOT_ADDRESS_PATCH_LABEL DispatchStub, FailTarget -LEAF_END_MARKED DispatchStubCode - -LEAF_ENTRY ResolveStubCode -PATCH_LABEL ResolveStubCode_FailEntry - sub dword ptr [DATA_SLOT(ResolveStub, Counter)], 1 -SLOT_ADDRESS_PATCH_LABEL ResolveStub, Counter, -5 - jl LOCAL_LABEL(Backpatcher) -PATCH_LABEL ResolveStubCode_ResolveEntry -LOCAL_LABEL(ResolveEntry): - push eax -PATCH_LABEL ResolveStubCode_ThisDeref - mov eax,dword ptr [ecx] - push edx - mov edx,eax - shr eax, 12 - add eax,edx - xor eax,dword ptr [DATA_SLOT(ResolveStub, HashedToken)] -SLOT_ADDRESS_PATCH_LABEL ResolveStub, HashedToken - and eax,CALL_STUB_CACHE_MASK_ASM * 4 - add eax,dword ptr [DATA_SLOT(ResolveStub, CacheAddress)] -SLOT_ADDRESS_PATCH_LABEL ResolveStub, CacheAddress - mov eax,dword ptr [eax] - cmp edx,dword ptr [eax] - jne LOCAL_LABEL(Miss) - mov edx,dword ptr [DATA_SLOT(ResolveStub, Token)] -SLOT_ADDRESS_PATCH_LABEL ResolveStub, Token,, 1 - cmp edx,dword ptr [eax + 4] - jne LOCAL_LABEL(Miss) - mov eax,dword ptr [eax + 8] - pop edx - add esp, 4 - jmp eax -LOCAL_LABEL(Miss): - pop edx - push dword ptr [DATA_SLOT(ResolveStub, Token)] -SLOT_ADDRESS_PATCH_LABEL ResolveStub, Token,, 2 - INDJMP DATA_SLOT(ResolveStub, ResolveWorkerTarget) // <<< resolveWorker == ResolveWorkerChainLookupAsmStub or ResolveWorkerAsmStub -SLOT_ADDRESS_PATCH_LABEL ResolveStub, ResolveWorkerTarget -LOCAL_LABEL(Backpatcher): - INDCALL DATA_SLOT(ResolveStub, PatcherTarget) // <<< backpatcherWorker == BackPatchWorkerAsmStub -SLOT_ADDRESS_PATCH_LABEL ResolveStub, PatcherTarget - jmp LOCAL_LABEL(ResolveEntry) -LEAF_END_MARKED ResolveStubCode - diff --git a/src/coreclr/vm/i386/thunktemplates.asm b/src/coreclr/vm/i386/thunktemplates.asm index e133865abc59c..dfb1a4285c022 100644 --- a/src/coreclr/vm/i386/thunktemplates.asm +++ b/src/coreclr/vm/i386/thunktemplates.asm @@ -57,70 +57,4 @@ CountReachedZero: SLOT_ADDRESS_PATCH_LABEL CallCountingStub, TargetForThresholdReached LEAF_END_MARKED _CallCountingStubCode@0 -LEAF_ENTRY _LookupStubCode@0 - push eax - push dword ptr DATA_SLOT(LookupStub, DispatchToken) -SLOT_ADDRESS_PATCH_LABEL LookupStub, DispatchToken - jmp dword ptr DATA_SLOT(LookupStub, ResolveWorkerTarget) -SLOT_ADDRESS_PATCH_LABEL LookupStub, ResolveWorkerTarget -LEAF_END_MARKED _LookupStubCode@0 - -LEAF_ENTRY _DispatchStubCode@0 - push eax - mov eax, dword ptr DATA_SLOT(DispatchStub, ExpectedMT) -SLOT_ADDRESS_PATCH_LABEL DispatchStub, ExpectedMT -PATCH_LABEL _DispatchStubCode_ThisDeref@0 - cmp dword ptr [ecx],eax - pop eax - jne NoMatch - jmp dword ptr DATA_SLOT(DispatchStub, ImplTarget) -SLOT_ADDRESS_PATCH_LABEL DispatchStub, ImplTarget -NoMatch: - jmp dword ptr DATA_SLOT(DispatchStub, FailTarget) -SLOT_ADDRESS_PATCH_LABEL DispatchStub, FailTarget -LEAF_END_MARKED _DispatchStubCode@0 - -LEAF_ENTRY _ResolveStubCode@0 -_ResolveStubCode_FailEntry@0: -PUBLIC _ResolveStubCode_FailEntry@0 - sub dword ptr DATA_SLOT(ResolveStub, Counter), 1 -SLOT_ADDRESS_PATCH_LABEL ResolveStub, Counter, -5 - jl Backpatcher -PATCH_LABEL _ResolveStubCode_ResolveEntry@0 - push eax -PATCH_LABEL _ResolveStubCode_ThisDeref@0 - mov eax,dword ptr [ecx] - push edx - mov edx,eax - shr eax, 12 - add eax,edx - xor eax,dword ptr DATA_SLOT(ResolveStub, HashedToken) -SLOT_ADDRESS_PATCH_LABEL ResolveStub, HashedToken - and eax,CALL_STUB_CACHE_MASK_ASM * 4 - add eax,dword ptr DATA_SLOT(ResolveStub, CacheAddress) -SLOT_ADDRESS_PATCH_LABEL ResolveStub, CacheAddress - mov eax,dword ptr [eax] - cmp edx,dword ptr [eax] - jne Miss - mov edx,dword ptr DATA_SLOT(ResolveStub, Token) -SLOT_ADDRESS_PATCH_LABEL ResolveStub, Token,, 1 - cmp edx,dword ptr [eax + 4] - jne Miss - mov eax,dword ptr [eax + 8] - pop edx - add esp, 4 - jmp eax -Miss: - pop edx -Slow: - push dword ptr DATA_SLOT(ResolveStub, Token) -SLOT_ADDRESS_PATCH_LABEL ResolveStub, Token,, 2 - jmp dword ptr DATA_SLOT(ResolveStub, ResolveWorkerTarget); <<< resolveWorker == ResolveWorkerChainLookupAsmStub or ResolveWorkerAsmStub -SLOT_ADDRESS_PATCH_LABEL ResolveStub, ResolveWorkerTarget -Backpatcher: - call dword ptr DATA_SLOT(ResolveStub, PatcherTarget); <<< backpatcherWorker == BackPatchWorkerAsmStub -SLOT_ADDRESS_PATCH_LABEL ResolveStub, PatcherTarget - jmp _ResolveStubCode_ResolveEntry@0 -LEAF_END_MARKED _ResolveStubCode@0 - end \ No newline at end of file diff --git a/src/coreclr/vm/i386/virtualcallstubcpu.hpp b/src/coreclr/vm/i386/virtualcallstubcpu.hpp index 9f648f6788524..38a5a9baafe4b 100644 --- a/src/coreclr/vm/i386/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/i386/virtualcallstubcpu.hpp @@ -13,17 +13,370 @@ #ifndef _VIRTUAL_CALL_STUB_X86_H #define _VIRTUAL_CALL_STUB_X86_H -#define DISPATCH_STUB_FIRST_WORD 0xa150 -#define RESOLVE_STUB_FIRST_WORD 0x2d83 -#define LOOKUP_STUB_FIRST_WORD 0xff50 -#define VTABLECALL_STUB_FIRST_WORD 0x018b - #ifdef DECLARE_DATA #include "asmconstants.h" #endif #include // Since we are placing code, we want byte packing of the structs +#define USES_LOOKUP_STUBS 1 + +/********************************************************************************************* +Stubs that contain code are all part of larger structs called Holders. There is a +Holder for each kind of stub, i.e XXXStub is contained with XXXHolder. Holders are +essentially an implementation trick that allowed rearranging the code sequences more +easily while trying out different alternatives, and for dealing with any alignment +issues in a way that was mostly immune to the actually code sequences. These Holders +should be revisited when the stub code sequences are fixed, since in many cases they +add extra space to a stub that is not really needed. + +Stubs are placed in cache and hash tables. Since unaligned access of data in memory +is very slow, the keys used in those tables should be aligned. The things used as keys +typically also occur in the generated code, e.g. a token as an immediate part of an instruction. +For now, to avoid alignment computations as different code strategies are tried out, the key +fields are all in the Holders. Eventually, many of these fields should be dropped, and the instruction +streams aligned so that the immediate fields fall on aligned boundaries. +*/ + +#if USES_LOOKUP_STUBS + +struct LookupStub; +struct LookupHolder; + +/*LookupStub************************************************************************************** +Virtual and interface call sites are initially setup to point at LookupStubs. +This is because the runtime type of the pointer is not yet known, +so the target cannot be resolved. Note: if the jit is able to determine the runtime type +of the pointer, it should be generating a direct call not a virtual or interface call. +This stub pushes a lookup token onto the stack to identify the sought after method, and then +jumps into the EE (VirtualCallStubManager::ResolveWorkerStub) to effectuate the lookup and +transfer of control to the appropriate target method implementation, perhaps patching of the call site +along the way to point to a more appropriate stub. Hence callsites that point to LookupStubs +get quickly changed to point to another kind of stub. +*/ +struct LookupStub +{ + inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } + inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } + +private: + friend struct LookupHolder; + + // DispatchStub:: _entryPoint expects: + // ecx: object (the "this" pointer) + // eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call + BYTE _entryPoint [2]; // 50 push eax ;save siteAddrForRegisterIndirect - this may be an indirect call + // 68 push + size_t _token; // xx xx xx xx 32-bit constant +#ifdef STUB_LOGGING + BYTE cntr2[2]; // ff 05 inc + size_t* c_lookup; // xx xx xx xx [call_lookup_counter] +#endif //STUB_LOGGING + BYTE part2 [1]; // e9 jmp + DISPL _resolveWorkerDispl;// xx xx xx xx pc-rel displ +}; + +/* LookupHolders are the containers for LookupStubs, they provide for any alignment of +stubs as necessary. In the case of LookupStubs, alignment is necessary since +LookupStubs are placed in a hash table keyed by token. */ +struct LookupHolder +{ + static void InitializeStatic(); + + void Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken); + + LookupStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } + + static LookupHolder* FromLookupEntry(PCODE lookupEntry); + +private: + friend struct LookupStub; + + BYTE align[(sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*)))%sizeof(void*)]; + LookupStub _stub; + BYTE pad[sizeof(void*) - + ((sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*))) + + (sizeof(LookupStub)) + ) % sizeof(void*)]; //complete DWORD + + static_assert_no_msg((sizeof(void*) - + ((sizeof(void*)-(offsetof(LookupStub,_token)%sizeof(void*))) + + (sizeof(LookupStub)) + ) % sizeof(void*)) != 0); +}; + +#endif // USES_LOOKUP_STUBS + +struct DispatchStub; +struct DispatchHolder; + +/*DispatchStub************************************************************************************** +Monomorphic and mostly monomorphic call sites eventually point to DispatchStubs. +A dispatch stub has an expected type (expectedMT), target address (target) and fail address (failure). +If the calling frame does in fact have the type be of the expected type, then +control is transfered to the target address, the method implementation. If not, +then control is transfered to the fail address, a fail stub (see below) where a polymorphic +lookup is done to find the correct address to go to. + +implementation note: Order, choice of instructions, and branch directions +should be carefully tuned since it can have an inordinate effect on performance. Particular +attention needs to be paid to the effects on the BTB and branch prediction, both in the small +and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. +Note that since this stub is only used for mostly monomorphic callsites (ones that are not, get patched +to something else), therefore the conditional jump "jne failure" is mostly not taken, and hence it is important +that the branch prediction staticly predict this, which means it must be a forward jump. The alternative +is to reverse the order of the jumps and make sure that the resulting conditional jump "je implTarget" +is statically predicted as taken, i.e a backward jump. The current choice was taken since it was easier +to control the placement of the stubs than control the placement of the jitted code and the stubs. */ +struct DispatchStub +{ + inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_entryPoint[0]; } + + inline size_t expectedMT() { LIMITED_METHOD_CONTRACT; return _expectedMT; } + inline PCODE implTarget() { LIMITED_METHOD_CONTRACT; return (PCODE) &_implDispl + sizeof(DISPL) + _implDispl; } + + inline TADDR implTargetSlot(EntryPointSlots::SlotType *slotTypeRef) const + { + LIMITED_METHOD_CONTRACT; + _ASSERTE(slotTypeRef != nullptr); + + *slotTypeRef = EntryPointSlots::SlotType_ExecutableRel32; + return (TADDR)&_implDispl; + } + + inline PCODE failTarget() { LIMITED_METHOD_CONTRACT; return (PCODE) &_failDispl + sizeof(DISPL) + _failDispl; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(DispatchStub); } + +private: + friend struct DispatchHolder; + + // DispatchStub:: _entryPoint expects: + // ecx: object (the "this" pointer) + // eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call +#ifndef STUB_LOGGING + BYTE _entryPoint [2]; // 81 39 cmp [ecx], ; This is the place where we are going to fault on null this. + size_t _expectedMT; // xx xx xx xx expectedMT ; If you change it, change also AdjustContextForVirtualStub in excep.cpp!!! + BYTE jmpOp1[2]; // 0f 85 jne + DISPL _failDispl; // xx xx xx xx failEntry ;must be forward jmp for perf reasons + BYTE jmpOp2; // e9 jmp + DISPL _implDispl; // xx xx xx xx implTarget +#else //STUB_LOGGING + BYTE _entryPoint [2]; // ff 05 inc + size_t* d_call; // xx xx xx xx [call_mono_counter] + BYTE cmpOp [2]; // 81 39 cmp [ecx], + size_t _expectedMT; // xx xx xx xx expectedMT + BYTE jmpOp1[2]; // 0f 84 je + DISPL _implDispl; // xx xx xx xx implTarget ;during logging, perf is not so important + BYTE fail [2]; // ff 05 inc + size_t* d_miss; // xx xx xx xx [miss_mono_counter] + BYTE jmpFail; // e9 jmp + DISPL _failDispl; // xx xx xx xx failEntry +#endif //STUB_LOGGING +}; + +/* DispatchHolders are the containers for DispatchStubs, they provide for any alignment of +stubs as necessary. DispatchStubs are placed in a hashtable and in a cache. The keys for both +are the pair expectedMT and token. Efficiency of the of the hash table is not a big issue, +since lookups in it are fairly rare. Efficiency of the cache is paramount since it is accessed frequently +o(see ResolveStub below). Currently we are storing both of these fields in the DispatchHolder to simplify +alignment issues. If inlineMT in the stub itself was aligned, then it could be the expectedMT field. +While the token field can be logically gotten by following the failure target to the failEntryPoint +of the ResolveStub and then to the token over there, for perf reasons of cache access, it is duplicated here. +This allows us to use DispatchStubs in the cache. The alternative is to provide some other immutable struct +for the cache composed of the triplet (expectedMT, token, target) and some sort of reclaimation scheme when +they are thrown out of the cache via overwrites (since concurrency will make the obvious approaches invalid). +*/ + +/* @workaround for ee resolution - Since the EE does not currently have a resolver function that +does what we want, see notes in implementation of VirtualCallStubManager::Resolver, we are +using dispatch stubs to siumulate what we want. That means that inlineTarget, which should be immutable +is in fact written. Hence we have moved target out into the holder and aligned it so we can +atomically update it. When we get a resolver function that does what we want, we can drop this field, +and live with just the inlineTarget field in the stub itself, since immutability will hold.*/ +struct DispatchHolder +{ + static void InitializeStatic(); + + void Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT); + + DispatchStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } + + static DispatchHolder* FromDispatchEntry(PCODE dispatchEntry); + +private: + // Force _implDispl to be aligned so that it is backpatchable for tiering + BYTE align[(sizeof(void*) - (offsetof(DispatchStub, _implDispl) % sizeof(void*))) % sizeof(void*)]; + DispatchStub _stub; + BYTE pad[(sizeof(void*) - (sizeof(DispatchStub) % sizeof(void*)) + offsetof(DispatchStub, _implDispl)) % sizeof(void*)]; //complete DWORD +}; + +struct ResolveStub; +struct ResolveHolder; + +/*ResolveStub************************************************************************************** +Polymorphic call sites and monomorphic calls that fail end up in a ResolverStub. There is only +one resolver stub built for any given token, even though there may be many call sites that +use that token and many distinct types that are used in the calling call frames. A resolver stub +actually has two entry points, one for polymorphic call sites and one for dispatch stubs that fail on their +expectedMT test. There is a third part of the resolver stub that enters the ee when a decision should +be made about changing the callsite. Therefore, we have defined the resolver stub as three distinct pieces, +even though they are actually allocated as a single contiguous block of memory. These pieces are: + +A ResolveStub has two entry points: + +FailEntry - where the dispatch stub goes if the expected MT test fails. This piece of the stub does +a check to see how often we are actually failing. If failures are frequent, control transfers to the +patch piece to cause the call site to be changed from a mostly monomorphic callsite +(calls dispatch stub) to a polymorphic callsize (calls resolve stub). If failures are rare, control +transfers to the resolve piece (see ResolveStub). The failEntryPoint decrements a counter +every time it is entered. The ee at various times will add a large chunk to the counter. + +ResolveEntry - does a lookup via in a cache by hashing the actual type of the calling frame s + and the token identifying the (contract,method) pair desired. If found, control is transfered +to the method implementation. If not found in the cache, the token is pushed and the ee is entered via +the ResolveWorkerStub to do a full lookup and eventual transfer to the correct method implementation. Since +there is a different resolve stub for every token, the token can be inlined and the token can be pre-hashed. +The effectiveness of this approach is highly sensitive to the effectiveness of the hashing algorithm used, +as well as its speed. It turns out it is very important to make the hash function sensitive to all +of the bits of the method table, as method tables are laid out in memory in a very non-random way. Before +making any changes to the code sequences here, it is very important to measure and tune them as perf +can vary greatly, in unexpected ways, with seeming minor changes. + +Implementation note - Order, choice of instructions, and branch directions +should be carefully tuned since it can have an inordinate effect on performance. Particular +attention needs to be paid to the effects on the BTB and branch prediction, both in the small +and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. +Note that this stub is called in highly polymorphic cases, but the cache should have been sized +and the hash function chosen to maximize the cache hit case. Hence the cmp/jcc instructions should +mostly be going down the cache hit route, and it is important that this be statically predicted as so. +Hence the 3 jcc instrs need to be forward jumps. As structured, there is only one jmp/jcc that typically +gets put in the BTB since all the others typically fall straight thru. Minimizing potential BTB entries +is important. */ + +struct ResolveStub +{ + inline PCODE failEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_failEntryPoint[0]; } + inline PCODE resolveEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_resolveEntryPoint; } + inline PCODE slowEntryPoint() { LIMITED_METHOD_CONTRACT; return (PCODE)&_slowEntryPoint[0]; } + + inline INT32* pCounter() { LIMITED_METHOD_CONTRACT; return _pCounter; } + inline UINT32 hashedToken() { LIMITED_METHOD_CONTRACT; return _hashedToken >> LOG2_PTRSIZE; } + inline size_t cacheAddress() { LIMITED_METHOD_CONTRACT; return _cacheAddress; } + inline size_t token() { LIMITED_METHOD_CONTRACT; return _token; } + inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(ResolveStub); } +#ifndef UNIX_X86_ABI + inline static size_t offsetOfThisDeref(){ LIMITED_METHOD_CONTRACT; return offsetof(ResolveStub, part1) - offsetof(ResolveStub, _resolveEntryPoint); } + inline size_t stackArgumentsSize() { LIMITED_METHOD_CONTRACT; return _stackArgumentsSize; } +#endif + +private: + friend struct ResolveHolder; + + // ResolveStub::_failEntryPoint expects: + // ecx: object (the "this" pointer) + // eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call + BYTE _failEntryPoint [2]; // 83 2d sub + INT32* _pCounter; // xx xx xx xx [counter], + BYTE part0 [2]; // 01 01 + // 7c jl + BYTE toPatcher; // xx backpatcher ;must be forward jump, for perf reasons + // ;fall into the resolver stub + + // ResolveStub::_resolveEntryPoint expects: + // ecx: object (the "this" pointer) + // eax: siteAddrForRegisterIndirect if this is a RegisterIndirect dispatch call + BYTE _resolveEntryPoint; // 50 push eax ;save siteAddrForRegisterIndirect - this may be an indirect call + BYTE part1 [11]; // 8b 01 mov eax,[ecx] ;get the method table from the "this" pointer. This is the place + // ; where we are going to fault on null this. If you change it, + // ; change also AdjustContextForVirtualStub in excep.cpp!!! + // 52 push edx + // 8b d0 mov edx, eax + // c1 e8 0C shr eax,12 ;we are adding upper bits into lower bits of mt + // 03 c2 add eax,edx + // 35 xor eax, + UINT32 _hashedToken; // xx xx xx xx hashedToken ;along with pre-hashed token + BYTE part2 [1]; // 25 and eax, + size_t mask; // xx xx xx xx cache_mask + BYTE part3 [2]; // 8b 80 mov eax, [eax+ + size_t _cacheAddress; // xx xx xx xx lookupCache] +#ifdef STUB_LOGGING + BYTE cntr1[2]; // ff 05 inc + size_t* c_call; // xx xx xx xx [call_cache_counter] +#endif //STUB_LOGGING + BYTE part4 [2]; // 3b 10 cmp edx,[eax+ + // BYTE mtOffset; // ResolverCacheElem.pMT] + BYTE part5 [1]; // 75 jne + BYTE toMiss1; // xx miss ;must be forward jump, for perf reasons + BYTE part6 [2]; // 81 78 cmp [eax+ + BYTE tokenOffset; // xx ResolverCacheElem.token], + size_t _token; // xx xx xx xx token + BYTE part7 [1]; // 75 jne + BYTE toMiss2; // xx miss ;must be forward jump, for perf reasons + BYTE part8 [2]; // 8B 40 xx mov eax,[eax+ + BYTE targetOffset; // ResolverCacheElem.target] + BYTE part9 [6]; // 5a pop edx + // 83 c4 04 add esp,4 ;throw away siteAddrForRegisterIndirect - we don't need it now + // ff e0 jmp eax + // miss: + BYTE miss [1]; // 5a pop edx ; don't pop siteAddrForRegisterIndirect - leave it on the stack for use by ResolveWorkerChainLookupAsmStub and/or ResolveWorkerAsmStub + BYTE _slowEntryPoint[1]; // 68 push + size_t _tokenPush; // xx xx xx xx token +#ifdef STUB_LOGGING + BYTE cntr2[2]; // ff 05 inc + size_t* c_miss; // xx xx xx xx [miss_cache_counter] +#endif //STUB_LOGGING + BYTE part10 [1]; // e9 jmp + DISPL _resolveWorkerDispl; // xx xx xx xx resolveWorker == ResolveWorkerChainLookupAsmStub or ResolveWorkerAsmStub + BYTE patch[1]; // e8 call + DISPL _backpatcherDispl; // xx xx xx xx backpatcherWorker == BackPatchWorkerAsmStub + BYTE part11 [1]; // eb jmp + BYTE toResolveStub; // xx resolveStub, i.e. go back to _resolveEntryPoint +#ifndef UNIX_X86_ABI + size_t _stackArgumentsSize; // xx xx xx xx +#endif +}; + +/* ResolveHolders are the containers for ResolveStubs, They provide +for any alignment of the stubs as necessary. The stubs are placed in a hash table keyed by +the token for which they are built. Efficiency of access requires that this token be aligned. +For now, we have copied that field into the ResolveHolder itself, if the resolve stub is arranged such that +any of its inlined tokens (non-prehashed) is aligned, then the token field in the ResolveHolder +is not needed. */ +struct ResolveHolder +{ + static void InitializeStatic(); + + void Initialize(ResolveHolder* pResolveHolderRX, + PCODE resolveWorkerTarget, PCODE patcherTarget, + size_t dispatchToken, UINT32 hashedToken, + void * cacheAddr, INT32 * counterAddr +#ifndef UNIX_X86_ABI + , size_t stackArgumentsSize +#endif + ); + + ResolveStub* stub() { LIMITED_METHOD_CONTRACT; return &_stub; } + + static ResolveHolder* FromFailEntry(PCODE failEntry); + static ResolveHolder* FromResolveEntry(PCODE resolveEntry); + +private: + //align _token in resolve stub + + BYTE align[(sizeof(void*)-((offsetof(ResolveStub,_token))%sizeof(void*)))%sizeof(void*) +#ifdef STUB_LOGGING // This turns out to be zero-sized in stub_logging case, and is an error. So round up. + +sizeof(void*) +#endif + ]; + + ResolveStub _stub; + +//#ifdef STUB_LOGGING // This turns out to be zero-sized in non stub_logging case, and is an error. So remove + BYTE pad[(sizeof(void*)-((sizeof(ResolveStub))%sizeof(void*))+offsetof(ResolveStub,_token))%sizeof(void*)]; //fill out DWORD +//#endif +}; + /*VTableCallStub************************************************************************************** These are jump stubs that perform a vtable-base virtual call. These stubs assume that an object is placed in the first argument register (this pointer). From there, the stub extracts the MethodTable pointer, followed by the @@ -68,7 +421,6 @@ struct VTableCallHolder VTableCallStub* stub() { LIMITED_METHOD_CONTRACT; return reinterpret_cast(this); } - size_t size() { return stub()->size(); } static size_t GetHolderSize(unsigned slot) { STATIC_CONTRACT_WRAPPER; @@ -356,6 +708,283 @@ PCODE StubCallSite::GetCallerAddress() #endif // UNIX_X86_ABI } +#ifdef STUB_LOGGING +extern size_t g_lookup_inline_counter; +extern size_t g_mono_call_counter; +extern size_t g_mono_miss_counter; +extern size_t g_poly_call_counter; +extern size_t g_poly_miss_counter; +#endif + +/* Template used to generate the stub. We generate a stub by allocating a block of + memory and copy the template over it and just update the specific fields that need + to be changed. +*/ +LookupStub lookupInit; + +void LookupHolder::InitializeStatic() +{ + static_assert_no_msg(((offsetof(LookupStub, _token)+offsetof(LookupHolder, _stub)) % sizeof(void*)) == 0); + static_assert_no_msg((sizeof(LookupHolder) % sizeof(void*)) == 0); + + lookupInit._entryPoint [0] = 0x50; + lookupInit._entryPoint [1] = 0x68; + static_assert_no_msg(sizeof(lookupInit._entryPoint) == 2); + lookupInit._token = 0xcccccccc; +#ifdef STUB_LOGGING + lookupInit.cntr2 [0] = 0xff; + lookupInit.cntr2 [1] = 0x05; + static_assert_no_msg(sizeof(lookupInit.cntr2) == 2); + lookupInit.c_lookup = &g_call_lookup_counter; +#endif //STUB_LOGGING + lookupInit.part2 [0] = 0xe9; + static_assert_no_msg(sizeof(lookupInit.part2) == 1); + lookupInit._resolveWorkerDispl = 0xcccccccc; +} + +void LookupHolder::Initialize(LookupHolder* pLookupHolderRX, PCODE resolveWorkerTarget, size_t dispatchToken) +{ + _stub = lookupInit; + + //fill in the stub specific fields + //@TODO: Get rid of this duplication of data. + _stub._token = dispatchToken; + _stub._resolveWorkerDispl = resolveWorkerTarget - ((PCODE) &pLookupHolderRX->_stub._resolveWorkerDispl + sizeof(DISPL)); +} + +LookupHolder* LookupHolder::FromLookupEntry(PCODE lookupEntry) +{ + LIMITED_METHOD_CONTRACT; + LookupHolder* lookupHolder = (LookupHolder*) ( lookupEntry - offsetof(LookupHolder, _stub) - offsetof(LookupStub, _entryPoint) ); + // _ASSERTE(lookupHolder->_stub._entryPoint[0] == lookupInit._entryPoint[0]); + return lookupHolder; +} + + +/* Template used to generate the stub. We generate a stub by allocating a block of + memory and copy the template over it and just update the specific fields that need + to be changed. +*/ +DispatchStub dispatchInit; + +void DispatchHolder::InitializeStatic() +{ + // Check that _implDispl is aligned in the DispatchHolder for backpatching + static_assert_no_msg(((offsetof(DispatchHolder, _stub) + offsetof(DispatchStub, _implDispl)) % sizeof(void*)) == 0); + static_assert_no_msg((sizeof(DispatchHolder) % sizeof(void*)) == 0); + +#ifndef STUB_LOGGING + dispatchInit._entryPoint [0] = 0x81; + dispatchInit._entryPoint [1] = 0x39; + static_assert_no_msg(sizeof(dispatchInit._entryPoint) == 2); + + dispatchInit._expectedMT = 0xcccccccc; + dispatchInit.jmpOp1 [0] = 0x0f; + dispatchInit.jmpOp1 [1] = 0x85; + static_assert_no_msg(sizeof(dispatchInit.jmpOp1) == 2); + + dispatchInit._failDispl = 0xcccccccc; + dispatchInit.jmpOp2 = 0xe9; + dispatchInit._implDispl = 0xcccccccc; +#else //STUB_LOGGING + dispatchInit._entryPoint [0] = 0xff; + dispatchInit._entryPoint [1] = 0x05; + static_assert_no_msg(sizeof(dispatchInit._entryPoint) == 2); + + dispatchInit.d_call = &g_mono_call_counter; + dispatchInit.cmpOp [0] = 0x81; + dispatchInit.cmpOp [1] = 0x39; + static_assert_no_msg(sizeof(dispatchInit.cmpOp) == 2); + + dispatchInit._expectedMT = 0xcccccccc; + dispatchInit.jmpOp1 [0] = 0x0f; + dispatchInit.jmpOp1 [1] = 0x84; + static_assert_no_msg(sizeof(dispatchInit.jmpOp1) == 2); + + dispatchInit._implDispl = 0xcccccccc; + dispatchInit.fail [0] = 0xff; + dispatchInit.fail [1] = 0x05; + static_assert_no_msg(sizeof(dispatchInit.fail) == 2); + + dispatchInit.d_miss = &g_mono_miss_counter; + dispatchInit.jmpFail = 0xe9; + dispatchInit._failDispl = 0xcccccccc; +#endif //STUB_LOGGING +}; + +void DispatchHolder::Initialize(DispatchHolder* pDispatchHolderRX, PCODE implTarget, PCODE failTarget, size_t expectedMT) +{ + _stub = dispatchInit; + + //fill in the stub specific fields + _stub._expectedMT = (size_t) expectedMT; + _stub._failDispl = failTarget - ((PCODE) &pDispatchHolderRX->_stub._failDispl + sizeof(DISPL)); + _stub._implDispl = implTarget - ((PCODE) &pDispatchHolderRX->_stub._implDispl + sizeof(DISPL)); +} + +DispatchHolder* DispatchHolder::FromDispatchEntry(PCODE dispatchEntry) +{ + LIMITED_METHOD_CONTRACT; + DispatchHolder* dispatchHolder = (DispatchHolder*) ( dispatchEntry - offsetof(DispatchHolder, _stub) - offsetof(DispatchStub, _entryPoint) ); + // _ASSERTE(dispatchHolder->_stub._entryPoint[0] == dispatchInit._entryPoint[0]); + return dispatchHolder; +} + + +/* Template used to generate the stub. We generate a stub by allocating a block of + memory and copy the template over it and just update the specific fields that need + to be changed. +*/ + +ResolveStub resolveInit; + +void ResolveHolder::InitializeStatic() +{ + //Check that _token is aligned in ResolveHolder + static_assert_no_msg(((offsetof(ResolveHolder, _stub) + offsetof(ResolveStub, _token)) % sizeof(void*)) == 0); + static_assert_no_msg((sizeof(ResolveHolder) % sizeof(void*)) == 0); + + resolveInit._failEntryPoint [0] = 0x83; + resolveInit._failEntryPoint [1] = 0x2d; + static_assert_no_msg(sizeof(resolveInit._failEntryPoint) == 2); + + resolveInit._pCounter = (INT32 *) (size_t) 0xcccccccc; + resolveInit.part0 [0] = 0x01; + resolveInit.part0 [1] = 0x7c; + static_assert_no_msg(sizeof(resolveInit.part0) == 2); + + resolveInit.toPatcher = (offsetof(ResolveStub, patch) - (offsetof(ResolveStub, toPatcher) + 1)) & 0xFF; + + resolveInit._resolveEntryPoint = 0x50; + resolveInit.part1 [0] = 0x8b; + resolveInit.part1 [1] = 0x01; + resolveInit.part1 [2] = 0x52; + resolveInit.part1 [3] = 0x8b; + resolveInit.part1 [4] = 0xd0; + resolveInit.part1 [5] = 0xc1; + resolveInit.part1 [6] = 0xe8; + resolveInit.part1 [7] = CALL_STUB_CACHE_NUM_BITS; + resolveInit.part1 [8] = 0x03; + resolveInit.part1 [9] = 0xc2; + resolveInit.part1 [10] = 0x35; + static_assert_no_msg(sizeof(resolveInit.part1) == 11); + + resolveInit._hashedToken = 0xcccccccc; + resolveInit.part2 [0] = 0x25; + static_assert_no_msg(sizeof(resolveInit.part2) == 1); + + resolveInit.mask = (CALL_STUB_CACHE_MASK << LOG2_PTRSIZE); + resolveInit.part3 [0] = 0x8b; + resolveInit.part3 [1] = 0x80;; + static_assert_no_msg(sizeof(resolveInit.part3) == 2); + + resolveInit._cacheAddress = 0xcccccccc; +#ifdef STUB_LOGGING + resolveInit.cntr1 [0] = 0xff; + resolveInit.cntr1 [1] = 0x05; + static_assert_no_msg(sizeof(resolveInit.cntr1) == 2); + + resolveInit.c_call = &g_poly_call_counter; +#endif //STUB_LOGGING + resolveInit.part4 [0] = 0x3b; + resolveInit.part4 [1] = 0x10; + static_assert_no_msg(sizeof(resolveInit.part4) == 2); + + // resolveInit.mtOffset = offsetof(ResolveCacheElem,pMT) & 0xFF; + static_assert_no_msg(offsetof(ResolveCacheElem,pMT) == 0); + + resolveInit.part5 [0] = 0x75; + static_assert_no_msg(sizeof(resolveInit.part5) == 1); + + resolveInit.toMiss1 = offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss1)+1); + + resolveInit.part6 [0] = 0x81; + resolveInit.part6 [1] = 0x78; + static_assert_no_msg(sizeof(resolveInit.part6) == 2); + + resolveInit.tokenOffset = offsetof(ResolveCacheElem,token) & 0xFF; + + resolveInit._token = 0xcccccccc; + + resolveInit.part7 [0] = 0x75; + static_assert_no_msg(sizeof(resolveInit.part7) == 1); + + resolveInit.part8 [0] = 0x8b; + resolveInit.part8 [1] = 0x40; + static_assert_no_msg(sizeof(resolveInit.part8) == 2); + + resolveInit.targetOffset = offsetof(ResolveCacheElem,target) & 0xFF; + + resolveInit.toMiss2 = offsetof(ResolveStub,miss)-(offsetof(ResolveStub,toMiss2)+1); + + resolveInit.part9 [0] = 0x5a; + resolveInit.part9 [1] = 0x83; + resolveInit.part9 [2] = 0xc4; + resolveInit.part9 [3] = 0x04; + resolveInit.part9 [4] = 0xff; + resolveInit.part9 [5] = 0xe0; + static_assert_no_msg(sizeof(resolveInit.part9) == 6); + + resolveInit.miss [0] = 0x5a; +// resolveInit.miss [1] = 0xb8; +// resolveInit._hashedTokenMov = 0xcccccccc; + resolveInit._slowEntryPoint [0] = 0x68; + resolveInit._tokenPush = 0xcccccccc; +#ifdef STUB_LOGGING + resolveInit.cntr2 [0] = 0xff; + resolveInit.cntr2 [1] = 0x05; + resolveInit.c_miss = &g_poly_miss_counter; +#endif //STUB_LOGGING + resolveInit.part10 [0] = 0xe9; + resolveInit._resolveWorkerDispl = 0xcccccccc; + + resolveInit.patch [0] = 0xe8; + resolveInit._backpatcherDispl = 0xcccccccc; + resolveInit.part11 [0] = 0xeb; + resolveInit.toResolveStub = (offsetof(ResolveStub, _resolveEntryPoint) - (offsetof(ResolveStub, toResolveStub) + 1)) & 0xFF; +}; + +void ResolveHolder::Initialize(ResolveHolder* pResolveHolderRX, + PCODE resolveWorkerTarget, PCODE patcherTarget, + size_t dispatchToken, UINT32 hashedToken, + void * cacheAddr, INT32 * counterAddr +#ifndef UNIX_X86_ABI + , size_t stackArgumentsSize +#endif + ) +{ + _stub = resolveInit; + + //fill in the stub specific fields + _stub._pCounter = counterAddr; + _stub._hashedToken = hashedToken << LOG2_PTRSIZE; + _stub._cacheAddress = (size_t) cacheAddr; + _stub._token = dispatchToken; +// _stub._hashedTokenMov = hashedToken; + _stub._tokenPush = dispatchToken; + _stub._resolveWorkerDispl = resolveWorkerTarget - ((PCODE) &pResolveHolderRX->_stub._resolveWorkerDispl + sizeof(DISPL)); + _stub._backpatcherDispl = patcherTarget - ((PCODE) &pResolveHolderRX->_stub._backpatcherDispl + sizeof(DISPL)); +#ifndef UNIX_X86_ABI + _stub._stackArgumentsSize = stackArgumentsSize; +#endif +} + +ResolveHolder* ResolveHolder::FromFailEntry(PCODE failEntry) +{ + LIMITED_METHOD_CONTRACT; + ResolveHolder* resolveHolder = (ResolveHolder*) ( failEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _failEntryPoint) ); + // _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]); + return resolveHolder; +} + +ResolveHolder* ResolveHolder::FromResolveEntry(PCODE resolveEntry) +{ + LIMITED_METHOD_CONTRACT; + ResolveHolder* resolveHolder = (ResolveHolder*) ( resolveEntry - offsetof(ResolveHolder, _stub) - offsetof(ResolveStub, _resolveEntryPoint) ); + // _ASSERTE(resolveHolder->_stub._resolveEntryPoint[0] == resolveInit._resolveEntryPoint[0]); + return resolveHolder; +} + void VTableCallHolder::Initialize(unsigned slot) { unsigned offsetOfIndirection = MethodTable::GetVtableOffset() + MethodTable::GetIndexOfVtableIndirection(slot) * TARGET_POINTER_SIZE; @@ -420,23 +1049,22 @@ VirtualCallStubManager::StubKind VirtualCallStubManager::predictStubKind(PCODE s WORD firstWord = *((WORD*) stubStartAddress); #ifndef STUB_LOGGING - if (firstWord == DISPATCH_STUB_FIRST_WORD) + if (firstWord == 0x3981) #else //STUB_LOGGING -#error if (firstWord == 0x05ff) #endif { stubKind = SK_DISPATCH; } - else if (firstWord == LOOKUP_STUB_FIRST_WORD) + else if (firstWord == 0x6850) { stubKind = SK_LOOKUP; } - else if (firstWord == RESOLVE_STUB_FIRST_WORD) + else if (firstWord == 0x8b50) { stubKind = SK_RESOLVE; } - else if (firstWord == VTABLECALL_STUB_FIRST_WORD) + else if (firstWord == 0x018b) { stubKind = SK_VTABLECALL; } diff --git a/src/coreclr/vm/loaderallocator.cpp b/src/coreclr/vm/loaderallocator.cpp index 7795d47a362bf..039bb3825837a 100644 --- a/src/coreclr/vm/loaderallocator.cpp +++ b/src/coreclr/vm/loaderallocator.cpp @@ -1049,7 +1049,7 @@ void LoaderAllocator::ActivateManagedTracking() #define COLLECTIBLE_HIGH_FREQUENCY_HEAP_SIZE (3 * GetOsPageSize()) #define COLLECTIBLE_STUB_HEAP_SIZE GetOsPageSize() #define COLLECTIBLE_CODEHEAP_SIZE (7 * GetOsPageSize()) -#define COLLECTIBLE_VIRTUALSTUBDISPATCH_HEAP_SPACE (14 * GetOsPageSize()) +#define COLLECTIBLE_VIRTUALSTUBDISPATCH_HEAP_SPACE (5 * GetOsPageSize()) void LoaderAllocator::Init(BaseDomain *pDomain, BYTE *pExecutableHeapMemory) { diff --git a/src/coreclr/vm/virtualcallstub.cpp b/src/coreclr/vm/virtualcallstub.cpp index 544a3bc58fc00..5ae47059bc082 100644 --- a/src/coreclr/vm/virtualcallstub.cpp +++ b/src/coreclr/vm/virtualcallstub.cpp @@ -560,16 +560,19 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA cache_entry_heap_reserve_size *= sizeof(ResolveCacheElem); cache_entry_heap_commit_size *= sizeof(ResolveCacheElem); - lookup_heap_reserve_size *= sizeof(LookupStub); - lookup_heap_commit_size *= sizeof(LookupStub); + lookup_heap_reserve_size *= sizeof(LookupHolder); + lookup_heap_commit_size *= sizeof(LookupHolder); - DWORD dispatchStubSize = sizeof(DispatchStub); + DWORD dispatchHolderSize = sizeof(DispatchHolder); +#ifdef TARGET_AMD64 + dispatchHolderSize = static_cast(DispatchHolder::GetHolderSize(DispatchStub::e_TYPE_SHORT)); +#endif - dispatch_heap_reserve_size *= dispatchStubSize; - dispatch_heap_commit_size *= dispatchStubSize; + dispatch_heap_reserve_size *= dispatchHolderSize; + dispatch_heap_commit_size *= dispatchHolderSize; - resolve_heap_reserve_size *= sizeof(ResolveStub); - resolve_heap_commit_size *= sizeof(ResolveStub); + resolve_heap_reserve_size *= sizeof(ResolveHolder); + resolve_heap_commit_size *= sizeof(ResolveHolder); vtable_heap_reserve_size *= static_cast(VTableCallHolder::GetHolderSize(0)); vtable_heap_commit_size *= static_cast(VTableCallHolder::GetHolderSize(0)); @@ -583,14 +586,14 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA cache_entry_heap_reserve_size = (DWORD) ALIGN_UP(cache_entry_heap_reserve_size, GetOsPageSize()); cache_entry_heap_commit_size = (DWORD) ALIGN_UP(cache_entry_heap_commit_size, GetOsPageSize()); - lookup_heap_reserve_size = (DWORD) ALIGN_UP(lookup_heap_reserve_size, 2 * GetOsPageSize()); - lookup_heap_commit_size = (DWORD) ALIGN_UP(lookup_heap_commit_size, 2 * GetOsPageSize()); + lookup_heap_reserve_size = (DWORD) ALIGN_UP(lookup_heap_reserve_size, GetOsPageSize()); + lookup_heap_commit_size = (DWORD) ALIGN_UP(lookup_heap_commit_size, GetOsPageSize()); - dispatch_heap_reserve_size = (DWORD) ALIGN_UP(dispatch_heap_reserve_size, 2 * GetOsPageSize()); - dispatch_heap_commit_size = (DWORD) ALIGN_UP(dispatch_heap_commit_size, 2 * GetOsPageSize()); + dispatch_heap_reserve_size = (DWORD) ALIGN_UP(dispatch_heap_reserve_size, GetOsPageSize()); + dispatch_heap_commit_size = (DWORD) ALIGN_UP(dispatch_heap_commit_size, GetOsPageSize()); - resolve_heap_reserve_size = (DWORD) ALIGN_UP(resolve_heap_reserve_size, 2 * GetOsPageSize()); - resolve_heap_commit_size = (DWORD) ALIGN_UP(resolve_heap_commit_size, 2 * GetOsPageSize()); + resolve_heap_reserve_size = (DWORD) ALIGN_UP(resolve_heap_reserve_size, GetOsPageSize()); + resolve_heap_commit_size = (DWORD) ALIGN_UP(resolve_heap_commit_size, GetOsPageSize()); vtable_heap_reserve_size = (DWORD) ALIGN_UP(vtable_heap_reserve_size, GetOsPageSize()); vtable_heap_commit_size = (DWORD) ALIGN_UP(vtable_heap_commit_size, GetOsPageSize()); @@ -614,17 +617,16 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA if (dwWastedReserveMemSize != 0) { DWORD cWastedPages = dwWastedReserveMemSize / GetOsPageSize(); - DWORD cPagesPerHeap = cWastedPages / 9; - DWORD cPagesRemainder = cWastedPages % 9; // We'll throw this at the resolve heap + DWORD cPagesPerHeap = cWastedPages / 6; + DWORD cPagesRemainder = cWastedPages % 6; // We'll throw this at the resolve heap indcell_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); cache_entry_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); - lookup_heap_reserve_size += 2 * cPagesPerHeap * GetOsPageSize(); - dispatch_heap_reserve_size += 2 * cPagesPerHeap * GetOsPageSize(); + lookup_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); + dispatch_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); vtable_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); - resolve_heap_reserve_size += 2 * cPagesPerHeap * GetOsPageSize(); - resolve_heap_reserve_size += (cPagesRemainder & 0xFFFFFFFE) * GetOsPageSize(); - indcell_heap_reserve_size += (cPagesRemainder & 1) * GetOsPageSize(); + resolve_heap_reserve_size += cPagesPerHeap * GetOsPageSize(); + resolve_heap_reserve_size += cPagesRemainder * GetOsPageSize(); } CONSISTENCY_CHECK((indcell_heap_reserve_size + @@ -651,14 +653,14 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA cache_entry_heap_reserve_size = GetOsPageSize(); cache_entry_heap_commit_size = GetOsPageSize(); - lookup_heap_reserve_size = 4 * GetOsPageSize(); - lookup_heap_commit_size = 2 * GetOsPageSize(); + lookup_heap_reserve_size = GetOsPageSize(); + lookup_heap_commit_size = GetOsPageSize(); - dispatch_heap_reserve_size = 4 * GetOsPageSize(); - dispatch_heap_commit_size = 2 * GetOsPageSize(); + dispatch_heap_reserve_size = GetOsPageSize(); + dispatch_heap_commit_size = GetOsPageSize(); - resolve_heap_reserve_size = 4 * GetOsPageSize(); - resolve_heap_commit_size = 2 * GetOsPageSize(); + resolve_heap_reserve_size = GetOsPageSize(); + resolve_heap_commit_size = GetOsPageSize(); // Heap for the collectible case is carefully tuned to sum up to 16 pages. Today, we only use the // vtable jump stubs in the R2R scenario, which is unlikely to be loaded in the collectible context, @@ -691,7 +693,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA NewHolder indcell_heap_holder( new LoaderHeap(indcell_heap_reserve_size, indcell_heap_commit_size, initReservedMem, indcell_heap_reserve_size, - NULL, UnlockedLoaderHeap::HeapKind::Data, FALSE)); + NULL, UnlockedLoaderHeap::HeapKind::Data)); initReservedMem += indcell_heap_reserve_size; @@ -699,7 +701,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA NewHolder cache_entry_heap_holder( new LoaderHeap(cache_entry_heap_reserve_size, cache_entry_heap_commit_size, initReservedMem, cache_entry_heap_reserve_size, - &cache_entry_rangeList, UnlockedLoaderHeap::HeapKind::Data, FALSE)); + &cache_entry_rangeList, UnlockedLoaderHeap::HeapKind::Data)); initReservedMem += cache_entry_heap_reserve_size; @@ -707,7 +709,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA NewHolder lookup_heap_holder( new LoaderHeap(lookup_heap_reserve_size, lookup_heap_commit_size, initReservedMem, lookup_heap_reserve_size, - &lookup_rangeList, UnlockedLoaderHeap::HeapKind::Interleaved, FALSE, LookupStub::GenerateCodePage, LookupStub::CodeSize)); + &lookup_rangeList, UnlockedLoaderHeap::HeapKind::Executable)); initReservedMem += lookup_heap_reserve_size; @@ -715,7 +717,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA NewHolder dispatch_heap_holder( new LoaderHeap(dispatch_heap_reserve_size, dispatch_heap_commit_size, initReservedMem, dispatch_heap_reserve_size, - &dispatch_rangeList, UnlockedLoaderHeap::HeapKind::Interleaved, FALSE, DispatchStub::GenerateCodePage, DispatchStub::CodeSize)); + &dispatch_rangeList, UnlockedLoaderHeap::HeapKind::Executable)); initReservedMem += dispatch_heap_reserve_size; @@ -723,7 +725,7 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA NewHolder resolve_heap_holder( new LoaderHeap(resolve_heap_reserve_size, resolve_heap_commit_size, initReservedMem, resolve_heap_reserve_size, - &resolve_rangeList, UnlockedLoaderHeap::HeapKind::Interleaved, FALSE, ResolveStub::GenerateCodePage, ResolveStub::CodeSize)); + &resolve_rangeList, UnlockedLoaderHeap::HeapKind::Executable)); initReservedMem += resolve_heap_reserve_size; @@ -735,6 +737,9 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA initReservedMem += vtable_heap_reserve_size; + // Allocate the initial counter block + NewHolder m_counters_holder(new counter_block); + // // On success of every allocation, assign the objects and suppress the release // @@ -752,6 +757,16 @@ void VirtualCallStubManager::Init(BaseDomain *pDomain, LoaderAllocator *pLoaderA vtableCallers = vtableCallers_holder; vtableCallers_holder.SuppressRelease(); cache_entries = cache_entries_holder; cache_entries_holder.SuppressRelease(); + m_counters = m_counters_holder; m_counters_holder.SuppressRelease(); + + // Create the initial failure counter block + m_counters->next = NULL; + m_counters->used = 0; + m_cur_counter_block = m_counters; + + m_cur_counter_block_for_reclaim = m_counters; + m_cur_counter_block_for_reclaim_index = 0; + // Keep track of all of our managers VirtualCallStubManagerManager::GlobalManager()->AddStubManager(this); } @@ -808,6 +823,14 @@ VirtualCallStubManager::~VirtualCallStubManager() if (vtableCallers) { delete vtableCallers; vtableCallers = NULL;} if (cache_entries) { delete cache_entries; cache_entries = NULL;} + // Now get rid of the memory taken by the counter_blocks + while (m_counters != NULL) + { + counter_block *del = m_counters; + m_counters = m_counters->next; + delete del; + } + // This was the block reserved by Init for the heaps. // For the collectible case, the VSD logic does not allocate the memory. if (m_initialReservedMemForHeaps && !m_loaderAllocator->IsCollectible()) @@ -833,15 +856,18 @@ void VirtualCallStubManager::InitStatic() g_resetCacheIncr = (INT32) CLRConfig::GetConfigValue(CLRConfig::INTERNAL_VirtualCallStubResetCacheIncr); #endif // STUB_LOGGING +#ifndef STUB_DISPATCH_PORTABLE + DispatchHolder::InitializeStatic(); + ResolveHolder::InitializeStatic(); +#endif // !STUB_DISPATCH_PORTABLE + LookupHolder::InitializeStatic(); + g_resolveCache = new DispatchCache(); if(CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_VirtualCallStubLogging)) StartupLogging(); VirtualCallStubManagerManager::InitStatic(); - LookupStub::InitStatic(); - DispatchStub::InitStatic(); - ResolveStub::InitStatic(); } // Static shutdown code. @@ -903,6 +929,29 @@ i.e. need to be serialized and non-concurrant. */ void VirtualCallStubManager::Reclaim() { LIMITED_METHOD_CONTRACT; + + UINT32 limit = min(counter_block::MAX_COUNTER_ENTRIES, + m_cur_counter_block_for_reclaim->used); + limit = min(m_cur_counter_block_for_reclaim_index + 16, limit); + + for (UINT32 i = m_cur_counter_block_for_reclaim_index; i < limit; i++) + { + m_cur_counter_block_for_reclaim->block[i] += (STUB_MISS_COUNT_VALUE/10)+1; + } + + // Increment the index by the number we processed + m_cur_counter_block_for_reclaim_index = limit; + + // If we ran to the end of the block, go to the next + if (m_cur_counter_block_for_reclaim_index == m_cur_counter_block->used) + { + m_cur_counter_block_for_reclaim = m_cur_counter_block_for_reclaim->next; + m_cur_counter_block_for_reclaim_index = 0; + + // If this was the last block in the chain, go back to the beginning + if (m_cur_counter_block_for_reclaim == NULL) + m_cur_counter_block_for_reclaim = m_counters; + } } #endif // !DACCESS_COMPILE @@ -1077,8 +1126,8 @@ PCODE VirtualCallStubManager::GetCallStub(TypeHandle ownerType, DWORD slot) { if ((stub = (PCODE)(lookups->Find(&probeL))) == CALL_STUB_EMPTY_ENTRY) { - LookupStub *pLookupStub = GenerateLookupStub(addrOfResolver, token.To_SIZE_T()); - stub = (PCODE) (lookups->Add((size_t)(pLookupStub->entryPoint()), &probeL)); + LookupHolder *pLookupHolder = GenerateLookupStub(addrOfResolver, token.To_SIZE_T()); + stub = (PCODE) (lookups->Add((size_t)(pLookupHolder->stub()->entryPoint()), &probeL)); } } @@ -1288,22 +1337,22 @@ size_t VirtualCallStubManager::GetTokenFromStubQuick(VirtualCallStubManager * pM if (kind == SK_DISPATCH) { _ASSERTE(pMgr->isDispatchingStub(stub)); - DispatchStub * pDispatchStub = (DispatchStub *) PCODEToPINSTR(stub); - ResolveStub * pResolveStub = ResolveStub::FromFailEntry(pDispatchStub->failTarget()); - _ASSERTE(pMgr->isResolvingStub(pResolveStub->resolveEntryPoint())); - return pResolveStub->token(); + DispatchStub * dispatchStub = (DispatchStub *) PCODEToPINSTR(stub); + ResolveHolder * resolveHolder = ResolveHolder::FromFailEntry(dispatchStub->failTarget()); + _ASSERTE(pMgr->isResolvingStub(resolveHolder->stub()->resolveEntryPoint())); + return resolveHolder->stub()->token(); } else if (kind == SK_RESOLVE) { _ASSERTE(pMgr->isResolvingStub(stub)); - ResolveStub * pResolveStub = ResolveStub::FromResolveEntry(stub); - return pResolveStub->token(); + ResolveHolder * resolveHolder = ResolveHolder::FromResolveEntry(stub); + return resolveHolder->stub()->token(); } else if (kind == SK_LOOKUP) { _ASSERTE(pMgr->isLookupStub(stub)); - LookupStub * pLookupStub = LookupStub::FromLookupEntry(stub); - return pLookupStub->token(); + LookupHolder * lookupHolder = LookupHolder::FromLookupEntry(stub); + return lookupHolder->stub()->token(); } else if (kind == SK_VTABLECALL) { @@ -1698,7 +1747,7 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, { //we have a target but not the dispatcher stub, lets build it //First we need a failure target (the resolver stub) - ResolveStub *pResolveStub = NULL; + ResolveHolder *pResolveHolder = NULL; ResolveEntry entryR; Prober probeR(&entryR); PCODE pBackPatchFcn; @@ -1734,7 +1783,7 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, } #endif // TARGET_X86 && !UNIX_X86_ABI - pResolveStub = GenerateResolveStub(pResolverFcn, + pResolveHolder = GenerateResolveStub(pResolverFcn, pBackPatchFcn, token.To_SIZE_T() #if defined(TARGET_X86) && !defined(UNIX_X86_ABI) @@ -1744,14 +1793,14 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, // Add the resolve entrypoint into the cache. //@TODO: Can we store a pointer to the holder rather than the entrypoint? - resolvers->Add((size_t)(pResolveStub->resolveEntryPoint()), &probeR); + resolvers->Add((size_t)(pResolveHolder->stub()->resolveEntryPoint()), &probeR); } else { - pResolveStub = ResolveStub::FromResolveEntry(addrOfResolver); + pResolveHolder = ResolveHolder::FromResolveEntry(addrOfResolver); } - CONSISTENCY_CHECK(CheckPointer(pResolveStub)); - stub = pResolveStub->resolveEntryPoint(); + CONSISTENCY_CHECK(CheckPointer(pResolveHolder)); + stub = pResolveHolder->stub()->resolveEntryPoint(); CONSISTENCY_CHECK(stub != NULL); } @@ -1761,7 +1810,7 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, // 3. The call site is currently wired to a lookup stub. If the call site is wired // to anything else, then we're never going to use the dispatch stub so there's // no use in creating it. - if (pResolveStub != NULL && stubKind == SK_LOOKUP) + if (pResolveHolder != NULL && stubKind == SK_LOOKUP) { DispatchEntry entryD; Prober probeD(&entryD); @@ -1770,13 +1819,13 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, { // We are allowed to create a reusable dispatch stub for all assemblies // this allows us to optimize the call interception case the same way - DispatchStub *pDispatchStub = NULL; + DispatchHolder *pDispatchHolder = NULL; PCODE addrOfDispatch = (PCODE)(dispatchers->Find(&probeD)); if (addrOfDispatch == CALL_STUB_EMPTY_ENTRY) { - PCODE addrOfFail = pResolveStub->failEntryPoint(); + PCODE addrOfFail = pResolveHolder->stub()->failEntryPoint(); bool reenteredCooperativeGCMode = false; - pDispatchStub = GenerateDispatchStub( + pDispatchHolder = GenerateDispatchStub( target, addrOfFail, objectType, token.To_SIZE_T(), &reenteredCooperativeGCMode); if (reenteredCooperativeGCMode) { @@ -1784,16 +1833,16 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, BOOL success = dispatchers->SetUpProber(token.To_SIZE_T(), (size_t)objectType, &probeD); _ASSERTE(success); } - dispatchers->Add((size_t)(pDispatchStub->entryPoint()), &probeD); + dispatchers->Add((size_t)(pDispatchHolder->stub()->entryPoint()), &probeD); } else { - pDispatchStub = DispatchStub::FromDispatchEntry(addrOfDispatch); + pDispatchHolder = DispatchHolder::FromDispatchEntry(addrOfDispatch); } // Now assign the entrypoint to stub - CONSISTENCY_CHECK(CheckPointer(pDispatchStub)); - stub = pDispatchStub->entryPoint(); + CONSISTENCY_CHECK(CheckPointer(pDispatchHolder)); + stub = pDispatchHolder->stub()->entryPoint(); CONSISTENCY_CHECK(stub != NULL); } else @@ -1891,16 +1940,16 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, Prober probeD(&entryD); if (dispatchers->SetUpProber(token.To_SIZE_T(), (size_t) objectType, &probeD)) { - DispatchStub *pDispatchStub = NULL; + DispatchHolder *pDispatchHolder = NULL; PCODE addrOfDispatch = (PCODE)(dispatchers->Find(&probeD)); if (addrOfDispatch == CALL_STUB_EMPTY_ENTRY) { // It is possible that we never created this monomorphic dispatch stub // so we may have to create it now - ResolveStub* pResolveStub = ResolveStub::FromResolveEntry(pCallSite->GetSiteTarget()); - PCODE addrOfFail = pResolveStub->stub()->failEntryPoint(); + ResolveHolder* pResolveHolder = ResolveHolder::FromResolveEntry(pCallSite->GetSiteTarget()); + PCODE addrOfFail = pResolveHolder->stub()->failEntryPoint(); bool reenteredCooperativeGCMode = false; - pDispatchStub = GenerateDispatchStub( + pDispatchHolder = GenerateDispatchStub( target, addrOfFail, objectType, token.To_SIZE_T(), &reenteredCooperativeGCMode); if (reenteredCooperativeGCMode) { @@ -1908,19 +1957,19 @@ PCODE VirtualCallStubManager::ResolveWorker(StubCallSite* pCallSite, BOOL success = dispatchers->SetUpProber(token.To_SIZE_T(), (size_t)objectType, &probeD); _ASSERTE(success); } - dispatchers->Add((size_t)(pDispatchStub->entryPoint()), &probeD); + dispatchers->Add((size_t)(pDispatchHolder->stub()->entryPoint()), &probeD); } else { - pDispatchStub = DispatchStub::FromDispatchEntry(addrOfDispatch); + pDispatchHolder = DispatchHolder::FromDispatchEntry(addrOfDispatch); } // increment the of times we changed a cache collision into a mono stub stats.worker_collide_to_mono++; // Now assign the entrypoint to stub - CONSISTENCY_CHECK(pDispatchStub != NULL); - stub = pDispatchStub->entryPoint(); + CONSISTENCY_CHECK(pDispatchHolder != NULL); + stub = pDispatchHolder->stub()->entryPoint(); CONSISTENCY_CHECK(stub != NULL); } } @@ -2426,19 +2475,20 @@ void VirtualCallStubManager::BackPatchWorker(StubCallSite* pCallSite) if (isDispatchingStub(callSiteTarget)) { - DispatchStub * dispatchStub = DispatchStub::FromDispatchEntry(callSiteTarget); + DispatchHolder * dispatchHolder = DispatchHolder::FromDispatchEntry(callSiteTarget); + DispatchStub * dispatchStub = dispatchHolder->stub(); //yes, patch it to point to the resolve stub //We can ignore the races now since we now know that the call site does go thru our //stub mechanisms, hence no matter who wins the race, we are correct. //We find the correct resolve stub by following the failure path in the dispatcher stub itself - PCODE failEntry = dispatchStub->failTarget(); - ResolveStub* resolveStub = ResolveStub::FromFailEntry(failEntry); + PCODE failEntry = dispatchStub->failTarget(); + ResolveStub* resolveStub = ResolveHolder::FromFailEntry(failEntry)->stub(); PCODE resolveEntry = resolveStub->resolveEntryPoint(); BackPatchSite(pCallSite, resolveEntry); LOG((LF_STUBS, LL_INFO10000, "BackPatchWorker call-site" FMT_ADDR "dispatchStub" FMT_ADDR "\n", - DBG_ADDR(pCallSite->GetReturnAddress()), DBG_ADDR(dispatchStub))); + DBG_ADDR(pCallSite->GetReturnAddress()), DBG_ADDR(dispatchHolder->stub()))); //Add back the default miss count to the counter being used by this resolve stub //Since resolve stub are shared among many dispatch stubs each dispatch stub @@ -2513,13 +2563,13 @@ void StubCallSite::SetSiteTarget(PCODE newTarget) /* Generate a dispatcher stub, pMTExpected is the method table to burn in the stub, and the two addrOf's are the addresses the stub is to transfer to depending on the test with pMTExpected */ -DispatchStub *VirtualCallStubManager::GenerateDispatchStub(PCODE addrOfCode, +DispatchHolder *VirtualCallStubManager::GenerateDispatchStub(PCODE addrOfCode, PCODE addrOfFail, void * pMTExpected, size_t dispatchToken, bool * pMayHaveReenteredCooperativeGCMode) { - CONTRACT (DispatchStub*) { + CONTRACT (DispatchHolder*) { THROWS; GC_TRIGGERS; INJECT_FAULT(COMPlusThrowOM();); @@ -2531,20 +2581,113 @@ DispatchStub *VirtualCallStubManager::GenerateDispatchStub(PCODE addr POSTCONDITION(CheckPointer(RETVAL)); } CONTRACT_END; - size_t dispatchStubSize = DispatchStub::size(); + size_t dispatchHolderSize = sizeof(DispatchHolder); + +#ifdef TARGET_AMD64 + // See comment around m_fShouldAllocateLongJumpDispatchStubs for explanation. + if (m_fShouldAllocateLongJumpDispatchStubs + INDEBUG(|| g_pConfig->ShouldGenerateLongJumpDispatchStub())) + { + RETURN GenerateDispatchStubLong(addrOfCode, + addrOfFail, + pMTExpected, + dispatchToken, + pMayHaveReenteredCooperativeGCMode); + } + + dispatchHolderSize = DispatchHolder::GetHolderSize(DispatchStub::e_TYPE_SHORT); +#endif + + //allocate from the requisite heap and copy the template over it. + DispatchHolder * holder = (DispatchHolder*) (void*) + dispatch_heap->AllocAlignedMem(dispatchHolderSize, CODE_SIZE_ALIGN); + +#ifdef TARGET_AMD64 + if (!DispatchHolder::CanShortJumpDispatchStubReachFailTarget(addrOfFail, (LPCBYTE)holder)) + { + m_fShouldAllocateLongJumpDispatchStubs = TRUE; + RETURN GenerateDispatchStub(addrOfCode, addrOfFail, pMTExpected, dispatchToken, pMayHaveReenteredCooperativeGCMode); + } +#endif + + ExecutableWriterHolder dispatchWriterHolder(holder, dispatchHolderSize); + dispatchWriterHolder.GetRW()->Initialize(holder, addrOfCode, + addrOfFail, + (size_t)pMTExpected +#ifdef TARGET_AMD64 + , DispatchStub::e_TYPE_SHORT +#endif + ); + +#ifdef FEATURE_CODE_VERSIONING + MethodDesc *pMD = MethodTable::GetMethodDescForSlotAddress(addrOfCode); + if (pMD->IsVersionableWithVtableSlotBackpatch()) + { + EntryPointSlots::SlotType slotType; + TADDR slot = holder->stub()->implTargetSlot(&slotType); + pMD->RecordAndBackpatchEntryPointSlot(m_loaderAllocator, slot, slotType); + + // RecordAndBackpatchEntryPointSlot() may exit and reenter cooperative GC mode + *pMayHaveReenteredCooperativeGCMode = true; + } +#endif + + ClrFlushInstructionCache(holder->stub(), holder->stub()->size()); + + AddToCollectibleVSDRangeList(holder); + + //incr our counters + stats.stub_mono_counter++; + stats.stub_space += (UINT32)dispatchHolderSize; + LOG((LF_STUBS, LL_INFO10000, "GenerateDispatchStub for token" FMT_ADDR "and pMT" FMT_ADDR "at" FMT_ADDR "\n", + DBG_ADDR(dispatchToken), DBG_ADDR(pMTExpected), DBG_ADDR(holder->stub()))); + +#ifdef FEATURE_PERFMAP + PerfMap::LogStubs(__FUNCTION__, "GenerateDispatchStub", (PCODE)holder->stub(), holder->stub()->size()); +#endif + + RETURN (holder); +} + +#ifdef TARGET_AMD64 +//---------------------------------------------------------------------------- +/* Generate a dispatcher stub, pMTExpected is the method table to burn in the stub, and the two addrOf's +are the addresses the stub is to transfer to depending on the test with pMTExpected +*/ +DispatchHolder *VirtualCallStubManager::GenerateDispatchStubLong(PCODE addrOfCode, + PCODE addrOfFail, + void * pMTExpected, + size_t dispatchToken, + bool * pMayHaveReenteredCooperativeGCMode) +{ + CONTRACT (DispatchHolder*) { + THROWS; + GC_TRIGGERS; + INJECT_FAULT(COMPlusThrowOM();); + PRECONDITION(addrOfCode != NULL); + PRECONDITION(addrOfFail != NULL); + PRECONDITION(CheckPointer(pMTExpected)); + PRECONDITION(pMayHaveReenteredCooperativeGCMode != nullptr); + PRECONDITION(!*pMayHaveReenteredCooperativeGCMode); + POSTCONDITION(CheckPointer(RETVAL)); + } CONTRACT_END; //allocate from the requisite heap and copy the template over it. - DispatchStub * pStub = (DispatchStub*) (void*) - dispatch_heap->AllocAlignedMem(dispatchStubSize, 1);// CODE_SIZE_ALIGN); + size_t dispatchHolderSize = DispatchHolder::GetHolderSize(DispatchStub::e_TYPE_LONG); + DispatchHolder * holder = (DispatchHolder*) (void*)dispatch_heap->AllocAlignedMem(dispatchHolderSize, CODE_SIZE_ALIGN); + ExecutableWriterHolder dispatchWriterHolder(holder, dispatchHolderSize); - pStub->Initialize(addrOfCode, addrOfFail, (size_t)pMTExpected); + dispatchWriterHolder.GetRW()->Initialize(holder, addrOfCode, + addrOfFail, + (size_t)pMTExpected, + DispatchStub::e_TYPE_LONG); #ifdef FEATURE_CODE_VERSIONING MethodDesc *pMD = MethodTable::GetMethodDescForSlotAddress(addrOfCode); if (pMD->IsVersionableWithVtableSlotBackpatch()) { EntryPointSlots::SlotType slotType; - TADDR slot = pStub->implTargetSlot(&slotType); + TADDR slot = holder->stub()->implTargetSlot(&slotType); pMD->RecordAndBackpatchEntryPointSlot(m_loaderAllocator, slot, slotType); // RecordAndBackpatchEntryPointSlot() may exit and reenter cooperative GC mode @@ -2552,35 +2695,38 @@ DispatchStub *VirtualCallStubManager::GenerateDispatchStub(PCODE addr } #endif - AddToCollectibleVSDRangeList(pStub); + ClrFlushInstructionCache(holder->stub(), holder->stub()->size()); + + AddToCollectibleVSDRangeList(holder); //incr our counters stats.stub_mono_counter++; - stats.stub_space += (UINT32)dispatchStubSize; + stats.stub_space += static_cast(DispatchHolder::GetHolderSize(DispatchStub::e_TYPE_LONG)); LOG((LF_STUBS, LL_INFO10000, "GenerateDispatchStub for token" FMT_ADDR "and pMT" FMT_ADDR "at" FMT_ADDR "\n", - DBG_ADDR(dispatchToken), DBG_ADDR(pMTExpected), DBG_ADDR(pStub))); + DBG_ADDR(dispatchToken), DBG_ADDR(pMTExpected), DBG_ADDR(holder->stub()))); #ifdef FEATURE_PERFMAP - PerfMap::LogStubs(__FUNCTION__, "GenerateDispatchStub", (PCODE)pStub, pStub->size()); + PerfMap::LogStubs(__FUNCTION__, "GenerateDispatchStub", (PCODE)holder->stub(), holder->stub()->size()); #endif - RETURN (pStub); + RETURN (holder); } +#endif //---------------------------------------------------------------------------- /* Generate a resolve stub for the given dispatchToken. addrOfResolver is where to go if the inline cache check misses addrOfPatcher is who to call if the fail piece is being called too often by dispacher stubs */ -ResolveStub *VirtualCallStubManager::GenerateResolveStub(PCODE addrOfResolver, - PCODE addrOfPatcher, - size_t dispatchToken +ResolveHolder *VirtualCallStubManager::GenerateResolveStub(PCODE addrOfResolver, + PCODE addrOfPatcher, + size_t dispatchToken #if defined(TARGET_X86) && !defined(UNIX_X86_ABI) - , size_t stackArgumentsSize + , size_t stackArgumentsSize #endif - ) + ) { - CONTRACT (ResolveStub*) { + CONTRACT (ResolveHolder*) { THROWS; GC_TRIGGERS; INJECT_FAULT(COMPlusThrowOM();); @@ -2593,40 +2739,85 @@ ResolveStub *VirtualCallStubManager::GenerateResolveStub(PCODE addrOf _ASSERTE(addrOfResolver); + //get a counter for the fail piece + + UINT32 counter_index = counter_block::MAX_COUNTER_ENTRIES; + counter_block *cur_block = NULL; + + while (true) + { + cur_block = VolatileLoad(&m_cur_counter_block); + + if ((cur_block != NULL) && (cur_block->used < counter_block::MAX_COUNTER_ENTRIES)) + { + counter_index = FastInterlockIncrement((LONG*)&cur_block->used) - 1; + if (counter_index < counter_block::MAX_COUNTER_ENTRIES) + { + // Typical case we allocate the next free counter in the block + break; + } + } + + // Otherwise we have to create a new counter_block to serve as the head of m_cur_counter_block list + + // Create the new block in the main heap + counter_block *pNew = new counter_block; + + // Initialize the new block + pNew->next = cur_block; + pNew->used = 0; + + // Try to link in the new block + if (InterlockedCompareExchangeT(&m_cur_counter_block, pNew, cur_block) != cur_block) + { + // Lost a race to add pNew as new head + delete pNew; + } + } + + CONSISTENCY_CHECK(counter_index < counter_block::MAX_COUNTER_ENTRIES); + CONSISTENCY_CHECK(CheckPointer(cur_block)); + + // Initialize the default miss counter for this resolve stub + INT32* counterAddr = &(cur_block->block[counter_index]); + *counterAddr = STUB_MISS_COUNT_VALUE; + //allocate from the requisite heap and copy the templates for each piece over it. - ResolveStub * pResolveStub = (ResolveStub*) (void*) - resolve_heap->AllocAlignedMem(sizeof(ResolveStub), 1);// CODE_SIZE_ALIGN); + ResolveHolder * holder = (ResolveHolder*) (void*) + resolve_heap->AllocAlignedMem(sizeof(ResolveHolder), CODE_SIZE_ALIGN); + ExecutableWriterHolder resolveWriterHolder(holder, sizeof(ResolveHolder)); - pResolveStub->Initialize( + resolveWriterHolder.GetRW()->Initialize(holder, addrOfResolver, addrOfPatcher, dispatchToken, DispatchCache::HashToken(dispatchToken), - g_resolveCache->GetCacheBaseAddr(), STUB_MISS_COUNT_VALUE + g_resolveCache->GetCacheBaseAddr(), counterAddr #if defined(TARGET_X86) && !defined(UNIX_X86_ABI) , stackArgumentsSize #endif ); + ClrFlushInstructionCache(holder->stub(), holder->stub()->size()); - AddToCollectibleVSDRangeList(pResolveStub); + AddToCollectibleVSDRangeList(holder); //incr our counters stats.stub_poly_counter++; - stats.stub_space += sizeof(ResolveStub)+sizeof(size_t); + stats.stub_space += sizeof(ResolveHolder)+sizeof(size_t); LOG((LF_STUBS, LL_INFO10000, "GenerateResolveStub for token" FMT_ADDR "at" FMT_ADDR "\n", - DBG_ADDR(dispatchToken), DBG_ADDR(pResolveStub))); + DBG_ADDR(dispatchToken), DBG_ADDR(holder->stub()))); #ifdef FEATURE_PERFMAP - PerfMap::LogStubs(__FUNCTION__, "GenerateResolveStub", (PCODE)pResolveStub, pResolveStub->size()); + PerfMap::LogStubs(__FUNCTION__, "GenerateResolveStub", (PCODE)holder->stub(), holder->stub()->size()); #endif - RETURN (pResolveStub); + RETURN (holder); } //---------------------------------------------------------------------------- /* Generate a lookup stub for the given dispatchToken. addrOfResolver is where the stub always transfers control */ -LookupStub *VirtualCallStubManager::GenerateLookupStub(PCODE addrOfResolver, size_t dispatchToken) +LookupHolder *VirtualCallStubManager::GenerateLookupStub(PCODE addrOfResolver, size_t dispatchToken) { - CONTRACT (LookupStub*) { + CONTRACT (LookupHolder*) { THROWS; GC_TRIGGERS; INJECT_FAULT(COMPlusThrowOM();); @@ -2635,23 +2826,25 @@ LookupStub *VirtualCallStubManager::GenerateLookupStub(PCODE addrOfResolver, siz } CONTRACT_END; //allocate from the requisite heap and copy the template over it. - LookupStub* pStub = (LookupStub*)(void*)lookup_heap->AllocAlignedMem(sizeof(LookupStub), 1); // CODE_SIZE_ALIGN); - pStub->Initialize(addrOfResolver, dispatchToken); - ClrFlushInstructionCache(pStub, pStub->size()); + LookupHolder * holder = (LookupHolder*) (void*) lookup_heap->AllocAlignedMem(sizeof(LookupHolder), CODE_SIZE_ALIGN); + ExecutableWriterHolder lookupWriterHolder(holder, sizeof(LookupHolder)); + + lookupWriterHolder.GetRW()->Initialize(holder, addrOfResolver, dispatchToken); + ClrFlushInstructionCache(holder->stub(), holder->stub()->size()); - AddToCollectibleVSDRangeList(pStub); + AddToCollectibleVSDRangeList(holder); //incr our counters stats.stub_lookup_counter++; - stats.stub_space += sizeof(LookupStub); + stats.stub_space += sizeof(LookupHolder); LOG((LF_STUBS, LL_INFO10000, "GenerateLookupStub for token" FMT_ADDR "at" FMT_ADDR "\n", - DBG_ADDR(dispatchToken), DBG_ADDR(pStub))); + DBG_ADDR(dispatchToken), DBG_ADDR(holder->stub()))); #ifdef FEATURE_PERFMAP - PerfMap::LogStubs(__FUNCTION__, "GenerateLookupStub", (PCODE)pStub, pStub->size()); + PerfMap::LogStubs(__FUNCTION__, "GenerateLookupStub", (PCODE)holder->stub(), holder->stub()->size()); #endif - RETURN (pStub); + RETURN (holder); } //---------------------------------------------------------------------------- @@ -3854,322 +4047,3 @@ BOOL VirtualCallStubManagerManager::TraceManager( // Forward the call to the appropriate manager. return pMgr->TraceManager(thread, trace, pContext, pRetAddr); } - - -#ifndef DACCESS_COMPILE - -//#include "asmconstants.h" - -#ifdef STUB_LOGGING -extern size_t g_lookup_inline_counter; -extern size_t g_call_inline_counter; -extern size_t g_miss_inline_counter; -extern size_t g_call_cache_counter; -extern size_t g_miss_cache_counter; -#endif - -void LookupStub::Initialize(PCODE resolveWorkerTarget, size_t dispatchToken) -{ - LookupStubData *pData = GetData(); - pData->DispatchToken = dispatchToken; - pData->ResolveWorkerTarget = resolveWorkerTarget; -} - -void DispatchStub::Initialize(PCODE implTarget, PCODE failTarget, size_t expectedMT) -{ - DispatchStubData *pData = GetData(); - pData->ExpectedMT = expectedMT; - pData->ImplTarget = implTarget; - pData->FailTarget = failTarget; -} - -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) - #define ENUM_PAGE_SIZE(size) \ - extern "C" void LookupStubCode##size(); \ - extern "C" void LookupStubCode##size##_End(); - - ENUM_PAGE_SIZES - #undef ENUM_PAGE_SIZE -#else -extern "C" void LookupStubCode(); -extern "C" void LookupStubCode_End(); -#endif - -#ifdef TARGET_X86 -extern "C" size_t LookupStubCode_DispatchToken_Offset; -extern "C" size_t LookupStubCode_ResolveWorkerTarget_Offset; - -#define SYMBOL_VALUE(name) ((size_t)&name) - -#endif - -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) -void (*LookupStub::LookupStubCode)(); -#endif - -void LookupStub::InitStatic() -{ -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) - int pageSize = GetOsPageSize(); - #define ENUM_PAGE_SIZE(size) \ - case size: \ - LookupStubCode = LookupStubCode##size; \ - _ASSERTE(((BYTE*)LookupStubCode##size##_End - (BYTE*)LookupStubCode##size) <= LookupStub::CodeSize); \ - break; - - switch (pageSize) - { - ENUM_PAGE_SIZES - default: - EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size")); - } - #undef ENUM_PAGE_SIZE -#else - _ASSERTE(((BYTE*)LookupStubCode_End - (BYTE*)LookupStubCode) <= LookupStub::CodeSize); -#endif - _ASSERTE(VirtualCallStubManager::predictStubKind((PCODE)(void*)LookupStubCode) == VirtualCallStubManager::SK_LOOKUP); -} - -void LookupStub::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX) -{ - int pageSize = GetOsPageSize(); - -#ifdef TARGET_X86 - int totalCodeSize = (pageSize / LookupStub::CodeSize) * LookupStub::CodeSize; - - for (int i = 0; i < pageSize; i += LookupStub::CodeSize) - { - memcpy(pageBase + i, (const void*)LookupStubCode, LookupStub::CodeSize); - - BYTE* pDispatchTokenSlot = pageBaseRX + i + pageSize + offsetof(LookupStubData, DispatchToken); - *(BYTE**)(pageBase + i + SYMBOL_VALUE(LookupStubCode_DispatchToken_Offset)) = pDispatchTokenSlot; - - BYTE* pResolveWorkerTargetSlot = pageBaseRX + i + pageSize + offsetof(LookupStubData, ResolveWorkerTarget); - *(BYTE**)(pageBase + i + SYMBOL_VALUE(LookupStubCode_ResolveWorkerTarget_Offset)) = pResolveWorkerTargetSlot; - } -#else // TARGET_X86 - FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)LookupStubCode), LookupStub::CodeSize, pageSize); -#endif // TARGET_X86 -} - -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) - #define ENUM_PAGE_SIZE(size) \ - extern "C" void DispatchStubCode##size(); \ - extern "C" void DispatchStubCode##size##_End(); \ - extern "C" void DispatchStubCode_ThisDeref##size(); - - ENUM_PAGE_SIZES - #undef ENUM_PAGE_SIZE -#else -extern "C" void DispatchStubCode(); -extern "C" void DispatchStubCode_End(); -#endif - -#ifdef TARGET_X86 -extern "C" size_t DispatchStubCode_ExpectedMT_Offset; -extern "C" size_t DispatchStubCode_ImplTarget_Offset; -extern "C" size_t DispatchStubCode_FailTarget_Offset; -#endif - -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) -void (*DispatchStub::DispatchStubCode)(); -void (*DispatchStub::DispatchStubCode_ThisDeref)(); -#endif // TARGET_ARM64 && TARGET_UNIX - -void DispatchStub::InitStatic() -{ -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) - int pageSize = GetOsPageSize(); - #define ENUM_PAGE_SIZE(size) \ - case size: \ - DispatchStubCode = DispatchStubCode##size; \ - DispatchStubCode_ThisDeref = DispatchStubCode_ThisDeref##size; \ - _ASSERTE(((BYTE*)DispatchStubCode##size##_End - (BYTE*)DispatchStubCode##size) <= DispatchStub::CodeSize); \ - break; - - switch (pageSize) - { - ENUM_PAGE_SIZES - default: - EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size")); - } - #undef ENUM_PAGE_SIZE -#else - _ASSERTE(((BYTE*)DispatchStubCode_End - (BYTE*)DispatchStubCode) <= DispatchStub::CodeSize); -#endif - _ASSERTE(VirtualCallStubManager::predictStubKind((PCODE)(void*)DispatchStubCode) == VirtualCallStubManager::SK_DISPATCH); -} - -void DispatchStub::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX) -{ - int pageSize = GetOsPageSize(); - -#ifdef TARGET_X86 - int totalCodeSize = (pageSize / DispatchStub::CodeSize) * DispatchStub::CodeSize; - for (int i = 0; i <= pageSize - DispatchStub::CodeSize; i += DispatchStub::CodeSize) - { - memcpy(pageBase + i, (const void*)DispatchStubCode, DispatchStub::CodeSize); - - BYTE* pExpectedMTSlot = pageBaseRX + i + pageSize + offsetof(DispatchStubData, ExpectedMT); - *(BYTE**)(pageBase + i + SYMBOL_VALUE(DispatchStubCode_ExpectedMT_Offset)) = pExpectedMTSlot; - - BYTE* pImplTargetSlot = pageBaseRX + i + pageSize + offsetof(DispatchStubData, ImplTarget); - *(BYTE**)(pageBase + i + SYMBOL_VALUE(DispatchStubCode_ImplTarget_Offset)) = pImplTargetSlot; - - BYTE* pFailTargetSlot = pageBaseRX + i + pageSize + offsetof(DispatchStubData, FailTarget); - *(BYTE**)(pageBase + i + SYMBOL_VALUE(DispatchStubCode_FailTarget_Offset)) = pFailTargetSlot; - } -#else // TARGET_X86 - FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)DispatchStubCode), DispatchStub::CodeSize, pageSize); -#endif // TARGET_X86 -} - -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) - #define ENUM_PAGE_SIZE(size) \ - extern "C" void ResolveStubCode##size(); \ - extern "C" void ResolveStubCode##size##_End(); \ - extern "C" void ResolveStubCode_ResolveEntry##size(); \ - extern "C" void ResolveStubCode_FailEntry##size(); \ - extern "C" void ResolveStubCode_SlowEntry##size(); \ - extern "C" void ResolveStubCode_ThisDeref##size(); - ENUM_PAGE_SIZES - #undef ENUM_PAGE_SIZE -#else -extern "C" void ResolveStubCode(); -extern "C" void ResolveStubCode_End(); -extern "C" void ResolveStubCode_ResolveEntry(); -extern "C" void ResolveStubCode_FailEntry(); -extern "C" void ResolveStubCode_SlowEntry(); -extern "C" void ResolveStubCode_ThisDeref(); -#endif - -#ifdef TARGET_X86 -extern "C" size_t ResolveStubCode_Counter_Offset; -extern "C" size_t ResolveStubCode_HashedToken_Offset; -extern "C" size_t ResolveStubCode_CacheAddress_Offset; -extern "C" size_t ResolveStubCode_Token_Offset1; -extern "C" size_t ResolveStubCode_Token_Offset2; -extern "C" size_t ResolveStubCode_ResolveWorkerTarget_Offset; -extern "C" size_t ResolveStubCode_PatcherTarget_Offset; -#endif - -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) -void (*ResolveStub::ResolveStubCode)(); -void (*ResolveStub::ResolveStubCode_FailEntry)(); -void (*ResolveStub::ResolveStubCode_SlowEntry)(); -void (*ResolveStub::ResolveStubCode_ResolveEntry)(); -void (*ResolveStub::ResolveStubCode_ThisDeref)(); -#endif // TARGET_ARM64 && TARGET_UNIX - -void ResolveStub::InitStatic() -{ -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) - int pageSize = GetOsPageSize(); - #define ENUM_PAGE_SIZE(size) \ - case size: ResolveStubCode = ResolveStubCode##size; \ - ResolveStubCode_FailEntry = ResolveStubCode_FailEntry##size; \ - ResolveStubCode_SlowEntry = ResolveStubCode_SlowEntry##size; \ - ResolveStubCode_ResolveEntry = ResolveStubCode_ResolveEntry##size; \ - _ASSERTE(((BYTE*)ResolveStubCode##size##_End - (BYTE*)ResolveStubCode##size) <= ResolveStub::CodeSize); \ - break; - - switch (pageSize) - { - ENUM_PAGE_SIZES - default: - EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size")); - } - #undef ENUM_PAGE_SIZE -#else - _ASSERTE(((BYTE*)ResolveStubCode_End - (BYTE*)ResolveStubCode) <= ResolveStub::CodeSize); -#endif - _ASSERTE(VirtualCallStubManager::predictStubKind((PCODE)(void*)ResolveStubCode) == VirtualCallStubManager::SK_RESOLVE); -} - -void ResolveStub::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX) -{ - int pageSize = GetOsPageSize(); - -#ifdef TARGET_X86 - int totalCodeSize = (pageSize / ResolveStub::CodeSize) * ResolveStub::CodeSize; - for (int i = 0; i <= pageSize - ResolveStub::CodeSize; i += ResolveStub::CodeSize) - { - memcpy(pageBase + i, (const void*)ResolveStubCode, ResolveStub::CodeSize); - - BYTE* pCounterSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, Counter); - *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_Counter_Offset)) = pCounterSlot; - - BYTE* pHashedTokenSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, HashedToken); - *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_HashedToken_Offset)) = pHashedTokenSlot; - - BYTE* pLookupCacheSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, CacheAddress); - *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_CacheAddress_Offset)) = pLookupCacheSlot; - - BYTE* pTokenSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, Token); - *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_Token_Offset1)) = pTokenSlot; - - *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_Token_Offset2)) = pTokenSlot; - - BYTE* pResolveWorkerSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, ResolveWorkerTarget); - *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_ResolveWorkerTarget_Offset)) = pResolveWorkerSlot; - - BYTE* pBackpatcherSlot = pageBaseRX + i + pageSize + offsetof(ResolveStubData, PatcherTarget); - *(BYTE**)(pageBase + i + SYMBOL_VALUE(ResolveStubCode_PatcherTarget_Offset)) = pBackpatcherSlot; - } -#else // TARGET_X86 - FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)ResolveStubCode), ResolveStub::CodeSize, pageSize); -#endif // TARGET_X86 -} - -void ResolveStub::Initialize(PCODE resolveWorkerTarget, PCODE patcherTarget, - size_t dispatchToken, UINT32 hashedToken, - void * cacheAddr, INT32 counterValue -#if defined(TARGET_X86) && !defined(UNIX_X86_ABI) - , size_t stackArgumentsSize -#endif - ) -{ - ResolveStubData *pData = GetData(); - - pData->CacheAddress = (size_t)cacheAddr; - pData->HashedToken = hashedToken << LOG2_PTRSIZE; - pData->Token = dispatchToken; - pData->Counter = counterValue; - pData->ResolveWorkerTarget = resolveWorkerTarget; -#ifdef TARGET_X86 - pData->PatcherTarget = patcherTarget; -#ifndef UNIX_X86_ABI - pData->StackArgumentsSize = stackArgumentsSize; -#endif -#endif -} - -LookupStub* LookupStub::FromLookupEntry(PCODE lookupEntry) -{ - LIMITED_METHOD_CONTRACT; - LookupStub* pLookupStub = (LookupStub*)PCODEToPINSTR(lookupEntry); - return pLookupStub; -} - -DispatchStub* DispatchStub::FromDispatchEntry(PCODE dispatchEntry) -{ - LIMITED_METHOD_CONTRACT; - DispatchStub* pDispatchStub = (DispatchStub*)PCODEToPINSTR(dispatchEntry); - return pDispatchStub; -} - -ResolveStub* ResolveStub::FromFailEntry(PCODE failEntry) -{ - LIMITED_METHOD_CONTRACT; - ResolveStub* pResolveStub = (ResolveStub*) (PCODEToPINSTR(failEntry) - ((BYTE*)ResolveStubCode_FailEntry - (BYTE*)ResolveStubCode)); - return pResolveStub; -} - -ResolveStub* ResolveStub::FromResolveEntry(PCODE resolveEntry) -{ - LIMITED_METHOD_CONTRACT; - ResolveStub* pResolveStub = (ResolveStub*) (PCODEToPINSTR(resolveEntry) - ((BYTE*)ResolveStubCode_ResolveEntry - (BYTE*)ResolveStubCode)); - return pResolveStub; -} - -#endif // DACCESS_COMPILE diff --git a/src/coreclr/vm/virtualcallstub.h b/src/coreclr/vm/virtualcallstub.h index e9a9be18bdb76..ec1c99877e129 100644 --- a/src/coreclr/vm/virtualcallstub.h +++ b/src/coreclr/vm/virtualcallstub.h @@ -33,9 +33,9 @@ class Entry; class Prober; class VirtualCallStubManager; class VirtualCallStubManagerManager; -struct LookupStub; -struct DispatchStub; -struct ResolveStub; +struct LookupHolder; +struct DispatchHolder; +struct ResolveHolder; struct VTableCallHolder; ///////////////////////////////////////////////////////////////////////////////////// @@ -159,6 +159,7 @@ extern "C" void BackPatchWorkerStaticStub(PCODE returnAddr, TADDR siteAddrForReg #endif // TARGET_UNIX #endif // TARGET_X86 + typedef VPTR(class VirtualCallStubManager) PTR_VirtualCallStubManager; // VirtualCallStubManager is the heart of the stub dispatch logic. See the book of the runtime entry @@ -279,10 +280,17 @@ class VirtualCallStubManager : public StubManager lookup_heap(NULL), dispatch_heap(NULL), resolve_heap(NULL), +#ifdef TARGET_AMD64 + m_fShouldAllocateLongJumpDispatchStubs(FALSE), +#endif lookups(NULL), cache_entries(NULL), dispatchers(NULL), resolvers(NULL), + m_counters(NULL), + m_cur_counter_block(NULL), + m_cur_counter_block_for_reclaim(NULL), + m_cur_counter_block_for_reclaim_index(NULL), m_pNext(NULL) { LIMITED_METHOD_CONTRACT; @@ -303,7 +311,7 @@ class VirtualCallStubManager : public StubManager }; // peek at the assembly code and predict which kind of a stub we have - static StubKind predictStubKind(PCODE stubStartAddress); + StubKind predictStubKind(PCODE stubStartAddress); /* know thine own stubs. It is possible that when multiple virtualcallstub managers are built that these may need to become @@ -475,7 +483,7 @@ class VirtualCallStubManager : public StubManager private: //allocate and initialize a stub of the desired kind - DispatchStub *GenerateDispatchStub(PCODE addrOfCode, + DispatchHolder *GenerateDispatchStub(PCODE addrOfCode, PCODE addrOfFail, void *pMTExpected, size_t dispatchToken, @@ -484,33 +492,33 @@ class VirtualCallStubManager : public StubManager #ifdef TARGET_AMD64 // Used to allocate a long jump dispatch stub. See comment around // m_fShouldAllocateLongJumpDispatchStubs for explaination. - DispatchStub *GenerateDispatchStubLong(PCODE addrOfCode, + DispatchHolder *GenerateDispatchStubLong(PCODE addrOfCode, PCODE addrOfFail, void *pMTExpected, size_t dispatchToken, bool *pMayHaveReenteredCooperativeGCMode); #endif - ResolveStub *GenerateResolveStub(PCODE addrOfResolver, - PCODE addrOfPatcher, - size_t dispatchToken + ResolveHolder *GenerateResolveStub(PCODE addrOfResolver, + PCODE addrOfPatcher, + size_t dispatchToken #if defined(TARGET_X86) && !defined(UNIX_X86_ABI) - , size_t stackArgumentsSize + , size_t stackArgumentsSize #endif - ); + ); - LookupStub *GenerateLookupStub(PCODE addrOfResolver, + LookupHolder *GenerateLookupStub(PCODE addrOfResolver, size_t dispatchToken); VTableCallHolder* GenerateVTableCallStub(DWORD slot); - template - void AddToCollectibleVSDRangeList(STUB *pStub) + template + void AddToCollectibleVSDRangeList(STUB_HOLDER *holder) { if (m_loaderAllocator->IsCollectible()) { - parentDomain->GetCollectibleVSDRanges()->AddRange(reinterpret_cast(pStub), - reinterpret_cast(pStub) + pStub->size(), + parentDomain->GetCollectibleVSDRanges()->AddRange(reinterpret_cast(holder->stub()), + reinterpret_cast(holder->stub()) + holder->stub()->size(), this); } } @@ -721,12 +729,45 @@ class VirtualCallStubManager : public StubManager PTR_LoaderHeap resolve_heap; // resolve stubs go here PTR_LoaderHeap vtable_heap; // vtable-based jump stubs go here +#ifdef TARGET_AMD64 + // When we layout the stub heaps, we put them close together in a sequential order + // so that we maximize performance with respect to branch predictions. On AMD64, + // dispatch stubs use a rel32 jump on failure to the resolve stub. This works for + // a while because of the ordering, but as soon as we have to start allocating more + // memory for either the dispatch or resolve heaps we have a chance that we'll be + // further away than a rel32 jump can reach, because we're in a 64-bit address + // space. As such, this flag will indicate when we allocate the first dispatch stub + // that cannot reach a resolve stub, and when this happens we'll switch over to + // allocating the larger version of the dispatch stub which contains an abs64 jump. + //@TODO: This is a bit of a workaround, but the limitations of LoaderHeap require that we + //@TODO: take this approach. Hopefully in Orcas we'll have a chance to rewrite LoaderHeap. + BOOL m_fShouldAllocateLongJumpDispatchStubs; // Defaults to FALSE. +#endif + BucketTable * lookups; // hash table of lookups keyed by tokens BucketTable * cache_entries; // hash table of dispatch token/target structs for dispatch cache BucketTable * dispatchers; // hash table of dispatching stubs keyed by tokens/actualtype BucketTable * resolvers; // hash table of resolvers keyed by tokens/resolverstub BucketTable * vtableCallers; // hash table of vtable call stubs keyed by slot values + // This structure is used to keep track of the fail counters. + // We only need one fail counter per ResolveStub, + // and most programs use less than 250 ResolveStubs + // We allocate these on the main heap using "new counter block" + struct counter_block + { + static const UINT32 MAX_COUNTER_ENTRIES = 256-2; // 254 counters should be enough for most cases. + + counter_block * next; // the next block + UINT32 used; // the index of the next free entry + INT32 block[MAX_COUNTER_ENTRIES]; // the counters + }; + + counter_block *m_counters; // linked list of counter blocks of failure counters + counter_block *m_cur_counter_block; // current block for updating counts + counter_block *m_cur_counter_block_for_reclaim; // current block for updating + UINT32 m_cur_counter_block_for_reclaim_index; // index into the current block for updating + // Used to keep track of all the VCSManager objects in the system. PTR_VirtualCallStubManager m_pNext; // Linked list pointer @@ -1018,358 +1059,6 @@ class Entry #include -//#ifdef TARGET_AMD64 -#pragma pack(push, 1) -// since we are placing code, we want byte packing of the structs - -// Codes of the instruction in the stub where the instruction access violation -// is converted to NullReferenceException at the caller site. -#ifdef UNIX_AMD64_ABI -#define X64_INSTR_CMP_IND_THIS_REG_RAX 0x073948 -#define X64_INSTR_MOV_RAX_IND_THIS_REG 0x078b48 -#else // UNIX_AMD64_ABI -#define X64_INSTR_CMP_IND_THIS_REG_RAX 0x013948 -#define X64_INSTR_MOV_RAX_IND_THIS_REG 0x018b48 -#endif // UNIX_AMD64_ABI - -#define USES_LOOKUP_STUBS 1 - -/********************************************************************************************* -Stubs that contain code are all part of larger structs called Holders. There is a -Holder for each kind of stub, i.e XXXStub is contained with XXXHolder. Holders are -essentially an implementation trick that allowed rearranging the code sequences more -easily while trying out different alternatives, and for dealing with any alignment -issues in a way that was mostly immune to the actually code sequences. These Holders -should be revisited when the stub code sequences are fixed, since in many cases they -add extra space to a stub that is not really needed. - -Stubs are placed in cache and hash tables. Since unaligned access of data in memory -is very slow, the keys used in those tables should be aligned. The things used as keys -typically also occur in the generated code, e.g. a token as an immediate part of an instruction. -For now, to avoid alignment computations as different code strategies are tried out, the key -fields are all in the Holders. Eventually, many of these fields should be dropped, and the instruction -streams aligned so that the immediate fields fall on aligned boundaries. -*/ - -#if USES_LOOKUP_STUBS - -/*LookupStub************************************************************************************** -Virtual and interface call sites are initially setup to point at LookupStubs. -This is because the runtime type of the pointer is not yet known, -so the target cannot be resolved. Note: if the jit is able to determine the runtime type -of the pointer, it should be generating a direct call not a virtual or interface call. -This stub pushes a lookup token onto the stack to identify the sought after method, and then -jumps into the EE (VirtualCallStubManager::ResolveWorkerStub) to effectuate the lookup and -transfer of control to the appropriate target method implementation, perhaps patching of the call site -along the way to point to a more appropriate stub. Hence callsites that point to LookupStubs -get quickly changed to point to another kind of stub. -*/ -struct LookupStubData -{ - size_t DispatchToken; - PCODE ResolveWorkerTarget; -}; - -typedef DPTR(LookupStubData) PTR_LookupStubData; - -struct LookupStub -{ -#if defined(HOST_AMD64) - static const int CodeSize = 16; -#elif defined(HOST_X86) - static const int CodeSize = 16; -#elif defined(HOST_ARM64) - static const int CodeSize = 16; -#elif defined(HOST_ARM) - static const int CodeSize = 12; -#endif // HOST_AMD64 - - void Initialize(PCODE resolveWorkerTarget, size_t dispatchToken); - - static void InitStatic(); - - inline PCODE entryPoint() { LIMITED_METHOD_CONTRACT; return PINSTRToPCODE((TADDR)this); } - inline size_t token() { LIMITED_METHOD_CONTRACT; return GetData()->DispatchToken; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(LookupStub); } - - static LookupStub* FromLookupEntry(PCODE lookupEntry); - - static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX); - -private: - PTR_LookupStubData GetData() const - { - return dac_cast(dac_cast(this) + GetOsPageSize()); - } - // The lookup entry point starts with a nop in order to allow us to quickly see - // if the stub is lookup stub or a dispatch stub. We can read thye first byte - // of a stub to find out what kind of a stub we have. - - BYTE _code[CodeSize]; - -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) - static void (*LookupStubCode)(); -#endif // TARGET_ARM64 && TARGET_UNIX) -}; - -#endif // USES_LOOKUP_STUBS - -/*DispatchStub************************************************************************************** -The structure of a full dispatch stub in memory is a DispatchStub followed contiguously in memory -by either a DispatchStubShort of a DispatchStubLong. DispatchStubShort is used when the resolve -stub (failTarget()) is reachable by a rel32 (DISPL) jump. We make a pretty good effort to make sure -that the stub heaps are set up so that this is the case. If we allocate enough stubs that the heap -end up allocating in a new block that is further away than a DISPL jump can go, then we end up using -a DispatchStubLong which is bigger but is a full 64-bit jump. */ - -/*DispatchStub************************************************************************************** -Monomorphic and mostly monomorphic call sites eventually point to DispatchStubs. -A dispatch stub has an expected type (expectedMT), target address (target) and fail address (failure). -If the calling frame does in fact have the type be of the expected type, then -control is transfered to the target address, the method implementation. If not, -then control is transfered to the fail address, a fail stub (see below) where a polymorphic -lookup is done to find the correct address to go to. - -implementation note: Order, choice of instructions, and branch directions -should be carefully tuned since it can have an inordinate effect on performance. Particular -attention needs to be paid to the effects on the BTB and branch prediction, both in the small -and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. -Note that since this stub is only used for mostly monomorphic callsites (ones that are not, get patched -to something else), therefore the conditional jump "jne failure" is mostly not taken, and hence it is important -that the branch prediction staticly predict this, which means it must be a forward jump. The alternative -is to reverse the order of the jumps and make sure that the resulting conditional jump "je implTarget" -is statically predicted as taken, i.e a backward jump. The current choice was taken since it was easier -to control the placement of the stubs than control the placement of the jitted code and the stubs. */ -struct DispatchStubData -{ - size_t ExpectedMT; - PCODE ImplTarget; - PCODE FailTarget; -}; - -typedef DPTR(DispatchStubData) PTR_DispatchStubData; - -/* DispatchHolders are the containers for DispatchStubs, they provide for any alignment of -stubs as necessary. DispatchStubs are placed in a hashtable and in a cache. The keys for both -are the pair expectedMT and token. Efficiency of the of the hash table is not a big issue, -since lookups in it are fairly rare. Efficiency of the cache is paramount since it is accessed frequently -(see ResolveStub below). Currently we are storing both of these fields in the DispatchHolder to simplify -alignment issues. If inlineMT in the stub itself was aligned, then it could be the expectedMT field. -While the token field can be logically gotten by following the failure target to the failEntryPoint -of the ResolveStub and then to the token over there, for perf reasons of cache access, it is duplicated here. -This allows us to use DispatchStubs in the cache. The alternative is to provide some other immutable struct -for the cache composed of the triplet (expectedMT, token, target) and some sort of reclaimation scheme when -they are thrown out of the cache via overwrites (since concurrency will make the obvious approaches invalid). -*/ - -/* @workaround for ee resolution - Since the EE does not currently have a resolver function that -does what we want, see notes in implementation of VirtualCallStubManager::Resolver, we are -using dispatch stubs to siumulate what we want. That means that inlineTarget, which should be immutable -is in fact written. Hence we have moved target out into the holder and aligned it so we can -atomically update it. When we get a resolver function that does what we want, we can drop this field, -and live with just the inlineTarget field in the stub itself, since immutability will hold.*/ - -#if !(defined(TARGET_ARM64) && defined(TARGET_UNIX)) -extern "C" void DispatchStubCode(); -extern "C" void DispatchStubCode_ThisDeref(); -#endif // !(TARGET_ARM64 && TARGET_UNIX) - - -struct DispatchStub -{ -#if defined(HOST_AMD64) - static const int CodeSize = 24; -#elif defined(HOST_X86) - static const int CodeSize = 24; -#elif defined(HOST_ARM64) - static const int CodeSize = 32; -#elif defined(HOST_ARM) - static const int CodeSize = 24; -#endif // HOST_AMD64 - - void Initialize(PCODE implTarget, PCODE failTarget, size_t expectedMT); - - static void InitStatic(); - - static DispatchStub* FromDispatchEntry(PCODE dispatchEntry); - static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX); - - inline PCODE entryPoint() const { LIMITED_METHOD_CONTRACT; return PINSTRToPCODE((TADDR)this); } - inline size_t expectedMT() const { LIMITED_METHOD_CONTRACT; return GetData()->ExpectedMT; } - inline static size_t size() { WRAPPER_NO_CONTRACT; return sizeof(DispatchStub); } - - inline static size_t offsetOfThisDeref() - { - LIMITED_METHOD_CONTRACT; - return (BYTE*)DispatchStubCode_ThisDeref - (BYTE*)DispatchStubCode; - } - - inline PCODE implTarget() const - { - LIMITED_METHOD_CONTRACT; - return GetData()->ImplTarget; - } - - inline TADDR implTargetSlot(EntryPointSlots::SlotType *slotTypeRef) const - { - LIMITED_METHOD_CONTRACT; - _ASSERTE(slotTypeRef != nullptr); - - *slotTypeRef = EntryPointSlots::SlotType_Normal; - return (TADDR)&GetData()->ImplTarget; - } - - inline PCODE failTarget() const - { - return GetData()->FailTarget; - } - -private: - BYTE code[CodeSize]; - - PTR_DispatchStubData GetData() const - { - return dac_cast(dac_cast(this) + GetOsPageSize()); - } - -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) - static void (*DispatchStubCode)(); - static void (*DispatchStubCode_ThisDeref)(); -#endif -}; - -/*ResolveStub************************************************************************************** -Polymorphic call sites and monomorphic calls that fail end up in a ResolverStub. There is only -one resolver stub built for any given token, even though there may be many call sites that -use that token and many distinct types that are used in the calling call frames. A resolver stub -actually has two entry points, one for polymorphic call sites and one for dispatch stubs that fail on their -expectedMT test. There is a third part of the resolver stub that enters the ee when a decision should -be made about changing the callsite. Therefore, we have defined the resolver stub as three distinct pieces, -even though they are actually allocated as a single contiguous block of memory. These pieces are: - -A ResolveStub has two entry points: - -FailEntry - where the dispatch stub goes if the expected MT test fails. This piece of the stub does -a check to see how often we are actually failing. If failures are frequent, control transfers to the -patch piece to cause the call site to be changed from a mostly monomorphic callsite -(calls dispatch stub) to a polymorphic callsize (calls resolve stub). If failures are rare, control -transfers to the resolve piece (see ResolveStub). The failEntryPoint decrements a counter -every time it is entered. The ee at various times will add a large chunk to the counter. - -ResolveEntry - does a lookup via in a cache by hashing the actual type of the calling frame s - and the token identifying the (contract,method) pair desired. If found, control is transfered -to the method implementation. If not found in the cache, the token is pushed and the ee is entered via -the ResolveWorkerStub to do a full lookup and eventual transfer to the correct method implementation. Since -there is a different resolve stub for every token, the token can be inlined and the token can be pre-hashed. -The effectiveness of this approach is highly sensitive to the effectiveness of the hashing algorithm used, -as well as its speed. It turns out it is very important to make the hash function sensitive to all -of the bits of the method table, as method tables are laid out in memory in a very non-random way. Before -making any changes to the code sequences here, it is very important to measure and tune them as perf -can vary greatly, in unexpected ways, with seeming minor changes. - -Implementation note - Order, choice of instructions, and branch directions -should be carefully tuned since it can have an inordinate effect on performance. Particular -attention needs to be paid to the effects on the BTB and branch prediction, both in the small -and in the large, i.e. it needs to run well in the face of BTB overflow--using static predictions. -Note that this stub is called in highly polymorphic cases, but the cache should have been sized -and the hash function chosen to maximize the cache hit case. Hence the cmp/jcc instructions should -mostly be going down the cache hit route, and it is important that this be statically predicted as so. -Hence the 3 jcc instrs need to be forward jumps. As structured, there is only one jmp/jcc that typically -gets put in the BTB since all the others typically fall straight thru. Minimizing potential BTB entries -is important. */ - -struct ResolveStubData -{ - size_t CacheAddress; - UINT32 HashedToken; - INT32 Counter; - size_t Token; - PCODE ResolveWorkerTarget; -#ifdef TARGET_X86 - PCODE PatcherTarget; -#ifndef UNIX_X86_ABI - size_t StackArgumentsSize; -#endif // UNIX_X86_ABI -#endif // TARGET_X86 -}; - -typedef DPTR(ResolveStubData) PTR_ResolveStubData; - -#if !(defined(TARGET_ARM64) && defined(TARGET_UNIX)) -extern "C" void ResolveStubCode(); -extern "C" void ResolveStubCode_FailEntry(); -extern "C" void ResolveStubCode_SlowEntry(); -extern "C" void ResolveStubCode_ResolveEntry(); -extern "C" void ResolveStubCode_ThisDeref(); -#endif // !(TARGET_ARM64 && TARGET_UNIX) - -struct ResolveStub -{ -#if defined(HOST_AMD64) - static const int CodeSize = 88; -#elif defined(HOST_X86) - static const int CodeSize = 88; -#elif defined(HOST_ARM64) - static const int CodeSize = 128; -#elif defined(HOST_ARM) - static const int CodeSize = 108; -#endif // HOST_AMD64 - - void Initialize(PCODE resolveWorkerTarget, PCODE patcherTarget, - size_t dispatchToken, UINT32 hashedToken, - void * cacheAddr, INT32 counterValue -#if defined(TARGET_X86) && !defined(UNIX_X86_ABI) - , size_t stackArgumentsSize -#endif - ); - - static void InitStatic(); - - static ResolveStub* FromFailEntry(PCODE resolveEntry); - static ResolveStub* FromResolveEntry(PCODE resolveEntry); - - static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX); - - inline PCODE failEntryPoint() { LIMITED_METHOD_CONTRACT; return PINSTRToPCODE((TADDR)this + ((BYTE*)ResolveStubCode_FailEntry - (BYTE*)ResolveStubCode)); } - inline PCODE resolveEntryPoint() { LIMITED_METHOD_CONTRACT; return PINSTRToPCODE((TADDR)this + ((BYTE*)ResolveStubCode_ResolveEntry - (BYTE*)ResolveStubCode)); } - inline PCODE slowEntryPoint() { LIMITED_METHOD_CONTRACT; return PINSTRToPCODE((TADDR)this + ((BYTE*)ResolveStubCode_SlowEntry - (BYTE*)ResolveStubCode)); } - - inline INT32* pCounter() { LIMITED_METHOD_CONTRACT; return &GetData()->Counter; } - inline UINT32 hashedToken() { LIMITED_METHOD_CONTRACT; return GetData()->HashedToken >> LOG2_PTRSIZE; } - inline size_t cacheAddress() { LIMITED_METHOD_CONTRACT; return GetData()->CacheAddress; } - inline size_t token() { LIMITED_METHOD_CONTRACT; return GetData()->Token; } - inline size_t size() { LIMITED_METHOD_CONTRACT; return sizeof(ResolveStub); } - - inline static size_t offsetOfThisDeref() - { - LIMITED_METHOD_CONTRACT; - return (BYTE*)ResolveStubCode_ThisDeref - (BYTE*)ResolveStubCode_ResolveEntry; - } - -#if defined(TARGET_X86) && !defined(UNIX_X86_ABI) - inline size_t stackArgumentsSize() { LIMITED_METHOD_CONTRACT; return GetData()->StackArgumentsSize; } -#endif - -private: - PTR_ResolveStubData GetData() const - { - return dac_cast(dac_cast(this) + GetOsPageSize()); - } - - BYTE code[CodeSize]; - -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) - static void (*ResolveStubCode)(); - static void (*ResolveStubCode_FailEntry)(); - static void (*ResolveStubCode_SlowEntry)(); - static void (*ResolveStubCode_ResolveEntry)(); - static void (*ResolveStubCode_ThisDeref)(); -#endif // TARGET_ARM64 && TARGET_UNIX -}; - -//#endif // TARGET_AMD64 -#pragma pack(pop) - #if USES_LOOKUP_STUBS /********************************************************************************************** LookupEntry wraps LookupStubs and provide the concrete implementation of the abstract class Entry. @@ -1403,7 +1092,7 @@ class LookupEntry : public Entry { LIMITED_METHOD_CONTRACT; _ASSERTE(VirtualCallStubManager::isLookupStubStatic((PCODE)contents)); - stub = LookupStub::FromLookupEntry((PCODE)contents); + stub = LookupHolder::FromLookupEntry((PCODE)contents)->stub(); } //extract the token of the underlying lookup stub @@ -1525,7 +1214,7 @@ class ResolveEntry : public Entry { LIMITED_METHOD_CONTRACT; _ASSERTE(VirtualCallStubManager::isResolvingStubStatic((PCODE)contents)); - stub = ResolveStub::FromResolveEntry((PCODE)contents); + stub = ResolveHolder::FromResolveEntry((PCODE)contents)->stub(); } //extract the token of the underlying resolve stub inline size_t Token() { WRAPPER_NO_CONTRACT; return stub ? (size_t)(stub->token()) : 0; } @@ -1563,7 +1252,7 @@ class DispatchEntry : public Entry { LIMITED_METHOD_CONTRACT; _ASSERTE(VirtualCallStubManager::isDispatchingStubStatic((PCODE)contents)); - stub = DispatchStub::FromDispatchEntry((PCODE)contents); + stub = DispatchHolder::FromDispatchEntry((PCODE)contents)->stub(); } //extract the fields of the underlying dispatch stub @@ -1575,8 +1264,8 @@ class DispatchEntry : public Entry WRAPPER_NO_CONTRACT; if (stub) { - ResolveStub * pResolveStub = ResolveStub::FromFailEntry(stub->failTarget()); - size_t token = pResolveStub->token(); + ResolveHolder * resolveHolder = ResolveHolder::FromFailEntry(stub->failTarget()); + size_t token = resolveHolder->stub()->token(); _ASSERTE(token == VirtualCallStubManager::GetTokenFromStub((PCODE)stub)); return token; } From bd1c8baa8f366b80aba726295160f3d5d939274d Mon Sep 17 00:00:00 2001 From: Jan Vorlicek Date: Thu, 17 Mar 2022 16:53:39 +0100 Subject: [PATCH 10/10] Move FillStubCodePage to a better location --- src/coreclr/vm/common.h | 25 ------------------------- src/coreclr/vm/util.cpp | 18 ++++++++++++++++++ src/coreclr/vm/util.hpp | 9 +++++++++ 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/src/coreclr/vm/common.h b/src/coreclr/vm/common.h index ebe1a981e2636..6b1ded2b26ec0 100644 --- a/src/coreclr/vm/common.h +++ b/src/coreclr/vm/common.h @@ -417,31 +417,6 @@ extern DummyGlobalContract ___contract; #endif // defined(_DEBUG) -#define ENUM_PAGE_SIZES \ - ENUM_PAGE_SIZE(4096) \ - ENUM_PAGE_SIZE(8192) \ - ENUM_PAGE_SIZE(16384) \ - ENUM_PAGE_SIZE(32768) \ - ENUM_PAGE_SIZE(65536) - -inline void FillStubCodePage(BYTE* pageBase, const void* code, int codeSize, int pageSize) -{ - int totalCodeSize = (pageSize / codeSize) * codeSize; - - memcpy(pageBase, code, codeSize); - - int i; - for (i = codeSize; i < pageSize / 2; i *= 2) - { - memcpy(pageBase + i, pageBase, i); - } - - if (i != totalCodeSize) - { - memcpy(pageBase + i, pageBase, totalCodeSize - i); - } -} - // All files get to see all of these .inl files to make sure all files // get the benefit of inlining. #include "ceeload.inl" diff --git a/src/coreclr/vm/util.cpp b/src/coreclr/vm/util.cpp index 12a4ed3225739..a14950993b4b1 100644 --- a/src/coreclr/vm/util.cpp +++ b/src/coreclr/vm/util.cpp @@ -2244,4 +2244,22 @@ HRESULT GetFileVersion( // S_OK or error Volatile NormalizedTimer::s_frequency = -1.0; +void FillStubCodePage(BYTE* pageBase, const void* code, int codeSize, int pageSize) +{ + int totalCodeSize = (pageSize / codeSize) * codeSize; + + memcpy(pageBase, code, codeSize); + + int i; + for (i = codeSize; i < pageSize / 2; i *= 2) + { + memcpy(pageBase + i, pageBase, i); + } + + if (i != totalCodeSize) + { + memcpy(pageBase + i, pageBase, totalCodeSize - i); + } +} + #endif // !DACCESS_COMPILE diff --git a/src/coreclr/vm/util.hpp b/src/coreclr/vm/util.hpp index 81c7a64f05ae7..78a0061e15f32 100644 --- a/src/coreclr/vm/util.hpp +++ b/src/coreclr/vm/util.hpp @@ -1006,4 +1006,13 @@ class NormalizedTimer HRESULT GetFileVersion(LPCWSTR wszFilePath, ULARGE_INTEGER* pFileVersion); #endif // !TARGET_UNIX +#define ENUM_PAGE_SIZES \ + ENUM_PAGE_SIZE(4096) \ + ENUM_PAGE_SIZE(8192) \ + ENUM_PAGE_SIZE(16384) \ + ENUM_PAGE_SIZE(32768) \ + ENUM_PAGE_SIZE(65536) + +void FillStubCodePage(BYTE* pageBase, const void* code, int codeSize, int pageSize); + #endif /* _H_UTIL */