From bfef88129d20c1b4fbf3a56269d1a4e9c50bcc29 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Thu, 28 Jul 2016 23:36:12 +0100 Subject: [PATCH 1/8] WorkerThreadStart volatile read+cmpxchg loop --- src/vm/win32threadpool.cpp | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/vm/win32threadpool.cpp b/src/vm/win32threadpool.cpp index 95fa5c0ca514..8600484e2726 100644 --- a/src/vm/win32threadpool.cpp +++ b/src/vm/win32threadpool.cpp @@ -1271,7 +1271,8 @@ void ThreadpoolMgr::MaybeAddWorkingWorker() _ASSERTE(!CLRThreadpoolHosted()); - ThreadCounter::Counts counts = WorkerCounter.GetCleanCounts(); + // counts volatile read paired with CompareExchangeCounts loop set + ThreadCounter::Counts counts = WorkerCounter.DangerousGetDirtyCounts(); ThreadCounter::Counts newCounts; while (true) { @@ -1325,7 +1326,9 @@ void ThreadpoolMgr::MaybeAddWorkingWorker() // Of course, there's no guarantee *that* will work - but hopefully enough time will have passed // to allow whoever's using all the memory right now to release some. // - counts = WorkerCounter.GetCleanCounts(); + + // counts volatile read paired with CompareExchangeCounts loop set + counts = WorkerCounter.DangerousGetDirtyCounts(); while (true) { // @@ -2255,7 +2258,8 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) ThreadCounter::Counts counts, oldCounts, newCounts; bool foundWork = true, wasNotRecalled = true; - counts = WorkerCounter.GetCleanCounts(); + // counts only used for Etw event + counts = WorkerCounter.DangerousGetDirtyCounts(); FireEtwThreadPoolWorkerThreadStart(counts.NumActive, counts.NumRetired, GetClrInstanceId()); #ifdef FEATURE_COMINTEROP @@ -2286,7 +2290,8 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) #ifdef FEATURE_COMINTEROP if (pThread->SetApartment(Thread::AS_InMTA, TRUE) != Thread::AS_InMTA) { - counts = WorkerCounter.GetCleanCounts(); + // counts volatile read paired with CompareExchangeCounts loop set + counts = WorkerCounter.DangerousGetDirtyCounts(); while (true) { newCounts = counts; @@ -2311,7 +2316,8 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) // make sure there's really work. If not, go back to sleep - counts = WorkerCounter.GetCleanCounts(); + // counts volatile read paired with CompareExchangeCounts loop set + counts = WorkerCounter.DangerousGetDirtyCounts(); while (true) { _ASSERTE(counts.NumActive > 0); @@ -2383,7 +2389,8 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) Retire: - counts = WorkerCounter.GetCleanCounts(); + // counts only used for Etw event + counts = WorkerCounter.DangerousGetDirtyCounts(); FireEtwThreadPoolWorkerThreadRetirementStart(counts.NumActive, counts.NumRetired, GetClrInstanceId()); // It's possible that some work came in just before we decremented the active thread count, in which @@ -2403,7 +2410,9 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) if (WAIT_OBJECT_0 == result) { foundWork = true; - counts = WorkerCounter.GetCleanCounts(); + + // counts only used for Etw event + counts = WorkerCounter.DangerousGetDirtyCounts(); FireEtwThreadPoolWorkerThreadRetirementStop(counts.NumActive, counts.NumRetired, GetClrInstanceId()); goto Work; } @@ -2435,7 +2444,9 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) // if we don't hit zero, then there's another retired thread that will pick up this signal. So it's ok // to exit. // - counts = WorkerCounter.GetCleanCounts(); + + // counts volatile read paired with CompareExchangeCounts loop set + counts = WorkerCounter.DangerousGetDirtyCounts(); while (true) { if (counts.NumRetired == 0) @@ -2491,7 +2502,8 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) DangerousNonHostedSpinLockHolder tal(&ThreadAdjustmentLock); - counts = WorkerCounter.GetCleanCounts(); + // counts volatile read paired with CompareExchangeCounts loop set + counts = WorkerCounter.DangerousGetDirtyCounts(); while (true) { if (counts.NumActive == counts.NumWorking) @@ -2550,7 +2562,8 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) _ASSERTE(!IsIoPending()); - counts = WorkerCounter.GetCleanCounts(); + // counts only used for Etw event + counts = WorkerCounter.DangerousGetDirtyCounts(); FireEtwThreadPoolWorkerThreadStop(counts.NumActive, counts.NumRetired, GetClrInstanceId()); return ERROR_SUCCESS; From 2de9a699402abc72b5cb998370838ecf6d84f00e Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 29 Jul 2016 01:23:54 +0100 Subject: [PATCH 2/8] GetCleanCounts to Volatile read on x64 Use GetCleanCounts where result is used directly and DangerousGetDirtyCounts when used as part of compare exchange loop --- src/vm/win32threadpool.cpp | 34 +++++++++++++++++++--------------- src/vm/win32threadpool.h | 6 ++++++ 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/vm/win32threadpool.cpp b/src/vm/win32threadpool.cpp index 8600484e2726..3696ad5f37d0 100644 --- a/src/vm/win32threadpool.cpp +++ b/src/vm/win32threadpool.cpp @@ -2258,8 +2258,7 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) ThreadCounter::Counts counts, oldCounts, newCounts; bool foundWork = true, wasNotRecalled = true; - // counts only used for Etw event - counts = WorkerCounter.DangerousGetDirtyCounts(); + counts = WorkerCounter.GetCleanCounts(); FireEtwThreadPoolWorkerThreadStart(counts.NumActive, counts.NumRetired, GetClrInstanceId()); #ifdef FEATURE_COMINTEROP @@ -2389,8 +2388,7 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) Retire: - // counts only used for Etw event - counts = WorkerCounter.DangerousGetDirtyCounts(); + counts = WorkerCounter.GetCleanCounts(); FireEtwThreadPoolWorkerThreadRetirementStart(counts.NumActive, counts.NumRetired, GetClrInstanceId()); // It's possible that some work came in just before we decremented the active thread count, in which @@ -2411,8 +2409,7 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) { foundWork = true; - // counts only used for Etw event - counts = WorkerCounter.DangerousGetDirtyCounts(); + counts = WorkerCounter.GetCleanCounts(); FireEtwThreadPoolWorkerThreadRetirementStop(counts.NumActive, counts.NumRetired, GetClrInstanceId()); goto Work; } @@ -2562,8 +2559,7 @@ DWORD __stdcall ThreadpoolMgr::WorkerThreadStart(LPVOID lpArgs) _ASSERTE(!IsIoPending()); - // counts only used for Etw event - counts = WorkerCounter.DangerousGetDirtyCounts(); + counts = WorkerCounter.GetCleanCounts(); FireEtwThreadPoolWorkerThreadStop(counts.NumActive, counts.NumRetired, GetClrInstanceId()); return ERROR_SUCCESS; @@ -3846,7 +3842,9 @@ DWORD __stdcall ThreadpoolMgr::CompletionPortThreadStart(LPVOID lpArgs) // and the state of the rest of the IOCP threads, we need to figure out whether to de-activate (exit) this thread, retire this thread, // or transition to "working." // - oldCounts = CPThreadCounter.GetCleanCounts(); + + // counts volatile read paired with CompareExchangeCounts loop set + oldCounts = CPThreadCounter.DangerousGetDirtyCounts(); newCounts = oldCounts; enterRetirement = false; @@ -3929,7 +3927,8 @@ DWORD __stdcall ThreadpoolMgr::CompletionPortThreadStart(LPVOID lpArgs) // We can now exit; decrement the retired count. while (true) { - oldCounts = CPThreadCounter.GetCleanCounts(); + // counts volatile read paired with CompareExchangeCounts loop set + oldCounts = CPThreadCounter.DangerousGetDirtyCounts(); newCounts = oldCounts; newCounts.NumRetired--; if (oldCounts == CPThreadCounter.CompareExchangeCounts(newCounts, oldCounts)) @@ -3943,7 +3942,8 @@ DWORD __stdcall ThreadpoolMgr::CompletionPortThreadStart(LPVOID lpArgs) // put back into rotation -- we need a thread while (true) { - oldCounts = CPThreadCounter.GetCleanCounts(); + // counts volatile read paired with CompareExchangeCounts loop set + oldCounts = CPThreadCounter.DangerousGetDirtyCounts(); newCounts = oldCounts; newCounts.NumRetired--; newCounts.NumActive++; @@ -3984,7 +3984,8 @@ DWORD __stdcall ThreadpoolMgr::CompletionPortThreadStart(LPVOID lpArgs) //GC event, and some have not. while (true) { - oldCounts = CPThreadCounter.GetCleanCounts(); + // counts volatile read paired with CompareExchangeCounts loop set + oldCounts = CPThreadCounter.DangerousGetDirtyCounts(); newCounts = oldCounts; newCounts.NumWorking--; if (oldCounts == CPThreadCounter.CompareExchangeCounts(newCounts, oldCounts)) @@ -3997,7 +3998,8 @@ DWORD __stdcall ThreadpoolMgr::CompletionPortThreadStart(LPVOID lpArgs) while (true) { - oldCounts = CPThreadCounter.GetCleanCounts(); + // counts volatile read paired with CompareExchangeCounts loop set + oldCounts = CPThreadCounter.DangerousGetDirtyCounts(); newCounts = oldCounts; newCounts.NumWorking++; if (oldCounts == CPThreadCounter.CompareExchangeCounts(newCounts, oldCounts)) @@ -4272,7 +4274,8 @@ void ThreadpoolMgr::GrowCompletionPortThreadpoolIfNeeded() // if thread creation failed, we have to adjust the counts back down. while (true) { - oldCounts = CPThreadCounter.GetCleanCounts(); + // counts volatile read paired with CompareExchangeCounts loop set + oldCounts = CPThreadCounter.DangerousGetDirtyCounts(); newCounts = oldCounts; newCounts.NumActive--; newCounts.NumWorking--; @@ -4702,7 +4705,8 @@ DWORD __stdcall ThreadpoolMgr::GateThreadStart(LPVOID lpArgs) // IOCP threads are created as "active" and "working" while (true) { - oldCounts = CPThreadCounter.GetCleanCounts(); + // counts volatile read paired with CompareExchangeCounts loop set + oldCounts = CPThreadCounter.DangerousGetDirtyCounts(); newCounts = oldCounts; newCounts.NumActive++; newCounts.NumWorking++; diff --git a/src/vm/win32threadpool.h b/src/vm/win32threadpool.h index ba800e5073ed..5574512a2532 100644 --- a/src/vm/win32threadpool.h +++ b/src/vm/win32threadpool.h @@ -375,6 +375,11 @@ class ThreadpoolMgr Counts GetCleanCounts() { +#ifdef _WIN64 + // VolatileLoad x64 bit read is atomic + return DangerousGetDirtyCounts(); +#else // !_WIN64 + // VolatileLoad may result in torn read LIMITED_METHOD_CONTRACT; Counts result; #ifndef DACCESS_COMPILE @@ -384,6 +389,7 @@ class ThreadpoolMgr result.AsLongLong = 0; //prevents prefast warning for DAC builds #endif return result; +#endif // !_WIN64 } // From a033207e0dbe403473c4c980d9b5646131d84776 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 29 Jul 2016 01:50:03 +0100 Subject: [PATCH 3/8] Reduce false sharing in ManagedPerAppDomainTPCount --- src/vm/threadpoolrequest.cpp | 3 +++ src/vm/threadpoolrequest.h | 29 +++++++++++++++++++++-------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/vm/threadpoolrequest.cpp b/src/vm/threadpoolrequest.cpp index 111578e76842..5b970c81948f 100644 --- a/src/vm/threadpoolrequest.cpp +++ b/src/vm/threadpoolrequest.cpp @@ -30,11 +30,14 @@ #endif #include "appdomain.inl" +BYTE PerAppDomainTPCountList::padding1[64]; UnManagedPerAppDomainTPCount PerAppDomainTPCountList::s_unmanagedTPCount; +BYTE PerAppDomainTPCountList::padding2[64]; //The list of all per-appdomain work-request counts. ArrayListStatic PerAppDomainTPCountList::s_appDomainIndexList; +BYTE PerAppDomainTPCountList::padding3[64]; //Make this point to unmanaged TP in case, no appdomains have initialized yet. LONG PerAppDomainTPCountList::s_ADHint=-1; diff --git a/src/vm/threadpoolrequest.h b/src/vm/threadpoolrequest.h index d0c47e677cd6..3c332e73cd73 100644 --- a/src/vm/threadpoolrequest.h +++ b/src/vm/threadpoolrequest.h @@ -172,9 +172,14 @@ class ManagedPerAppDomainTPCount : public IPerAppDomainTPCount { void DispatchWorkItem(bool* foundWork, bool* wasNotRecalled); private: - Volatile m_numRequestsPending; - ADID m_id; - TPIndex m_index; + struct { + BYTE padding1[64]; // padding to ensure own cache line + Volatile m_numRequestsPending; + BYTE padding2[64]; // padding to ensure own cache line + ADID m_id; + BYTE padding3[64]; // padding to ensure own cache line + TPIndex m_index; + }; }; //-------------------------------------------------------------------------- @@ -272,9 +277,14 @@ class UnManagedPerAppDomainTPCount : public IPerAppDomainTPCount { } private: - ULONG m_NumRequests; - Volatile m_outstandingThreadRequestCount; - SpinLock m_lock; + struct { + BYTE padding1[64]; // padding to ensure own cache line + ULONG m_NumRequests; + BYTE padding2[64]; // padding to ensure own cache line + Volatile m_outstandingThreadRequestCount; + BYTE padding3[64]; // padding to ensure own cache line + SpinLock m_lock; + }; }; //-------------------------------------------------------------------------- @@ -330,11 +340,14 @@ class PerAppDomainTPCountList{ private: static DWORD FindFirstFreeTpEntry(); + static BYTE padding1[64]; // padding to ensure own cache line static UnManagedPerAppDomainTPCount s_unmanagedTPCount; - //The list of all per-appdomain work-request counts. - static ArrayListStatic s_appDomainIndexList; + static BYTE padding2[64]; // padding to ensure own cache line + //The list of all per-appdomain work-request counts. + static ArrayListStatic s_appDomainIndexList; + static BYTE padding3[64]; // padding to ensure own cache line static LONG s_ADHint; }; From b8220e45b14ea4174e323b1ca7b74ac2fd128e0e Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 29 Jul 2016 18:01:33 +0100 Subject: [PATCH 4/8] Adjust fences and add padding --- src/vm/comthreadpool.cpp | 7 ++++--- src/vm/threadpoolrequest.cpp | 24 +++++++++++++++--------- src/vm/threadpoolrequest.h | 28 ++++++++++++++++------------ src/vm/win32threadpool.cpp | 16 ++++++++++------ src/vm/win32threadpool.h | 15 ++++++++++----- 5 files changed, 55 insertions(+), 35 deletions(-) diff --git a/src/vm/comthreadpool.cpp b/src/vm/comthreadpool.cpp index e9ffbaf92fbc..8231214a20eb 100644 --- a/src/vm/comthreadpool.cpp +++ b/src/vm/comthreadpool.cpp @@ -251,6 +251,7 @@ FCIMPL0(FC_BOOL_RET, ThreadPoolNative::NotifyRequestComplete) pThread->HasCriticalRegion() || pThread->HasThreadAffinity(); + // Read fenced call bool shouldAdjustWorkers = ThreadpoolMgr::ShouldAdjustMaxWorkersActive(); // @@ -266,15 +267,15 @@ FCIMPL0(FC_BOOL_RET, ThreadPoolNative::NotifyRequestComplete) { HELPER_METHOD_FRAME_BEGIN_RET_0(); + if (needReset) + pThread->InternalReset(FALSE, TRUE, TRUE, FALSE); + if (shouldAdjustWorkers) { ThreadpoolMgr::AdjustMaxWorkersActive(); tal.Release(); } - if (needReset) - pThread->InternalReset (FALSE, TRUE, TRUE, FALSE); - HELPER_METHOD_FRAME_END(); } diff --git a/src/vm/threadpoolrequest.cpp b/src/vm/threadpoolrequest.cpp index 5b970c81948f..3b5cb37c6b61 100644 --- a/src/vm/threadpoolrequest.cpp +++ b/src/vm/threadpoolrequest.cpp @@ -362,7 +362,7 @@ void UnManagedPerAppDomainTPCount::SetAppDomainRequestsActive() { WRAPPER_NO_CONTRACT; #ifndef DACCESS_COMPILE - LONG count = m_outstandingThreadRequestCount; + LONG count = VolatileLoad(&m_outstandingThreadRequestCount); while (count < (LONG)ThreadpoolMgr::NumberOfProcessors) { LONG prevCount = FastInterlockCompareExchange(&m_outstandingThreadRequestCount, count+1, count); @@ -383,7 +383,7 @@ void UnManagedPerAppDomainTPCount::SetAppDomainRequestsActive() bool FORCEINLINE UnManagedPerAppDomainTPCount::TakeActiveRequest() { LIMITED_METHOD_CONTRACT; - LONG count = m_outstandingThreadRequestCount; + LONG count = VolatileLoad(&m_outstandingThreadRequestCount); while (count > 0) { @@ -602,7 +602,7 @@ void ManagedPerAppDomainTPCount::SetAppDomainRequestsActive() _ASSERTE(m_id.m_dwId != 0); #ifndef DACCESS_COMPILE - LONG count = m_numRequestsPending; + LONG count = VolatileLoad(&m_numRequestsPending); while (count != ADUnloading) { LONG prev = FastInterlockCompareExchange(&m_numRequestsPending, count+1, count); @@ -631,12 +631,15 @@ void ManagedPerAppDomainTPCount::ClearAppDomainRequestsActive(BOOL bADU) if (bADU) { - m_numRequestsPending = ADUnloading; + VolatileStore(&m_numRequestsPending, ADUnloading); } else { - LONG count = m_numRequestsPending; - while (count != ADUnloading && count > 0) + LONG count = VolatileLoad(&m_numRequestsPending); + // Test is: count > 0 && count != ADUnloading + // Since: const ADUnloading == -1 + // Both are tested: (count > 0) means following also true (count != ADUnloading) + while (count > 0) { LONG prev = FastInterlockCompareExchange(&m_numRequestsPending, 0, count); if (prev == count) @@ -649,8 +652,11 @@ void ManagedPerAppDomainTPCount::ClearAppDomainRequestsActive(BOOL bADU) bool ManagedPerAppDomainTPCount::TakeActiveRequest() { LIMITED_METHOD_CONTRACT; - LONG count = m_numRequestsPending; - while (count != ADUnloading && count > 0) + LONG count = VolatileLoad(&m_numRequestsPending); + // Test is: count > 0 && count != ADUnloading + // Since: const ADUnloading == -1 + // Both are tested: (count > 0) means following also true (count != ADUnloading) + while (count > 0) { LONG prev = FastInterlockCompareExchange(&m_numRequestsPending, count-1, count); if (prev == count) @@ -681,7 +687,7 @@ void ManagedPerAppDomainTPCount::ClearAppDomainUnloading() // it should be, but if it's smaller then we will be permanently out of sync with the // AD. // - m_numRequestsPending = ThreadpoolMgr::NumberOfProcessors; + VolatileStore(&m_numRequestsPending, (LONG)ThreadpoolMgr::NumberOfProcessors); if (!CLRThreadpoolHosted() && ThreadpoolMgr::IsInitialized()) { ThreadpoolMgr::MaybeAddWorkingWorker(); diff --git a/src/vm/threadpoolrequest.h b/src/vm/threadpoolrequest.h index 3c332e73cd73..c1ee9e3d9d60 100644 --- a/src/vm/threadpoolrequest.h +++ b/src/vm/threadpoolrequest.h @@ -85,14 +85,16 @@ class ManagedPerAppDomainTPCount : public IPerAppDomainTPCount { inline void ResetState() { LIMITED_METHOD_CONTRACT; - m_numRequestsPending = 0; + VolatileStore(&m_numRequestsPending, (LONG)0); m_id.m_dwId = 0; } inline BOOL IsRequestPending() { LIMITED_METHOD_CONTRACT; - return m_numRequestsPending != ADUnloading && m_numRequestsPending > 0; + + LONG count = VolatileLoad(&m_numRequestsPending); + return count != ADUnloading && count > 0; } void SetAppDomainRequestsActive(); @@ -106,7 +108,7 @@ class ManagedPerAppDomainTPCount : public IPerAppDomainTPCount { //has started running yet. That implies, no requests should be pending //or dispatched to this structure yet. - _ASSERTE(m_numRequestsPending != ADUnloading); + _ASSERTE(VolatileLoad(&m_numRequestsPending) != ADUnloading); _ASSERTE(m_id.m_dwId == 0); m_id = id; @@ -119,7 +121,7 @@ class ManagedPerAppDomainTPCount : public IPerAppDomainTPCount { //has started running yet. That implies, no requests should be pending //or dispatched to this structure yet. - _ASSERTE(m_numRequestsPending != ADUnloading); + _ASSERTE(VolatileLoad(&m_numRequestsPending) != ADUnloading); _ASSERTE(m_id.m_dwId == 0); _ASSERTE(m_index.m_dwIndex == UNUSED_THREADPOOL_INDEX); @@ -135,7 +137,7 @@ class ManagedPerAppDomainTPCount : public IPerAppDomainTPCount { //added removed at this time. So, make sure that the per-appdomain structures that //have been cleared(reclaimed) don't have any pending requests to them. - _ASSERTE(m_numRequestsPending != ADUnloading); + _ASSERTE(VolatileLoad(&m_numRequestsPending) != ADUnloading); _ASSERTE(m_id.m_dwId == 0); return TRUE; @@ -159,14 +161,14 @@ class ManagedPerAppDomainTPCount : public IPerAppDomainTPCount { inline void SetAppDomainUnloading() { LIMITED_METHOD_CONTRACT; - m_numRequestsPending = ADUnloading; + VolatileStore(&m_numRequestsPending, ADUnloading); } inline void ClearAppDomainUnloading(); inline BOOL IsAppDomainUnloading() { - return m_numRequestsPending.Load() == ADUnloading; + return VolatileLoad(&m_numRequestsPending) == ADUnloading; } void DispatchWorkItem(bool* foundWork, bool* wasNotRecalled); @@ -174,7 +176,8 @@ class ManagedPerAppDomainTPCount : public IPerAppDomainTPCount { private: struct { BYTE padding1[64]; // padding to ensure own cache line - Volatile m_numRequestsPending; + // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange + LONG m_numRequestsPending; BYTE padding2[64]; // padding to ensure own cache line ADID m_id; BYTE padding3[64]; // padding to ensure own cache line @@ -217,13 +220,13 @@ class UnManagedPerAppDomainTPCount : public IPerAppDomainTPCount { { LIMITED_METHOD_CONTRACT; m_NumRequests = 0; - m_outstandingThreadRequestCount = 0; + VolatileStore(&m_outstandingThreadRequestCount, (LONG)0); } inline BOOL IsRequestPending() { LIMITED_METHOD_CONTRACT; - return m_outstandingThreadRequestCount != 0 ? TRUE : FALSE; + return VolatileLoad(&m_outstandingThreadRequestCount) != (LONG)0 ? TRUE : FALSE; } void SetAppDomainRequestsActive(); @@ -231,7 +234,7 @@ class UnManagedPerAppDomainTPCount : public IPerAppDomainTPCount { inline void ClearAppDomainRequestsActive(BOOL bADU) { LIMITED_METHOD_CONTRACT; - m_outstandingThreadRequestCount = 0; + VolatileStore(&m_outstandingThreadRequestCount, (LONG)0); } bool TakeActiveRequest(); @@ -281,7 +284,8 @@ class UnManagedPerAppDomainTPCount : public IPerAppDomainTPCount { BYTE padding1[64]; // padding to ensure own cache line ULONG m_NumRequests; BYTE padding2[64]; // padding to ensure own cache line - Volatile m_outstandingThreadRequestCount; + // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange + LONG m_outstandingThreadRequestCount; BYTE padding3[64]; // padding to ensure own cache line SpinLock m_lock; }; diff --git a/src/vm/win32threadpool.cpp b/src/vm/win32threadpool.cpp index 3696ad5f37d0..0df0f1e19ee9 100644 --- a/src/vm/win32threadpool.cpp +++ b/src/vm/win32threadpool.cpp @@ -95,9 +95,12 @@ LONG ThreadpoolMgr::cpuUtilizationAverage = 0; HillClimbing ThreadpoolMgr::HillClimbingInstance; -Volatile ThreadpoolMgr::PriorCompletedWorkRequests = 0; -Volatile ThreadpoolMgr::PriorCompletedWorkRequestsTime; -Volatile ThreadpoolMgr::NextCompletedWorkRequestsTime; +BYTE ThreadpoolMgr::padding1[64]; +LONG ThreadpoolMgr::PriorCompletedWorkRequests = 0; +DWORD ThreadpoolMgr::PriorCompletedWorkRequestsTime; +DWORD ThreadpoolMgr::NextCompletedWorkRequestsTime; +BYTE ThreadpoolMgr::padding2[64]; + LARGE_INTEGER ThreadpoolMgr::CurrentSampleStartTime; int ThreadpoolMgr::ThreadAdjustmentInterval; @@ -1189,7 +1192,7 @@ void ThreadpoolMgr::AdjustMaxWorkersActive() DWORD currentTicks = GetTickCount(); LONG totalNumCompletions = Thread::GetTotalThreadPoolCompletionCount(); - LONG numCompletions = totalNumCompletions - PriorCompletedWorkRequests; + LONG numCompletions = totalNumCompletions - VolatileLoad(&PriorCompletedWorkRequests); LARGE_INTEGER startTime = CurrentSampleStartTime; LARGE_INTEGER endTime; @@ -1251,9 +1254,10 @@ void ThreadpoolMgr::AdjustMaxWorkersActive() } } - PriorCompletedWorkRequests = totalNumCompletions; + // Memory fences below writes + VolatileStore(&PriorCompletedWorkRequests, totalNumCompletions); PriorCompletedWorkRequestsTime = currentTicks; - NextCompletedWorkRequestsTime = PriorCompletedWorkRequestsTime + ThreadAdjustmentInterval; + NextCompletedWorkRequestsTime = currentTicks + ThreadAdjustmentInterval; CurrentSampleStartTime = endTime; } } diff --git a/src/vm/win32threadpool.h b/src/vm/win32threadpool.h index 5574512a2532..39c53a10c4b2 100644 --- a/src/vm/win32threadpool.h +++ b/src/vm/win32threadpool.h @@ -1139,8 +1139,9 @@ class ThreadpoolMgr if (CLRThreadpoolHosted()) return false; - DWORD requiredInterval = NextCompletedWorkRequestsTime - PriorCompletedWorkRequestsTime; - DWORD elapsedInterval = GetTickCount() - PriorCompletedWorkRequestsTime; + DWORD priorTime = PriorCompletedWorkRequestsTime; + DWORD requiredInterval = VolatileLoad(&NextCompletedWorkRequestsTime) - priorTime; // fences above read + DWORD elapsedInterval = GetTickCount() - priorTime; if (elapsedInterval >= requiredInterval) { ThreadCounter::Counts counts = WorkerCounter.GetCleanCounts(); @@ -1304,9 +1305,13 @@ class ThreadpoolMgr static HillClimbing HillClimbingInstance; - static Volatile PriorCompletedWorkRequests; - static Volatile PriorCompletedWorkRequestsTime; - static Volatile NextCompletedWorkRequestsTime; + static BYTE padding1[64]; // padding to ensure own cache line + + static LONG PriorCompletedWorkRequests; + static DWORD PriorCompletedWorkRequestsTime; + static DWORD NextCompletedWorkRequestsTime; + + static BYTE padding2[64]; // padding to ensure own cache line static LARGE_INTEGER CurrentSampleStartTime; From cd0d60049463b3eb1a598976b494dfe6af8e572b Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sat, 30 Jul 2016 01:07:52 +0100 Subject: [PATCH 5/8] Align to reduce false sharing --- src/vm/threadpoolrequest.cpp | 162 ++++++++++++++++++----------------- src/vm/threadpoolrequest.h | 56 +++++------- src/vm/win32threadpool.cpp | 34 +++++--- src/vm/win32threadpool.h | 11 ++- 4 files changed, 131 insertions(+), 132 deletions(-) diff --git a/src/vm/threadpoolrequest.cpp b/src/vm/threadpoolrequest.cpp index 3b5cb37c6b61..8d47e6b810b5 100644 --- a/src/vm/threadpoolrequest.cpp +++ b/src/vm/threadpoolrequest.cpp @@ -30,16 +30,15 @@ #endif #include "appdomain.inl" -BYTE PerAppDomainTPCountList::padding1[64]; -UnManagedPerAppDomainTPCount PerAppDomainTPCountList::s_unmanagedTPCount; +BYTE PerAppDomainTPCountList::s_padding[64 - sizeof(LONG)]; +// Make this point to unmanaged TP in case, no appdomains have initialized yet. +// Cacheline aligned, hot variable +DECLSPEC_ALIGN(64) LONG PerAppDomainTPCountList::s_ADHint = -1; -BYTE PerAppDomainTPCountList::padding2[64]; +// Move out of from preceeding variables' cache line +DECLSPEC_ALIGN(64) UnManagedPerAppDomainTPCount PerAppDomainTPCountList::s_unmanagedTPCount; //The list of all per-appdomain work-request counts. -ArrayListStatic PerAppDomainTPCountList::s_appDomainIndexList; - -BYTE PerAppDomainTPCountList::padding3[64]; -//Make this point to unmanaged TP in case, no appdomains have initialized yet. -LONG PerAppDomainTPCountList::s_ADHint=-1; +ArrayListStatic PerAppDomainTPCountList::s_appDomainIndexList; void PerAppDomainTPCountList::InitAppDomainIndexList() { @@ -77,7 +76,14 @@ TPIndex PerAppDomainTPCountList::AddNewTPIndex() return index; } +#ifdef _MSC_VER + // Disable this warning - we intentionally want __declspec(align()) to insert trailing padding for us +#pragma warning(disable:4316) // Object allocated on the heap may not be aligned for this type. +#endif ManagedPerAppDomainTPCount * pAdCount = new ManagedPerAppDomainTPCount(index); +#ifdef _MSC_VER +#pragma warning(default:4316) // Object allocated on the heap may not be aligned for this type. +#endif pAdCount->ResetState(); IfFailThrow(s_appDomainIndexList.Append(pAdCount)); @@ -277,34 +283,33 @@ LONG PerAppDomainTPCountList::GetAppDomainIndexForThreadpoolDispatch() } CONTRACTL_END; + LONG hint = s_ADHint; + DWORD count = s_appDomainIndexList.GetCount(); IPerAppDomainTPCount * pAdCount; - DWORD DwnumADs = s_appDomainIndexList.GetCount(); DWORD Dwi; - LONG hint = s_ADHint; - LONG temphint = hint; - if (hint == -1) + if (hint != -1) { - pAdCount = &s_unmanagedTPCount; + pAdCount = dac_cast(s_appDomainIndexList.Get(hint)); } else { - pAdCount = dac_cast(s_appDomainIndexList.Get(hint)); + pAdCount = &s_unmanagedTPCount; } + //temphint ensures that the check for appdomains proceeds in a pure round robin fashion. + LONG temphint = hint; + _ASSERTE( pAdCount); if (pAdCount->TakeActiveRequest()) goto HintDone; - //temphint ensures that the check for appdomains proceeds in a pure round robin fashion. - temphint = hint; - //If there is no work in any appdomains, check the unmanaged queue, hint = -1; - for (Dwi=0;DwiIsAbortRequested()) - { - // thread was aborted, and may not have had a chance to tell us it has work. - ENTER_DOMAIN_ID(m_id) + ManagedThreadBase::ThreadPool(appDomainId, QueueUserWorkItemManagedCallback, wasNotRecalled); + } + + if (pThread->IsAbortRequested()) { - ThreadpoolMgr::SetAppDomainRequestsActive(); - ThreadpoolMgr::QueueUserWorkItem(NULL, - NULL, - 0, - FALSE); + // thread was aborted, and may not have had a chance to tell us it has work. + ENTER_DOMAIN_ID(m_id) + { + ThreadpoolMgr::SetAppDomainRequestsActive(); + ThreadpoolMgr::QueueUserWorkItem(NULL, + NULL, + 0, + FALSE); + } + END_DOMAIN_TRANSITION; } - END_DOMAIN_TRANSITION; } - } - // We should have released all locks. - _ASSERTE(g_fEEShutDown || pThread->m_dwLockCount == 0 || pThread->m_fRudeAborted); - - END_SO_INTOLERANT_CODE; + // We should have released all locks. + _ASSERTE(g_fEEShutDown || pThread->m_dwLockCount == 0 || pThread->m_fRudeAborted); + + END_SO_INTOLERANT_CODE; - *foundWork = true; + *foundWork = true; + } + else + { + __SwitchToThread(0, CALLER_LIMITS_SPINNING); + return; + } } #endif // !DACCESS_COMPILE diff --git a/src/vm/threadpoolrequest.h b/src/vm/threadpoolrequest.h index c1ee9e3d9d60..af381c02fc0b 100644 --- a/src/vm/threadpoolrequest.h +++ b/src/vm/threadpoolrequest.h @@ -175,14 +175,17 @@ class ManagedPerAppDomainTPCount : public IPerAppDomainTPCount { private: struct { - BYTE padding1[64]; // padding to ensure own cache line - // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange - LONG m_numRequestsPending; - BYTE padding2[64]; // padding to ensure own cache line ADID m_id; - BYTE padding3[64]; // padding to ensure own cache line TPIndex m_index; }; + struct { + DECLSPEC_ALIGN(64) struct { + BYTE padding1[64 - sizeof(LONG)]; + // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange + LONG m_numRequestsPending; + BYTE padding2[64]; + }; + }; }; //-------------------------------------------------------------------------- @@ -281,13 +284,16 @@ class UnManagedPerAppDomainTPCount : public IPerAppDomainTPCount { private: struct { - BYTE padding1[64]; // padding to ensure own cache line - ULONG m_NumRequests; - BYTE padding2[64]; // padding to ensure own cache line - // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange - LONG m_outstandingThreadRequestCount; - BYTE padding3[64]; // padding to ensure own cache line SpinLock m_lock; + ULONG m_NumRequests; + }; + struct { + DECLSPEC_ALIGN(64) struct { + BYTE padding1[64 - sizeof(LONG)]; + // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange + LONG m_outstandingThreadRequestCount; + BYTE padding2[64]; + }; }; }; @@ -344,32 +350,12 @@ class PerAppDomainTPCountList{ private: static DWORD FindFirstFreeTpEntry(); - static BYTE padding1[64]; // padding to ensure own cache line + static BYTE s_padding[64 - sizeof(LONG)]; + static LONG s_ADHint; static UnManagedPerAppDomainTPCount s_unmanagedTPCount; - static BYTE padding2[64]; // padding to ensure own cache line - //The list of all per-appdomain work-request counts. + //The list of all per-appdomain work-request counts. static ArrayListStatic s_appDomainIndexList; - - static BYTE padding3[64]; // padding to ensure own cache line - static LONG s_ADHint; }; -#endif //_THREADPOOL_REQUEST_H - - - - - - - - - - - - - - - - - +#endif //_THREADPOOL_REQUEST_H \ No newline at end of file diff --git a/src/vm/win32threadpool.cpp b/src/vm/win32threadpool.cpp index 0df0f1e19ee9..7219bdaf61e7 100644 --- a/src/vm/win32threadpool.cpp +++ b/src/vm/win32threadpool.cpp @@ -85,7 +85,8 @@ SVAL_IMPL(LONG,ThreadpoolMgr,MaxFreeCPThreads); // = MaxFreeCP Volatile ThreadpoolMgr::NumCPInfrastructureThreads = 0; // number of threads currently busy handling draining cycle -SVAL_IMPL(ThreadpoolMgr::ThreadCounter, ThreadpoolMgr, WorkerCounter); +// Cacheline aligned, hot variable +DECLSPEC_ALIGN(64) SVAL_IMPL(ThreadpoolMgr::ThreadCounter, ThreadpoolMgr, WorkerCounter); SVAL_IMPL(LONG,ThreadpoolMgr,MinLimitTotalWorkerThreads); // = MaxLimitCPThreadsPerCPU * number of CPUS SVAL_IMPL(LONG,ThreadpoolMgr,MaxLimitTotalWorkerThreads); // = MaxLimitCPThreadsPerCPU * number of CPUS @@ -95,11 +96,10 @@ LONG ThreadpoolMgr::cpuUtilizationAverage = 0; HillClimbing ThreadpoolMgr::HillClimbingInstance; -BYTE ThreadpoolMgr::padding1[64]; -LONG ThreadpoolMgr::PriorCompletedWorkRequests = 0; +// Cacheline aligned, 3 hot variables updated in a group +DECLSPEC_ALIGN(64) LONG ThreadpoolMgr::PriorCompletedWorkRequests = 0; DWORD ThreadpoolMgr::PriorCompletedWorkRequestsTime; DWORD ThreadpoolMgr::NextCompletedWorkRequestsTime; -BYTE ThreadpoolMgr::padding2[64]; LARGE_INTEGER ThreadpoolMgr::CurrentSampleStartTime; @@ -114,8 +114,12 @@ int ThreadpoolMgr::ThreadAdjustmentInterval; #define SUSPEND_TIME GATE_THREAD_DELAY+100 // milliseconds to suspend during SuspendProcessing LONG ThreadpoolMgr::Initialization=0; // indicator of whether the threadpool is initialized. -Volatile ThreadpoolMgr::LastDequeueTime; // used to determine if work items are getting thread starved -int ThreadpoolMgr::offset_counter = 0; + +// Cacheline aligned, hot variable +DECLSPEC_ALIGN(64) unsigned int ThreadpoolMgr::LastDequeueTime; // used to determine if work items are getting thread starved + +// Move out of from preceeding variables' cache line +DECLSPEC_ALIGN(64) int ThreadpoolMgr::offset_counter = 0; SPTR_IMPL(WorkRequest,ThreadpoolMgr,WorkRequestHead); // Head of work request queue SPTR_IMPL(WorkRequest,ThreadpoolMgr,WorkRequestTail); // Head of work request queue @@ -138,15 +142,19 @@ CLRSemaphore* ThreadpoolMgr::RetiredWorkerSemaphore; CrstStatic ThreadpoolMgr::TimerQueueCriticalSection; HANDLE ThreadpoolMgr::TimerThread=NULL; Thread *ThreadpoolMgr::pTimerThread=NULL; -DWORD ThreadpoolMgr::LastTickCount; + +// Cacheline aligned, hot variable +DECLSPEC_ALIGN(64) DWORD ThreadpoolMgr::LastTickCount; #ifdef _DEBUG DWORD ThreadpoolMgr::TickCountAdjustment=0; #endif -LONG ThreadpoolMgr::GateThreadStatus=GATE_THREAD_STATUS_NOT_RUNNING; +// Cacheline aligned, hot variable +DECLSPEC_ALIGN(64) LONG ThreadpoolMgr::GateThreadStatus=GATE_THREAD_STATUS_NOT_RUNNING; -ThreadpoolMgr::RecycledListsWrapper ThreadpoolMgr::RecycledLists; +// Move out of from preceeding variables' cache line +DECLSPEC_ALIGN(64) ThreadpoolMgr::RecycledListsWrapper ThreadpoolMgr::RecycledLists; ThreadpoolMgr::TimerInfo *ThreadpoolMgr::TimerInfosToBeRecycled = NULL; @@ -4818,7 +4826,7 @@ BOOL ThreadpoolMgr::SufficientDelaySinceLastDequeue() #define DEQUEUE_DELAY_THRESHOLD (GATE_THREAD_DELAY * 2) - unsigned delay = GetTickCount() - LastDequeueTime; + unsigned delay = GetTickCount() - VolatileLoad(&LastDequeueTime); unsigned tooLong; if(cpuUtilization < CpuUtilizationLow) @@ -4979,8 +4987,10 @@ DWORD __stdcall ThreadpoolMgr::TimerThreadStart(LPVOID p) if (pThread == NULL) return 0; - pTimerThread = pThread; - // Timer threads never die + if (pTimerThread != pThread) { + pTimerThread = pThread; + // Timer threads never die + } LastTickCount = GetTickCount(); diff --git a/src/vm/win32threadpool.h b/src/vm/win32threadpool.h index 39c53a10c4b2..a7ab5a714548 100644 --- a/src/vm/win32threadpool.h +++ b/src/vm/win32threadpool.h @@ -373,6 +373,9 @@ class ThreadpoolMgr } counts; + // padding to ensure we get our own cache line + BYTE padding[64]; + Counts GetCleanCounts() { #ifdef _WIN64 @@ -529,7 +532,7 @@ class ThreadpoolMgr static inline void UpdateLastDequeueTime() { LIMITED_METHOD_CONTRACT; - LastDequeueTime = GetTickCount(); + VolatileStore(&LastDequeueTime, (unsigned int)GetTickCount()); } static BOOL CreateTimerQueueTimer(PHANDLE phNewTimer, @@ -1301,18 +1304,14 @@ class ThreadpoolMgr SVAL_DECL(LONG,MinLimitTotalWorkerThreads); // same as MinLimitTotalCPThreads SVAL_DECL(LONG,MaxLimitTotalWorkerThreads); // same as MaxLimitTotalCPThreads - static Volatile LastDequeueTime; // used to determine if work items are getting thread starved + static unsigned int LastDequeueTime; // used to determine if work items are getting thread starved static HillClimbing HillClimbingInstance; - static BYTE padding1[64]; // padding to ensure own cache line - static LONG PriorCompletedWorkRequests; static DWORD PriorCompletedWorkRequestsTime; static DWORD NextCompletedWorkRequestsTime; - static BYTE padding2[64]; // padding to ensure own cache line - static LARGE_INTEGER CurrentSampleStartTime; static int ThreadAdjustmentInterval; From 2e0332af6a6889e350c229dac5293b9e3d9b60a3 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Tue, 9 Aug 2016 21:40:09 +0100 Subject: [PATCH 6/8] PR feedback --- src/vm/threadpoolrequest.h | 36 ++++++++++++++---------------------- src/vm/win32threadpool.cpp | 6 ++---- src/vm/win32threadpool.h | 2 +- 3 files changed, 17 insertions(+), 27 deletions(-) diff --git a/src/vm/threadpoolrequest.h b/src/vm/threadpoolrequest.h index af381c02fc0b..07bcbb7dbad0 100644 --- a/src/vm/threadpoolrequest.h +++ b/src/vm/threadpoolrequest.h @@ -174,17 +174,13 @@ class ManagedPerAppDomainTPCount : public IPerAppDomainTPCount { void DispatchWorkItem(bool* foundWork, bool* wasNotRecalled); private: - struct { - ADID m_id; - TPIndex m_index; - }; - struct { - DECLSPEC_ALIGN(64) struct { - BYTE padding1[64 - sizeof(LONG)]; - // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange - LONG m_numRequestsPending; - BYTE padding2[64]; - }; + ADID m_id; + TPIndex m_index; + DECLSPEC_ALIGN(64) struct { + BYTE m_padding1[64 - sizeof(LONG)]; + // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange + LONG m_numRequestsPending; + BYTE m_padding2[64]; }; }; @@ -283,17 +279,13 @@ class UnManagedPerAppDomainTPCount : public IPerAppDomainTPCount { } private: - struct { - SpinLock m_lock; - ULONG m_NumRequests; - }; - struct { - DECLSPEC_ALIGN(64) struct { - BYTE padding1[64 - sizeof(LONG)]; - // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange - LONG m_outstandingThreadRequestCount; - BYTE padding2[64]; - }; + SpinLock m_lock; + ULONG m_NumRequests; + DECLSPEC_ALIGN(64) struct { + BYTE m_padding1[64 - sizeof(LONG)]; + // Only use with VolatileLoad+VolatileStore+FastInterlockCompareExchange + LONG m_outstandingThreadRequestCount; + BYTE m_padding2[64]; }; }; diff --git a/src/vm/win32threadpool.cpp b/src/vm/win32threadpool.cpp index 7219bdaf61e7..cc58cead3157 100644 --- a/src/vm/win32threadpool.cpp +++ b/src/vm/win32threadpool.cpp @@ -4987,10 +4987,8 @@ DWORD __stdcall ThreadpoolMgr::TimerThreadStart(LPVOID p) if (pThread == NULL) return 0; - if (pTimerThread != pThread) { - pTimerThread = pThread; - // Timer threads never die - } + pTimerThread = pThread; + // Timer threads never die LastTickCount = GetTickCount(); diff --git a/src/vm/win32threadpool.h b/src/vm/win32threadpool.h index a7ab5a714548..0ca01f8b650b 100644 --- a/src/vm/win32threadpool.h +++ b/src/vm/win32threadpool.h @@ -378,12 +378,12 @@ class ThreadpoolMgr Counts GetCleanCounts() { + LIMITED_METHOD_CONTRACT; #ifdef _WIN64 // VolatileLoad x64 bit read is atomic return DangerousGetDirtyCounts(); #else // !_WIN64 // VolatileLoad may result in torn read - LIMITED_METHOD_CONTRACT; Counts result; #ifndef DACCESS_COMPILE result.AsLongLong = FastInterlockCompareExchangeLong(&counts.AsLongLong, 0, 0); From 519dcde5a6c4fe999e08219e7edd7c9f90e6c96e Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Wed, 10 Aug 2016 00:15:18 +0100 Subject: [PATCH 7/8] Insert MemoryBarrier revert Comthreadpool --- src/vm/comthreadpool.cpp | 7 +++---- src/vm/win32threadpool.cpp | 4 ++-- src/vm/win32threadpool.h | 3 ++- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/vm/comthreadpool.cpp b/src/vm/comthreadpool.cpp index 8231214a20eb..7f629b508b32 100644 --- a/src/vm/comthreadpool.cpp +++ b/src/vm/comthreadpool.cpp @@ -251,7 +251,6 @@ FCIMPL0(FC_BOOL_RET, ThreadPoolNative::NotifyRequestComplete) pThread->HasCriticalRegion() || pThread->HasThreadAffinity(); - // Read fenced call bool shouldAdjustWorkers = ThreadpoolMgr::ShouldAdjustMaxWorkersActive(); // @@ -267,15 +266,15 @@ FCIMPL0(FC_BOOL_RET, ThreadPoolNative::NotifyRequestComplete) { HELPER_METHOD_FRAME_BEGIN_RET_0(); - if (needReset) - pThread->InternalReset(FALSE, TRUE, TRUE, FALSE); - if (shouldAdjustWorkers) { ThreadpoolMgr::AdjustMaxWorkersActive(); tal.Release(); } + if (needReset) + pThread->InternalReset(FALSE, TRUE, TRUE, FALSE); + HELPER_METHOD_FRAME_END(); } diff --git a/src/vm/win32threadpool.cpp b/src/vm/win32threadpool.cpp index cc58cead3157..2ec7deded47a 100644 --- a/src/vm/win32threadpool.cpp +++ b/src/vm/win32threadpool.cpp @@ -1262,11 +1262,11 @@ void ThreadpoolMgr::AdjustMaxWorkersActive() } } - // Memory fences below writes - VolatileStore(&PriorCompletedWorkRequests, totalNumCompletions); + PriorCompletedWorkRequests = totalNumCompletions; PriorCompletedWorkRequestsTime = currentTicks; NextCompletedWorkRequestsTime = currentTicks + ThreadAdjustmentInterval; CurrentSampleStartTime = endTime; + MemoryBarrier(); } } diff --git a/src/vm/win32threadpool.h b/src/vm/win32threadpool.h index 0ca01f8b650b..50eb708c394b 100644 --- a/src/vm/win32threadpool.h +++ b/src/vm/win32threadpool.h @@ -1142,8 +1142,9 @@ class ThreadpoolMgr if (CLRThreadpoolHosted()) return false; + MemoryBarrier(); DWORD priorTime = PriorCompletedWorkRequestsTime; - DWORD requiredInterval = VolatileLoad(&NextCompletedWorkRequestsTime) - priorTime; // fences above read + DWORD requiredInterval = NextCompletedWorkRequestsTime - priorTime; // fences above read DWORD elapsedInterval = GetTickCount() - priorTime; if (elapsedInterval >= requiredInterval) { From a0597dae2368fecea16c21f2d9f255476c330818 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Wed, 10 Aug 2016 01:10:43 +0100 Subject: [PATCH 8/8] Fix MemoryBarrier --- src/vm/win32threadpool.cpp | 6 +++--- src/vm/win32threadpool.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/vm/win32threadpool.cpp b/src/vm/win32threadpool.cpp index 2ec7deded47a..23ed9116114d 100644 --- a/src/vm/win32threadpool.cpp +++ b/src/vm/win32threadpool.cpp @@ -1263,10 +1263,10 @@ void ThreadpoolMgr::AdjustMaxWorkersActive() } PriorCompletedWorkRequests = totalNumCompletions; - PriorCompletedWorkRequestsTime = currentTicks; NextCompletedWorkRequestsTime = currentTicks + ThreadAdjustmentInterval; - CurrentSampleStartTime = endTime; - MemoryBarrier(); + MemoryBarrier(); // flush previous writes (especially NextCompletedWorkRequestsTime) + PriorCompletedWorkRequestsTime = currentTicks; + CurrentSampleStartTime = endTime;; } } diff --git a/src/vm/win32threadpool.h b/src/vm/win32threadpool.h index 50eb708c394b..45880d0b0f4c 100644 --- a/src/vm/win32threadpool.h +++ b/src/vm/win32threadpool.h @@ -1142,9 +1142,9 @@ class ThreadpoolMgr if (CLRThreadpoolHosted()) return false; - MemoryBarrier(); - DWORD priorTime = PriorCompletedWorkRequestsTime; - DWORD requiredInterval = NextCompletedWorkRequestsTime - priorTime; // fences above read + DWORD priorTime = PriorCompletedWorkRequestsTime; + MemoryBarrier(); // read fresh value for NextCompletedWorkRequestsTime below + DWORD requiredInterval = NextCompletedWorkRequestsTime - priorTime; DWORD elapsedInterval = GetTickCount() - priorTime; if (elapsedInterval >= requiredInterval) {