From 776053faf760acd17cedb51838191ba9f80a6244 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 20 Jul 2021 18:06:25 -0400 Subject: [PATCH] Revamp caching scheme in PoolingAsyncValueTaskMethodBuilder (#55955) * Revamp caching scheme in PoolingAsyncValueTaskMethodBuilder The current scheme caches one instance per thread in a ThreadStatic, and then has a locked stack that all threads contend on; then to avoid blocking a thread while accessing the cache, locking is done with TryEnter rather than Enter, simply skipping the cache if there is any contention. The locked stack is capped by default at ProcessorCount*4 objects. The new scheme is simpler: one instance per thread, one instance per core. This ends up meaning fewer objects may be cached, but it also almost entirely eliminates contention between threads trying to rent/return objects. As a result, under heavy load it can actually do a better job of using pooled objects as it doesn't bail on using the cache in the face of contention. It also reduces concerns about larger machines being more negatively impacted by the caching. Under lighter load, since we don't cache as many objects, it does mean we may end up allocating a bit more, but generally not much more (and the size of the object we do allocate is a reference-field smaller). * Address PR feedback --- .../src/Internal/Padding.cs | 8 ++ .../PoolingAsyncValueTaskMethodBuilder.cs | 8 -- .../PoolingAsyncValueTaskMethodBuilderT.cs | 133 +++++++----------- .../Sources/ManualResetValueTaskSourceCore.cs | 17 ++- 4 files changed, 73 insertions(+), 93 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/Internal/Padding.cs b/src/libraries/System.Private.CoreLib/src/Internal/Padding.cs index c9d1efe78389c..17f3643be3e46 100644 --- a/src/libraries/System.Private.CoreLib/src/Internal/Padding.cs +++ b/src/libraries/System.Private.CoreLib/src/Internal/Padding.cs @@ -21,4 +21,12 @@ internal static class PaddingHelpers internal struct PaddingFor32 { } + + /// Padded reference to an object. + [StructLayout(LayoutKind.Explicit, Size = PaddingHelpers.CACHE_LINE_SIZE)] + internal struct PaddedReference + { + [FieldOffset(0)] + public object? Object; + } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/CompilerServices/PoolingAsyncValueTaskMethodBuilder.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/CompilerServices/PoolingAsyncValueTaskMethodBuilder.cs index 84996a0e062fd..9210434e90a16 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/CompilerServices/PoolingAsyncValueTaskMethodBuilder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/CompilerServices/PoolingAsyncValueTaskMethodBuilder.cs @@ -1,10 +1,8 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Globalization; using System.Runtime.InteropServices; using System.Threading.Tasks; - using StateMachineBox = System.Runtime.CompilerServices.PoolingAsyncValueTaskMethodBuilder.StateMachineBox; namespace System.Runtime.CompilerServices @@ -13,12 +11,6 @@ namespace System.Runtime.CompilerServices [StructLayout(LayoutKind.Auto)] public struct PoolingAsyncValueTaskMethodBuilder { - /// Maximum number of boxes that are allowed to be cached per state machine type. - internal static readonly int s_valueTaskPoolingCacheSize = - int.TryParse(Environment.GetEnvironmentVariable("DOTNET_SYSTEM_THREADING_POOLINGASYNCVALUETASKSCACHESIZE"), NumberStyles.Integer, CultureInfo.InvariantCulture, out int result) && result > 0 ? - result : - Environment.ProcessorCount * 4; // arbitrary default value - /// Sentinel object used to indicate that the builder completed synchronously and successfully. private static readonly StateMachineBox s_syncSuccessSentinel = PoolingAsyncValueTaskMethodBuilder.s_syncSuccessSentinel; diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/CompilerServices/PoolingAsyncValueTaskMethodBuilderT.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/CompilerServices/PoolingAsyncValueTaskMethodBuilderT.cs index 02d15bb92c3b9..a69dbc041216d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/CompilerServices/PoolingAsyncValueTaskMethodBuilderT.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/CompilerServices/PoolingAsyncValueTaskMethodBuilderT.cs @@ -7,6 +7,7 @@ using System.Threading; using System.Threading.Tasks; using System.Threading.Tasks.Sources; +using Internal; using Internal.Runtime.CompilerServices; namespace System.Runtime.CompilerServices @@ -209,7 +210,7 @@ private static IAsyncStateMachineBox GetStateMachineBox( // cases is we lose the ability to properly step in the debugger, as the debugger uses that // object's identity to track this specific builder/state machine. As such, we proceed to // overwrite whatever's there anyway, even if it's non-null. - StateMachineBox box = StateMachineBox.GetOrCreateBox(); + StateMachineBox box = StateMachineBox.RentFromCache(); boxFieldRef = box; // important: this must be done before storing stateMachine into box.StateMachine! box.StateMachine = stateMachine; box.Context = currentContext; @@ -284,114 +285,86 @@ private sealed class StateMachineBox : { /// Delegate used to invoke on an ExecutionContext when passed an instance of this box type. private static readonly ContextCallback s_callback = ExecutionContextCallback; + /// Per-core cache of boxes, with one box per core. + /// Each element is padded to expected cache-line size so as to minimize false sharing. + private static readonly PaddedReference[] s_perCoreCache = new PaddedReference[Environment.ProcessorCount]; /// Thread-local cache of boxes. This currently only ever stores one. [ThreadStatic] private static StateMachineBox? t_tlsCache; - /// Lock used to protected the shared cache of boxes. 1 == held, 0 == not held. - /// The code that uses this assumes a runtime without thread aborts. - private static int s_cacheLock; - /// Singly-linked list cache of boxes. - private static StateMachineBox? s_cache; - /// The number of items stored in . - private static int s_cacheSize; - - /// If this box is stored in the cache, the next box in the cache. - private StateMachineBox? _next; + /// The state machine itself. public TStateMachine? StateMachine; /// Gets a box object to use for an operation. This may be a reused, pooled object, or it may be new. [MethodImpl(MethodImplOptions.AggressiveInlining)] // only one caller - internal static StateMachineBox GetOrCreateBox() + internal static StateMachineBox RentFromCache() { - StateMachineBox? box; - - // First see if the thread-static cache of at most one box has one. - box = t_tlsCache; + // First try to get a box from the per-thread cache. + StateMachineBox? box = t_tlsCache; if (box is not null) { t_tlsCache = null; - return box; } - - // Try to acquire the lock to access the cache. If there's any contention, don't use the cache. - if (s_cache is not null && // hot read just to see if there's any point paying for the interlocked - Interlocked.Exchange(ref s_cacheLock, 1) == 0) + else { - // If there are any instances cached, take one from the cache stack and use it. - box = s_cache; - if (box is not null) + // If we can't, then try to get a box from the per-core cache. + ref StateMachineBox? slot = ref PerCoreCacheSlot; + if (slot is null || + (box = Interlocked.Exchange?>(ref slot, null)) is null) { - s_cache = box._next; - box._next = null; - s_cacheSize--; - Debug.Assert(s_cacheSize >= 0, "Expected the cache size to be non-negative."); - - // Release the lock and return the box. - Volatile.Write(ref s_cacheLock, 0); - return box; + // If we can't, just create a new one. + box = new StateMachineBox(); } - - // No objects were cached. We'll just create a new instance. - Debug.Assert(s_cacheSize == 0, "Expected cache size to be 0."); - - // Release the lock. - Volatile.Write(ref s_cacheLock, 0); } - // Couldn't quickly get a cached instance, so create a new instance. - return new StateMachineBox(); + return box; } - /// Returns this instance to the cache, or drops it if the cache is full or this instance shouldn't be cached. - private void ReturnOrDropBox() + /// Returns this instance to the cache. + [MethodImpl(MethodImplOptions.AggressiveInlining)] // only two callers + private void ReturnToCache() { - Debug.Assert(_next is null, "Expected box to not be part of cached list."); - // Clear out the state machine and associated context to avoid keeping arbitrary state referenced by - // lifted locals. We want to do this regardless of whether we end up caching the box or not, in case - // the caller keeps the box alive for an arbitrary period of time. + // lifted locals, and reset the instance for another await. ClearStateUponCompletion(); - - // Reset the MRVTSC. We can either do this here, in which case we may be paying the (small) overhead - // to reset the box even if we're going to drop it, or we could do it while holding the lock, in which - // case we'll only reset it if necessary but causing the lock to be held for longer, thereby causing - // more contention. For now at least, we do it outside of the lock. (This must not be done after - // the lock is released, since at that point the instance could already be in use elsewhere.) - // We also want to increment the version number even if we're going to drop it, to maximize the chances - // that incorrectly double-awaiting a ValueTask will produce an error. _valueTaskSource.Reset(); - // If reusing the object would result in potentially wrapping around its version number, just throw it away. - // This provides a modicum of additional safety when ValueTasks are misused (helping to avoid the case where - // a ValueTask is illegally re-awaited and happens to do so at exactly 2^16 uses later on this exact same instance), - // at the expense of potentially incurring an additional allocation every 65K uses. - if ((ushort)_valueTaskSource.Version == ushort.MaxValue) - { - return; - } - - // If the thread static cache is empty, store this into it and bail. + // If the per-thread cache is empty, store this into it.. if (t_tlsCache is null) { t_tlsCache = this; - return; } - - // Try to acquire the cache lock. If there's any contention, or if the cache is full, we just throw away the object. - if (Interlocked.Exchange(ref s_cacheLock, 1) == 0) + else { - if (s_cacheSize < PoolingAsyncValueTaskMethodBuilder.s_valueTaskPoolingCacheSize) + // Otherwise, store it into the per-core cache. + ref StateMachineBox? slot = ref PerCoreCacheSlot; + if (slot is null) { - // Push the box onto the cache stack for subsequent reuse. - _next = s_cache; - s_cache = this; - s_cacheSize++; - Debug.Assert(s_cacheSize > 0 && s_cacheSize <= PoolingAsyncValueTaskMethodBuilder.s_valueTaskPoolingCacheSize, "Expected cache size to be within bounds."); + // Try to avoid the write if we know the slot isn't empty (we may still have a benign race condition and + // overwrite what's there if something arrived in the interim). + Volatile.Write(ref slot, this); } + } + } - // Release the lock. - Volatile.Write(ref s_cacheLock, 0); + /// Gets the slot in for the current core. + private static ref StateMachineBox? PerCoreCacheSlot + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] // only two callers are RentFrom/ReturnToCache + get + { + // Get the current processor ID. We need to ensure it fits within s_perCoreCache, so we + // could % by its length, but we can do so instead by Environment.ProcessorCount, which will be a const + // in tier 1, allowing better code gen, and then further use uints for even better code gen. + Debug.Assert(s_perCoreCache.Length == Environment.ProcessorCount); + int i = (int)((uint)Thread.GetCurrentProcessorId() % (uint)Environment.ProcessorCount); + + // We want an array of StateMachineBox<> objects, each consuming its own cache line so that + // elements don't cause false sharing with each other. But we can't use StructLayout.Explicit + // with generics. So we use object fields, but always reinterpret them (for all reads and writes + // to avoid any safety issues) as StateMachineBox<> instances. + Debug.Assert(s_perCoreCache[i].Object is null || s_perCoreCache[i].Object is StateMachineBox); + return ref Unsafe.As?>(ref s_perCoreCache[i].Object); } } @@ -429,7 +402,7 @@ public void MoveNext() if (context is null) { - Debug.Assert(!(StateMachine is null)); + Debug.Assert(StateMachine is not null); StateMachine.MoveNext(); } else @@ -447,8 +420,7 @@ TResult IValueTaskSource.GetResult(short token) } finally { - // Reuse this instance if possible, otherwise clear and drop it. - ReturnOrDropBox(); + ReturnToCache(); } } @@ -461,8 +433,7 @@ void IValueTaskSource.GetResult(short token) } finally { - // Reuse this instance if possible, otherwise clear and drop it. - ReturnOrDropBox(); + ReturnToCache(); } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/Tasks/Sources/ManualResetValueTaskSourceCore.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/Tasks/Sources/ManualResetValueTaskSourceCore.cs index 846e755904440..692f2fa9da824 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Threading/Tasks/Sources/ManualResetValueTaskSourceCore.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Threading/Tasks/Sources/ManualResetValueTaskSourceCore.cs @@ -2,7 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Diagnostics; -using System.Diagnostics.CodeAnalysis; using System.Runtime.ExceptionServices; using System.Runtime.InteropServices; @@ -91,14 +90,24 @@ public ValueTaskSourceStatus GetStatus(short token) [StackTraceHidden] public TResult GetResult(short token) { - ValidateToken(token); - if (!_completed) + if (token != _version || !_completed || _error is not null) + { + ThrowForFailedGetResult(token); + } + + return _result!; + } + + [StackTraceHidden] + private void ThrowForFailedGetResult(short token) + { + if (token != _version || !_completed) { ThrowHelper.ThrowInvalidOperationException(); } _error?.Throw(); - return _result!; + Debug.Fail("Should not get here"); } /// Schedules the continuation action for this operation.