Skip to content
This repository has been archived by the owner on Nov 1, 2020. It is now read-only.

Improve thread pool performance on Unix #6955

Merged
merged 21 commits into from
Feb 15, 2019
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
3a27429
Update LowLevelLifoSemaphore to more closely mimic CoreCLR.
filipnavara Feb 5, 2019
5e472b7
Add cache line separation
filipnavara Feb 6, 2019
938f134
Remove support for uninterruptible and/or prioritized waits in WaitSu…
filipnavara Feb 6, 2019
0dc3770
Remove obsolete comment
filipnavara Feb 7, 2019
41f83b9
Rename Wake/WaitForWake to ReleaseCore/WaitCore
filipnavara Feb 7, 2019
265f41a
Rename Counts.ExchangeSubtract to Subtract to reflect the semantics
filipnavara Feb 7, 2019
6b34732
Add missing sealed keyword and Assert
filipnavara Feb 7, 2019
9d2fad4
Make thread pool worker spin count configurable
filipnavara Feb 7, 2019
579bd96
Replicate the spinning algorithm from CLRLifoSemaphore
filipnavara Feb 7, 2019
750db85
Update Counts to use instance methods, add Assert for spinCount >= 0
filipnavara Feb 11, 2019
e7afe31
Revert "Remove support for uninterruptible and/or prioritized waits i…
filipnavara Feb 11, 2019
ddfb816
Switch LowLevelLifoSemaphore.Unix back to WaitSystem implementation
filipnavara Feb 11, 2019
81dc860
Remove obsolete internal API
filipnavara Feb 11, 2019
e60f673
Add comment
filipnavara Feb 11, 2019
2aca3bd
Fix asserts
filipnavara Feb 11, 2019
05e2183
Move more of the spinning logic to LowLevelSpinWaiter
filipnavara Feb 11, 2019
9a3678e
Remove obsolete reference to LowLevelLinq
filipnavara Feb 11, 2019
9435818
Replace SpinYieldThreshold with SpinSleep0Threshold
filipnavara Feb 14, 2019
4678e25
Update Asserts based on feedback
filipnavara Feb 14, 2019
cd0979d
Reorganize the uniprocessor spinning code paths
filipnavara Feb 14, 2019
2a109f3
Use Runtime.RuntimeImports.RhGetProcessCpuCount in LowLevelSpinWaiter…
filipnavara Feb 15, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ private bool JoinInternal(int millisecondsTimeout)
}
else
{
result = WaitHandle.WaitForSingleObject(waitHandle.DangerousGetHandle(), millisecondsTimeout, true);
result = WaitHandle.WaitForSingleObject(waitHandle.DangerousGetHandle(), millisecondsTimeout);
}

return result == (int)Interop.Constants.WaitObject0;
Expand Down
3 changes: 2 additions & 1 deletion src/System.Private.CoreLib/src/System.Private.CoreLib.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -268,11 +268,12 @@
<Compile Include="System\Threading\ManagedThreadId.cs" />
<Compile Include="System\Threading\Lock.cs" />
<Compile Include="System\Threading\Condition.cs" />
<Compile Include="System\Threading\FirstLevelSpinWaiter.cs" />
<Compile Include="System\Threading\Interlocked.cs" />
<Compile Include="System\Threading\LockHolder.cs" />
<Compile Include="System\Threading\LowLevelLifoSemaphore.cs" />
<Compile Include="System\Threading\LowLevelLock.cs" />
<Compile Include="System\Threading\LowLevelMonitor.cs" />
<Compile Include="System\Threading\LowLevelSpinWaiter.cs" />
<Compile Include="System\Threading\Monitor.cs" />
<Compile Include="System\Threading\ObjectHeader.cs" Condition="'$(UseSyncTable)' == 'true'" />
<Compile Include="System\Threading\Overlapped.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,16 @@ private static class WorkerThread
/// <summary>
/// Semaphore for controlling how many threads are currently working.
/// </summary>
private static LowLevelLifoSemaphore s_semaphore = new LowLevelLifoSemaphore(0, MaxPossibleThreadCount);

private static LowLevelLifoSemaphore s_semaphore = new LowLevelLifoSemaphore(0, MaxPossibleThreadCount, SemaphoreSpinCount);

/// <summary>
/// Maximum number of spins a thread pool worker thread performs before waiting for work
/// </summary>
private static int SemaphoreSpinCount
{
get => AppContextConfigHelper.GetInt16Config("ThreadPool_UnfairSemaphoreSpinLimit", 70, false);
}

private static void WorkerThreadStart()
{
ClrThreadPoolEventSource.Log.WorkerThreadStart(ThreadCounts.VolatileReadCounts(ref ThreadPoolInstance._separated.counts).numExistingThreads);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,74 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Collections.Generic;

namespace System.Threading
{
/// <summary>
/// A LIFO semaphore.
/// Waits on this semaphore are uninterruptible.
/// </summary>
internal sealed class LowLevelLifoSemaphore : IDisposable
internal sealed partial class LowLevelLifoSemaphore : IDisposable
{
private WaitSubsystem.WaitableObject _semaphore;
private WaiterListEntry _waiterStackHead;
private LowLevelLock _waiterStackLock;
[ThreadStatic]
private static WaiterListEntry t_waitEntry;

public LowLevelLifoSemaphore(int initialSignalCount, int maximumSignalCount)
private void Create(int maximumSignalCount)
{
_semaphore = WaitSubsystem.WaitableObject.NewSemaphore(initialSignalCount, maximumSignalCount);
_waiterStackHead = null;
_waiterStackLock = new LowLevelLock();
}

public bool Wait(int timeoutMs)
public void Dispose()
{
return WaitSubsystem.Wait(_semaphore, timeoutMs, false, true);
}

public int Release(int count)
private bool WaitCore(int timeoutMs)
{
return WaitSubsystem.ReleaseSemaphore(_semaphore, count);
WaiterListEntry waitEntry = t_waitEntry ?? (t_waitEntry = new WaiterListEntry());
filipnavara marked this conversation as resolved.
Show resolved Hide resolved
waitEntry._monitor.Acquire();
try
{
_waiterStackLock.Acquire();
waitEntry._next = _waiterStackHead;
_waiterStackHead = waitEntry;
_waiterStackLock.Release();
return waitEntry._monitor.Wait(timeoutMs);
filipnavara marked this conversation as resolved.
Show resolved Hide resolved
}
finally
{
waitEntry._monitor.Release();
}
}

public void Dispose()
private void ReleaseCore(int count)
{
while (count-- > 0)
{
_waiterStackLock.Acquire();
WaiterListEntry waitEntry = _waiterStackHead;
_waiterStackHead = waitEntry?._next;
_waiterStackLock.Release();
if (waitEntry != null)
filipnavara marked this conversation as resolved.
Show resolved Hide resolved
{
waitEntry._monitor.Acquire();
waitEntry._monitor.Signal_Release();
}
}
}

class WaiterListEntry
{
public LowLevelMonitor _monitor;
public WaiterListEntry _next;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought that you may use this implementation for Mono. It shouldn't be necessary to change the implementation in CoreRT for that purpose, what it was doing before is more appropriate for CoreRT. It would probably be beneficial to separate LowLevelLifoSemaphore, which offers just the base LIFO semaphore functionality (no spinning, etc.), from ClrLifoSemaphore, which does all of the rest. Maybe ClrLifoSemaphore could use a better name but I would leave that for later, as there are more abstractions that could be done. ClrLifoSemaphore would use LowLevelLifoSemaphore. With that, the implementation of ClrLifoSemaphore can be shared between runtimes and the only runtime-specific dependency would be LowLevelLifoSemaphore, for which runtimes can provide an implementation that is suitable. For CoreRT, that would be using the wait subsystem, for Mono it may be this implementation, for CoreCLR it would be the PAL, and so on.

Copy link
Member Author

@filipnavara filipnavara Feb 7, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't split up LowLevelLifoSemaphore and ClrLifoSemaphore because technically there's only one user of it - the thread pool. There's probably never going to be another one because there's SemaphoreSlim, so as long as you specifically don't depend on uninterruptibility and LIFO order (at the same time) there's no reason to use this one. LowLevelLock already contained spinning code, so it seemed acceptable that spinning is part of the low-level code. The LowLevel prefix also makes it immediately obvious that it is uninterruptible (as long as one knows the convention or read the long comment in WaitSubsystem).

The runtime sharing is a bit of a PITA part that I had to put some thought into. I am trying hard to avoid a Mono-specific implementation (as long as I can avoid hurting CoreRT at the same time).

While the wait subsystem seems like an obvious choice on CoreRT it doesn't really save any code. In fact it makes the subsystem support LIFO order and uninterruptible Wait which is way more code than the primitive implementation here. Also the current implementation of WaitSubsystem has contention on a single lock which hurts the performance a bit (implementation detail, but a real problem).

My current plan is to break this dependency on WaitSubsystem on this one place and move LowLevelLock/LowLevelMonitor/LowLevelSemaphore into shared part of CoreLib as a feature (eg. FeaturePortableLowLevelSync). That would be shared by CoreRT and Mono. CoreCLR could use it (for testing at least), or simply skip the feature and implement the primitives on its own through existing PAL code.

Copy link
Member

@kouvel kouvel Feb 7, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Merging LowLevelLifoSemaphore and ClrLifoSemaphore is fine, the wait portions can still be specialized for runtimes, and I think they should be. I really don't want to use this implementation of a LIFO semaphore in CoreRT or CoreCLR, it's unnecessarily more complicated. The tracking being done here and additional overhead is unnecessary for those runtimes, which already do similar tracking to support all of the other types of waits and the tracking is shared so there is no additional overhead with uninterruptibility. Adding uninterruptibility in both runtimes was actually very easy. Conversely it was actually a lot more work to add interruptibility to CoreRT's wait subsystem, which started off uninterruptible during initial implementation. If it's not easy to modify Mono's semaphore to support uninterruptible waits, I would still prefer to separate out this implementation specifically for Mono.

The LowLevel prefix also makes it immediately obvious that it is uninterruptible (as long as one knows the convention or read the long comment in WaitSubsystem).

At the moment the LowLevel part doesn't just mean uninterruptible, it also means it's a simple implementation that directly uses system components. LowLevelLock does some spinning, in hindsight it probably would have made sense to separate it out. The idea was that the LowLevel portions would be system dependencies that may be specific to runtimes (which may also be shared if possible) and anything on top of them could be shared. Even if there is only one user currently, it can help to componentize the functionality.

While the wait subsystem seems like an obvious choice on CoreRT it doesn't really save any code. In fact it makes the subsystem support LIFO order and uninterruptible Wait which is way more code than the primitive implementation here.

Not really, most of the code is shared with interruptible waits so there is no overhead that is specific to uninterruptible waits.

Also the current implementation of WaitSubsystem has contention on a single lock which hurts the performance a bit (implementation detail, but a real problem).

I don't see how this implementation would be better. Typically these locks are uncontended. It's rare for two threads to both be releasing threads because the thread pool uses an interlocked operation to figure out how many threads need to be released and one of those two threads would release all of them. Perf is not a big concern but the wait subsystem is actually pretty fast.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if there is only one user currently, it can help to componentize the functionality.

Granted sometimes it can be advantageous to not componentize in order to add some features

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really don't want to use this implementation of a LIFO semaphore in CoreRT or CoreCLR, it's unnecessarily more complicated.

I do have different goals for this, which makes it hard agenda to push. I think it's not unnecessarily more complicated because the amount of code removed from the wait subsystem is more than the amount of code added here. I will do some more benchmarks of different variations, but I actually think that the contention on s_lock in WaitSubsystem hurts the performance significantly for heavily threaded code and this is an easy way out. The suggestion with CoreCLR was not about using it there for production code, but only as a means of bring up of the code in another runtime and for apples-to-apples performance comparison.

At the moment the LowLevel part doesn't just mean uninterruptible, it also means it's a simple implementation that directly uses system components.

The documentation states

    /// <see cref="LowLevelLock"/> and <see cref="LowLevelMonitor"/>
    ///   - These are "low level" in the sense they don't depend on this wait subsystem, and any waits done are not
    ///     interruptible

LowLevelSemaphore never used the system component in Unix, it used the wait subsystem, violating any common sense about the component layering. In that sense calling it "low level" is wrong. The only other shared property is the uninterruptibility. I think it should be implemented using system components, but if I fail to push that direction then it should definitely be renamed to avoid confusion.

Not really, most of the code is shared with interruptible waits so there is no overhead that is specific to uninterruptible waits.

No overhead in terms of performance for other code paths through the WaitSubsystem. Amount of code is different story though. Performance-wise it increases contention on the single lock. It's orthogonal issue to these code changes though.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re the naming of LowLevelSemaphore: I chose that name moreso to match the naming of LowLevelLinq (subset of API surface reimplemented for runtime use) than LowLevelLock or LowLevelMonitor (thin managed wrappers around native system components).

Copy link
Member Author

@filipnavara filipnavara Feb 11, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is also not necessary to port the wait subsystem, that is a runtime-specific piece currently. Perhaps it could be shared in the future but it doesn't have to be at the moment. Is there some other agenda you're trying to push for?

The whole point is to share as much code as possible between the runtimes.

Code that is runtime specific doesn't get as much testing as the one that is shared. Likewise the performance improvements have historically been done mostly on the CoreCLR side and didn't translate to CoreRT which was a shame. Now that Mono is sharing the CoreLib code there was a push that helped to fix the gap by moving more of the code to the shared part of the CoreLib.

Ideally I would like to see the wait subsystem shared between all three runtimes, but we are still quite far away from there and it may well never happen.

Decoupling individual components - thread pool, timers, low level synchronization primitives - was an attempt to get there faster, piece by piece. Unfortunately I underestimated the complexity of the LIFO semaphore implementation and its edge cases. I assumed I could do it in less than 100 lines and simplify the WaitSubsystem at the same time to the point that it would be a win-win situation overall (no regression, cleaner component separation, the benefits of code being shared and tested by 2 runtimes and allowing testing the whole managed pool on CoreCLR as well). Eventually I ended up with an implementation that is small enough and covers all the edge cases that were brought up, but it requires unmanaged code and it's not as mature as whatever WaitSubsystem does now.

This implementation adds another monitor per thread, allocates another object per thread, has issues with OOMs...

Yeah, it was flawed. I have completely rewritten it in the end, but it's out of scope of this PR. I solved couple of the issues by moving the LIFO semaphore to unmanaged code, reducing the number of PThread objects (1 mutex + 1 condition/waiting thread) and storing the LIFO entries on stack. (Unfinished code here: https://gist.github.com/filipnavara/4d58444435cb106bbc4e3ffa939e99be, missing comments and propert pthread_cond_t initalization on non-macOS systems).

The global lock is held for a very short time, I have not seen significant contention on that lock.

I have seen it in the heavy threading+locking benchmarks before, but it could have been flaw in the benchmark methodology. I went ahead and re-run a different combinations of the code with my benchmarks. It turns out that the additional logic with interlocked Counts loops avoids getting to the slow path in those heavy threading scenarios.

I think the name is appropriate, I'm not too particular about the name as long as it sort of works, it's difficult to get names right.

I understand it is difficult. I am not happy about the name, but I don't really have a better name right now. The "low level" part makes me feel it should not call into something "higher level" like WaitSubsystem, implementation detail or not.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re the naming of LowLevelSemaphore: I chose that name moreso to match the naming of LowLevelLinq (subset of API surface reimplemented for runtime use) than LowLevelLock or LowLevelMonitor (thin managed wrappers around native system component

Ah, thanks for helpful historical note :-) LowLevelLinq was not really used in that part of code anymore, aside from a using in one of the files that seems like a leftover.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A good time to share the implementation would be if a system LIFO wait support were to be found. This implementation of the component can be implemented in managed or native code, would be fairly small, and perf doesn't really matter, that I don't see a huge benefit in sharing it at the moment, especially since there are better alternatives for some runtimes. Sharing the portion above this part (the ClrLifoSemaphore part of tracking spinners/waiters, etc.) is a lot more useful.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A good time to share the implementation would be if a system LIFO wait support were to be found.

Not going to happen, at least not in a portable way. Every other library I found (notably Facebook Folly, which is referenced from couple of academic papers) uses an implementation similar to the one I came up with.


public WaiterListEntry()
{
this._monitor = new LowLevelMonitor();
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,13 @@ namespace System.Threading
/// See https://msdn.microsoft.com/en-us/library/windows/desktop/aa365198(v=vs.85).aspx under How I/O Completion Ports Work.
/// From the docs "Threads that block their execution on an I/O completion port are released in last-in-first-out (LIFO) order."
/// </remarks>
internal sealed class LowLevelLifoSemaphore : IDisposable
internal sealed partial class LowLevelLifoSemaphore : IDisposable
{
private IntPtr _completionPort;

public LowLevelLifoSemaphore(int initialSignalCount, int maximumSignalCount)
private void Create(int maximumSignalCount)
{
Debug.Assert(initialSignalCount >= 0, "Windows LowLevelLifoSemaphore does not support a negative signal count"); // TODO: Track actual signal count to enable this
Debug.Assert(maximumSignalCount > 0);
Debug.Assert(initialSignalCount <= maximumSignalCount);

_completionPort =
Interop.Kernel32.CreateIoCompletionPort(new IntPtr(-1), IntPtr.Zero, UIntPtr.Zero, maximumSignalCount);
Expand All @@ -34,7 +32,6 @@ public LowLevelLifoSemaphore(int initialSignalCount, int maximumSignalCount)
exception.HResult = error;
throw exception;
}
Release(initialSignalCount);
}

~LowLevelLifoSemaphore()
Expand All @@ -45,7 +42,7 @@ public LowLevelLifoSemaphore(int initialSignalCount, int maximumSignalCount)
}
}

public bool Wait(int timeoutMs)
public bool WaitCore(int timeoutMs)
{
Debug.Assert(timeoutMs >= -1);

Expand All @@ -54,7 +51,7 @@ public bool Wait(int timeoutMs)
return success;
}

public int Release(int count)
public void ReleaseCore(int count)
{
Debug.Assert(count > 0);

Expand All @@ -68,7 +65,6 @@ public int Release(int count)
throw exception;
}
}
return 0; // TODO: Track actual signal count to calculate this
}

public void Dispose()
Expand Down
Loading