Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NonBacktracking Regex optimizations #102655

Merged
merged 63 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
34eba54
Regex automata optimizations
ieviev May 24, 2024
49607f4
off by one err
ieviev May 24, 2024
5ac29f3
wip reversal optimizations
ieviev May 26, 2024
e440dec
removing unnecessary overhead
ieviev May 26, 2024
627fd90
handle final position correctly
ieviev May 26, 2024
7ae6440
edge case workarounds, tests should be ok again
ieviev May 27, 2024
383f3e5
optimizing lookup initialization
ieviev May 27, 2024
5a2636c
more dfa overhead removed
ieviev May 28, 2024
57e5b8d
removed potential rewrite
ieviev May 28, 2024
4d275db
low memory variant
ieviev May 28, 2024
c35ed7e
some kind of compromise between speed and memory
ieviev May 28, 2024
868e02d
cheaper nullability checks
ieviev May 29, 2024
14afd18
nullability encoding
ieviev May 29, 2024
5f5ab55
nullability cached as bytes
ieviev May 29, 2024
dd121de
reverting some changes
ieviev May 30, 2024
723c5b6
testing nfa fallback
ieviev Jun 5, 2024
6bf4095
refactoring, work in progress
ieviev Jun 17, 2024
b10e600
refactoring to struct interfaces
ieviev Jun 18, 2024
d68bd3c
refactoring optimizations
ieviev Jun 18, 2024
153dfc3
fallback mode and bugfix
ieviev Jun 18, 2024
4aebe3e
reenable warnings
ieviev Jun 18, 2024
1e6f55c
anchor edge case
ieviev Jun 19, 2024
c6ad3ac
anchor edge cases
ieviev Jun 19, 2024
e10b43f
Apply suggestions from code review
ieviev Jun 19, 2024
f581755
Apply suggestions from code review
ieviev Jun 27, 2024
01a9684
rebased branch and some cleanup
ieviev Jun 27, 2024
341ce27
cleanup, removing unused features
ieviev Jun 27, 2024
1a28c69
cleanup
ieviev Jun 27, 2024
9bba84f
timeout limit changes
ieviev Jun 29, 2024
a957781
lookup allocation threshold and timeout limits
ieviev Jun 30, 2024
7e86855
char mapping
ieviev Jun 30, 2024
99b5717
empty array mapping
ieviev Jun 30, 2024
47c6b04
adding timeout check to create-derivative
ieviev Jun 30, 2024
22d23fa
some cleanup
ieviev Jun 30, 2024
761f897
comments and cleanup
ieviev Jun 30, 2024
53924eb
cleanup and comments
ieviev Jun 30, 2024
e66d3d3
reflecting new limits in tests
ieviev Jul 1, 2024
65c0b8b
rerunning tests
ieviev Jul 1, 2024
de085b4
retesting DFA timeout
ieviev Jul 1, 2024
5ef3b32
more precise regex memory limit for DFA mode
ieviev Jul 2, 2024
281446f
reverting change
ieviev Jul 2, 2024
8f78046
reverting reversal refactor
ieviev Jul 3, 2024
7157520
Apply suggestions from code review
ieviev Jul 3, 2024
931552d
variable naming
ieviev Jul 3, 2024
cc493f1
test for over 255 minterms
ieviev Jul 3, 2024
a0d2390
adding net directive around test
ieviev Jul 3, 2024
0691c58
all engines in minterms test
ieviev Jul 3, 2024
8ceb207
Apply suggestions from code review
ieviev Jul 3, 2024
379519b
Apply suggestions from code review
ieviev Jul 3, 2024
57c8f6d
simplifying code
ieviev Jul 3, 2024
2e57d42
state flag values down
ieviev Jul 3, 2024
60b1352
mintermclassifier changes
ieviev Jul 3, 2024
2900aad
reversal
ieviev Jul 4, 2024
764ded8
getstateflags
ieviev Jul 4, 2024
81d0dca
formatting
ieviev Jul 4, 2024
38f28b9
removing unused interface
ieviev Jul 4, 2024
cce1188
local function typo
ieviev Jul 4, 2024
8b946da
temporarily removing minterms test
ieviev Jul 5, 2024
d3430b3
re-adding minterms test
ieviev Jul 6, 2024
388c256
reenabling test for all engines
ieviev Jul 8, 2024
2704641
test bugfix
ieviev Jul 8, 2024
0abaabe
expected matches change
ieviev Jul 8, 2024
0a0f409
Review and clean up some code
stephentoub Jul 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@
<Compile Include="System\Text\RegularExpressions\Symbolic\MatchingState.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\DoublyLinkedList.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\ISolver.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\MatchReversalKind.cs"/>
<Compile Include="System\Text\RegularExpressions\Symbolic\MatchReversal.cs"/>
<Compile Include="System\Text\RegularExpressions\Symbolic\MintermClassifier.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\MintermGenerator.cs" />
<Compile Include="System\Text\RegularExpressions\Symbolic\RegexNodeConverter.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

#pragma warning disable CS8500 // takes address of managed type

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ internal sealed class BitVectorSolver : ISolver<BitVector>
internal readonly MintermClassifier _classifier;
private readonly BitVector[] _mintermVectors;

public BitVectorSolver(BDD[] minterms, CharSetSolver solver)
public BitVectorSolver(BDD[] minterms)
{
_minterms = minterms;

_classifier = new MintermClassifier(minterms, solver);
_classifier = new MintermClassifier(minterms);

var singleBitVectors = new BitVector[minterms.Length];
for (int i = 0; i < singleBitVectors.Length; i++)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Diagnostics;

namespace System.Text.RegularExpressions.Symbolic
{
/// <summary>Provides details on how a match may be processed in reverse to find the beginning of a match once a match's existence has been confirmed.</summary>
internal readonly struct MatchReversalInfo<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
{
/// <summary>Initializes the match reversal details.</summary>
internal MatchReversalInfo(MatchReversalKind kind, int fixedLength, MatchingState<TSet>? adjustedStartState = null)
{
Debug.Assert(kind is MatchReversalKind.MatchStart or MatchReversalKind.FixedLength or MatchReversalKind.PartialFixedLength);
Debug.Assert(fixedLength >= 0);
Debug.Assert((adjustedStartState is not null) == (kind is MatchReversalKind.PartialFixedLength));

Kind = kind;
FixedLength = fixedLength;
AdjustedStartState = adjustedStartState;
}

/// <summary>Gets the kind of the match reversal processing required.</summary>
internal MatchReversalKind Kind { get; }

/// <summary>Gets the fixed length of the match, if one is known.</summary>
/// <remarks>
/// For <see cref="MatchReversalKind.MatchStart"/>, this is ignored.
/// For <see cref="MatchReversalKind.FixedLength"/>, this is the full length of the match. The beginning may be found simply
/// by subtracting this length from the end.
/// For <see cref="MatchReversalKind.PartialFixedLength"/>, this is the length of fixed portion of the match.
/// </remarks>
internal int FixedLength { get; }

/// <summary>Gets the adjusted start state to use for partial fixed-length matches.</summary>
/// <remarks>This will be non-null iff <see cref="Kind"/> is <see cref="MatchReversalKind.PartialFixedLength"/>.</remarks>
internal MatchingState<TSet>? AdjustedStartState { get; }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

namespace System.Text.RegularExpressions.Symbolic
{
/// <summary>Specifies the kind of a <see cref="MatchReversalInfo{TSet}"/>.</summary>
internal enum MatchReversalKind
{
/// <summary>The regex should be run in reverse to find beginning of the match.</summary>
MatchStart,

/// <summary>The end of the pattern is of a fixed length and can be skipped as part of running a regex in reverse to find the beginning of the match.</summary>
/// <remarks>
/// Reverse execution is not necessary for a subset of the match.
/// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the fixed portion.
/// </remarks>
PartialFixedLength,

/// <summary>The entire pattern is of a fixed length.</summary>
/// <remarks>
/// Reverse execution is not necessary to find the beginning of the match.
/// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the match.
/// </remarks>
FixedLength
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
{
Node = node;
PrevCharKind = prevCharKind;
NullabilityInfo = BuildNullabilityInfo();
}

/// <summary>The regular expression that labels this state and gives it its semantics.</summary>
Expand Down Expand Up @@ -95,21 +96,37 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
return Node.CreateNfaDerivativeWithEffects(builder, minterm, context);
}

/// <summary>Determines whether the node is nullable for the given context.</summary>
/// <remarks>
/// This is functionally equivalent to <see cref="SymbolicRegexNode{TSet}.IsNullableFor(uint)"/>, but using cached
/// answers stored in <see cref="NullabilityInfo"/>.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal bool IsNullableFor(uint nextCharKind)
{
Debug.Assert(CharKind.IsValidCharKind(nextCharKind));
uint context = CharKind.Context(PrevCharKind, nextCharKind);
return Node.IsNullableFor(context);
Debug.Assert(nextCharKind is >= 0 and < CharKind.CharKindCount);
return (NullabilityInfo & (1 << (int)nextCharKind)) != 0;
}

/// <summary>Gets the nullability info for the matching state.</summary>
/// <remarks>
/// <list>
/// <item>00000 -> node cannot be nullable</item>
/// <item>00001 -> nullable for General</item>
/// <item>00010 -> nullable for BeginningEnd</item>
/// <item>00100 -> nullable for NewLine</item>
/// <item>01000 -> nullable for NewLineS</item>
/// <item>10000 -> nullable for WordLetter</item>
/// </list>
/// </remarks>
internal int NullabilityInfo { get; }

/// <summary>
/// Builds a <see cref="StateFlags"/> with the relevant flags set.
/// </summary>
/// <param name="solver">a solver for <typeparamref name="TSet"/></param>
/// <param name="isInitial">whether this state is an initial state</param>
/// <returns>the flags for this matching state</returns>
internal StateFlags BuildStateFlags(ISolver<TSet> solver, bool isInitial)
internal StateFlags BuildStateFlags(bool isInitial)
{
StateFlags info = 0;

Expand All @@ -118,11 +135,6 @@ internal StateFlags BuildStateFlags(ISolver<TSet> solver, bool isInitial)
info |= StateFlags.IsInitialFlag;
}

if (IsDeadend(solver))
{
info |= StateFlags.IsDeadendFlag;
}

if (Node.CanBeNullable)
{
info |= StateFlags.CanBeNullableFlag;
Expand All @@ -140,6 +152,22 @@ internal StateFlags BuildStateFlags(ISolver<TSet> solver, bool isInitial)
return info;
}

/// <summary>Builds the nullability information for the matching state.</summary>
/// <remarks>Nullability for each context is encoded in a bit. See <see cref="NullabilityInfo"/>.</remarks>
private byte BuildNullabilityInfo()
{
byte nullabilityInfo = 0;
if (Node.CanBeNullable)
{
for (uint charKind = 0; charKind < CharKind.CharKindCount; charKind++)
{
nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, charKind)) ? 1 << (int)charKind : 0);
}
}
ieviev marked this conversation as resolved.
Show resolved Hide resolved

return nullabilityInfo;
}

public override bool Equals(object? obj) =>
obj is MatchingState<TSet> s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;

namespace System.Text.RegularExpressions.Symbolic
Expand All @@ -20,81 +22,104 @@ namespace System.Text.RegularExpressions.Symbolic
/// </remarks>
internal sealed class MintermClassifier
{
/// <summary>An array used when there's a single minterm, in order to map every ASCII character to it trivially.</summary>
private static readonly int[] AllAsciiIsZeroMintermArray = new int[128];
/// <summary>Mapping for characters to minterms, used in the vast majority case when there are less than 256 minterms.</summary>
/// <remarks>_lookup[char] provides the minterm ID. If char &gt;= _lookup.Length, its minterm is 0.</remarks>
private readonly byte[]? _lookup;

/// <summary>Array providing fast mapping from an ASCII character (the array index) to its corresponding minterm ID.</summary>
private readonly int[] _ascii;
/// <summary>A multi-terminal BDD for mapping any non-ASCII character to its associated minterm ID.</summary>
/// <remarks>
/// The use of a multi-terminal BDD here is an implementation detail. Should we decide its important to optimize non-ASCII inputs further,
/// or to consolidate the mechanism with the other engines, an alternatie lookup algorithm / data structure could be employed.
/// </remarks>
private readonly BDD _nonAscii;
/// <summary>Mapping for characters to minterms, used when there are at least 256 minterms. This is rarely used.</summary>
/// <remarks>_intLookup[char] provides the minterm ID. If char &gt;= _intLookup.Length, its minterm is 0.</remarks>
private readonly int[]? _intLookup;

ieviev marked this conversation as resolved.
Show resolved Hide resolved
/// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
/// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
/// <param name="solver">The character set solver to use.</param>
public MintermClassifier(BDD[] minterms, CharSetSolver solver)
public MintermClassifier(BDD[] minterms)
{
Debug.Assert(minterms.Length > 0, "Requires at least");

if (minterms.Length == 1)
{
// With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
// For ASCII, use an array containing all zeros. For non-ASCII, use a BDD that maps everything to 0.
_ascii = AllAsciiIsZeroMintermArray;
_nonAscii = solver.ReplaceTrue(BDD.True, 0);
_lookup = [];
return;
}

// Create a multi-terminal BDD for mapping any character to its associated minterm.
BDD anyCharacterToMintermId = BDD.False;
for (int i = 0; i < minterms.Length; i++)
{
// Each supplied minterm BDD decides whether a given character maps to it or not.
// We need to combine all of those into a multi-terminal BDD that decides which
// minterm a character maps to. To do that, we take each minterm BDD and replace
// its True result with the ID of the minterm, such that a character that would
// have returned True for that BDD now returns the minterm ID.
BDD charToTargetMintermId = solver.ReplaceTrue(minterms[i], i);
// Compute all minterm ranges. We do this here in order to determine the maximum character value
// in order to size the lookup array to minimize steady-state memory consumption of the potentially
// large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory
// consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case.
// However, when there are more than 255 minterms, we need to use int[] _intLookup.
(uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length);

// Now union this BDD with the multi-terminal BDD we've built up thus far. Unioning
// is valid because every character belongs to exactly one minterm and thus will
// only map to an ID instead of False in exactly one of the input BDDs.
anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId);
int maxChar = -1;
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
{
(uint, uint)[] ranges = BDDRangeConverter.ToRanges(minterms[mintermId]);
charRangesPerMinterm[mintermId] = ranges;
maxChar = Math.Max(maxChar, (int)ranges[^1].Item2);
}

// Now that we have our mapping that supports any input character, we want to optimize for
// ASCII inputs. Rather than forcing every input ASCII character to consult the BDD at match
// time, we precompute a lookup table, where each ASCII character can be used to index into the
// array to determine the ID for its corresponding minterm.
var ascii = new int[128];
for (int i = 0; i < ascii.Length; i++)
// It's incredibly rare for a regex to use more than a couple hundred minterms,
// but we need a fallback just in case. (Over 128 unique sets also means it's never ASCII only.)
if (minterms.Length > 255)
{
_intLookup = CreateLookup<int>(minterms, charRangesPerMinterm, maxChar);
}
else
{
ascii[i] = anyCharacterToMintermId.Find(i);
_lookup = CreateLookup<byte>(minterms, charRangesPerMinterm, maxChar);
}
_ascii = ascii;

// We can also further optimize the BDD in two ways:
// 1. We can now remove the ASCII characters from it, as we'll always consult the lookup table first
// for ASCII inputs and thus will never use the BDD for them. While optional (skipping this step will not
// affect correctness), removing the ASCII values from the BDD reduces the size of the multi-terminal BDD.
// 2. We can check if every character now maps to the same minterm ID (the same terminal in the
// multi-terminal BDD). This can be relatively common after (1) above is applied, as many
// patterns don't distinguish between any non-ASCII characters (e.g. "[0-9]*"). If every character
// in the BDD now maps to the same minterm, we can replace the BDD with a much simpler/faster/smaller one.
BDD nonAsciiBDD = solver.And(anyCharacterToMintermId, solver.NonAscii);
nonAsciiBDD = nonAsciiBDD.IsEssentiallyBoolean(out BDD? singleTerminalBDD) ? singleTerminalBDD : nonAsciiBDD;
_nonAscii = nonAsciiBDD;
// Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive.
Array.Clear(charRangesPerMinterm, 0, minterms.Length);
ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm);

// Creates the lookup array.
static T[] CreateLookup<T>(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger<T>
{
T[] lookup = new T[_maxChar + 1];
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
{
// Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm.
foreach ((uint start, uint end) in charRangesPerMinterm[mintermId])
{
lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId));
}
}

return lookup;
}
}

/// <summary>Gets the ID of the minterm associated with the specified character.</summary>
/// <summary>Gets the ID of the minterm associated with the specified character. </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetMintermID(int c)
{
int[] ascii = _ascii;
return (uint)c < (uint)ascii.Length ? ascii[c] : _nonAscii.Find(c);
if (_lookup is not null)
{
byte[] lookup = _lookup;
return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
}
else
{
int[] lookup = _intLookup!;
return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
}
ieviev marked this conversation as resolved.
Show resolved Hide resolved
}
/// <summary>
/// Gets a quick mapping from char to minterm for the common case when there are &lt;= 255 minterms.
/// Null if there are greater than 255 minterms.
/// </summary>
public byte[]? ByteLookup => _lookup;

/// <summary>
/// Gets a mapping from char to minterm for the rare case when there are &gt;= 255 minterms.
/// Null in the common case where there are fewer than 255 minterms.
/// </summary>
public int[]? IntLookup => _intLookup;

/// <summary>
/// Maximum ordinal character for a non-0 minterm, used to conserve memory
/// </summary>
public int MaxChar => (_lookup?.Length ?? _intLookup!.Length) - 1;
}
}
Loading
Loading