Skip to content

Commit

Permalink
Broaden use of SearchValues in TryFindNextPossibleStartingPosition in…
Browse files Browse the repository at this point in the history
… Regex (#89205)

SearchValues has been updated to have an ASCII fast-path for inputs that are not only ASCII.  This means we can simplify TryFindNextPossibleStartingPosition in Regex to not track AsciiSet specially and instead just increase the number of characters we query the set for (from 5 to 128).  That way, we'll use SearchValues rather than emitting our own helper up until a (semi-arbitrary) point where we deem it impossible or infeasible to enumerate all the chars that make up the set.
  • Loading branch information
stephentoub authored Jul 20, 2023
1 parent 84b7c61 commit 0ccbbe7
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 80 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -399,54 +399,65 @@ private static string EmitSearchValuesOrLiteral(ReadOnlySpan<char> chars, Dictio
}

/// <summary>Adds a SearchValues instance declaration to the required helpers collection.</summary>
private static string EmitSearchValues(char[] asciiChars, Dictionary<string, string[]> requiredHelpers)
private static string EmitSearchValues(char[] chars, Dictionary<string, string[]> requiredHelpers)
{
Debug.Assert(RegexCharClass.IsAscii(asciiChars));
Array.Sort(chars);

// The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
byte[] bitmap = new byte[16];
foreach (char c in asciiChars)
string fieldName;
if (RegexCharClass.IsAscii(chars))
{
bitmap[c >> 3] |= (byte)(1 << (c & 7));
// The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
var bitmap = new byte[16];
foreach (char c in chars)
{
bitmap[c >> 3] |= (byte)(1 << (c & 7));
}

string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);

fieldName = hexBitmap switch
{
"FFFFFFFF000000000000000000000080" => "s_asciiControl",
"000000000000FF030000000000000000" => "s_asciiDigits",
"0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
"000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
"000000000000FF037E0000007E000000" => "s_asciiHexDigits",
"000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
"000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
"00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
"00000000010000000000000000000000" => "s_asciiSeparators",
"00000000100800700000004001000050" => "s_asciiSymbols",
"003E0000010000000000000000000000" => "s_asciiWhiteSpace",
"000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",

"00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
"FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
"FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
"FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
"FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
"FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
"FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
"FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
"FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
"FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
"FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",

_ => $"s_ascii_{hexBitmap.TrimStart('0')}"
};
}

string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);

string fieldName = hexBitmap switch
else
{
"FFFFFFFF000000000000000000000080" => "s_asciiControl",
"000000000000FF030000000000000000" => "s_asciiDigits",
"0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
"000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
"000000000000FF037E0000007E000000" => "s_asciiHexDigits",
"000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
"000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
"00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
"00000000010000000000000000000000" => "s_asciiSeparators",
"00000000100800700000004001000050" => "s_asciiSymbols",
"003E0000010000000000000000000000" => "s_asciiWhiteSpace",
"000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",

"00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
"FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
"FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
"FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
"FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
"FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
"FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
"FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
"FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
"FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
"FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",

_ => $"s_ascii_{hexBitmap.TrimStart('0')}"
};
using (SHA256 sha = SHA256.Create())
{
#pragma warning disable CA1850 // SHA256.HashData isn't available on netstandard2.0
fieldName = $"s_nonAscii_{BitConverter.ToString(sha.ComputeHash(Encoding.UTF8.GetBytes(chars))).Replace("-", "")}";
#pragma warning restore CA1850
}
}

if (!requiredHelpers.ContainsKey(fieldName))
{
Array.Sort(asciiChars);

string setLiteral = Literal(new string(asciiChars));
string setLiteral = Literal(new string(chars));

requiredHelpers.Add(fieldName, new string[]
{
Expand All @@ -465,12 +476,12 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
// a sequential walk). In order to do that search, we actually build up a set for all of the ASCII
// characters _not_ contained in the set, and then do a search for the inverse of that, which will be
// all of the target ASCII characters and all of non-ASCII.
var asciiChars = new List<char>();
var excludedAsciiChars = new List<char>();
for (int i = 0; i < 128; i++)
{
if (!RegexCharClass.CharInClass((char)i, set))
{
asciiChars.Add((char)i);
excludedAsciiChars.Add((char)i);
}
}

Expand Down Expand Up @@ -538,9 +549,9 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
lines.Add($"internal static int {helperName}(this ReadOnlySpan<char> span)");
lines.Add($"{{");
int uncheckedStart = lines.Count;
lines.Add(asciiChars.Count == 128 ?
lines.Add(excludedAsciiChars.Count == 128 ?
$" int i = span.IndexOfAnyExceptInRange('\0', '\u007f');" :
$" int i = span.IndexOfAnyExcept({EmitSearchValues(asciiChars.ToArray(), requiredHelpers)});");
$" int i = span.IndexOfAnyExcept({EmitSearchValues(excludedAsciiChars.ToArray(), requiredHelpers)});");
lines.Add($" if ((uint)i < (uint)span.Length)");
lines.Add($" {{");
lines.Add($" if (char.IsAscii(span[i]))");
Expand Down Expand Up @@ -1067,6 +1078,8 @@ void EmitFixedSet_LeftToRight()
string indexOf;
if (primarySet.Chars is not null)
{
Debug.Assert(primarySet.Chars.Length > 0);

// We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny";
if (primarySet.Negated)
Expand All @@ -1076,18 +1089,19 @@ void EmitFixedSet_LeftToRight()

indexOf = primarySet.Chars.Length switch
{
// 1, 2, 3 have dedicated optimized IndexOfAny overloads
1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})",
2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
_ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",

// 4, 5 have dedicated optimized IndexOfAny overloads accessible via the ReadOnlySpan<char> overload,
// but can also be handled via SearchValues
4 or 5 => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",

// > 5 can only be handled efficiently via SearchValues
_ => $"{span}.{indexOfAnyName}({EmitSearchValues(primarySet.Chars, requiredHelpers)})",
};
}
else if (primarySet.AsciiSet is not null)
{
// We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it.
Debug.Assert(!primarySet.Negated);
indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})";
}
else if (primarySet.Range is not null)
{
// We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case,
Expand All @@ -1102,8 +1116,8 @@ void EmitFixedSet_LeftToRight()
}
else
{
// We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that
// will perform the search as efficiently as possible.
// We have an arbitrary set of characters that's really large or otherwise not enumerable.
// We use a custom IndexOfAny helper that will perform the search as efficiently as possible.
indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,7 @@ void EmitFixedSet_LeftToRight()

if (primarySet.Chars is not null)
{
Debug.Assert(primarySet.Chars.Length > 0);
switch (primarySet.Chars.Length)
{
case 1:
Expand All @@ -926,19 +927,23 @@ void EmitFixedSet_LeftToRight()
Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar);
break;

default:
case 4 or 5:
// tmp = ...IndexOfAny("abcd");
// Note that this case differs slightly from the source generator, where it might choose to use
// SearchValues instead of a literal, but there's extra cost to doing so for RegexCompiler so
// it just always uses IndexOfAny(span).
Ldstr(new string(primarySet.Chars));
Call(s_stringAsSpanMethod);
Call(primarySet.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan);
break;

default:
// tmp = ...IndexOfAny(s_searchValues);
LoadSearchValues(primarySet.Chars);
Call(primarySet.Negated ? s_spanIndexOfAnyExceptSearchValues : s_spanIndexOfAnySearchValues);
break;
}
}
else if (primarySet.AsciiSet is not null)
{
Debug.Assert(!primarySet.Negated);
LoadSearchValues(primarySet.AsciiSet);
Call(s_spanIndexOfAnySearchValues);
}
else if (primarySet.Range is not null)
{
if (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,6 @@ public FixedDistanceSet(char[]? chars, string set, int distance)
public int Distance;
/// <summary>As an alternative to <see cref="Chars"/>, a description of the single range the set represents, if it does.</summary>
public (char LowInclusive, char HighInclusive)? Range;
/// <summary>As an alternative to <see cref="Chars"/>, a description of the set of ASCII characters it represents, if it does.</summary>
public char[]? AsciiSet;
}

/// <summary>When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop.</summary>
Expand Down Expand Up @@ -593,7 +591,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
char[]? chars = primarySet.Chars;

ReadOnlySpan<char> span = textSpan.Slice(pos);
if (chars is not null)
if (chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except} without SearchValues
{
int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars);
if (i >= 0)
Expand Down Expand Up @@ -660,7 +658,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,

int endMinusRequiredLength = textSpan.Length - Math.Max(1, MinRequiredLength);

if (primarySet.Chars is not null)
if (primarySet.Chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except}
{
for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
TryFindRawFixedSets(root, results, ref distance, thorough);
#if DEBUG
results.ForEach(r => Debug.Assert(
!r.Negated && r.Chars is null && r.AsciiSet is null && r.Range is null,
!r.Negated && r.Chars is null && r.Range is null,
$"{nameof(TryFindRawFixedSets)} should have only populated {nameof(r.Set)} and {nameof(r.Distance)}"));
#endif

Expand Down Expand Up @@ -225,31 +225,25 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)

// For every entry, try to get the chars that make up the set, if there are few enough.
// For any for which we couldn't get the small chars list, see if we can get other useful info.
Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
Span<char> scratch = stackalloc char[128]; // limit based on what's currently efficiently handled by SearchValues
for (int i = 0; i < results.Count; i++)
{
RegexFindOptimizations.FixedDistanceSet result = results[i];
result.Negated = RegexCharClass.IsNegated(result.Set);

int count = RegexCharClass.GetSetChars(result.Set, scratch);

if (count > 0)
{
result.Chars = scratch.Slice(0, count).ToArray();
}

if (thorough)
// Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
if (thorough &&
(result.Chars is null || result.Chars.Length > 2) &&
RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
{
// Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
if ((result.Chars is null || count > 2) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
{
result.Chars = null;
result.Range = (lowInclusive, highInclusive);
}
else if (result.Chars is null && !result.Negated && RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars))
{
result.AsciiSet = asciiChars;
}
result.Chars = null;
result.Range = (lowInclusive, highInclusive);
}

results[i] = result;
Expand Down Expand Up @@ -472,8 +466,8 @@ public static void SortFixedDistanceSetsByQuality(List<RegexFindOptimizations.Fi
// for the fastest and that have the best chance of matching as few false positives as possible.
results.Sort(static (s1, s2) =>
{
char[]? s1Chars = s1.Chars ?? s1.AsciiSet;
char[]? s2Chars = s2.Chars ?? s2.AsciiSet;
char[]? s1Chars = s1.Chars;
char[]? s2Chars = s2.Chars;
int s1CharsLength = s1Chars?.Length ?? 0;
int s2CharsLength = s2Chars?.Length ?? 0;
bool s1Negated = s1.Negated;
Expand Down

0 comments on commit 0ccbbe7

Please sign in to comment.