From 99b76018b6e4edc4ce185dd5f3c5697c6941d88e Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Fri, 23 Feb 2024 06:28:30 -0500 Subject: [PATCH] Enable Regex to use SearchValues in compiled / source generator for IgnoreCase multi-strings (#98791) * Enable Regex to use SearchValues in compiled / source generator TryFindNextStartingPosition The analyzer determines a set of prefixes that can start any match, and then uses SearchValues with IndexOfAny to find the next one from that set. It's currently only enabled for case-insensitive; we need to do some more perf validation before enabling for case-sensitive. * Address PR feedback * Fix unit test --- .../gen/RegexGenerator.Emitter.cs | 36 +++ .../Text/RegularExpressions/RegexCharClass.cs | 15 + .../Text/RegularExpressions/RegexCompiler.cs | 32 +- .../RegexFindOptimizations.cs | 39 ++- .../Text/RegularExpressions/RegexNode.cs | 9 +- .../RegularExpressions/RegexPrefixAnalyzer.cs | 301 +++++++++++++++++- .../UnitTests/RegexPrefixAnalyzerTests.cs | 63 ++++ 7 files changed, 473 insertions(+), 22 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 7e7fed6cab65f..40897343cc471 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -732,6 +732,11 @@ private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter w EmitIndexOfString_RightToLeft(); break; + case FindNextStartingPositionMode.LeadingStrings_LeftToRight: + case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight: + EmitIndexOfStrings_LeftToRight(); + break; + case FindNextStartingPositionMode.LeadingSet_LeftToRight: case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight: EmitFixedSet_LeftToRight(); @@ -1041,6 +1046,37 @@ UnicodeCategory.NonSpacingMark or } } + // Emits a case-sensitive left-to-right search for any one of multiple leading prefixes. + void EmitIndexOfStrings_LeftToRight() + { + RegexFindOptimizations opts = regexTree.FindOptimizations; + Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingStrings_LeftToRight or FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight); + + string prefixes = string.Join(", ", opts.LeadingPrefixes.Select(prefix => Literal(prefix))); + StringComparison stringComparison = opts.FindMode is FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight ? + StringComparison.OrdinalIgnoreCase : + StringComparison.Ordinal; + string fieldName = GetSHA256FieldName($"s_indexOfAnyStrings_{stringComparison}_", prefixes); + + if (!requiredHelpers.ContainsKey(fieldName)) + { + requiredHelpers.Add(fieldName, + [ + $"/// Supports searching for the specified strings.", + $"internal static readonly SearchValues {fieldName} = SearchValues.Create([{prefixes}], StringComparison.{stringComparison});", // explicitly using an array in case prefixes is large + ]); + } + + writer.WriteLine($"// The pattern has multiple strings that could begin the match. Search for any of them."); + writer.WriteLine($"// If none can be found, there's no match."); + writer.WriteLine($"int i = inputSpan.Slice(pos).IndexOfAny({HelpersTypeName}.{fieldName});"); + using (EmitBlock(writer, "if (i >= 0)")) + { + writer.WriteLine("base.runtextpos = pos + i;"); + writer.WriteLine("return true;"); + } + } + // Emits a case-sensitive right-to-left search for a substring. void EmitIndexOfString_RightToLeft() { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index c56ad4b5b6e05..ed67df6819023 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -1054,6 +1054,21 @@ public static bool IsAscii(ReadOnlySpan s) #endif } + /// Gets whether the set description string is for two ASCII letters that case to each other under OrdinalIgnoreCase rules. + public static bool SetContainsAsciiOrdinalIgnoreCaseCharacter(string set, Span twoChars) + { + Debug.Assert(twoChars.Length >= 2); + return + !IsNegated(set) && + GetSetChars(set, twoChars) == 2 && + twoChars[0] < 128 && + twoChars[1] < 128 && + twoChars[0] != twoChars[1] && + char.IsLetter(twoChars[0]) && + char.IsLetter(twoChars[1]) && + (twoChars[0] | 0x20) == (twoChars[1] | 0x20); + } + /// Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents. /// This may enumerate negated characters if the set is negated. This will return false if the set has subtraction. private static bool CanEasilyEnumerateSetContents(string set) => diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index bac950c6db2f9..caf8479199d36 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -460,6 +460,8 @@ protected void EmitTryFindNextPossibleStartingPosition() { case FindNextStartingPositionMode.LeadingString_LeftToRight: case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight: + case FindNextStartingPositionMode.LeadingStrings_LeftToRight: + case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight: case FindNextStartingPositionMode.FixedDistanceString_LeftToRight: EmitIndexOfString_LeftToRight(); break; @@ -745,15 +747,19 @@ bool EmitAnchors() return false; } - // Emits a case-sensitive left-to-right search for a substring. + // Emits a case-sensitive left-to-right search for a substring or substrings. void EmitIndexOfString_LeftToRight() { RegexFindOptimizations opts = _regexTree.FindOptimizations; - Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight); + Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or + FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or + FindNextStartingPositionMode.FixedDistanceString_LeftToRight or + FindNextStartingPositionMode.LeadingStrings_LeftToRight or + FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight); using RentedLocalBuilder i = RentInt32Local(); - // int i = inputSpan.Slice(pos).IndexOf(prefix); + // int i = inputSpan.Slice(pos)... Ldloca(inputSpan); Ldloc(pos); if (opts.FindMode is FindNextStartingPositionMode.FixedDistanceString_LeftToRight && @@ -763,11 +769,21 @@ void EmitIndexOfString_LeftToRight() Add(); } Call(s_spanSliceIntMethod); - string literalString = opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? - opts.LeadingPrefix : - opts.FixedDistanceLiteral.String!; - LoadSearchValues([literalString], opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal); - Call(s_spanIndexOfAnySearchValuesString); + + // ...IndexOf(prefix); + if (opts.FindMode is FindNextStartingPositionMode.LeadingStrings_LeftToRight or FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight) + { + LoadSearchValues(opts.LeadingPrefixes, opts.FindMode is FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal); + Call(s_spanIndexOfAnySearchValuesString); + } + else + { + string literalString = opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? + opts.LeadingPrefix : + opts.FixedDistanceLiteral.String!; + LoadSearchValues([literalString], opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal); + Call(s_spanIndexOfAnySearchValuesString); + } Stloc(i); // if (i < 0) goto ReturnFalse; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index f40f48e35a6d9..a8dc9f4fd0e58 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -137,7 +137,28 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) return; } - // We're now left-to-right only and looking for sets. + // We're now left-to-right only and looking for multiple prefixes and/or sets. + + // If there are multiple leading strings, we can search for any of them. + if (compiled) + { + if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: true) is { Length: > 1 } caseInsensitivePrefixes) + { + LeadingPrefixes = caseInsensitivePrefixes; + FindMode = FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight; + return; + } + + // TODO: While some benchmarks benefit from this significantly, others regressed a bit (in particular those with few + // matches). Before enabling this, we need to investigate the performance impact on real-world scenarios, + // and see if there are ways to reduce the impact. + //if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: false) is { Length: > 1 } caseSensitivePrefixes) + //{ + // LeadingPrefixes = caseSensitivePrefixes; + // FindMode = FindNextStartingPositionMode.LeadingStrings_LeftToRight; + // return; + //} + } // Build up a list of all of the sets that are a fixed distance from the start of the expression. List? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, thorough: !interpreter); @@ -244,6 +265,9 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) /// Gets the leading prefix. May be an empty string. public string LeadingPrefix { get; } = string.Empty; + /// Gets the leading prefixes. May be an empty array. + public string[] LeadingPrefixes { get; } = Array.Empty(); + /// When in fixed distance literal mode, gets the literal and how far it is from the start of the pattern. public (char Char, string? String, int Distance) FixedDistanceLiteral { get; } @@ -767,10 +791,16 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, return false; } + // Not supported in the interpreter, but we could end up here for patterns so complex the compiler gave up on them. + + case FindNextStartingPositionMode.LeadingStrings_LeftToRight: + case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight: + return true; + // Nothing special to look for. Just return true indicating this is a valid position to try to match. default: - Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch); + Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch, $"Unexpected FindMode {FindMode}"); return true; } } @@ -810,6 +840,11 @@ internal enum FindNextStartingPositionMode /// A multi-character ordinal case-insensitive substring at the beginning of the pattern. LeadingString_OrdinalIgnoreCase_LeftToRight, + /// Multiple leading prefix strings + LeadingStrings_LeftToRight, + /// Multiple leading ordinal case-insensitive prefix strings + LeadingStrings_OrdinalIgnoreCase_LeftToRight, + /// A set starting the pattern. LeadingSet_LeftToRight, /// A set starting the right-to-left pattern. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 5445f696423e4..335f9165856ff 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2561,14 +2561,7 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil { // In particular we want to look for sets that contain only the upper and lowercase variant // of the same ASCII letter. - if (RegexCharClass.IsNegated(child.Str!) || - RegexCharClass.GetSetChars(child.Str!, twoChars) != 2 || - twoChars[0] >= 128 || - twoChars[1] >= 128 || - twoChars[0] == twoChars[1] || - !char.IsLetter(twoChars[0]) || - !char.IsLetter(twoChars[1]) || - ((twoChars[0] | 0x20) != (twoChars[1] | 0x20))) + if (!RegexCharClass.SetContainsAsciiOrdinalIgnoreCaseCharacter(child.Str!, twoChars)) { break; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 1658e5bcdf2ad..35956b449390d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -11,6 +11,292 @@ namespace System.Text.RegularExpressions /// Detects various forms of prefixes in the regular expression that can help FindFirstChars optimize its search. internal static class RegexPrefixAnalyzer { + /// Finds an array of multiple prefixes that a node can begin with. + /// The node to search. + /// true to find ordinal ignore-case prefixes; false for case-sensitive. + /// + /// If a fixed set of prefixes is found, such that a match for this node is guaranteed to begin + /// with one of those prefixes, an array of those prefixes is returned. Otherwise, null. + /// + public static string[]? FindPrefixes(RegexNode node, bool ignoreCase) + { + // Minimum string length for prefixes to be useful. If any prefix has length 1, + // then we're generally better off just using IndexOfAny with chars. + const int MinPrefixLength = 2; + + // Arbitrary string length limit (with some wiggle room) to avoid creating strings that are longer than is useful and consuming too much memory. + const int MaxPrefixLength = 8; + + // Arbitrary limit on the number of prefixes to find. If we find more than this, we're likely to be spending too much time finding prefixes that won't be useful. + const int MaxPrefixes = 16; + + // Analyze the node to find prefixes. + List results = [new StringBuilder()]; + FindPrefixesCore(node, results, ignoreCase); + + // If we found too many prefixes or if any found is too short, fail. + if (results.Count > MaxPrefixes || !results.TrueForAll(sb => sb.Length >= MinPrefixLength)) + { + return null; + } + + // Return the prefixes. + string[] resultStrings = new string[results.Count]; + for (int i = 0; i < results.Count; i++) + { + resultStrings[i] = results[i].ToString(); + } + return resultStrings; + + // + // Updates the results list with found prefixes. All existing strings in the list are treated as existing + // discovered prefixes prior to the node being processed. The method returns true if subsequent nodes after + // this one should be examined, or returns false if they shouldn't be because the node wasn't guaranteed + // to be fully processed. + // + static bool FindPrefixesCore(RegexNode node, List results, bool ignoreCase) + { + // If we're too deep to analyze further, we can't trust what we've already computed, so stop iterating. + // Also bail if any of our results is already hitting the threshold + if (!StackHelper.TryEnsureSufficientExecutionStack() || + !results.TrueForAll(sb => sb.Length < MaxPrefixLength)) + { + return false; + } + + // These limits are approximations. We'll stop trying to make strings longer once we exceed the max length, + // and if we exceed the max number of prefixes by a non-trivial amount, we'll fail the operation. + Span setChars = stackalloc char[MaxPrefixes]; // limit how many chars we get from a set based on the max prefixes we care about + + // Loop down the left side of the tree, looking for a starting node we can handle. We only loop through + // atomic and capture nodes, as the child is guaranteed to execute once, as well as loops with a positive + // minimum and thus at least one guaranteed iteration. + while (true) + { + switch (node.Kind) + { + // These nodes are all guaranteed to execute at least once, so we can just + // skip through them to their child. + case RegexNodeKind.Atomic: + case RegexNodeKind.Capture: + case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M > 0: + node = node.Child(0); + continue; + + // Zero-width anchors and assertions don't impact a prefix and may be skipped over. + case RegexNodeKind.Bol: + case RegexNodeKind.Eol: + case RegexNodeKind.Boundary: + case RegexNodeKind.ECMABoundary: + case RegexNodeKind.NonBoundary: + case RegexNodeKind.NonECMABoundary: + case RegexNodeKind.Beginning: + case RegexNodeKind.Start: + case RegexNodeKind.EndZ: + case RegexNodeKind.End: + case RegexNodeKind.Empty: + case RegexNodeKind.UpdateBumpalong: + case RegexNodeKind.PositiveLookaround: + case RegexNodeKind.NegativeLookaround: + return true; + + // If we hit a single character, we can just return that character. + // This is only relevant for case-sensitive searches, as for case-insensitive we'd have sets for anything + // that produces a different result when case-folded, or for strings composed entirely of characters that + // don't participate in case conversion. Single character loops are handled the same as single characters + // up to the min iteration limit. We can continue processing after them as well if they're repeaters such + // that their min and max are the same. + case RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Onelazy or RegexNodeKind.Oneloopatomic when !ignoreCase || !RegexCharClass.ParticipatesInCaseConversion(node.Ch): + { + int reps = node.Kind is RegexNodeKind.One ? 1 : node.M; + foreach (StringBuilder sb in results) + { + sb.Append(node.Ch, reps); + } + } + return node.Kind is RegexNodeKind.One || node.M == node.N; + + // If we hit a string, we can just return that string. + // As with One above, this is only relevant for case-sensitive searches. + case RegexNodeKind.Multi: + if (!ignoreCase) + { + foreach (StringBuilder sb in results) + { + sb.Append(node.Str); + } + } + else + { + // If we're ignoring case, then only append up through characters that don't participate in case conversion. + // If there are any beyond that, we can't go further and need to stop with what we have. + foreach (char c in node.Str!) + { + if (RegexCharClass.ParticipatesInCaseConversion(c)) + { + return false; + } + + foreach (StringBuilder sb in results) + { + sb.Append(c); + } + } + } + return true; + + // For case-sensitive, try to extract the characters that comprise it, and if there are + // any and there aren't more than the max number of prefixes, we can return + // them each as a prefix. Effectively, this is an alternation of the characters + // that comprise the set. For case-insensitive, we need the set to be two ASCII letters that case fold to the same thing. + // As with One and loops, set loops are handled the same as sets up to the min iteration limit. + case RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic when !RegexCharClass.IsNegated(node.Str!): // negated sets are too complex to analyze + { + int charCount = RegexCharClass.GetSetChars(node.Str!, setChars); + if (charCount == 0) + { + return false; + } + + int reps = node.Kind is RegexNodeKind.Set ? 1 : node.M; + if (!ignoreCase) + { + int existingCount = results.Count; + + // Duplicate all of the existing strings for all of the new suffixes, other than the first. + foreach (char suffix in setChars.Slice(1, charCount - 1)) + { + for (int existing = 0; existing < existingCount; existing++) + { + StringBuilder newSb = new StringBuilder().Append(results[existing]); + newSb.Append(suffix, reps); + results.Add(newSb); + } + } + + // Then append the first suffix to all of the existing strings. + for (int existing = 0; existing < existingCount; existing++) + { + results[existing].Append(setChars[0], reps); + } + } + else + { + // For ignore-case, we currently only handle the simple (but common) case of a single + // ASCII character that case folds to the same char. + if (!RegexCharClass.SetContainsAsciiOrdinalIgnoreCaseCharacter(node.Str!, setChars)) + { + return false; + } + + // Append it to each. + foreach (StringBuilder sb in results) + { + sb.Append(setChars[1], reps); + } + } + } + return node.Kind is RegexNodeKind.Set || node.N == node.M; + + case RegexNodeKind.Concatenate: + { + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + // Atomic and Capture nodes don't impact prefixes, so skip through them. + // Unlike earlier, however, we can't skip through loops, as a loop with + // more than one iteration impacts the matched sequence for the concatenation, + // and since we need a minimum of one, we'd only be able to skip a loop with + // both a min and max of 1, which in general is removed as superfluous during + // tree optimization. We could keep track of having traversed a loop and then + // stop processing the continuation after that, but that complexity isn't + // currently worthwhile. + if (!FindPrefixesCore(SkipThroughAtomicAndCapture(node.Child(i)), results, ignoreCase)) + { + return false; + } + } + } + return true; + + // For alternations, we need to find a prefix for every branch; if we can't compute a + // prefix for any one branch, we can't trust the results and need to give up, since we don't + // know if our set of prefixes is complete. + case RegexNodeKind.Alternate: + { + // If there are more children than our maximum, just give up immediately, as we + // won't be able to get a prefix for every branch and have it be within our max. + int childCount = node.ChildCount(); + Debug.Assert(childCount >= 2); // otherwise it would have been optimized out + if (childCount > MaxPrefixes) + { + return false; + } + + // Build up the list of all prefixes across all branches. + List? allBranchResults = null; + List? alternateBranchResults = [new StringBuilder()]; + for (int i = 0; i < childCount; i++) + { + _ = FindPrefixesCore(node.Child(i), alternateBranchResults, ignoreCase); + + Debug.Assert(alternateBranchResults.Count > 0); + foreach (StringBuilder sb in alternateBranchResults) + { + if (sb.Length == 0) + { + return false; + } + } + + if (allBranchResults is null) + { + allBranchResults = alternateBranchResults; + alternateBranchResults = [new StringBuilder()]; + } + else + { + allBranchResults.AddRange(alternateBranchResults); + alternateBranchResults.Clear(); + alternateBranchResults.Add(new StringBuilder()); + } + } + + // At this point, we know we can successfully incorporate the alternation's results + // into the main results. + + // Duplicate all of the existing strings for all of the new suffixes, other than the first. + int existingCount = results.Count; + for (int i = 1; i < allBranchResults!.Count; i++) + { + StringBuilder suffix = allBranchResults[i]; + for (int existing = 0; existing < existingCount; existing++) + { + StringBuilder newSb = new StringBuilder().Append(results[existing]); + newSb.Append(suffix); + results.Add(newSb); + } + } + + // Then append the first suffix to all of the existing strings. + for (int existing = 0; existing < existingCount; existing++) + { + results[existing].Append(allBranchResults[0]); + } + } + + // We don't know that we fully processed every branch, so we can't iterate through what comes after this node. + // The results were successfully updated, but return false to indicate that nothing after this node should be examined. + return false; + + // Something else we don't recognize, so stop iterating. + default: + return false; + } + } + } + } + /// Computes the leading substring in ; may be empty. public static string FindPrefix(RegexNode node) { @@ -787,10 +1073,7 @@ public static (RegexNode LoopNode, (char Char, string? String, StringComparison // Find the first concatenation. We traverse through atomic and capture nodes as they don't effect flow control. (We don't // want to explore loops, even if they have a guaranteed iteration, because we may use information about the node to then // skip the node's execution in the matching algorithm, and we would need to special-case only skipping the first iteration.) - while (node.Kind is RegexNodeKind.Atomic or RegexNodeKind.Capture) - { - node = node.Child(0); - } + node = SkipThroughAtomicAndCapture(node); if (node.Kind != RegexNodeKind.Concatenate) { return null; @@ -1014,6 +1297,16 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le } } + /// Walk through a node's children as long as the nodes are atomic or capture. + private static RegexNode SkipThroughAtomicAndCapture(RegexNode node) + { + while (node.Kind is RegexNodeKind.Atomic or RegexNodeKind.Capture) + { + node = node.Child(0); + } + return node; + } + /// Percent occurrences in source text (100 * char count / total count). private static ReadOnlySpan Frequency => [ diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexPrefixAnalyzerTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexPrefixAnalyzerTests.cs index 9c592d7c57f60..acc77b5a49c0d 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexPrefixAnalyzerTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexPrefixAnalyzerTests.cs @@ -70,6 +70,69 @@ public void FindFirstCharClass_StressDeep() FindFirstCharClass(string.Concat(Enumerable.Repeat($"(a?", nesting).Concat(Enumerable.Repeat(")*", nesting))), 0, null); } + [Theory] + // case-sensitive + [InlineData("abc", new[] { "abc" }, false)] + [InlineData("(abc+|bcd+)", new[] { "abc", "bcd" }, false)] + [InlineData("(ab+c|bcd+)", new[] { "ab", "bcd" }, false)] + [InlineData("(ab+c|bcd+)*", null, false)] + [InlineData("(ab+c|bcd+)+", new[] { "ab", "bcd" }, false)] + [InlineData("(ab+c|bcd+){3,5}", new[] { "ab", "bcd" }, false)] + [InlineData("abc|def", new[] { "abc", "def" }, false)] + [InlineData("ab{4}c|def{5}|g{2,4}h", new[] { "abbbbc", "defffff", "gg" }, false)] + [InlineData("abc|def|(ghi|jklm)", new[] { "abc", "def", "ghi", "jklm" }, false)] + [InlineData("abc[def]ghi", new[] { "abcdghi", "abceghi", "abcfghi" }, false)] + [InlineData("abc[def]ghi|[jkl]m", new[] { "abcdghi", "abceghi", "abcfghi", "jm", "km", "lm" }, false)] + [InlineData("agggtaaa|tttaccct", new[] { "agggtaaa", "tttaccct" }, false)] + [InlineData("[cgt]gggtaaa|tttaccc[acg]", new[] { "cgggtaaa", "ggggtaaa", "tgggtaaa", "tttaccca", "tttacccc", "tttacccg" }, false)] + [InlineData("a[act]ggtaaa|tttacc[agt]t", new[] { "aaggtaaa", "acggtaaa", "atggtaaa", "tttaccat", "tttaccgt", "tttacctt" }, false)] + [InlineData("ag[act]gtaaa|tttac[agt]ct", new[] { "agagtaaa", "agcgtaaa", "agtgtaaa", "tttacact", "tttacgct", "tttactct" }, false)] + [InlineData("agg[act]taaa|ttta[agt]cct", new[] { "aggataaa", "aggctaaa", "aggttaaa", "tttaacct", "tttagcct", "tttatcct" }, false)] + [InlineData(@"\b(abc|def)\b", new[] { "abc", "def" }, false)] + [InlineData("^(abc|def)$", new[] { "abc", "def" }, false)] + [InlineData("abcdefg|h", null, false)] + [InlineData("abc[def]ghi|[jkl]", null, false)] + [InlineData("[12][45][789]", new[] { "147", "148", "149", "157", "158", "159", "247", "248", "249", "257", "258", "259" }, false)] + [InlineData("[12]a[45]b[789]c", new[] { "1a4b7c", "1a4b8c", "1a4b9c", "1a5b7c", "1a5b8c", "1a5b9c", "2a4b7c", "2a4b8c", "2a4b9c", "2a5b7c", "2a5b8c", "2a5b9c" }, false)] + // case-insensitive + [InlineData("[Aa][Bb][Cc]", new[] { "abc" }, true)] + [InlineData("[Aa][Bbc][Cc]", null, true)] + [InlineData(":[Aa]![Bb]@", new[] { ":a!b@" }, true)] + [InlineData("(?i)abc", new[] { "abc" }, true)] + [InlineData("(?i)(abc+|bcd+)", new[] { "abc", "bcd" }, true)] + [InlineData("(?i)(ab+c|bcd+)", new[] { "ab", "bcd" }, true)] + [InlineData("(?i)(ab+c|bcd+)*", null, true)] + [InlineData("(?i)(ab+c|bcd+)+", new[] { "ab", "bcd" }, true)] + [InlineData("(?i)(ab+c|bcd+){3,5}", new[] { "ab", "bcd" }, true)] + [InlineData("(?i)abc|def", new[] { "abc", "def" }, true)] + [InlineData("(?i)ab{4}c|def{5}|g{2,4}h", new[] { "abbbbc", "defffff", "gg" }, true)] + [InlineData("(?i)(((?>abc)|(?>def)))", new[] { "abc", "def" }, true)] + [InlineData("(?i)(abc|def|(ghi|jklm))", null, true)] + [InlineData("(?i)(abc|def|(ghi|jlmn))", new[] { "abc", "def", "ghi", "jlmn" }, true)] + [InlineData("abc", null, true)] + [InlineData("abc|def", null, true)] + [InlineData("abc|def|(ghi|jklm)", null, true)] + [InlineData("://[Aa][Bb]|[Cc]@!", new[] { "://ab", "c@!" }, true)] + public void FindPrefixes(string pattern, string[] expectedSet, bool ignoreCase) + { + RegexTree tree = RegexParser.Parse(pattern, RegexOptions.None, CultureInfo.InvariantCulture); + string[] actual = RegexPrefixAnalyzer.FindPrefixes(tree.Root, ignoreCase); + + if (expectedSet is null) + { + Assert.Null(actual); + } + else + { + Assert.NotNull(actual); + + Array.Sort(actual, StringComparer.Ordinal); + Array.Sort(expectedSet, StringComparer.Ordinal); + + Assert.Equal(expectedSet, actual); + } + } + private static string FormatSet(string set) { if (set is null)