diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index 05116c6861b7f..81e4a512a99d5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -105,16 +105,7 @@ private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, C #if DEBUG if (IsDebug) { - Debug.Write($"Pattern: {pattern}"); - RegexOptions displayOptions = options & ~RegexOptions.Debug; - if (displayOptions != RegexOptions.None) - { - Debug.Write($"Options: {displayOptions}"); - } - if (matchTimeout != InfiniteMatchTimeout) - { - Debug.Write($"Timeout: {matchTimeout}"); - } + Debug.WriteLine($"Pattern: {pattern} Options: {options & ~RegexOptions.Debug} Timeout: {(matchTimeout == InfiniteMatchTimeout ? "infinite" : matchTimeout.ToString())}"); } #endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs index 9897fae933326..fc0193ace723b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs @@ -334,37 +334,35 @@ public int Scan(string text, int index, int beglimit, int endlimit) } #if DEBUG - /// - /// Used when dumping for debugging. - /// + /// Used when dumping for debugging. [ExcludeFromCodeCoverage] public override string ToString() => Pattern; [ExcludeFromCodeCoverage] public string Dump(string indent) { - StringBuilder sb = new StringBuilder(); + var sb = new StringBuilder(); sb.AppendLine($"{indent}BM Pattern: {Pattern}"); - sb.Append(indent + "Positive: "); - for (int i = 0; i < Positive.Length; i++) + + sb.Append($"{indent}Positive: "); + foreach (int i in Positive) { - sb.Append(Positive[i].ToString(CultureInfo.InvariantCulture) + " "); + sb.Append($"{i} "); } sb.AppendLine(); if (NegativeASCII != null) { - sb.Append(indent + "Negative table: "); + sb.Append($"{indent}Negative table: "); for (int i = 0; i < NegativeASCII.Length; i++) { if (NegativeASCII[i] != Pattern.Length) { - sb.Append(" {" + Regex.Escape(Convert.ToString((char)i, CultureInfo.InvariantCulture)) + " " + NegativeASCII[i].ToString(CultureInfo.InvariantCulture) + "}"); + sb.Append($" {{{Regex.Escape(((char)i).ToString())} {NegativeASCII[i]}}}"); } } } - sb.AppendLine(); return sb.ToString(); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 76288af6a890b..5d5f8be42d1a3 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -735,7 +735,7 @@ public static char SingletonChar(string set) return set[SetStartIndex]; } - public static bool IsMergeable(string? charClass) => + public static bool IsMergeable(string charClass) => charClass != null && !IsNegated(charClass) && !IsSubtraction(charClass); @@ -1541,7 +1541,7 @@ public static string SetDescription(string set) int categoryLength = set[CategoryLengthIndex]; int endPosition = SetStartIndex + setLength + categoryLength; - StringBuilder desc = new StringBuilder(); + var desc = new StringBuilder(); desc.Append('['); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs index e88cc04b5a90a..83b008818f2c8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs @@ -285,7 +285,7 @@ private static string OperatorDescription(int Opcode) [ExcludeFromCodeCoverage] public string OpcodeDescription(int offset) { - StringBuilder sb = new StringBuilder(); + var sb = new StringBuilder(); int opcode = Codes[offset]; sb.AppendFormat("{0:D6} ", offset); @@ -307,7 +307,7 @@ public string OpcodeDescription(int offset) case Notoneloopatomic: case Onelazy: case Notonelazy: - sb.Append(RegexCharClass.CharDescription((char)Codes[offset + 1])); + sb.Append("'").Append(RegexCharClass.CharDescription((char)Codes[offset + 1])).Append("'"); break; case Set: @@ -319,7 +319,7 @@ public string OpcodeDescription(int offset) break; case Multi: - sb.Append(Strings[Codes[offset + 1]]); + sb.Append('"').Append(Strings[Codes[offset + 1]]).Append('"'); break; case Ref: diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 9fd58a334baf1..bcbfe8324c277 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -1772,7 +1772,7 @@ static bool NodeSupportsNonBacktrackingImplementation(RegexNode node, int maxDep // Its children must all also be supported. case RegexNode.Alternate: if (node.Next != null && - (node.Next.Type == RegexNode.Atomic || // atomic alternate + (node.IsAtomicByParent() || // atomic alternate (node.Next.Type == RegexNode.Capture && node.Next.Next is null))) // root alternate { goto case RegexNode.Concatenate; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs index dca3bf774ed59..da8d2e59d9f7a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs @@ -227,29 +227,20 @@ private static int AnchorFromType(int type) => [ExcludeFromCodeCoverage] public static string AnchorDescription(int anchors) { - StringBuilder sb = new StringBuilder(); - - if (0 != (anchors & Beginning)) - sb.Append(", Beginning"); - if (0 != (anchors & Start)) - sb.Append(", Start"); - if (0 != (anchors & Bol)) - sb.Append(", Bol"); - if (0 != (anchors & Boundary)) - sb.Append(", Boundary"); - if (0 != (anchors & ECMABoundary)) - sb.Append(", ECMABoundary"); - if (0 != (anchors & Eol)) - sb.Append(", Eol"); - if (0 != (anchors & End)) - sb.Append(", End"); - if (0 != (anchors & EndZ)) - sb.Append(", EndZ"); - - if (sb.Length >= 2) - return (sb.ToString(2, sb.Length - 2)); - - return "None"; + var sb = new StringBuilder(); + + if ((anchors & Beginning) != 0) sb.Append(", Beginning"); + if ((anchors & Start) != 0) sb.Append(", Start"); + if ((anchors & Bol) != 0) sb.Append(", Bol"); + if ((anchors & Boundary) != 0) sb.Append(", Boundary"); + if ((anchors & ECMABoundary) != 0) sb.Append(", ECMABoundary"); + if ((anchors & Eol) != 0) sb.Append(", Eol"); + if ((anchors & End) != 0) sb.Append(", End"); + if ((anchors & EndZ) != 0) sb.Append(", EndZ"); + + return sb.Length >= 2 ? + sb.ToString(2, sb.Length - 2) : + "None"; } #endif diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 06cca891cfe13..5c656c64a43b2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -227,9 +227,13 @@ internal RegexNode FinalOptimize() case Alternate: case Loop: case Lazyloop: - var atomic = new RegexNode(Atomic, Options); - atomic.AddChild(existingChild); - node.ReplaceChild(node.ChildCount() - 1, atomic); + // Make the node atomic if it isn't already (as conferred by a parent node being atomic). + if (!existingChild.IsAtomicByParent()) + { + var atomic = new RegexNode(Atomic, Options); + atomic.AddChild(existingChild); + node.ReplaceChild(node.ChildCount() - 1, atomic); + } break; } continue; @@ -299,7 +303,7 @@ internal RegexNode FinalOptimize() // Optimization: Unnecessary root atomic. // If the root node under the implicit Capture is an Atomic, the Atomic is useless as there's nothing // to backtrack into it, so we can remove it. - if (rootNode.Child(0).Type == Atomic) + while (rootNode.Child(0).Type == Atomic) { rootNode.ReplaceChild(0, rootNode.Child(0).Child(0)); } @@ -308,6 +312,29 @@ internal RegexNode FinalOptimize() return rootNode; } + /// Whether this node is considered to be atomic based on its parent. + /// + /// This is used to determine whether additional atomic nodes may be valuable to + /// be introduced into the tree. It should not be used to determine for sure whether + /// a node will be backtracked into. + /// + public bool IsAtomicByParent() + { + RegexNode? next = Next; + if (next is null) return false; + if (next.Type == Atomic) return true; + + // We only walk up one group as a balance between optimization and cost. + if ((next.Type != Concatenate && next.Type != Capture) || + next.Child(next.ChildCount() - 1) != this) + { + return false; + } + + next = next.Next; + return next != null && next.Type == Atomic; + } + /// /// Removes redundant nodes from the subtree, and returns a reduced subtree. /// @@ -545,137 +572,321 @@ private RegexNode ReduceSet() return this; } - /// - /// Combine adjacent sets/chars. - /// Basic optimization. Single-letter alternations can be replaced - /// by faster set specifications, and nested alternations with no - /// intervening operators can be flattened: - /// - /// a|b|c|def|g|h -> [a-c]|def|[gh] - /// apple|(?:orange|pear)|grape -> apple|orange|pear|grape - /// + /// Optimize an alternation. private RegexNode ReduceAlternation() { - int childCount = ChildCount(); - if (childCount == 0) + switch (ChildCount()) { - return new RegexNode(Nothing, Options); - } + case 0: + return new RegexNode(Nothing, Options); - if (childCount == 1) - { - return Child(0); - } + case 1: + return Child(0); - bool wasLastSet = false; - bool lastNodeCannotMerge = false; - RegexOptions optionsLast = 0; - RegexOptions optionsAt; - int i; - int j; - RegexNode at; - RegexNode prev; + default: + ReduceSingleLetterAndNestedAlternations(); + RegexNode newThis = StripEnation(Nothing); + return newThis != this ? newThis : ExtractCommonPrefix(); + } - List children = (List)Children!; - for (i = 0, j = 0; i < children.Count; i++, j++) + // This function performs two optimizations: + // - Single-letter alternations can be replaced by faster set specifications + // e.g. "a|b|c|def|g|h" -> "[a-c]|def|[gh]" + // - Nested alternations with no intervening operators can be flattened: + // e.g. "apple|(?:orange|pear)|grape" -> "apple|orange|pear|grape" + void ReduceSingleLetterAndNestedAlternations() { - at = children[i]; + bool wasLastSet = false; + bool lastNodeCannotMerge = false; + RegexOptions optionsLast = 0; + RegexOptions optionsAt; + int i; + int j; + RegexNode at; + RegexNode prev; + + List children = (List)Children!; + for (i = 0, j = 0; i < children.Count; i++, j++) + { + at = children[i]; - if (j < i) - children[j] = at; + if (j < i) + children[j] = at; - while (true) - { - if (at.Type == Alternate) + while (true) { - if (at.Children is List atChildren) + if (at.Type == Alternate) { - for (int k = 0; k < atChildren.Count; k++) + if (at.Children is List atChildren) + { + for (int k = 0; k < atChildren.Count; k++) + { + atChildren[k].Next = this; + } + children.InsertRange(i + 1, atChildren); + } + else { - atChildren[k].Next = this; + RegexNode atChild = (RegexNode)at.Children!; + atChild.Next = this; + children.Insert(i + 1, atChild); } - children.InsertRange(i + 1, atChildren); + j--; } - else + else if (at.Type == Set || at.Type == One) { - RegexNode atChild = (RegexNode)at.Children!; - atChild.Next = this; - children.Insert(i + 1, atChild); - } - j--; - } - else if (at.Type == Set || at.Type == One) - { - // Cannot merge sets if L or I options differ, or if either are negated. - optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase); + // Cannot merge sets if L or I options differ, or if either are negated. + optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase); - if (at.Type == Set) - { - if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at.Str)) + if (at.Type == Set) + { + if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at.Str!)) + { + wasLastSet = true; + lastNodeCannotMerge = !RegexCharClass.IsMergeable(at.Str!); + optionsLast = optionsAt; + break; + } + } + else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge) { wasLastSet = true; - lastNodeCannotMerge = !RegexCharClass.IsMergeable(at.Str); + lastNodeCannotMerge = false; optionsLast = optionsAt; break; } + + + // The last node was a Set or a One, we're a Set or One and our options are the same. + // Merge the two nodes. + j--; + prev = children[j]; + + RegexCharClass prevCharClass; + if (prev.Type == One) + { + prevCharClass = new RegexCharClass(); + prevCharClass.AddChar(prev.Ch); + } + else + { + prevCharClass = RegexCharClass.Parse(prev.Str!); + } + + if (at.Type == One) + { + prevCharClass.AddChar(at.Ch); + } + else + { + RegexCharClass atCharClass = RegexCharClass.Parse(at.Str!); + prevCharClass.AddCharClass(atCharClass); + } + + prev.Type = Set; + prev.Str = prevCharClass.ToStringClass(); + } + else if (at.Type == Nothing) + { + j--; } - else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge) + else { - wasLastSet = true; + wasLastSet = false; lastNodeCannotMerge = false; - optionsLast = optionsAt; - break; } + break; + } + } + + if (j < i) + { + children.RemoveRange(j, i - j); + } + } + // Analyzes all the branches of the alternation for text that's identical at the beginning + // of every branch. That text is then pulled out into its own one or multi node in a + // concatenation with the alternation (whose branches are updated to remove that prefix). + // This is valuable for a few reasons. One, it exposes potentially more text to the + // expression prefix analyzer used to influence FindFirstChar. Second, it exposes more + // potential alternation optimizations, e.g. if the same prefix is followed in two branches + // by sets that can be merged. Third, it reduces the amount of duplicated comparisons required + // if we end up backtracking into subsequent branches. + RegexNode ExtractCommonPrefix() + { + // To keep things relatively simple, we currently only handle: + // - Branches that are one or multi nodes, or that are concatenations beginning with one or multi nodes. + // - All branches having the same options. + // - Text, rather than also trying to combine identical sets that start each branch. + + Debug.Assert(Children is List); + var children = (List)Children; + Debug.Assert(children.Count >= 2); + + // Process the first branch to get the maximum possible common string. + RegexNode? startingNode = FindBranchOneMultiStart(children[0]); + if (startingNode is null) + { + return this; + } - // The last node was a Set or a One, we're a Set or One and our options are the same. - // Merge the two nodes. - j--; - prev = children[j]; + RegexOptions startingNodeOptions = startingNode.Options; + string? originalStartingString = startingNode.Str; + ReadOnlySpan startingSpan = startingNode.Type == One ? stackalloc char[1] { startingNode.Ch } : (ReadOnlySpan)originalStartingString; + Debug.Assert(startingSpan.Length > 0); + + // Now compare the rest of the branches against it. + for (int i = 1; i < children.Count; i++) + { + // Get the starting node of the next branch. + startingNode = FindBranchOneMultiStart(children[i]); + if (startingNode is null || startingNode.Options != startingNodeOptions) + { + return this; + } - RegexCharClass prevCharClass; - if (prev.Type == One) + // See if the new branch's prefix has a shared prefix with the current one. + // If it does, shorten to that; if it doesn't, bail. + if (startingNode.Type == One) + { + if (startingSpan[0] != startingNode.Ch) { - prevCharClass = new RegexCharClass(); - prevCharClass.AddChar(prev.Ch); + return this; } - else + + if (startingSpan.Length != 1) { - prevCharClass = RegexCharClass.Parse(prev.Str!); + startingSpan = startingSpan.Slice(0, 1); } + } + else + { + Debug.Assert(startingNode.Type == Multi); + Debug.Assert(startingNode.Str!.Length > 0); - if (at.Type == One) + int minLength = Math.Min(startingSpan.Length, startingNode.Str.Length); + int c = 0; + while (c < minLength && startingSpan[c] == startingNode.Str[c]) c++; + if (c == 0) { - prevCharClass.AddChar(at.Ch); + return this; + } + + startingSpan = startingSpan.Slice(0, c); + } + } + + // If we get here, we have a starting string prefix shared by all branches. + Debug.Assert(startingSpan.Length > 0); + + // Now remove the prefix from each branch. + for (int i = 0; i < children.Count; i++) + { + RegexNode branch = children[i]; + if (branch.Type == Concatenate) + { + ProcessOneOrMulti(branch.Child(0), startingSpan); + ReplaceChild(i, branch.Reduce()); + } + else + { + ProcessOneOrMulti(branch, startingSpan); + } + + // Remove the starting text from the one or multi node. This may end up changing + // the type of the node to be Empty if the starting text matches the node's full value. + static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan startingSpan) + { + if (node.Type == One) + { + Debug.Assert(startingSpan.Length == 1); + Debug.Assert(startingSpan[0] == node.Ch); + node.Type = Empty; + node.Ch = '\0'; } else { - RegexCharClass atCharClass = RegexCharClass.Parse(at.Str!); - prevCharClass.AddCharClass(atCharClass); + Debug.Assert(node.Type == Multi); + Debug.Assert(node.Str.AsSpan().StartsWith(startingSpan, StringComparison.Ordinal)); + if (node.Str!.Length == startingSpan.Length) + { + node.Type = Empty; + node.Str = null; + } + else if (node.Str.Length - 1 == startingSpan.Length) + { + node.Type = One; + node.Ch = node.Str[^1]; + node.Str = null; + } + else + { + node.Str = node.Str.Substring(startingSpan.Length); + } } + } + } - prev.Type = Set; - prev.Str = prevCharClass.ToStringClass(); + // We may have changed multiple branches to be Empty, but we only need to keep + // the first (keeping the rest would just duplicate work in backtracking, though + // it would also mean the original regex had at least two identical branches). + for (int firstEmpty = 0; firstEmpty < children.Count; firstEmpty++) + { + if (children[firstEmpty].Type != Empty) + { + continue; } - else if (at.Type == Nothing) + + // Found the first empty. Now starting after it, remove all subsequent found Empty nodes, + // pushing everything else down. (In the future, should we want to there's also the opportunity + // here to remove other duplication, but such duplication is a more egregious mistake on the + // part of the expression author.) + int i = firstEmpty + 1; + int j = i; + while (i < children.Count) { - j--; + if (children[i].Type != Empty) + { + if (j != i) + { + children[j] = children[i]; + } + j++; + } + i++; } - else + + if (j < i) { - wasLastSet = false; - lastNodeCannotMerge = false; + children.RemoveRange(j, i - j); } + break; } - } - if (j < i) - { - children.RemoveRange(j, i - j); - } + var concat = new RegexNode(Concatenate, Options); // use same options as the Alternate + concat.AddChild(startingSpan.Length == 1 ? // use same options as the branches + new RegexNode(One, startingNodeOptions) { Ch = startingSpan[0] } : + new RegexNode(Multi, startingNodeOptions) { Str = originalStartingString?.Length == startingSpan.Length ? originalStartingString : startingSpan.ToString() }); + concat.AddChild(this); // this will re-reduce the node, allowing for newly exposed possible optimizations in what came after the prefix + return concat; + + // Finds the starting one or multi of the branch, if it has one; otherwise, returns null. + // For simplicity, this only considers branches that are One or Multi, or a Concatenation + // beginning with a One or Multi. We don't traverse more than one level to avoid the + // complication of then having to later update that hierarchy when removing the prefix, + // but it could be done in the future if proven beneficial enough. + static RegexNode? FindBranchOneMultiStart(RegexNode branch) + { + if (branch.Type == Concatenate) + { + branch = branch.Child(0); + } - return StripEnation(Nothing); + return branch.Type == One || branch.Type == Multi ? branch : null; + } + } } /// @@ -1238,20 +1449,21 @@ public RegexNode MakeQuantifier(bool lazy, int min, int max) public void AddChild(RegexNode newChild) { - RegexNode reducedChild = newChild.Reduce(); - reducedChild.Next = this; + newChild.Next = this; // so that the child can see its parent while being reduced + newChild = newChild.Reduce(); + newChild.Next = this; // in case Reduce returns a different node that needs to be reparented if (Children is null) { - Children = reducedChild; + Children = newChild; } else if (Children is RegexNode currentChild) { - Children = new List() { currentChild, reducedChild }; + Children = new List() { currentChild, newChild }; } else { - ((List)Children).Add(reducedChild); + ((List)Children).Add(newChild); } } @@ -1301,7 +1513,6 @@ public int ChildCount() [ExcludeFromCodeCoverage] public string Description() { - string typeStr = Type switch { Oneloop => nameof(Oneloop), @@ -1344,17 +1555,15 @@ public string Description() _ => $"(unknown {Type})" }; - var argSb = new StringBuilder().Append(typeStr); - - if ((Options & RegexOptions.ExplicitCapture) != 0) argSb.Append("-C"); - if ((Options & RegexOptions.IgnoreCase) != 0) argSb.Append("-I"); - if ((Options & RegexOptions.RightToLeft) != 0) argSb.Append("-L"); - if ((Options & RegexOptions.Multiline) != 0) argSb.Append("-M"); - if ((Options & RegexOptions.Singleline) != 0) argSb.Append("-S"); - if ((Options & RegexOptions.IgnorePatternWhitespace) != 0) argSb.Append("-X"); - if ((Options & RegexOptions.ECMAScript) != 0) argSb.Append("-E"); + var sb = new StringBuilder(typeStr); - argSb.Append(Indent()); + if ((Options & RegexOptions.ExplicitCapture) != 0) sb.Append("-C"); + if ((Options & RegexOptions.IgnoreCase) != 0) sb.Append("-I"); + if ((Options & RegexOptions.RightToLeft) != 0) sb.Append("-L"); + if ((Options & RegexOptions.Multiline) != 0) sb.Append("-M"); + if ((Options & RegexOptions.Singleline) != 0) sb.Append("-S"); + if ((Options & RegexOptions.IgnorePatternWhitespace) != 0) sb.Append("-X"); + if ((Options & RegexOptions.ECMAScript) != 0) sb.Append("-E"); switch (Type) { @@ -1366,25 +1575,27 @@ public string Description() case Notonelazy: case One: case Notone: - argSb.Append(RegexCharClass.CharDescription(Ch)); + sb.Append(" '").Append(RegexCharClass.CharDescription(Ch)).Append('\''); break; case Capture: - argSb.Append("index = " + M); + sb.Append(' ').Append($"index = {M}"); if (N != -1) - argSb.Append(", unindex = " + N); + { + sb.Append($", unindex = {N}"); + } break; case Ref: case Testref: - argSb.Append("index = " + M); + sb.Append(' ').Append($"index = {M}"); break; case Multi: - argSb.Append(Str); + sb.Append(" \"").Append(Str).Append('"'); break; case Set: case Setloop: case Setloopatomic: case Setlazy: - argSb.Append(RegexCharClass.SetDescription(Str!)); + sb.Append(' ').Append(RegexCharClass.SetDescription(Str!)); break; } @@ -1401,19 +1612,16 @@ public string Description() case Setlazy: case Loop: case Lazyloop: - if (argSb[^1] != ' ') - argSb.Append(", "); - argSb.Append("min = " + M + ", max = "); - if (N == int.MaxValue) - argSb.Append("inf"); - else - argSb.Append(N); + sb.Append( + (M == 0 && N == int.MaxValue) ? "*" : + (M == 0 && N == 1) ? "?" : + (M == 1 && N == int.MaxValue) ? "+" : + (N == int.MaxValue) ? $"{{{M}, *}}" : + $"{{{M}, {N}}}"); break; } - string Indent() => new string(' ', Math.Max(1, 25 - argSb.Length)); - - return argSb.ToString(); + return sb.ToString(); } [ExcludeFromCodeCoverage] diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs index acaf1e2125844..45af2b52fc506 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs @@ -680,14 +680,18 @@ private static string StackDescription(int[] a, int index) sb.Append(a.Length); if (sb.Length < 8) + { sb.Append(' ', 8 - sb.Length); + } sb.Append('('); for (int i = index; i < a.Length; i++) { if (i > index) + { sb.Append(' '); + } sb.Append(a[i]); } @@ -704,12 +708,18 @@ internal virtual string TextposDescription() sb.Append(runtextpos); if (sb.Length < 8) + { sb.Append(' ', 8 - sb.Length); + } if (runtextpos > runtextbeg) + { sb.Append(RegexCharClass.CharDescription(runtext![runtextpos - 1])); + } else + { sb.Append('^'); + } sb.Append('>'); diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs index 133c95a3bfd51..ebca1004c7724 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs @@ -55,16 +55,20 @@ public static void RegexUnicodeChar() for (int i = 0; i < 100; i++) { - StringBuilder builder1 = new StringBuilder(); - StringBuilder builder2 = new StringBuilder(); + var builder1 = new StringBuilder(); + var builder2 = new StringBuilder(); + for (int j = 0; j < validCharLength; j++) { char c = validChars[random.Next(validChars.Count)]; builder1.Append(c); builder2.Append(c); } + for (int j = 0; j < invalidCharLength; j++) + { builder1.Append(invalidChars[random.Next(invalidChars.Count)]); + } string input = builder1.ToString(); Match match = regex.Match(input); @@ -94,18 +98,26 @@ public static void RegexUnicodeChar() for (int i = 0; i < 500; i++) { - StringBuilder builder1 = new StringBuilder(); - StringBuilder builder2 = new StringBuilder(); + var builder1 = new StringBuilder(); + var builder2 = new StringBuilder(); + for (int j = 0; j < invalidCharLength; j++) + { builder1.Append(invalidChars[random.Next(invalidChars.Count)]); + } + for (int j = 0; j < validCharLength; j++) { char c = validChars[random.Next(validChars.Count)]; builder1.Append(c); builder2.Append(c); } + for (int j = 0; j < invalidCharLength; j++) + { builder1.Append(invalidChars[random.Next(invalidChars.Count)]); + } + string input = builder1.ToString(); Match match = regex.Match(input); @@ -143,16 +155,21 @@ public static void RegexUnicodeChar() for (int i = 0; i < 100; i++) { - StringBuilder builder1 = new StringBuilder(); - StringBuilder builder2 = new StringBuilder(); + var builder1 = new StringBuilder(); + var builder2 = new StringBuilder(); + for (int j = 0; j < validCharLength; j++) { char c = validChars[random.Next(validChars.Count)]; builder1.Append(c); builder2.Append(c); } + for (int j = 0; j < invalidCharLength; j++) + { builder1.Append(invalidChars[random.Next(invalidChars.Count)]); + } + string input = builder1.ToString(); Match match = regex.Match(input); @@ -173,18 +190,26 @@ public static void RegexUnicodeChar() for (int i = 0; i < 100; i++) { - StringBuilder builder1 = new StringBuilder(); - StringBuilder builder2 = new StringBuilder(); + var builder1 = new StringBuilder(); + var builder2 = new StringBuilder(); + for (int j = 0; j < invalidCharLength; j++) + { builder1.Append(invalidChars[random.Next(invalidChars.Count)]); + } + for (int j = 0; j < validCharLength; j++) { char c = validChars[random.Next(validChars.Count)]; builder1.Append(c); builder2.Append(c); } + for (int j = 0; j < invalidCharLength; j++) + { builder1.Append(invalidChars[random.Next(invalidChars.Count)]); + } + string input = builder1.ToString(); Match match = regex.Match(input); diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs index 7eb9e1d01b3bc..b622ec6c71508 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs @@ -277,8 +277,31 @@ private static int GetMinRequiredLength(Regex r) [InlineData("(?:a+){4}", "a{4,}")] [InlineData("(?:a{1,2}){4}", "a{4,8}")] // Alternation reduction + [InlineData("a|b", "[ab]")] [InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")] [InlineData("a|b|c|def|g|h", "[a-c]|def|[gh]")] + [InlineData("this|that|there|then|those", "th(?:is|at|ere|en|ose)")] + [InlineData("it's (?>this|that|there|then|those)", "it's (?>th(?:is|at|ere|en|ose))")] + [InlineData("it's (?>this|that|there|then|those)!", "it's (?>th(?:is|at|ere|en|ose))!")] + [InlineData("abcd|abce", "abc[de]")] + [InlineData("abcd|abef", "ab(?:cd|ef)")] + [InlineData("abcd|aefg", "a(?:bcd|efg)")] + [InlineData("abcd|abc|ab|a", "a(?:bcd|bc|b|)")] + [InlineData("abcde|abcdef", "abcde(?:|f)")] + [InlineData("abcdef|abcde", "abcde(?:f|)")] + [InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")] + [InlineData("(ab|ab*)bc", "(a(?:b|b*))bc")] + [InlineData("abc(?:defgh|defij)klmn", "abcdef(?:gh|ij)klmn")] + [InlineData("abc(defgh|defij)klmn", "abc(def(?:gh|ij))klmn")] + [InlineData("a[b-f]|a[g-k]", "a[b-k]")] + [InlineData("this|this", "this")] + [InlineData("this|this|this", "this")] + [InlineData("hello there|hello again|hello|hello|hello|hello", "hello(?: there| again|)")] + [InlineData("hello there|hello again|hello|hello|hello|hello|hello world", "hello(?: there| again|| world)")] + [InlineData("hello there|hello again|hello|hello|hello|hello|hello world|hello", "hello(?: there| again|| world)")] + [InlineData("abcd(?:(?i:e)|(?i:f))", "abcd(?i:[ef])")] + [InlineData("(?i:abcde)|(?i:abcdf)", "(?i:abcd[ef])")] + [InlineData("xyz(?:(?i:abcde)|(?i:abcdf))", "xyz(?i:abcd[ef])")] // Auto-atomicity [InlineData("a*b", "(?>a*)b")] [InlineData("a*b+", "(?>a*)b+")] @@ -294,11 +317,16 @@ private static int GetMinRequiredLength(Regex r) [InlineData("(?:abc*|def*)g", "(?:ab(?>c*)|de(?>f*))g")] [InlineData("(?:a[ce]*|b*)g", "(?:a(?>[ce]*)|(?>b*))g")] [InlineData("(?:a[ce]*|b*)c", "(?:a[ce]*|(?>b*))c")] + [InlineData("apple|(?:orange|pear)|grape", "apple|orange|pear|grape")] + [InlineData("(?>(?>(?>(?:abc)*)))", "(?:abc)*")] public void PatternsReduceIdentically(string pattern1, string pattern2) { AssertExtensions.Equal(GetRegexCodes(new Regex(pattern1)), GetRegexCodes(new Regex(pattern2))); - Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.IgnoreCase)), GetRegexCodes(new Regex(pattern2))); Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.RightToLeft)), GetRegexCodes(new Regex(pattern2))); + if (!pattern1.Contains("?i:") && !pattern2.Contains("?i:")) + { + Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.IgnoreCase)), GetRegexCodes(new Regex(pattern2))); + } } [Theory] @@ -341,6 +369,10 @@ public void PatternsReduceIdentically(string pattern1, string pattern2) [InlineData("[ace][ace]{0,2147483646}", "[ace]{0,2147483647}")] [InlineData("[ace]{2147482647}[ace]{1000}", "[ace]{2147483647}")] [InlineData("[ace]{0,2147482647}[ace]{0,1000}", "[ace]{0,2147483647}")] + // Not reducing branches of alternations with different casing + [InlineData("(?i:abcd)|abcd", "abcd|abcd")] + [InlineData("abcd|(?i:abcd)", "abcd|abcd")] + [InlineData("abc(?:(?i:e)|f)", "abc[ef]")] // Not applying auto-atomicity [InlineData("a*b*", "(?>a*)b*")] [InlineData("[^\n]*\n*", "(?>[^\n]*)\n")]