diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
index 05116c6861b7f..81e4a512a99d5 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
@@ -105,16 +105,7 @@ private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, C
#if DEBUG
if (IsDebug)
{
- Debug.Write($"Pattern: {pattern}");
- RegexOptions displayOptions = options & ~RegexOptions.Debug;
- if (displayOptions != RegexOptions.None)
- {
- Debug.Write($"Options: {displayOptions}");
- }
- if (matchTimeout != InfiniteMatchTimeout)
- {
- Debug.Write($"Timeout: {matchTimeout}");
- }
+ Debug.WriteLine($"Pattern: {pattern} Options: {options & ~RegexOptions.Debug} Timeout: {(matchTimeout == InfiniteMatchTimeout ? "infinite" : matchTimeout.ToString())}");
}
#endif
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs
index 9897fae933326..fc0193ace723b 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexBoyerMoore.cs
@@ -334,37 +334,35 @@ public int Scan(string text, int index, int beglimit, int endlimit)
}
#if DEBUG
- ///
- /// Used when dumping for debugging.
- ///
+ /// Used when dumping for debugging.
[ExcludeFromCodeCoverage]
public override string ToString() => Pattern;
[ExcludeFromCodeCoverage]
public string Dump(string indent)
{
- StringBuilder sb = new StringBuilder();
+ var sb = new StringBuilder();
sb.AppendLine($"{indent}BM Pattern: {Pattern}");
- sb.Append(indent + "Positive: ");
- for (int i = 0; i < Positive.Length; i++)
+
+ sb.Append($"{indent}Positive: ");
+ foreach (int i in Positive)
{
- sb.Append(Positive[i].ToString(CultureInfo.InvariantCulture) + " ");
+ sb.Append($"{i} ");
}
sb.AppendLine();
if (NegativeASCII != null)
{
- sb.Append(indent + "Negative table: ");
+ sb.Append($"{indent}Negative table: ");
for (int i = 0; i < NegativeASCII.Length; i++)
{
if (NegativeASCII[i] != Pattern.Length)
{
- sb.Append(" {" + Regex.Escape(Convert.ToString((char)i, CultureInfo.InvariantCulture)) + " " + NegativeASCII[i].ToString(CultureInfo.InvariantCulture) + "}");
+ sb.Append($" {{{Regex.Escape(((char)i).ToString())} {NegativeASCII[i]}}}");
}
}
}
-
sb.AppendLine();
return sb.ToString();
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index 76288af6a890b..5d5f8be42d1a3 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -735,7 +735,7 @@ public static char SingletonChar(string set)
return set[SetStartIndex];
}
- public static bool IsMergeable(string? charClass) =>
+ public static bool IsMergeable(string charClass) =>
charClass != null &&
!IsNegated(charClass) &&
!IsSubtraction(charClass);
@@ -1541,7 +1541,7 @@ public static string SetDescription(string set)
int categoryLength = set[CategoryLengthIndex];
int endPosition = SetStartIndex + setLength + categoryLength;
- StringBuilder desc = new StringBuilder();
+ var desc = new StringBuilder();
desc.Append('[');
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
index e88cc04b5a90a..83b008818f2c8 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
@@ -285,7 +285,7 @@ private static string OperatorDescription(int Opcode)
[ExcludeFromCodeCoverage]
public string OpcodeDescription(int offset)
{
- StringBuilder sb = new StringBuilder();
+ var sb = new StringBuilder();
int opcode = Codes[offset];
sb.AppendFormat("{0:D6} ", offset);
@@ -307,7 +307,7 @@ public string OpcodeDescription(int offset)
case Notoneloopatomic:
case Onelazy:
case Notonelazy:
- sb.Append(RegexCharClass.CharDescription((char)Codes[offset + 1]));
+ sb.Append("'").Append(RegexCharClass.CharDescription((char)Codes[offset + 1])).Append("'");
break;
case Set:
@@ -319,7 +319,7 @@ public string OpcodeDescription(int offset)
break;
case Multi:
- sb.Append(Strings[Codes[offset + 1]]);
+ sb.Append('"').Append(Strings[Codes[offset + 1]]).Append('"');
break;
case Ref:
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index 9fd58a334baf1..bcbfe8324c277 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -1772,7 +1772,7 @@ static bool NodeSupportsNonBacktrackingImplementation(RegexNode node, int maxDep
// Its children must all also be supported.
case RegexNode.Alternate:
if (node.Next != null &&
- (node.Next.Type == RegexNode.Atomic || // atomic alternate
+ (node.IsAtomicByParent() || // atomic alternate
(node.Next.Type == RegexNode.Capture && node.Next.Next is null))) // root alternate
{
goto case RegexNode.Concatenate;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs
index dca3bf774ed59..da8d2e59d9f7a 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs
@@ -227,29 +227,20 @@ private static int AnchorFromType(int type) =>
[ExcludeFromCodeCoverage]
public static string AnchorDescription(int anchors)
{
- StringBuilder sb = new StringBuilder();
-
- if (0 != (anchors & Beginning))
- sb.Append(", Beginning");
- if (0 != (anchors & Start))
- sb.Append(", Start");
- if (0 != (anchors & Bol))
- sb.Append(", Bol");
- if (0 != (anchors & Boundary))
- sb.Append(", Boundary");
- if (0 != (anchors & ECMABoundary))
- sb.Append(", ECMABoundary");
- if (0 != (anchors & Eol))
- sb.Append(", Eol");
- if (0 != (anchors & End))
- sb.Append(", End");
- if (0 != (anchors & EndZ))
- sb.Append(", EndZ");
-
- if (sb.Length >= 2)
- return (sb.ToString(2, sb.Length - 2));
-
- return "None";
+ var sb = new StringBuilder();
+
+ if ((anchors & Beginning) != 0) sb.Append(", Beginning");
+ if ((anchors & Start) != 0) sb.Append(", Start");
+ if ((anchors & Bol) != 0) sb.Append(", Bol");
+ if ((anchors & Boundary) != 0) sb.Append(", Boundary");
+ if ((anchors & ECMABoundary) != 0) sb.Append(", ECMABoundary");
+ if ((anchors & Eol) != 0) sb.Append(", Eol");
+ if ((anchors & End) != 0) sb.Append(", End");
+ if ((anchors & EndZ) != 0) sb.Append(", EndZ");
+
+ return sb.Length >= 2 ?
+ sb.ToString(2, sb.Length - 2) :
+ "None";
}
#endif
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
index 06cca891cfe13..5c656c64a43b2 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -227,9 +227,13 @@ internal RegexNode FinalOptimize()
case Alternate:
case Loop:
case Lazyloop:
- var atomic = new RegexNode(Atomic, Options);
- atomic.AddChild(existingChild);
- node.ReplaceChild(node.ChildCount() - 1, atomic);
+ // Make the node atomic if it isn't already (as conferred by a parent node being atomic).
+ if (!existingChild.IsAtomicByParent())
+ {
+ var atomic = new RegexNode(Atomic, Options);
+ atomic.AddChild(existingChild);
+ node.ReplaceChild(node.ChildCount() - 1, atomic);
+ }
break;
}
continue;
@@ -299,7 +303,7 @@ internal RegexNode FinalOptimize()
// Optimization: Unnecessary root atomic.
// If the root node under the implicit Capture is an Atomic, the Atomic is useless as there's nothing
// to backtrack into it, so we can remove it.
- if (rootNode.Child(0).Type == Atomic)
+ while (rootNode.Child(0).Type == Atomic)
{
rootNode.ReplaceChild(0, rootNode.Child(0).Child(0));
}
@@ -308,6 +312,29 @@ internal RegexNode FinalOptimize()
return rootNode;
}
+ /// Whether this node is considered to be atomic based on its parent.
+ ///
+ /// This is used to determine whether additional atomic nodes may be valuable to
+ /// be introduced into the tree. It should not be used to determine for sure whether
+ /// a node will be backtracked into.
+ ///
+ public bool IsAtomicByParent()
+ {
+ RegexNode? next = Next;
+ if (next is null) return false;
+ if (next.Type == Atomic) return true;
+
+ // We only walk up one group as a balance between optimization and cost.
+ if ((next.Type != Concatenate && next.Type != Capture) ||
+ next.Child(next.ChildCount() - 1) != this)
+ {
+ return false;
+ }
+
+ next = next.Next;
+ return next != null && next.Type == Atomic;
+ }
+
///
/// Removes redundant nodes from the subtree, and returns a reduced subtree.
///
@@ -545,137 +572,321 @@ private RegexNode ReduceSet()
return this;
}
- ///
- /// Combine adjacent sets/chars.
- /// Basic optimization. Single-letter alternations can be replaced
- /// by faster set specifications, and nested alternations with no
- /// intervening operators can be flattened:
- ///
- /// a|b|c|def|g|h -> [a-c]|def|[gh]
- /// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
- ///
+ /// Optimize an alternation.
private RegexNode ReduceAlternation()
{
- int childCount = ChildCount();
- if (childCount == 0)
+ switch (ChildCount())
{
- return new RegexNode(Nothing, Options);
- }
+ case 0:
+ return new RegexNode(Nothing, Options);
- if (childCount == 1)
- {
- return Child(0);
- }
+ case 1:
+ return Child(0);
- bool wasLastSet = false;
- bool lastNodeCannotMerge = false;
- RegexOptions optionsLast = 0;
- RegexOptions optionsAt;
- int i;
- int j;
- RegexNode at;
- RegexNode prev;
+ default:
+ ReduceSingleLetterAndNestedAlternations();
+ RegexNode newThis = StripEnation(Nothing);
+ return newThis != this ? newThis : ExtractCommonPrefix();
+ }
- List children = (List)Children!;
- for (i = 0, j = 0; i < children.Count; i++, j++)
+ // This function performs two optimizations:
+ // - Single-letter alternations can be replaced by faster set specifications
+ // e.g. "a|b|c|def|g|h" -> "[a-c]|def|[gh]"
+ // - Nested alternations with no intervening operators can be flattened:
+ // e.g. "apple|(?:orange|pear)|grape" -> "apple|orange|pear|grape"
+ void ReduceSingleLetterAndNestedAlternations()
{
- at = children[i];
+ bool wasLastSet = false;
+ bool lastNodeCannotMerge = false;
+ RegexOptions optionsLast = 0;
+ RegexOptions optionsAt;
+ int i;
+ int j;
+ RegexNode at;
+ RegexNode prev;
+
+ List children = (List)Children!;
+ for (i = 0, j = 0; i < children.Count; i++, j++)
+ {
+ at = children[i];
- if (j < i)
- children[j] = at;
+ if (j < i)
+ children[j] = at;
- while (true)
- {
- if (at.Type == Alternate)
+ while (true)
{
- if (at.Children is List atChildren)
+ if (at.Type == Alternate)
{
- for (int k = 0; k < atChildren.Count; k++)
+ if (at.Children is List atChildren)
+ {
+ for (int k = 0; k < atChildren.Count; k++)
+ {
+ atChildren[k].Next = this;
+ }
+ children.InsertRange(i + 1, atChildren);
+ }
+ else
{
- atChildren[k].Next = this;
+ RegexNode atChild = (RegexNode)at.Children!;
+ atChild.Next = this;
+ children.Insert(i + 1, atChild);
}
- children.InsertRange(i + 1, atChildren);
+ j--;
}
- else
+ else if (at.Type == Set || at.Type == One)
{
- RegexNode atChild = (RegexNode)at.Children!;
- atChild.Next = this;
- children.Insert(i + 1, atChild);
- }
- j--;
- }
- else if (at.Type == Set || at.Type == One)
- {
- // Cannot merge sets if L or I options differ, or if either are negated.
- optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
+ // Cannot merge sets if L or I options differ, or if either are negated.
+ optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
- if (at.Type == Set)
- {
- if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at.Str))
+ if (at.Type == Set)
+ {
+ if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at.Str!))
+ {
+ wasLastSet = true;
+ lastNodeCannotMerge = !RegexCharClass.IsMergeable(at.Str!);
+ optionsLast = optionsAt;
+ break;
+ }
+ }
+ else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge)
{
wasLastSet = true;
- lastNodeCannotMerge = !RegexCharClass.IsMergeable(at.Str);
+ lastNodeCannotMerge = false;
optionsLast = optionsAt;
break;
}
+
+
+ // The last node was a Set or a One, we're a Set or One and our options are the same.
+ // Merge the two nodes.
+ j--;
+ prev = children[j];
+
+ RegexCharClass prevCharClass;
+ if (prev.Type == One)
+ {
+ prevCharClass = new RegexCharClass();
+ prevCharClass.AddChar(prev.Ch);
+ }
+ else
+ {
+ prevCharClass = RegexCharClass.Parse(prev.Str!);
+ }
+
+ if (at.Type == One)
+ {
+ prevCharClass.AddChar(at.Ch);
+ }
+ else
+ {
+ RegexCharClass atCharClass = RegexCharClass.Parse(at.Str!);
+ prevCharClass.AddCharClass(atCharClass);
+ }
+
+ prev.Type = Set;
+ prev.Str = prevCharClass.ToStringClass();
+ }
+ else if (at.Type == Nothing)
+ {
+ j--;
}
- else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge)
+ else
{
- wasLastSet = true;
+ wasLastSet = false;
lastNodeCannotMerge = false;
- optionsLast = optionsAt;
- break;
}
+ break;
+ }
+ }
+
+ if (j < i)
+ {
+ children.RemoveRange(j, i - j);
+ }
+ }
+ // Analyzes all the branches of the alternation for text that's identical at the beginning
+ // of every branch. That text is then pulled out into its own one or multi node in a
+ // concatenation with the alternation (whose branches are updated to remove that prefix).
+ // This is valuable for a few reasons. One, it exposes potentially more text to the
+ // expression prefix analyzer used to influence FindFirstChar. Second, it exposes more
+ // potential alternation optimizations, e.g. if the same prefix is followed in two branches
+ // by sets that can be merged. Third, it reduces the amount of duplicated comparisons required
+ // if we end up backtracking into subsequent branches.
+ RegexNode ExtractCommonPrefix()
+ {
+ // To keep things relatively simple, we currently only handle:
+ // - Branches that are one or multi nodes, or that are concatenations beginning with one or multi nodes.
+ // - All branches having the same options.
+ // - Text, rather than also trying to combine identical sets that start each branch.
+
+ Debug.Assert(Children is List);
+ var children = (List)Children;
+ Debug.Assert(children.Count >= 2);
+
+ // Process the first branch to get the maximum possible common string.
+ RegexNode? startingNode = FindBranchOneMultiStart(children[0]);
+ if (startingNode is null)
+ {
+ return this;
+ }
- // The last node was a Set or a One, we're a Set or One and our options are the same.
- // Merge the two nodes.
- j--;
- prev = children[j];
+ RegexOptions startingNodeOptions = startingNode.Options;
+ string? originalStartingString = startingNode.Str;
+ ReadOnlySpan startingSpan = startingNode.Type == One ? stackalloc char[1] { startingNode.Ch } : (ReadOnlySpan)originalStartingString;
+ Debug.Assert(startingSpan.Length > 0);
+
+ // Now compare the rest of the branches against it.
+ for (int i = 1; i < children.Count; i++)
+ {
+ // Get the starting node of the next branch.
+ startingNode = FindBranchOneMultiStart(children[i]);
+ if (startingNode is null || startingNode.Options != startingNodeOptions)
+ {
+ return this;
+ }
- RegexCharClass prevCharClass;
- if (prev.Type == One)
+ // See if the new branch's prefix has a shared prefix with the current one.
+ // If it does, shorten to that; if it doesn't, bail.
+ if (startingNode.Type == One)
+ {
+ if (startingSpan[0] != startingNode.Ch)
{
- prevCharClass = new RegexCharClass();
- prevCharClass.AddChar(prev.Ch);
+ return this;
}
- else
+
+ if (startingSpan.Length != 1)
{
- prevCharClass = RegexCharClass.Parse(prev.Str!);
+ startingSpan = startingSpan.Slice(0, 1);
}
+ }
+ else
+ {
+ Debug.Assert(startingNode.Type == Multi);
+ Debug.Assert(startingNode.Str!.Length > 0);
- if (at.Type == One)
+ int minLength = Math.Min(startingSpan.Length, startingNode.Str.Length);
+ int c = 0;
+ while (c < minLength && startingSpan[c] == startingNode.Str[c]) c++;
+ if (c == 0)
{
- prevCharClass.AddChar(at.Ch);
+ return this;
+ }
+
+ startingSpan = startingSpan.Slice(0, c);
+ }
+ }
+
+ // If we get here, we have a starting string prefix shared by all branches.
+ Debug.Assert(startingSpan.Length > 0);
+
+ // Now remove the prefix from each branch.
+ for (int i = 0; i < children.Count; i++)
+ {
+ RegexNode branch = children[i];
+ if (branch.Type == Concatenate)
+ {
+ ProcessOneOrMulti(branch.Child(0), startingSpan);
+ ReplaceChild(i, branch.Reduce());
+ }
+ else
+ {
+ ProcessOneOrMulti(branch, startingSpan);
+ }
+
+ // Remove the starting text from the one or multi node. This may end up changing
+ // the type of the node to be Empty if the starting text matches the node's full value.
+ static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan startingSpan)
+ {
+ if (node.Type == One)
+ {
+ Debug.Assert(startingSpan.Length == 1);
+ Debug.Assert(startingSpan[0] == node.Ch);
+ node.Type = Empty;
+ node.Ch = '\0';
}
else
{
- RegexCharClass atCharClass = RegexCharClass.Parse(at.Str!);
- prevCharClass.AddCharClass(atCharClass);
+ Debug.Assert(node.Type == Multi);
+ Debug.Assert(node.Str.AsSpan().StartsWith(startingSpan, StringComparison.Ordinal));
+ if (node.Str!.Length == startingSpan.Length)
+ {
+ node.Type = Empty;
+ node.Str = null;
+ }
+ else if (node.Str.Length - 1 == startingSpan.Length)
+ {
+ node.Type = One;
+ node.Ch = node.Str[^1];
+ node.Str = null;
+ }
+ else
+ {
+ node.Str = node.Str.Substring(startingSpan.Length);
+ }
}
+ }
+ }
- prev.Type = Set;
- prev.Str = prevCharClass.ToStringClass();
+ // We may have changed multiple branches to be Empty, but we only need to keep
+ // the first (keeping the rest would just duplicate work in backtracking, though
+ // it would also mean the original regex had at least two identical branches).
+ for (int firstEmpty = 0; firstEmpty < children.Count; firstEmpty++)
+ {
+ if (children[firstEmpty].Type != Empty)
+ {
+ continue;
}
- else if (at.Type == Nothing)
+
+ // Found the first empty. Now starting after it, remove all subsequent found Empty nodes,
+ // pushing everything else down. (In the future, should we want to there's also the opportunity
+ // here to remove other duplication, but such duplication is a more egregious mistake on the
+ // part of the expression author.)
+ int i = firstEmpty + 1;
+ int j = i;
+ while (i < children.Count)
{
- j--;
+ if (children[i].Type != Empty)
+ {
+ if (j != i)
+ {
+ children[j] = children[i];
+ }
+ j++;
+ }
+ i++;
}
- else
+
+ if (j < i)
{
- wasLastSet = false;
- lastNodeCannotMerge = false;
+ children.RemoveRange(j, i - j);
}
+
break;
}
- }
- if (j < i)
- {
- children.RemoveRange(j, i - j);
- }
+ var concat = new RegexNode(Concatenate, Options); // use same options as the Alternate
+ concat.AddChild(startingSpan.Length == 1 ? // use same options as the branches
+ new RegexNode(One, startingNodeOptions) { Ch = startingSpan[0] } :
+ new RegexNode(Multi, startingNodeOptions) { Str = originalStartingString?.Length == startingSpan.Length ? originalStartingString : startingSpan.ToString() });
+ concat.AddChild(this); // this will re-reduce the node, allowing for newly exposed possible optimizations in what came after the prefix
+ return concat;
+
+ // Finds the starting one or multi of the branch, if it has one; otherwise, returns null.
+ // For simplicity, this only considers branches that are One or Multi, or a Concatenation
+ // beginning with a One or Multi. We don't traverse more than one level to avoid the
+ // complication of then having to later update that hierarchy when removing the prefix,
+ // but it could be done in the future if proven beneficial enough.
+ static RegexNode? FindBranchOneMultiStart(RegexNode branch)
+ {
+ if (branch.Type == Concatenate)
+ {
+ branch = branch.Child(0);
+ }
- return StripEnation(Nothing);
+ return branch.Type == One || branch.Type == Multi ? branch : null;
+ }
+ }
}
///
@@ -1238,20 +1449,21 @@ public RegexNode MakeQuantifier(bool lazy, int min, int max)
public void AddChild(RegexNode newChild)
{
- RegexNode reducedChild = newChild.Reduce();
- reducedChild.Next = this;
+ newChild.Next = this; // so that the child can see its parent while being reduced
+ newChild = newChild.Reduce();
+ newChild.Next = this; // in case Reduce returns a different node that needs to be reparented
if (Children is null)
{
- Children = reducedChild;
+ Children = newChild;
}
else if (Children is RegexNode currentChild)
{
- Children = new List() { currentChild, reducedChild };
+ Children = new List() { currentChild, newChild };
}
else
{
- ((List)Children).Add(reducedChild);
+ ((List)Children).Add(newChild);
}
}
@@ -1301,7 +1513,6 @@ public int ChildCount()
[ExcludeFromCodeCoverage]
public string Description()
{
-
string typeStr = Type switch
{
Oneloop => nameof(Oneloop),
@@ -1344,17 +1555,15 @@ public string Description()
_ => $"(unknown {Type})"
};
- var argSb = new StringBuilder().Append(typeStr);
-
- if ((Options & RegexOptions.ExplicitCapture) != 0) argSb.Append("-C");
- if ((Options & RegexOptions.IgnoreCase) != 0) argSb.Append("-I");
- if ((Options & RegexOptions.RightToLeft) != 0) argSb.Append("-L");
- if ((Options & RegexOptions.Multiline) != 0) argSb.Append("-M");
- if ((Options & RegexOptions.Singleline) != 0) argSb.Append("-S");
- if ((Options & RegexOptions.IgnorePatternWhitespace) != 0) argSb.Append("-X");
- if ((Options & RegexOptions.ECMAScript) != 0) argSb.Append("-E");
+ var sb = new StringBuilder(typeStr);
- argSb.Append(Indent());
+ if ((Options & RegexOptions.ExplicitCapture) != 0) sb.Append("-C");
+ if ((Options & RegexOptions.IgnoreCase) != 0) sb.Append("-I");
+ if ((Options & RegexOptions.RightToLeft) != 0) sb.Append("-L");
+ if ((Options & RegexOptions.Multiline) != 0) sb.Append("-M");
+ if ((Options & RegexOptions.Singleline) != 0) sb.Append("-S");
+ if ((Options & RegexOptions.IgnorePatternWhitespace) != 0) sb.Append("-X");
+ if ((Options & RegexOptions.ECMAScript) != 0) sb.Append("-E");
switch (Type)
{
@@ -1366,25 +1575,27 @@ public string Description()
case Notonelazy:
case One:
case Notone:
- argSb.Append(RegexCharClass.CharDescription(Ch));
+ sb.Append(" '").Append(RegexCharClass.CharDescription(Ch)).Append('\'');
break;
case Capture:
- argSb.Append("index = " + M);
+ sb.Append(' ').Append($"index = {M}");
if (N != -1)
- argSb.Append(", unindex = " + N);
+ {
+ sb.Append($", unindex = {N}");
+ }
break;
case Ref:
case Testref:
- argSb.Append("index = " + M);
+ sb.Append(' ').Append($"index = {M}");
break;
case Multi:
- argSb.Append(Str);
+ sb.Append(" \"").Append(Str).Append('"');
break;
case Set:
case Setloop:
case Setloopatomic:
case Setlazy:
- argSb.Append(RegexCharClass.SetDescription(Str!));
+ sb.Append(' ').Append(RegexCharClass.SetDescription(Str!));
break;
}
@@ -1401,19 +1612,16 @@ public string Description()
case Setlazy:
case Loop:
case Lazyloop:
- if (argSb[^1] != ' ')
- argSb.Append(", ");
- argSb.Append("min = " + M + ", max = ");
- if (N == int.MaxValue)
- argSb.Append("inf");
- else
- argSb.Append(N);
+ sb.Append(
+ (M == 0 && N == int.MaxValue) ? "*" :
+ (M == 0 && N == 1) ? "?" :
+ (M == 1 && N == int.MaxValue) ? "+" :
+ (N == int.MaxValue) ? $"{{{M}, *}}" :
+ $"{{{M}, {N}}}");
break;
}
- string Indent() => new string(' ', Math.Max(1, 25 - argSb.Length));
-
- return argSb.ToString();
+ return sb.ToString();
}
[ExcludeFromCodeCoverage]
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
index acaf1e2125844..45af2b52fc506 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
@@ -680,14 +680,18 @@ private static string StackDescription(int[] a, int index)
sb.Append(a.Length);
if (sb.Length < 8)
+ {
sb.Append(' ', 8 - sb.Length);
+ }
sb.Append('(');
for (int i = index; i < a.Length; i++)
{
if (i > index)
+ {
sb.Append(' ');
+ }
sb.Append(a[i]);
}
@@ -704,12 +708,18 @@ internal virtual string TextposDescription()
sb.Append(runtextpos);
if (sb.Length < 8)
+ {
sb.Append(' ', 8 - sb.Length);
+ }
if (runtextpos > runtextbeg)
+ {
sb.Append(RegexCharClass.CharDescription(runtext![runtextpos - 1]));
+ }
else
+ {
sb.Append('^');
+ }
sb.Append('>');
diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs
index 133c95a3bfd51..ebca1004c7724 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.UnicodeChar.Tests.cs
@@ -55,16 +55,20 @@ public static void RegexUnicodeChar()
for (int i = 0; i < 100; i++)
{
- StringBuilder builder1 = new StringBuilder();
- StringBuilder builder2 = new StringBuilder();
+ var builder1 = new StringBuilder();
+ var builder2 = new StringBuilder();
+
for (int j = 0; j < validCharLength; j++)
{
char c = validChars[random.Next(validChars.Count)];
builder1.Append(c);
builder2.Append(c);
}
+
for (int j = 0; j < invalidCharLength; j++)
+ {
builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+ }
string input = builder1.ToString();
Match match = regex.Match(input);
@@ -94,18 +98,26 @@ public static void RegexUnicodeChar()
for (int i = 0; i < 500; i++)
{
- StringBuilder builder1 = new StringBuilder();
- StringBuilder builder2 = new StringBuilder();
+ var builder1 = new StringBuilder();
+ var builder2 = new StringBuilder();
+
for (int j = 0; j < invalidCharLength; j++)
+ {
builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+ }
+
for (int j = 0; j < validCharLength; j++)
{
char c = validChars[random.Next(validChars.Count)];
builder1.Append(c);
builder2.Append(c);
}
+
for (int j = 0; j < invalidCharLength; j++)
+ {
builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+ }
+
string input = builder1.ToString();
Match match = regex.Match(input);
@@ -143,16 +155,21 @@ public static void RegexUnicodeChar()
for (int i = 0; i < 100; i++)
{
- StringBuilder builder1 = new StringBuilder();
- StringBuilder builder2 = new StringBuilder();
+ var builder1 = new StringBuilder();
+ var builder2 = new StringBuilder();
+
for (int j = 0; j < validCharLength; j++)
{
char c = validChars[random.Next(validChars.Count)];
builder1.Append(c);
builder2.Append(c);
}
+
for (int j = 0; j < invalidCharLength; j++)
+ {
builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+ }
+
string input = builder1.ToString();
Match match = regex.Match(input);
@@ -173,18 +190,26 @@ public static void RegexUnicodeChar()
for (int i = 0; i < 100; i++)
{
- StringBuilder builder1 = new StringBuilder();
- StringBuilder builder2 = new StringBuilder();
+ var builder1 = new StringBuilder();
+ var builder2 = new StringBuilder();
+
for (int j = 0; j < invalidCharLength; j++)
+ {
builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+ }
+
for (int j = 0; j < validCharLength; j++)
{
char c = validChars[random.Next(validChars.Count)];
builder1.Append(c);
builder2.Append(c);
}
+
for (int j = 0; j < invalidCharLength; j++)
+ {
builder1.Append(invalidChars[random.Next(invalidChars.Count)]);
+ }
+
string input = builder1.ToString();
Match match = regex.Match(input);
diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs
index 7eb9e1d01b3bc..b622ec6c71508 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs
@@ -277,8 +277,31 @@ private static int GetMinRequiredLength(Regex r)
[InlineData("(?:a+){4}", "a{4,}")]
[InlineData("(?:a{1,2}){4}", "a{4,8}")]
// Alternation reduction
+ [InlineData("a|b", "[ab]")]
[InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")]
[InlineData("a|b|c|def|g|h", "[a-c]|def|[gh]")]
+ [InlineData("this|that|there|then|those", "th(?:is|at|ere|en|ose)")]
+ [InlineData("it's (?>this|that|there|then|those)", "it's (?>th(?:is|at|ere|en|ose))")]
+ [InlineData("it's (?>this|that|there|then|those)!", "it's (?>th(?:is|at|ere|en|ose))!")]
+ [InlineData("abcd|abce", "abc[de]")]
+ [InlineData("abcd|abef", "ab(?:cd|ef)")]
+ [InlineData("abcd|aefg", "a(?:bcd|efg)")]
+ [InlineData("abcd|abc|ab|a", "a(?:bcd|bc|b|)")]
+ [InlineData("abcde|abcdef", "abcde(?:|f)")]
+ [InlineData("abcdef|abcde", "abcde(?:f|)")]
+ [InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")]
+ [InlineData("(ab|ab*)bc", "(a(?:b|b*))bc")]
+ [InlineData("abc(?:defgh|defij)klmn", "abcdef(?:gh|ij)klmn")]
+ [InlineData("abc(defgh|defij)klmn", "abc(def(?:gh|ij))klmn")]
+ [InlineData("a[b-f]|a[g-k]", "a[b-k]")]
+ [InlineData("this|this", "this")]
+ [InlineData("this|this|this", "this")]
+ [InlineData("hello there|hello again|hello|hello|hello|hello", "hello(?: there| again|)")]
+ [InlineData("hello there|hello again|hello|hello|hello|hello|hello world", "hello(?: there| again|| world)")]
+ [InlineData("hello there|hello again|hello|hello|hello|hello|hello world|hello", "hello(?: there| again|| world)")]
+ [InlineData("abcd(?:(?i:e)|(?i:f))", "abcd(?i:[ef])")]
+ [InlineData("(?i:abcde)|(?i:abcdf)", "(?i:abcd[ef])")]
+ [InlineData("xyz(?:(?i:abcde)|(?i:abcdf))", "xyz(?i:abcd[ef])")]
// Auto-atomicity
[InlineData("a*b", "(?>a*)b")]
[InlineData("a*b+", "(?>a*)b+")]
@@ -294,11 +317,16 @@ private static int GetMinRequiredLength(Regex r)
[InlineData("(?:abc*|def*)g", "(?:ab(?>c*)|de(?>f*))g")]
[InlineData("(?:a[ce]*|b*)g", "(?:a(?>[ce]*)|(?>b*))g")]
[InlineData("(?:a[ce]*|b*)c", "(?:a[ce]*|(?>b*))c")]
+ [InlineData("apple|(?:orange|pear)|grape", "apple|orange|pear|grape")]
+ [InlineData("(?>(?>(?>(?:abc)*)))", "(?:abc)*")]
public void PatternsReduceIdentically(string pattern1, string pattern2)
{
AssertExtensions.Equal(GetRegexCodes(new Regex(pattern1)), GetRegexCodes(new Regex(pattern2)));
- Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.IgnoreCase)), GetRegexCodes(new Regex(pattern2)));
Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.RightToLeft)), GetRegexCodes(new Regex(pattern2)));
+ if (!pattern1.Contains("?i:") && !pattern2.Contains("?i:"))
+ {
+ Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.IgnoreCase)), GetRegexCodes(new Regex(pattern2)));
+ }
}
[Theory]
@@ -341,6 +369,10 @@ public void PatternsReduceIdentically(string pattern1, string pattern2)
[InlineData("[ace][ace]{0,2147483646}", "[ace]{0,2147483647}")]
[InlineData("[ace]{2147482647}[ace]{1000}", "[ace]{2147483647}")]
[InlineData("[ace]{0,2147482647}[ace]{0,1000}", "[ace]{0,2147483647}")]
+ // Not reducing branches of alternations with different casing
+ [InlineData("(?i:abcd)|abcd", "abcd|abcd")]
+ [InlineData("abcd|(?i:abcd)", "abcd|abcd")]
+ [InlineData("abc(?:(?i:e)|f)", "abc[ef]")]
// Not applying auto-atomicity
[InlineData("a*b*", "(?>a*)b*")]
[InlineData("[^\n]*\n*", "(?>[^\n]*)\n")]