Skip to content

Commit

Permalink
Fix Regex if-then-else code generation (#63427)
Browse files Browse the repository at this point in the history
* Fix Regex if-then-else code generation

Code coverage revealed some gaps in testing around conditionals, which in turn led to discovering a) some bugs in both RegexCompiler and the source generator around expression conditionals, and b) divergence between the compiler and the source generator.  This adds tests to address the code coverage gaps and fixes the implementations to both address the bugs and bring the code much closer.  The main problem with expressional conditionals was flawed handling of backtracking, either not pushing the right state on to the stack or not initializing the state correctly in the first place.  We also failed to revert positional changes made in the expression condition when the condition failed to match.

* Address PR feedback

* A few more tweaks
  • Loading branch information
stephentoub authored Jan 10, 2022
1 parent 7242690 commit 9d5cfc5
Show file tree
Hide file tree
Showing 9 changed files with 542 additions and 364 deletions.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -308,11 +308,11 @@ private void ValidateFinalTreeInvariants()
break;

case Testref:
Debug.Assert(childCount is 1 or 2, $"Expected one or two children for {node.TypeName}, got {childCount}");
Debug.Assert(childCount == 2, $"Expected two children for {node.TypeName}, got {childCount}");
break;

case Testgroup:
Debug.Assert(childCount is 2 or 3, $"Expected two or three children for {node.TypeName}, got {childCount}");
Debug.Assert(childCount == 3, $"Expected three children for {node.TypeName}, got {childCount}");
break;

case Concatenate:
Expand Down Expand Up @@ -562,39 +562,20 @@ public bool IsAtomicByParent()
/// <summary>
/// Removes redundant nodes from the subtree, and returns an optimized subtree.
/// </summary>
internal RegexNode Reduce()
{
switch (Type)
internal RegexNode Reduce() =>
Type switch
{
case Alternate:
return ReduceAlternation();

case Concatenate:
return ReduceConcatenation();

case Loop:
case Lazyloop:
return ReduceLoops();

case Atomic:
return ReduceAtomic();

case Group:
return ReduceGroup();

case Set:
case Setloop:
case Setloopatomic:
case Setlazy:
return ReduceSet();

case Prevent:
return ReducePrevent();

default:
return this;
}
}
Alternate => ReduceAlternation(),
Atomic => ReduceAtomic(),
Concatenate => ReduceConcatenation(),
Group => ReduceGroup(),
Loop or Lazyloop => ReduceLoops(),
Prevent => ReducePrevent(),
Set or Setloop or Setloopatomic or Setlazy => ReduceSet(),
Testgroup => ReduceTestgroup(),
Testref => ReduceTestref(),
_ => this,
};

/// <summary>Remove an unnecessary Concatenation or Alternation node</summary>
/// <remarks>
Expand Down Expand Up @@ -1819,6 +1800,53 @@ private RegexNode ReducePrevent()
return this;
}

/// <summary>Optimizations for backreference conditionals.</summary>
private RegexNode ReduceTestref()
{
Debug.Assert(Type == Testref);
Debug.Assert(ChildCount() is 1 or 2);

// This isn't so much an optimization as it is changing the tree for consistency.
// We want all engines to be able to trust that every Testref will have two children,
// even though it's optional in the syntax. If it's missing a "not matched" branch,
// we add one that will match empty.
if (ChildCount() == 1)
{
AddChild(new RegexNode(Empty, Options));
}

return this;
}

/// <summary>Optimizations for expression conditionals.</summary>
private RegexNode ReduceTestgroup()
{
Debug.Assert(Type == Testgroup);
Debug.Assert(ChildCount() is 2 or 3);

// This isn't so much an optimization as it is changing the tree for consistency.
// We want all engines to be able to trust that every Testgroup will have three children,
// even though it's optional in the syntax. If it's missing a "not matched" branch,
// we add one that will match empty.
if (ChildCount() == 2)
{
AddChild(new RegexNode(Empty, Options));
}

// It's common for the condition to be an explicit positive lookahead, as specifying
// that eliminates any ambiguity in syntax as to whether the expression is to be matched
// as an expression or to be a reference to a capture group. After parsing, however,
// there's no ambiguity, and we can remove an extra level of positive lookahead, as the
// engines need to treat the condition as a zero-width positive, atomic assertion regardless.
RegexNode condition = Child(0);
if (condition.Type == Require && (condition.Options & RegexOptions.RightToLeft) == 0)
{
ReplaceChild(0, condition.Child(0));
}

return this;
}

/// <summary>
/// Determines whether node can be switched to an atomic loop. Subsequent is the node
/// immediately after 'node'.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,13 +288,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
Emit(RegexCode.Goto, 0);
PatchJump(Branchpos, _emitted.Length);
Emit(RegexCode.Forejump);
if (node.ChildCount() > 1)
{
break;
}

// else fallthrough
goto case 1;
break;
}
case 1:
PatchJump(_intStack.Pop(), _emitted.Length);
Expand Down Expand Up @@ -328,11 +322,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
PatchJump(Branchpos, _emitted.Length);
Emit(RegexCode.Getmark);
Emit(RegexCode.Forejump);

if (node.ChildCount() > 2)
break;
// else fallthrough
goto case 2;
break;
case 2:
PatchJump(_intStack.Pop(), _emitted.Length);
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ SymbolicRegexNode<BDD> ConvertSetloop(RegexNode node, bool isLazy)

bool IsDotStar(RegexNode node) => node.Type == RegexNode.Setloop && Convert(node, topLevel: false).IsAnyStar;

bool IsIntersect(RegexNode node) => node.Type == RegexNode.Testgroup && node.ChildCount() > 2 && IsNothing(node.Child(2));
bool IsIntersect(RegexNode node) => node.Type == RegexNode.Testgroup && IsNothing(node.Child(2));

bool TryGetIntersection(RegexNode node, [Diagnostics.CodeAnalysis.NotNullWhen(true)] out List<RegexNode>? conjuncts)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,7 @@ public static IEnumerable<object[]> Groups_Basic_TestData()
yield return new object[] { engine, null, @"abcd(.*)f", @"abcabcdefg", options, new string[] { "abcdef", "e" } };
}

// Grouping Constructs Invalid Regular Expressions
// Grouping Constructs
yield return new object[] { engine, null, @"()", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
yield return new object[] { engine, null, @"(?<cat>)", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
yield return new object[] { engine, null, @"(?'cat')", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
Expand All @@ -844,13 +844,15 @@ public static IEnumerable<object[]> Groups_Basic_TestData()
yield return new object[] { engine, null, @"(?<=)", "cat", RegexOptions.None, new string[] { string.Empty } };
yield return new object[] { engine, null, @"(?>)", "cat", RegexOptions.None, new string[] { string.Empty } };

// Alternation construct Invalid Regular Expressions
// Alternation construct
yield return new object[] { engine, null, @"(?()|)", "(?()|)", RegexOptions.None, new string[] { "" } };

yield return new object[] { engine, null, @"(?(cat)|)", "cat", RegexOptions.None, new string[] { "" } };
yield return new object[] { engine, null, @"(?(cat)|)", "dog", RegexOptions.None, new string[] { "" } };

yield return new object[] { engine, null, @"(?(cat)catdog|)", "catdog", RegexOptions.None, new string[] { "catdog" } };
yield return new object[] { engine, null, @"(?(cat)cat\w\w\w)*", "catdogcathog", RegexOptions.None, new string[] { "catdogcathog" } };
yield return new object[] { engine, null, @"(?(?=cat)cat\w\w\w)*", "catdogcathog", RegexOptions.None, new string[] { "catdogcathog" } };
yield return new object[] { engine, null, @"(?(cat)catdog|)", "dog", RegexOptions.None, new string[] { "" } };
yield return new object[] { engine, null, @"(?(cat)dog|)", "dog", RegexOptions.None, new string[] { "" } };
yield return new object[] { engine, null, @"(?(cat)dog|)", "cat", RegexOptions.None, new string[] { "" } };
Expand All @@ -859,6 +861,9 @@ public static IEnumerable<object[]> Groups_Basic_TestData()
yield return new object[] { engine, null, @"(?(cat)|catdog)", "catdog", RegexOptions.None, new string[] { "" } };
yield return new object[] { engine, null, @"(?(cat)|dog)", "dog", RegexOptions.None, new string[] { "dog" } };

yield return new object[] { engine, null, @"(?((\w{3}))\1\1|no)", "dogdogdog", RegexOptions.None, new string[] { "dogdog", "dog" } };
yield return new object[] { engine, null, @"(?((\w{3}))\1\1|no)", "no", RegexOptions.None, new string[] { "no", "" } };

// Invalid unicode
yield return new object[] { engine, null, "([\u0000-\uFFFF-[azAZ09]]|[\u0000-\uFFFF-[^azAZ09]])+", "azAZBCDE1234567890BCDEFAZza", RegexOptions.None, new string[] { "azAZBCDE1234567890BCDEFAZza", "a" } };
yield return new object[] { engine, null, "[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[\u0000-\uFFFF-[a]]]]]]+", "abcxyzABCXYZ123890", RegexOptions.None, new string[] { "bcxyzABCXYZ123890" } };
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -357,12 +357,17 @@ public static IEnumerable<object[]> Match_MemberData()
yield return ("(?(dog2))", "dog2", RegexOptions.None, 0, 4, true, string.Empty);
yield return ("(?(a:b))", "a", RegexOptions.None, 0, 1, true, string.Empty);
yield return ("(?(a:))", "a", RegexOptions.None, 0, 1, true, string.Empty);
yield return ("(?(cat)cat|dog)", "cat", RegexOptions.None, 0, 3, true, "cat");
yield return ("(?((?=cat))cat|dog)", "cat", RegexOptions.None, 0, 3, true, "cat");
yield return ("(?(cat)|dog)", "cat", RegexOptions.None, 0, 3, true, string.Empty);
yield return ("(?(cat)|dog)", "catdog", RegexOptions.None, 0, 6, true, string.Empty);
yield return ("(?(cat)|dog)", "oof", RegexOptions.None, 0, 3, false, string.Empty);
yield return ("(?(cat)dog1|dog2)", "catdog1", RegexOptions.None, 0, 7, false, string.Empty);
yield return ("(?(cat)dog1|dog2)", "catdog2", RegexOptions.None, 0, 7, true, "dog2");
yield return ("(?(cat)dog1|dog2)", "catdog1dog2", RegexOptions.None, 0, 11, true, "dog2");
yield return (@"(?(\w+)\w+)dog", "catdog", RegexOptions.None, 0, 6, true, "catdog");
yield return (@"(?(abc)\w+|\w{0,2})dog", "catdog", RegexOptions.None, 0, 6, true, "atdog");
yield return (@"(?(abc)cat|\w{0,2})dog", "catdog", RegexOptions.None, 0, 6, true, "atdog");
yield return (@"(\w+|\d+)a+[ab]+", "123123aa", RegexOptions.None, 0, 8, true, "123123aa");
yield return ("(a|ab|abc|abcd)d", "abcd", RegexOptions.RightToLeft, 0, 4, true, "abcd");
yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.None, 0, 4, false, string.Empty);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,32 @@ public static IEnumerable<object[]> Matches_TestData()
}
};

yield return new object[]
{
engine,
@"(?(\w+)\w+|)", "abcd", RegexOptions.None,
new CaptureData[]
{
new CaptureData("abcd", 0, 4),
new CaptureData("", 4, 0),
}
};

if (!PlatformDetection.IsNetFramework)
{
// .NET Framework has some behavioral inconsistencies when there's no else branch.
yield return new object[]
{
engine,
@"(?(\w+)\w+)", "abcd", RegexOptions.None,
new CaptureData[]
{
new CaptureData("abcd", 0, 4),
new CaptureData("", 4, 0),
}
};
}

yield return new object[]
{
engine,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,11 @@ private static int GetMinRequiredLength(Regex r)
[InlineData("xyz(?:(?i:abcde)|(?i:abcdf))", "xyz(?i:abcd[ef])")]
[InlineData("bonjour|hej|ciao|shalom|zdravo|pozdrav|hallo|hola|hello|hey|witam|tere|bonjou|salam|helo|sawubona", "(?>bonjou(?>r|)|h(?>e(?>j|(?>l(?>lo|o)|y))|allo|ola)|ciao|s(?>halom|a(?>lam|wubona))|zdravo|pozdrav|witam|tere)")]
[InlineData("\\w\\d123|\\w\\dabc", "\\w\\d(?:123|abc)")]
[InlineData("(a)(?(1)b)", "(a)(?(1)b|)")]
[InlineData("(abc)(?(1)def)", "(abc)(?(1)def|)")]
[InlineData("(?(a)a)", "(?(a)a|)")]
[InlineData("(?(abc)def)", "(?(abc)def|)")]
[InlineData("(?(\\w)\\d)", "(?(\\w)\\d|)")]
// Auto-atomicity
[InlineData("a*b", "(?>a*)b")]
[InlineData("a*b+", "(?>a*)b+")]
Expand Down

0 comments on commit 9d5cfc5

Please sign in to comment.