Skip to content

Commit

Permalink
Added new regex option RegexOptions.AnyNewLine
Browse files Browse the repository at this point in the history
  • Loading branch information
shishirchawla committed Jan 10, 2020
1 parent 0bdab8a commit 55c713b
Show file tree
Hide file tree
Showing 13 changed files with 324 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ public enum RegexOptions
RightToLeft = 64,
ECMAScript = 256,
CultureInvariant = 512,
AnyNewLine = 1024,
}
public abstract partial class RegexRunner
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace System.Text.RegularExpressions
/// </summary>
public partial class Regex : ISerializable
{
private const int MaxOptionShift = 10;
private const int MaxOptionShift = 11;

protected internal string? pattern; // The string pattern provided
protected internal RegexOptions roptions; // the top-level options from the options string
Expand Down Expand Up @@ -137,7 +137,8 @@ internal static void ValidateOptions(RegexOptions options)
#if DEBUG
RegexOptions.Debug |
#endif
RegexOptions.CultureInvariant)) != 0)
RegexOptions.CultureInvariant |
RegexOptions.AnyNewLine)) != 0)
{
throw new ArgumentOutOfRangeException(nameof(options));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ internal sealed class RegexCode
public const int Beginning = 18; // \A
public const int Start = 19; // \G
public const int EndZ = 20; // \Z
public const int End = 21; // \Z
public const int End = 21; // \z

public const int Nothing = 22; // Reject!

Expand Down Expand Up @@ -88,6 +88,9 @@ internal sealed class RegexCode
public const int Notoneloopatomic = 44; // lef,back set,min,max (?> . {,n} )
public const int Setloopatomic = 45; // lef,back set,min,max (?> [\d]{,n} )

public const int AnyEndZ = 46; // \Z
public const int AnyEol = 47; // $

// Modifiers for alternate modes
public const int Mask = 63; // Mask to get unmodified ordinary operator
public const int Rtl = 64; // bit to indicate that we're reverse scanning.
Expand Down Expand Up @@ -168,13 +171,15 @@ public static int OpcodeSize(int opcode)
case Nothing:
case Bol:
case Eol:
case AnyEol:
case Boundary:
case Nonboundary:
case ECMABoundary:
case NonECMABoundary:
case Beginning:
case Start:
case EndZ:
case AnyEndZ:
case End:
case Nullmark:
case Setmark:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,7 @@ protected void GenerateFindFirstChar()
}
}

if ((_anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.End)) != 0)
if ((_anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.AnyEndZ | RegexFCD.End)) != 0)
{
if (!_code!.RightToLeft)
{
Expand Down Expand Up @@ -908,7 +908,7 @@ protected void GenerateFindFirstChar()
MarkLabel(l1);
}

if ((_anchors & RegexFCD.EndZ) != 0)
if ((_anchors & (RegexFCD.EndZ | RegexFCD.AnyEndZ)) != 0)
{
Label l1 = DefineLabel();
Ldthisfld(s_runtextposField);
Expand Down Expand Up @@ -978,6 +978,60 @@ protected void GenerateFindFirstChar()
MarkLabel(l2);
}

if ((_anchors & RegexFCD.AnyEndZ) != 0)
{
LocalBuilder diff = _temp1Local;
Label l1 = DefineLabel();
Label l2 = DefineLabel();
Label l3 = DefineLabel();
Ldthisfld(s_runtextendField);
Ldthisfld(s_runtextposField);
Sub();
Stloc(diff);
Ldloc(diff);
Ldc(2);
Bgt(l1);
Ldloc(diff);
Ldc(2);
Blt(l2);
Ldthisfld(s_runtextField);
Ldthisfld(s_runtextposField);
Callvirt(s_stringGetCharsMethod);
Ldc('\r');
Bne(l1);
Ldthisfld(s_runtextField);
Ldthisfld(s_runtextposField);
Ldc(1);
Add();
Callvirt(s_stringGetCharsMethod);
Ldc('\n');
Bne(l1);
Br(l3);

MarkLabel(l2);
Ldloc(diff);
Ldc(1);
Blt(l3);
Ldthisfld(s_runtextField);
Ldthisfld(s_runtextposField);
Callvirt(s_stringGetCharsMethod);
Ldc('\n');
Beq(l3);
Ldthisfld(s_runtextField);
Ldthisfld(s_runtextposField);
Callvirt(s_stringGetCharsMethod);
Ldc('\r');
Beq(l3);

MarkLabel(l1);
Ldthis();
Ldthisfld(s_runtextbegField);
Stfld(s_runtextposField);
Ldc(0);
Ret();
MarkLabel(l3);
}

if ((_anchors & RegexFCD.Start) != 0)
{
Label l1 = DefineLabel();
Expand Down Expand Up @@ -3361,6 +3415,23 @@ private void GenerateOneCode()
break;
}

case RegexCode.AnyEol:
//: if (Rightchars() > 0 && CharAt(Textpos()) != '\n' && CharAt(Textpos()) != '\r')
//: break Backward;
{
Label l1 = _labels![NextCodepos()];
Ldloc(_runtextposLocal!);
Ldloc(_runtextendLocal!);
Bge(l1);
Rightchar();
Ldc('\n');
Beq(l1);
Rightchar();
Ldc('\r');
BneFar(_backtrack);
break;
}

case RegexCode.Boundary:
case RegexCode.Nonboundary:
//: if (!IsBoundary(Textpos(), _textbeg, _textend))
Expand Down Expand Up @@ -3431,6 +3502,52 @@ private void GenerateOneCode()
BneFar(_backtrack);
break;

case RegexCode.AnyEndZ:
//: if (rightChars > 2)
//: break Backward;
//: if (rightChars == 1 && CharAt(Textpos()) != '\r' && CharAt(Textpos()) != '\n')
//: break Backward;
//: if (rightChars == 2 && (CharAt(Textpos()) != '\r' || CharAt(Textpos()+1) != '\n'))
//: break Backward;
{
LocalBuilder diff = _temp1Local!;
Label l1 = DefineLabel();

Ldloc(_runtextendLocal!);
Ldloc(_runtextposLocal!);
Sub();
Stloc(diff);
Ldloc(diff);
Ldc(2);
BgtFar(_backtrack);
Ldloc(diff);
Ldc(2);
Blt(l1);
Rightchar();
Ldc('\r');
BneFar(_backtrack);
Ldloc(_runtextLocal!);
Ldloc(_runtextposLocal!);
Ldc(1);
Add();
Callvirt(s_stringGetCharsMethod);
Ldc('\n');
BneFar(_backtrack);
Br(_labels![NextCodepos()]);

MarkLabel(l1);
Ldloc(diff);
Ldc(1);
Blt(_labels![NextCodepos()]);
Rightchar();
Ldc('\n');
Beq(_labels![NextCodepos()]);
Rightchar();
Ldc('\r');
BneFar(_backtrack);
break;
}

case RegexCode.End:
//: if (Rightchars() > 0)
//: break Backward;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ internal ref struct RegexFCD
public const int Boundary = 0x0040;
public const int ECMABoundary = 0x0080;

public const int AnyEndZ = 0x0100;
public const int AnyEol = 0x0200;

private readonly List<RegexFC> _fcStack;
private ValueListBuilder<int> _intStack; // must not be readonly
private bool _skipAllChildren; // don't process any more children at the current level
Expand Down Expand Up @@ -130,11 +133,13 @@ public static RegexPrefix Prefix(RegexTree tree)

case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.AnyEol:
case RegexNode.Boundary:
case RegexNode.ECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.AnyEndZ:
case RegexNode.End:
case RegexNode.Empty:
case RegexNode.Require:
Expand Down Expand Up @@ -185,11 +190,13 @@ public static int Anchors(RegexTree tree)

case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.AnyEol:
case RegexNode.Boundary:
case RegexNode.ECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.AnyEndZ:
case RegexNode.End:
return result | AnchorFromType(curNode.Type);

Expand Down Expand Up @@ -217,11 +224,13 @@ private static int AnchorFromType(int type) =>
{
RegexNode.Bol => Bol,
RegexNode.Eol => Eol,
RegexNode.AnyEol => AnyEol,
RegexNode.Boundary => Boundary,
RegexNode.ECMABoundary => ECMABoundary,
RegexNode.Beginning => Beginning,
RegexNode.Start => Start,
RegexNode.EndZ => EndZ,
RegexNode.AnyEndZ => AnyEndZ,
RegexNode.End => End,
_ => 0,
};
Expand All @@ -244,10 +253,14 @@ public static string AnchorDescription(int anchors)
sb.Append(", ECMABoundary");
if (0 != (anchors & Eol))
sb.Append(", Eol");
if (0 != (anchors & AnyEol))
sb.Append(", AnyEol");
if (0 != (anchors & End))
sb.Append(", End");
if (0 != (anchors & EndZ))
sb.Append(", EndZ");
if (0 != (anchors & AnyEndZ))
sb.Append(", AnyEndZ");

if (sb.Length >= 2)
return (sb.ToString(2, sb.Length - 2));
Expand Down Expand Up @@ -479,13 +492,15 @@ private void CalculateFC(int NodeType, RegexNode node, int CurIndex)
case RegexNode.Nothing:
case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.AnyEol:
case RegexNode.Boundary:
case RegexNode.Nonboundary:
case RegexNode.ECMABoundary:
case RegexNode.NonECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.AnyEndZ:
case RegexNode.End:
PushFC(new RegexFC(true));
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,8 @@ private char CharAt(int j)

protected override bool FindFirstChar()
{
if (0 != (_code.Anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.End)))
if (0 != (_code.Anchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ |
RegexFCD.AnyEndZ | RegexFCD.End)))
{
if (!_code.RightToLeft)
{
Expand All @@ -397,7 +398,7 @@ protected override bool FindFirstChar()
runtextpos = runtextend;
return false;
}
if (0 != (_code.Anchors & RegexFCD.EndZ) && runtextpos < runtextend - 1)
if (0 != (_code.Anchors & (RegexFCD.EndZ | RegexFCD.AnyEndZ)) && runtextpos < runtextend - 1)
{
runtextpos = runtextend - 1;
}
Expand All @@ -411,6 +412,11 @@ protected override bool FindFirstChar()
if ((0 != (_code.Anchors & RegexFCD.End) && runtextpos < runtextend) ||
(0 != (_code.Anchors & RegexFCD.EndZ) && (runtextpos < runtextend - 1 ||
(runtextpos == runtextend - 1 && CharAt(runtextpos) != '\n'))) ||
(0 != (_code.Anchors & RegexFCD.AnyEndZ) && (runtextpos < runtextend - 2 ||
(runtextpos == runtextend - 2 && (CharAt(runtextpos) != '\r'
|| CharAt(runtextpos+1) != '\n')) ||
(runtextpos == runtextend - 1 && CharAt(runtextpos) != '\n'
&& CharAt(runtextpos) != '\r'))) ||
(0 != (_code.Anchors & RegexFCD.Start) && runtextpos < runtextstart))
{
runtextpos = runtextbeg;
Expand Down Expand Up @@ -967,6 +973,12 @@ protected override void Go()
advance = 0;
continue;

case RegexCode.AnyEol:
if (Rightchars() > 0 && CharAt(Textpos()) != '\n' && CharAt(Textpos()) != '\r')
break;
advance = 0;
continue;

case RegexCode.Boundary:
if (!IsBoundary(Textpos(), runtextbeg, runtextend))
break;
Expand Down Expand Up @@ -1009,6 +1021,17 @@ protected override void Go()
advance = 0;
continue;

case RegexCode.AnyEndZ:
int rightChars = Rightchars();
if (rightChars > 2)
break;
if (rightChars == 1 && CharAt(Textpos()) != '\r' && CharAt(Textpos()) != '\n')
break;
if (rightChars == 2 && (CharAt(Textpos()) != '\r' || CharAt(Textpos() + 1) != '\n'))
break;
advance = 0;
continue;

case RegexCode.End:
if (Rightchars() > 0)
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,15 @@ internal sealed class RegexNode

public const int Bol = RegexCode.Bol; // ^
public const int Eol = RegexCode.Eol; // $
public const int AnyEol = RegexCode.AnyEol; // $
public const int Boundary = RegexCode.Boundary; // \b
public const int Nonboundary = RegexCode.Nonboundary; // \B
public const int ECMABoundary = RegexCode.ECMABoundary; // \b
public const int NonECMABoundary = RegexCode.NonECMABoundary; // \B
public const int Beginning = RegexCode.Beginning; // \A
public const int Start = RegexCode.Start; // \G
public const int EndZ = RegexCode.EndZ; // \Z
public const int AnyEndZ = RegexCode.AnyEndZ; // \Z
public const int End = RegexCode.End; // \z

public const int Oneloopatomic = RegexCode.Oneloopatomic; // c,n (?> a*)
Expand Down Expand Up @@ -978,6 +980,7 @@ public string Description()
if ((Options & RegexOptions.Singleline) != 0) argSb.Append("-S");
if ((Options & RegexOptions.IgnorePatternWhitespace) != 0) argSb.Append("-X");
if ((Options & RegexOptions.ECMAScript) != 0) argSb.Append("-E");
if ((Options & RegexOptions.AnyNewLine) != 0) argSb.Append("-A");

switch (Type)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@ public enum RegexOptions

ECMAScript = 0x0100, // "e"
CultureInvariant = 0x0200,
AnyNewLine = 0x0400, // "a", Treat "$" as (?=\r\z|\n\z|\r\n\z|\z)
}
}
Loading

0 comments on commit 55c713b

Please sign in to comment.