Skip to content

Commit

Permalink
Adding Regex.EnumerateMatches (#67794)
Browse files Browse the repository at this point in the history
* Adding Regex.EnumerateMatches()

* Addressing some feedback and implementing Count(span) on top of EnumerateMatches and cleaning up some code.

* Revert Regex.Count implementation over Enumerate

* PR Feedback
  • Loading branch information
joperezr committed Apr 13, 2022
1 parent b58d801 commit 89daf96
Show file tree
Hide file tree
Showing 11 changed files with 444 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ public static void CompileToAssembly(System.Text.RegularExpressions.RegexCompila
public static int Count(System.ReadOnlySpan<char> input, [System.Diagnostics.CodeAnalysis.StringSyntax(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options) { throw null; }
public static int Count(System.ReadOnlySpan<char> input, [System.Diagnostics.CodeAnalysis.StringSyntax(System.Diagnostics.CodeAnalysis.StringSyntaxAttribute.Regex, "options")] string pattern, System.Text.RegularExpressions.RegexOptions options, System.TimeSpan matchTimeout) { throw null; }
public static string Escape(string str) { throw null; }
public System.Text.RegularExpressions.Regex.ValueMatchEnumerator EnumerateMatches(System.ReadOnlySpan<char> input) { throw null; }
public static System.Text.RegularExpressions.Regex.ValueMatchEnumerator EnumerateMatches(System.ReadOnlySpan<char> input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute("Regex")] string pattern) { throw null; }
public static System.Text.RegularExpressions.Regex.ValueMatchEnumerator EnumerateMatches(System.ReadOnlySpan<char> input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute("Regex", new object[]{ "options"})] string pattern, System.Text.RegularExpressions.RegexOptions options) { throw null; }
public static System.Text.RegularExpressions.Regex.ValueMatchEnumerator EnumerateMatches(System.ReadOnlySpan<char> input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute("Regex", new object[]{ "options"})] string pattern, System.Text.RegularExpressions.RegexOptions options, System.TimeSpan matchTimeout) { throw null; }
public string[] GetGroupNames() { throw null; }
public int[] GetGroupNumbers() { throw null; }
public string GroupNameFromNumber(int i) { throw null; }
Expand Down Expand Up @@ -220,6 +224,14 @@ void System.Runtime.Serialization.ISerializable.GetObjectData(System.Runtime.Ser
protected bool UseOptionC() { throw null; }
protected internal bool UseOptionR() { throw null; }
protected internal static void ValidateMatchTimeout(System.TimeSpan matchTimeout) { }
public ref partial struct ValueMatchEnumerator
{
private object _dummy;
private int _dummyPrimitive;
public readonly System.Text.RegularExpressions.ValueMatch Current { get { throw null; } }
public readonly System.Text.RegularExpressions.Regex.ValueMatchEnumerator GetEnumerator() { throw null; }
public bool MoveNext() { throw null; }
}
}
[System.ObsoleteAttribute("Regex.CompileToAssembly is obsolete and not supported. Use the RegexGeneratorAttribute with the regular expression source generator instead.", DiagnosticId = "SYSLIB0036", UrlFormat = "https://aka.ms/dotnet-warnings/{0}")]
public partial class RegexCompilationInfo
Expand Down Expand Up @@ -359,4 +371,10 @@ public abstract partial class RegexRunnerFactory
protected RegexRunnerFactory() { }
protected internal abstract System.Text.RegularExpressions.RegexRunner CreateInstance();
}
public readonly ref partial struct ValueMatch
{
private readonly int _dummyPrimitive;
public int Index { get { throw null; } }
public int Length { get { throw null; } }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
<ItemGroup>
<Compile Include="System\Collections\HashtableExtensions.cs" />
<Compile Include="System\Collections\Generic\ValueListBuilder.Pop.cs" />
<Compile Include="System\Text\RegularExpressions\ValueMatch.cs" />
<Compile Include="System\Threading\StackHelper.cs" />
<Compile Include="System\Text\SegmentStringBuilder.cs" />
<Compile Include="System\Text\RegularExpressions\Capture.cs" />
Expand All @@ -23,6 +24,7 @@
<Compile Include="System\Text\RegularExpressions\Regex.Match.cs" />
<Compile Include="System\Text\RegularExpressions\Regex.Replace.cs" />
<Compile Include="System\Text\RegularExpressions\Regex.Split.cs" />
<Compile Include="System\Text\RegularExpressions\Regex.EnumerateMatches.cs" />
<Compile Include="System\Text\RegularExpressions\Regex.Timeout.cs" />
<Compile Include="System\Text\RegularExpressions\RegexCaseBehavior.cs" />
<Compile Include="System\Text\RegularExpressions\RegexCaseEquivalences.Data.cs" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;

namespace System.Text.RegularExpressions
{
public partial class Regex
{
/// <summary>
/// Searches an input span for all occurrences of a regular expression and returns a <see cref="ValueMatchEnumerator"/> to iterate over the matches.
/// </summary>
/// <remarks>
/// Each match won't actually happen until <see cref="ValueMatchEnumerator.MoveNext"/> is invoked on the enumerator, with one match being performed per <see cref="ValueMatchEnumerator.MoveNext"/> call.
/// Since the evaluation of the match happens lazily, any changes to the passed in input in between calls to <see cref="ValueMatchEnumerator.MoveNext"/> will affect the match results.
/// The enumerator returned by this method, as well as the structs returned by the enumerator that wrap each match found in the input are ref structs which
/// make this method be amortized allocation free.
/// </remarks>
/// <param name="input">The span to search for a match.</param>
/// <param name="pattern">The regular expression pattern to match.</param>
/// <returns>A <see cref="ValueMatchEnumerator"/> to iterate over the matches.</returns>
/// <exception cref="ArgumentNullException"><paramref name="pattern"/> is null.</exception>
/// <exception cref="RegexParseException">A regular expression parsing error occurred.</exception>
public static ValueMatchEnumerator EnumerateMatches(ReadOnlySpan<char> input, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern) =>
RegexCache.GetOrAdd(pattern).EnumerateMatches(input);

/// <summary>
/// Searches an input span for all occurrences of a regular expression and returns a <see cref="ValueMatchEnumerator"/> to iterate over the matches.
/// </summary>
/// <remarks>
/// Each match won't actually happen until <see cref="ValueMatchEnumerator.MoveNext"/> is invoked on the enumerator, with one match being performed per <see cref="ValueMatchEnumerator.MoveNext"/> call.
/// Since the evaluation of the match happens lazily, any changes to the passed in input in between calls to <see cref="ValueMatchEnumerator.MoveNext"/> will affect the match results.
/// The enumerator returned by this method, as well as the structs returned by the enumerator that wrap each match found in the input are ref structs which
/// make this method be amortized allocation free.
/// </remarks>
/// <param name="input">The span to search for a match.</param>
/// <param name="pattern">The regular expression pattern to match.</param>
/// <param name="options">A bitwise combination of the enumeration values that specify options for matching.</param>
/// <returns>A <see cref="ValueMatchEnumerator"/> to iterate over the matches.</returns>
/// <exception cref="ArgumentNullException"><paramref name="pattern"/> is null.</exception>
/// <exception cref="ArgumentOutOfRangeException"><paramref name="options"/> is not a valid bitwise combination of RegexOptions values.</exception>
/// <exception cref="RegexParseException">A regular expression parsing error occurred.</exception>
public static ValueMatchEnumerator EnumerateMatches(ReadOnlySpan<char> input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options) =>
RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).EnumerateMatches(input);

/// <summary>
/// Searches an input span for all occurrences of a regular expression and returns a <see cref="ValueMatchEnumerator"/> to iterate over the matches.
/// </summary>
/// <remarks>
/// Each match won't actually happen until <see cref="ValueMatchEnumerator.MoveNext"/> is invoked on the enumerator, with one match being performed per <see cref="ValueMatchEnumerator.MoveNext"/> call.
/// Since the evaluation of the match happens lazily, any changes to the passed in input in between calls to <see cref="ValueMatchEnumerator.MoveNext"/> will affect the match results.
/// The enumerator returned by this method, as well as the structs returned by the enumerator that wrap each match found in the input are ref structs which
/// make this method be amortized allocation free.
/// </remarks>
/// <param name="input">The span to search for a match.</param>
/// <param name="pattern">The regular expression pattern to match.</param>
/// <param name="options">A bitwise combination of the enumeration values that specify options for matching.</param>
/// <param name="matchTimeout">A time-out interval, or <see cref="InfiniteMatchTimeout"/> to indicate that the method should not time out.</param>
/// <returns>A <see cref="ValueMatchEnumerator"/> to iterate over the matches.</returns>
/// <exception cref="ArgumentNullException"><paramref name="pattern"/> is null.</exception>
/// <exception cref="ArgumentOutOfRangeException"><paramref name="options"/> is not a valid bitwise combination of RegexOptions values, or <paramref name="matchTimeout"/> is negative, zero, or greater than approximately 24 days.</exception>
/// <exception cref="RegexParseException">A regular expression parsing error occurred.</exception>
public static ValueMatchEnumerator EnumerateMatches(ReadOnlySpan<char> input, [StringSyntax(StringSyntaxAttribute.Regex, "options")] string pattern, RegexOptions options, TimeSpan matchTimeout) =>
RegexCache.GetOrAdd(pattern, options, matchTimeout).EnumerateMatches(input);

/// <summary>
/// Searches an input span for all occurrences of a regular expression and returns a <see cref="ValueMatchEnumerator"/> to iterate over the matches.
/// </summary>
/// <remarks>
/// Each match won't actually happen until <see cref="ValueMatchEnumerator.MoveNext"/> is invoked on the enumerator, with one match being performed per <see cref="ValueMatchEnumerator.MoveNext"/> call.
/// Since the evaluation of the match happens lazily, any changes to the passed in input in between calls to <see cref="ValueMatchEnumerator.MoveNext"/> will affect the match results.
/// The enumerator returned by this method, as well as the structs returned by the enumerator that wrap each match found in the input are ref structs which
/// make this method be amortized allocation free.
/// </remarks>
/// <param name="input">The span to search for a match.</param>
/// <returns>A <see cref="ValueMatchEnumerator"/> to iterate over the matches.</returns>
public ValueMatchEnumerator EnumerateMatches(ReadOnlySpan<char> input) =>
new ValueMatchEnumerator(this, input, RightToLeft ? input.Length : 0);

/// <summary>
/// Represents an enumerator containing the set of successful matches found by iteratively applying a regular expression pattern to the input span.
/// </summary>
/// <remarks>
/// The enumerator has no public constructor. The <see cref="Regex.EnumerateMatches(ReadOnlySpan{char})"/> method returns a <see cref="Regex.ValueMatchEnumerator"/>
/// object.The enumerator will lazily iterate over zero or more <see cref="ValueMatch"/> objects. If there is at least one successful match in the span, then
/// <see cref="MoveNext"/> returns <see langword="true"/> and <see cref="Current"/> will contain the first <see cref="ValueMatch"/>. If there are no successful matches,
/// then <see cref="MoveNext"/> returns <see langword="false"/> and <see cref="Current"/> throws an <see cref="InvalidOperationException"/>.
///
/// This type is a ref struct since it stores the input span as a field in order to be able to lazily iterate over it.
/// </remarks>
public ref struct ValueMatchEnumerator
{
private readonly Regex _regex;
private readonly ReadOnlySpan<char> _input;
private ValueMatch _current;
private int _startAt;
private int _prevLen;

/// <summary>
/// Creates an instance of the <see cref="ValueMatchEnumerator"/> for the passed in <paramref name="regex"/> which iterates over <paramref name="input"/>.
/// </summary>
/// <param name="regex">The <see cref="Regex"/> to use for finding matches.</param>
/// <param name="input">The input span to iterate over.</param>
/// <param name="startAt">The position where the engine should start looking for matches from.</param>
internal ValueMatchEnumerator(Regex regex, ReadOnlySpan<char> input, int startAt)
{
_regex = regex;
_input = input;
_current = default;
_startAt = startAt;
_prevLen = -1;
}

/// <summary>
/// Provides an enumerator that iterates through the matches in the input span.
/// </summary>
/// <returns>A copy of this enumerator.</returns>
public readonly ValueMatchEnumerator GetEnumerator() => this;

/// <summary>
/// Advances the enumerator to the next match in the span.
/// </summary>
/// <returns>
/// <see langword="true"/> if the enumerator was successfully advanced to the next element; <see langword="false"/> if the enumerator cannot find additional matches.
/// </returns>
public bool MoveNext()
{
Match? match = _regex.RunSingleMatch(quick: false, _prevLen, _input, _startAt);
Debug.Assert(match != null, "Match shouldn't be null because we passed quick = false.");
if (match != RegularExpressions.Match.Empty)
{
_current = new ValueMatch(match.Index, match.Length);
_startAt = match._textpos;
_prevLen = match.Length;
return true;
}
return false;
}

/// <summary>
/// Gets the <see cref="ValueMatch"/> element at the current position of the enumerator.
/// </summary>
/// <exception cref="InvalidOperationException">Enumeration has either not started or has already finished.</exception>
public readonly ValueMatch Current => _current;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ public bool IsMatch(string input)
/// <returns><see langword="true"/> if the regular expression finds a match; otherwise, <see langword="false"/>.</returns>
/// <exception cref="RegexMatchTimeoutException">A time-out ocurred.</exception>
public bool IsMatch(ReadOnlySpan<char> input) =>
RunSingleMatch(input, RightToLeft ? input.Length : 0) is null;
RunSingleMatch(quick: true, -1, input, RightToLeft ? input.Length : 0) is null;

/// <summary>
/// Searches the input string for one or more matches using the previous pattern and options,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ protected void InitializeReferences()
}

/// <summary>Internal worker which will scan the passed in span <paramref name="input"/> for a match. Used by public APIs.</summary>
internal Match? RunSingleMatch(ReadOnlySpan<char> input, int startat)
internal Match? RunSingleMatch(bool quick, int prevlen, ReadOnlySpan<char> input, int startat)
{
// startat parameter is always either 0 or input.Length since public API for IsMatch doesn't have an overload
// that takes in startat.
Expand All @@ -416,13 +416,45 @@ protected void InitializeReferences()
try
{
runner.InitializeTimeout(internalMatchTimeout);
runner.InitializeForScan(this, input, startat, quick: true);
runner.InitializeForScan(this, input, startat, quick);

// If previous match was empty or failed, advance by one before matching.
if (prevlen == 0)
{
if (RightToLeft)
{
if (runner.runtextstart == 0)
{
return RegularExpressions.Match.Empty;
}
runner.runtextpos--;
}
else
{
if (runner.runtextstart == input.Length)
{
return RegularExpressions.Match.Empty;
}
runner.runtextpos++;
}
}

runner.Scan(input);

// If runmatch is null it means that an override of Scan didn't implement it correctly, so we will
// let this null ref since there are lots of ways where you can end up in a erroneous state.
return runner.runmatch!.FoundMatch ? null : RegularExpressions.Match.Empty;
Match match = runner.runmatch!;
if (match!.FoundMatch)
{
if (quick)
{
return null;
}
match.Tidy(runner.runtextpos, 0);
return match;
}

return RegularExpressions.Match.Empty;
}
finally
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

namespace System.Text.RegularExpressions
{
/// <summary>
/// Represents the results from a single regular expression match.
/// </summary>
/// <remarks>
/// The <see cref="ValueMatch"/> type is immutable and has no public constructor. An instance of the <see cref="ValueMatch"/> struct is returned by the
/// <see cref="Regex.ValueMatchEnumerator.Current"/> method when iterating over the results from calling <see cref="Regex.EnumerateMatches(ReadOnlySpan{char})"/>.
/// </remarks>
public readonly ref struct ValueMatch
{
private readonly int _index;
private readonly int _length;

/// <summary>
/// Crates an instance of the <see cref="ValueMatch"/> type based on the passed in <paramref name="index"/> and <paramref name="length"/>.
/// </summary>
/// <param name="index">The position in the original span where the first character of the captured sliced span is found.</param>
/// <param name="length">The length of the captured sliced span.</param>
internal ValueMatch(int index, int length)
{
_index = index;
_length = length;
}

/// <summary>
/// Gets the position in the original span where the first character of the captured sliced span is found.
/// </summary>
public int Index => _index;

/// <summary>
/// Gets the length of the captured sliced span.
/// </summary>
public int Length => _length;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

namespace System.Text.RegularExpressions.Tests
{
public class RegexCountTests
public partial class RegexCountTests
{
[Theory]
[MemberData(nameof(Count_ReturnsExpectedCount_TestData))]
Expand Down
Loading

0 comments on commit 89daf96

Please sign in to comment.