From daa28133c5c69fcb30eed67d27855efa08e03181 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Tue, 13 Apr 2021 15:03:29 -0700 Subject: [PATCH 01/17] First cut of look up table for speeding up Go() --- .../src/System/Text/RegularExpressions/RegexCode.cs | 6 +++++- .../Text/RegularExpressions/RegexInterpreter.cs | 13 ++++++++++++- .../System/Text/RegularExpressions/RegexWriter.cs | 13 ++++++++++++- .../tests/Regex.Match.Tests.cs | 13 +++++++++++++ 4 files changed, 42 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs index e76e3ea160aaf..7ae68765c9077 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs @@ -14,6 +14,7 @@ // Strings and sets are indices into a string table. using System.Collections; +using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; @@ -106,11 +107,13 @@ internal sealed class RegexCode public readonly int LeadingAnchor; // the leading anchor, if one exists (RegexPrefixAnalyzer.Bol, etc) public readonly bool RightToLeft; // true if right to left + public readonly Dictionary> FirstLetterToStringTableIndices; + public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount, Hashtable? caps, int capsize, RegexBoyerMoore? boyerMoorePrefix, (string CharClass, bool CaseInsensitive)[]? leadingCharClasses, - int leadingAnchor, bool rightToLeft) + int leadingAnchor, bool rightToLeft, Dictionary> firstLetterToStringTableIndices) { Debug.Assert(boyerMoorePrefix is null || leadingCharClasses is null); @@ -125,6 +128,7 @@ public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount, LeadingCharClasses = leadingCharClasses; LeadingAnchor = leadingAnchor; RightToLeft = rightToLeft; + FirstLetterToStringTableIndices = firstLetterToStringTableIndices; } public static bool OpcodeBacktracks(int Op) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index d557ec6c3aa73..f87cfcb5bfb3d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Globalization; @@ -1058,7 +1059,17 @@ protected override void Go() continue; case RegexCode.Multi: - if (!MatchString(_code.Strings[Operand(0)])) + int stringTableIndex = Operand(0); + char textChar = runtext![runtextpos]; + if (_code.FirstLetterToStringTableIndices.TryGetValue(textChar, out HashSet? stringTableIndices)) + { + if (!stringTableIndices.Contains(stringTableIndex)) + { + // We are trying a pattern that doesn't start with the right char, so there's no way we can match. + break; + } + } + if (!MatchString(_code.Strings[stringTableIndex])) { break; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs index 2154947cfaa8d..c87778acaec81 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs @@ -172,14 +172,25 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree) int leadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree); // Convert the string table into an ordered string array. + var firstLetterToStringTableIndices = new Dictionary>(); var strings = new string[_stringTable.Count]; foreach (KeyValuePair stringEntry in _stringTable) { + if (firstLetterToStringTableIndices.TryGetValue(stringEntry.Key[0], out HashSet? indices)) + { + indices.Add(stringEntry.Value); + } + else + { + firstLetterToStringTableIndices.Add(stringEntry.Key[0], new HashSet() { stringEntry.Value }); + } strings[stringEntry.Value] = stringEntry.Key; } + + // Return all that in a RegexCode object. - return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl); + return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl, firstLetterToStringTableIndices); } /// diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 719f280c91726..d0c883c37ca3f 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -1178,5 +1178,18 @@ public void Synchronized() AssertExtensions.Throws("inner", () => System.Text.RegularExpressions.Match.Synchronized(null)); } + + [Fact] + public void HowManyAlternationsAreChecked() + { + // We can statically determine if it's impossible for an alternation branch N + 1 to match after we've gotten to a certain place in matching branch N, e.g. given the alternation "abc|def" we know that once we match the 'a', there's no point in even considering the second branch. We should be able to utilize that knowledge to avoid unnecessarily checking branches when a previous one fails to match. + + Debugger.Launch(); + var regex = new Regex("(abc|def)xyz"); + var match = regex.Match("abqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqdefxyz"); + Assert.True(match.Success); + Assert.Equal("defxyz", match.Value); + Console.WriteLine("BH"); + } } } From 4a917c2f9be12820d0df942596364c95ec46c72d Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Tue, 13 Apr 2021 16:12:36 -0700 Subject: [PATCH 02/17] More efficient .* in RegexInterpreter --- .../Text/RegularExpressions/RegexCode.cs | 6 +-- .../RegularExpressions/RegexInterpreter.cs | 41 +++++++++++++------ .../Text/RegularExpressions/RegexWriter.cs | 13 +----- .../tests/Regex.Match.Tests.cs | 30 ++++++++++++-- 4 files changed, 57 insertions(+), 33 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs index 7ae68765c9077..e76e3ea160aaf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs @@ -14,7 +14,6 @@ // Strings and sets are indices into a string table. using System.Collections; -using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; @@ -107,13 +106,11 @@ internal sealed class RegexCode public readonly int LeadingAnchor; // the leading anchor, if one exists (RegexPrefixAnalyzer.Bol, etc) public readonly bool RightToLeft; // true if right to left - public readonly Dictionary> FirstLetterToStringTableIndices; - public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount, Hashtable? caps, int capsize, RegexBoyerMoore? boyerMoorePrefix, (string CharClass, bool CaseInsensitive)[]? leadingCharClasses, - int leadingAnchor, bool rightToLeft, Dictionary> firstLetterToStringTableIndices) + int leadingAnchor, bool rightToLeft) { Debug.Assert(boyerMoorePrefix is null || leadingCharClasses is null); @@ -128,7 +125,6 @@ public RegexCode(RegexTree tree, int[] codes, string[] strings, int trackcount, LeadingCharClasses = leadingCharClasses; LeadingAnchor = leadingAnchor; RightToLeft = rightToLeft; - FirstLetterToStringTableIndices = firstLetterToStringTableIndices; } public static bool OpcodeBacktracks(int Op) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index f87cfcb5bfb3d..2e9aa72c6e9fc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -1,7 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Globalization; @@ -21,6 +20,7 @@ internal sealed class RegexInterpreter : RegexRunner private int _codepos; private bool _rightToLeft; private bool _caseInsensitive; + private int _maxBacktrackPosition = -1; public RegexInterpreter(RegexCode code, CultureInfo culture) { @@ -224,6 +224,20 @@ private bool MatchString(string str) { if (runtextend - runtextpos < c) { + // If MatchString was called after a greedy op such as a .*, we would have zipped runtextpos to the end without really examining any characters. Reset to maxBacktrackPos here as an optimization + if (_maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition) + { + // If lastIndexOf is -1, we backtrack to the max extent possible. + runtextpos = _maxBacktrackPosition; + ReadOnlySpan runtextSpan = runtext.AsSpan(_maxBacktrackPosition); + int lastIndexOf = runtextSpan.LastIndexOf(str); + if (lastIndexOf > -1) + { + // Found the next position to match. Move runtextpos here + runtextpos = _maxBacktrackPosition + lastIndexOf; + } + } + return false; } @@ -1059,17 +1073,7 @@ protected override void Go() continue; case RegexCode.Multi: - int stringTableIndex = Operand(0); - char textChar = runtext![runtextpos]; - if (_code.FirstLetterToStringTableIndices.TryGetValue(textChar, out HashSet? stringTableIndices)) - { - if (!stringTableIndices.Contains(stringTableIndex)) - { - // We are trying a pattern that doesn't start with the right char, so there's no way we can match. - break; - } - } - if (!MatchString(_code.Strings[stringTableIndex])) + if (!MatchString(_code.Strings[Operand(0)])) { break; } @@ -1196,6 +1200,7 @@ protected override void Go() int len = Math.Min(Operand(1), Forwardchars()); char ch = (char)Operand(0); int i; + int tempMaxBacktrackPosition = runtextpos; if (!_rightToLeft && !_caseInsensitive) { @@ -1228,6 +1233,8 @@ protected override void Go() if (len > i && _operator == RegexCode.Notoneloop) { TrackPush(len - i - 1, runtextpos - Bump()); + Debug.Assert(_maxBacktrackPosition == -1); + _maxBacktrackPosition = tempMaxBacktrackPosition; } } advance = 2; @@ -1272,6 +1279,16 @@ protected override void Go() { int i = TrackPeek(); int pos = TrackPeek(1); + if (_maxBacktrackPosition != -1 && pos > _maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft) + { + // The Multi node has bumped us along already + int difference = pos - _maxBacktrackPosition; + Debug.Assert(difference > 0); + pos = runtextpos; + i -= difference; + // We shouldn't be backtracking anymore. + _maxBacktrackPosition = -1; + } runtextpos = pos; if (i > 0) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs index c87778acaec81..2154947cfaa8d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs @@ -172,25 +172,14 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree) int leadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree); // Convert the string table into an ordered string array. - var firstLetterToStringTableIndices = new Dictionary>(); var strings = new string[_stringTable.Count]; foreach (KeyValuePair stringEntry in _stringTable) { - if (firstLetterToStringTableIndices.TryGetValue(stringEntry.Key[0], out HashSet? indices)) - { - indices.Add(stringEntry.Value); - } - else - { - firstLetterToStringTableIndices.Add(stringEntry.Key[0], new HashSet() { stringEntry.Value }); - } strings[stringEntry.Value] = stringEntry.Key; } - - // Return all that in a RegexCode object. - return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl, firstLetterToStringTableIndices); + return new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, boyerMoorePrefix, leadingCharClasses, leadingAnchor, rtl); } /// diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index d0c883c37ca3f..fb4f96a24d90d 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -1179,16 +1179,38 @@ public void Synchronized() AssertExtensions.Throws("inner", () => System.Text.RegularExpressions.Match.Synchronized(null)); } + [Theory] + [InlineData(".*foo", "hifoo")] + [InlineData("ab.*foo", "abhifoo")] + [InlineData("ab.*foo.*bar", "abhifooabcbar")] + [InlineData("ab.*foo.*", "abhifooabcbar")] + [InlineData(".*abc|ghi", "ghi")] + public void TestRegressions(string pattern, string input) + { + var regex = new Regex(pattern); + var match = regex.Match(input); + Assert.True(match.Success); + Assert.Equal(input, match.Value); + } + [Fact] public void HowManyAlternationsAreChecked() { // We can statically determine if it's impossible for an alternation branch N + 1 to match after we've gotten to a certain place in matching branch N, e.g. given the alternation "abc|def" we know that once we match the 'a', there's no point in even considering the second branch. We should be able to utilize that knowledge to avoid unnecessarily checking branches when a previous one fails to match. - Debugger.Launch(); - var regex = new Regex("(abc|def)xyz"); - var match = regex.Match("abqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqabqdefxyz"); + //Debugger.Launch(); + var regex = new Regex(".*(ss)"); + var match = regex.Match("Essential services are provided by regular exprs."); Assert.True(match.Success); - Assert.Equal("defxyz", match.Value); + Assert.Equal("Ess", match.Value); + Assert.Equal(0, match.Index); + Assert.Equal(1, match.Groups[1].Index); + + //var regex = new Regex(".*abc|ghi"); + //var match = regex.Match("ghi"); + //Assert.Equal(1, match.Groups.Count); + //Assert.Equal("ghi", match.Groups[0].Value); + Console.WriteLine("BH"); } } From 4cf9ce99c1e217b7565c924350c31ff8628446ce Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Mon, 19 Apr 2021 11:45:52 -0700 Subject: [PATCH 03/17] sq --- .../tests/Regex.Match.Tests.cs | 35 ------------------- 1 file changed, 35 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index fb4f96a24d90d..719f280c91726 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -1178,40 +1178,5 @@ public void Synchronized() AssertExtensions.Throws("inner", () => System.Text.RegularExpressions.Match.Synchronized(null)); } - - [Theory] - [InlineData(".*foo", "hifoo")] - [InlineData("ab.*foo", "abhifoo")] - [InlineData("ab.*foo.*bar", "abhifooabcbar")] - [InlineData("ab.*foo.*", "abhifooabcbar")] - [InlineData(".*abc|ghi", "ghi")] - public void TestRegressions(string pattern, string input) - { - var regex = new Regex(pattern); - var match = regex.Match(input); - Assert.True(match.Success); - Assert.Equal(input, match.Value); - } - - [Fact] - public void HowManyAlternationsAreChecked() - { - // We can statically determine if it's impossible for an alternation branch N + 1 to match after we've gotten to a certain place in matching branch N, e.g. given the alternation "abc|def" we know that once we match the 'a', there's no point in even considering the second branch. We should be able to utilize that knowledge to avoid unnecessarily checking branches when a previous one fails to match. - - //Debugger.Launch(); - var regex = new Regex(".*(ss)"); - var match = regex.Match("Essential services are provided by regular exprs."); - Assert.True(match.Success); - Assert.Equal("Ess", match.Value); - Assert.Equal(0, match.Index); - Assert.Equal(1, match.Groups[1].Index); - - //var regex = new Regex(".*abc|ghi"); - //var match = regex.Match("ghi"); - //Assert.Equal(1, match.Groups.Count); - //Assert.Equal("ghi", match.Groups[0].Value); - - Console.WriteLine("BH"); - } } } From 11d00c16d748b7657a8744fcdd58f3a09378bf85 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Tue, 20 Apr 2021 11:32:08 -0700 Subject: [PATCH 04/17] Get more debug info --- .../src/System/Text/RegularExpressions/RegexInterpreter.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 2e9aa72c6e9fc..ea2095a84deec 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -1233,7 +1233,7 @@ protected override void Go() if (len > i && _operator == RegexCode.Notoneloop) { TrackPush(len - i - 1, runtextpos - Bump()); - Debug.Assert(_maxBacktrackPosition == -1); + Debug.Assert(_maxBacktrackPosition == -1, $"maxBacktrackPosition = {_maxBacktrackPosition}, runtext = {runtext}, runtextpos = {runtextpos}, ch = {ch}, code = {_code}, runregex = {runregex}"); _maxBacktrackPosition = tempMaxBacktrackPosition; } } From a0154702166484e06564885c1115165e3998b919 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Mon, 26 Apr 2021 10:02:34 -0700 Subject: [PATCH 05/17] Remove assert and add unit test --- .../src/System/Text/RegularExpressions/RegexInterpreter.cs | 1 - .../System.Text.RegularExpressions/tests/Regex.Match.Tests.cs | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index ea2095a84deec..47ce7a343366a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -1233,7 +1233,6 @@ protected override void Go() if (len > i && _operator == RegexCode.Notoneloop) { TrackPush(len - i - 1, runtextpos - Bump()); - Debug.Assert(_maxBacktrackPosition == -1, $"maxBacktrackPosition = {_maxBacktrackPosition}, runtext = {runtext}, runtextpos = {runtextpos}, ch = {ch}, code = {_code}, runregex = {runregex}"); _maxBacktrackPosition = tempMaxBacktrackPosition; } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 719f280c91726..5d1144aee8232 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -207,6 +207,7 @@ public static IEnumerable Match_Basic_TestData() // Using *, +, ?, {}: Actual - "a+\\.?b*\\.?c{2}" yield return new object[] { @"a+\.?b*\.+c{2}", "ab.cc", RegexOptions.None, 0, 5, true, "ab.cc" }; + yield return new object[] { @"[^a]+\.[^z]+", "zzzzz", RegexOptions.None, 0, 5, false, string.Empty }; // RightToLeft yield return new object[] { @"\s+\d+", "sdf 12sad", RegexOptions.RightToLeft, 0, 9, true, " 12" }; From d01e32c50a397c934af517d5c5da9b2efc3c5768 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Wed, 26 May 2021 11:03:10 -0700 Subject: [PATCH 06/17] Potential unit test --- .../tests/Regex.Match.Tests.cs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 5d1144aee8232..34a881748a121 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -1179,5 +1179,13 @@ public void Synchronized() AssertExtensions.Throws("inner", () => System.Text.RegularExpressions.Match.Synchronized(null)); } + + [Fact] + public void Test() + { + var regex = new Regex(".*foo"); + var match = regex.Match("aBfoo"); + Assert.True(match.Success); + } } } From 7b15bacc9b52b99bb135f76e7390eabc060b8ea8 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Mon, 21 Jun 2021 10:40:18 -0700 Subject: [PATCH 07/17] temp --- .../System.Text.RegularExpressions/tests/Regex.Match.Tests.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 34a881748a121..dfca9702eec48 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -1183,7 +1183,8 @@ public void Synchronized() [Fact] public void Test() { - var regex = new Regex(".*foo"); + Debugger.Launch(); + var regex = new Regex(".*Foo", RegexOptions.IgnoreCase); var match = regex.Match("aBfoo"); Assert.True(match.Success); } From c086dc8ef065a9926d890c1cb0a06143bbc6ed81 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Mon, 21 Jun 2021 17:33:14 -0700 Subject: [PATCH 08/17] Fix a bug --- .../src/System/Text/RegularExpressions/RegexInterpreter.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 47ce7a343366a..c8c70a6116c0e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -225,7 +225,7 @@ private bool MatchString(string str) if (runtextend - runtextpos < c) { // If MatchString was called after a greedy op such as a .*, we would have zipped runtextpos to the end without really examining any characters. Reset to maxBacktrackPos here as an optimization - if (_maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition) + if (!_caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition) { // If lastIndexOf is -1, we backtrack to the max extent possible. runtextpos = _maxBacktrackPosition; From c9621ddb9a9d64127423b82109ac328b6705da24 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Mon, 21 Jun 2021 17:38:49 -0700 Subject: [PATCH 09/17] sq --- .../tests/Regex.Match.Tests.cs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index dfca9702eec48..5d1144aee8232 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -1179,14 +1179,5 @@ public void Synchronized() AssertExtensions.Throws("inner", () => System.Text.RegularExpressions.Match.Synchronized(null)); } - - [Fact] - public void Test() - { - Debugger.Launch(); - var regex = new Regex(".*Foo", RegexOptions.IgnoreCase); - var match = regex.Match("aBfoo"); - Assert.True(match.Success); - } } } From 485aac5a64a5eda0f2181d3572d50a8442d5ff72 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Tue, 22 Jun 2021 10:15:33 -0700 Subject: [PATCH 10/17] Add extra protection to the backtracking optimization --- .../src/System/Text/RegularExpressions/RegexInterpreter.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index c8c70a6116c0e..5021ea064f56f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -1278,7 +1278,7 @@ protected override void Go() { int i = TrackPeek(); int pos = TrackPeek(1); - if (_maxBacktrackPosition != -1 && pos > _maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft) + if (!_caseInsensitive && _maxBacktrackPosition != -1 && pos > _maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft) { // The Multi node has bumped us along already int difference = pos - _maxBacktrackPosition; From dba02791934c82be5a006021a60c73ee29b32d60 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Tue, 22 Jun 2021 11:26:15 -0700 Subject: [PATCH 11/17] Add unit test --- .../System.Text.RegularExpressions/tests/Regex.Match.Tests.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 5d1144aee8232..6e644b1260d8e 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -204,6 +204,7 @@ public static IEnumerable Match_Basic_TestData() yield return new object[] { "abc", "abc", RegexOptions.None, 0, 3, true, "abc" }; yield return new object[] { "abc", "aBc", RegexOptions.None, 0, 3, false, string.Empty }; yield return new object[] { "abc", "aBc", RegexOptions.IgnoreCase, 0, 3, true, "aBc" }; + yield return new object[] { @"abc.*def", "abcghiDEF", RegexOptions.IgnoreCase, 0, 9, true, "abcghiDEF" }; // Using *, +, ?, {}: Actual - "a+\\.?b*\\.?c{2}" yield return new object[] { @"a+\.?b*\.+c{2}", "ab.cc", RegexOptions.None, 0, 5, true, "ab.cc" }; From 20594518adbe9e850b9fa07feeeb89a0c0cd7b8b Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Wed, 30 Jun 2021 15:16:23 -0700 Subject: [PATCH 12/17] Revert --- .../tests/Regex.Match.Tests.cs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 6e644b1260d8e..0d2134715c22b 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -1180,5 +1180,14 @@ public void Synchronized() AssertExtensions.Throws("inner", () => System.Text.RegularExpressions.Match.Synchronized(null)); } + + [Fact] + public void Test() + { + Debugger.Launch(); + var regex = new Regex(".*[abc]string"); + var match1 = regex.Match("fooastring"); + var match2 = regex.Match("foostring"); + } } } From 2766f3003cb25df8d3100ef8210195971f6b8b03 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Fri, 23 Apr 2021 10:38:13 -0700 Subject: [PATCH 13/17] RegexCompiler changes --- .../Text/RegularExpressions/RegexCompiler.cs | 151 ++++++++++++++++-- .../tests/Regex.Match.Tests.cs | 20 ++- 2 files changed, 153 insertions(+), 18 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index b9b6e791ed572..bf3e9e6b29ea0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -64,6 +64,8 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!; + private static readonly MethodInfo s_stringAsSpanWithStartMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int) })!; + private static readonly MethodInfo s_spanLastIndexOfMethod = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!; private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; private static readonly MethodInfo s_stringIndexOfCharInt = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int) })!; @@ -90,6 +92,7 @@ internal abstract class RegexCompiler private LocalBuilder? _runstackLocal; private LocalBuilder? _textInfoLocal; // cached to avoid extraneous TLS hits from CurrentCulture and virtual calls to TextInfo private LocalBuilder? _loopTimeoutCounterLocal; // timeout counter for setrep and setloop + private LocalBuilder? _maxBacktrackPositionLocal; protected RegexOptions _options; // options protected RegexCode? _code; // the RegexCode object @@ -891,6 +894,8 @@ private void GenerateForwardSection() Mvfldloc(s_runtrackposField, _runtrackposLocal!); Mvfldloc(s_runstackField, _runstackLocal!); Mvfldloc(s_runstackposField, _runstackposLocal!); + Ldc(-1); + Stloc(_maxBacktrackPositionLocal!); _backpos = -1; @@ -1705,7 +1710,7 @@ protected void GenerateFindFirstChar() // if (!CharInClass(textSpan[i + 2], prefix[2], "...")) goto returnFalse; // ... Debug.Assert(charClassIndex == 0 || charClassIndex == 1); - for ( ; charClassIndex < _leadingCharClasses.Length; charClassIndex++) + for (; charClassIndex < _leadingCharClasses.Length; charClassIndex++) { Debug.Assert(needLoop); Ldloca(textSpanLocal); @@ -3310,6 +3315,7 @@ protected void GenerateGo() } _runtextbegLocal = DeclareInt32(); _runtextendLocal = DeclareInt32(); + _maxBacktrackPositionLocal = DeclareInt32(); InitializeCultureForGoIfNecessary(); @@ -4258,7 +4264,63 @@ private void GenerateOneCode() //: break Backward; { string str = _strings![Operand(0)]; + Label multiCode = DefineLabel(); + if (!IsRightToLeft()) + { + // if (runtextend - runtextpos < c) + Ldloc(_runtextendLocal!); + Ldloc(_runtextposLocal!); + Sub(); + Ldc(str.Length); + BgeFar(multiCode); + // if (!_caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition) + if (!IsCaseInsensitive()) + { + Ldloc(_maxBacktrackPositionLocal!); + Ldc(-1); + BeqFar(_backtrack); + Ldloc(_runtextposLocal); + Ldloc(_maxBacktrackPositionLocal!); + BleFar(_backtrack); + // runtextpos = _maxBacktrackPosition; + Ldloc(_maxBacktrackPositionLocal!); + Stloc(_runtextposLocal!); + // ReadOnlySpan runtextSpan = runtext.AsSpan(_maxBacktrackPosition); + Ldloc(_runtextLocal!); + Ldloc(_maxBacktrackPositionLocal!); + using (RentedLocalBuilder textSpanLocal = RentReadOnlySpanCharLocal()) + { + Call(s_stringAsSpanWithStartMethod); + Stloc(textSpanLocal); + using (RentedLocalBuilder lastIndexOf = RentInt32Local()) + { + using (RentedLocalBuilder strAsSpanLocal = RentReadOnlySpanCharLocal()) + { + // int lastIndexOf = runtextSpan.LastIndexOf(str.AsSpan()); + Ldstr(str); + Call(s_stringAsSpanMethod); + Stloc(strAsSpanLocal); + Ldloc(textSpanLocal); + Ldloc(strAsSpanLocal); + Call(s_spanLastIndexOfMethod); + } + Stloc(lastIndexOf); + Ldloc(lastIndexOf); + Ldc(-1); + // if (lastIndexOf > -1) + BleFar(_backtrack); + // runtextpos = _maxBacktrackPosition + lastIndexOf; + Ldloc(lastIndexOf); + Ldloc(_maxBacktrackPositionLocal!); + Add(); + Stloc(_runtextposLocal!); + BrFar(_backtrack); + } + } + } + } + MarkLabel(multiCode); Ldc(str.Length); Ldloc(_runtextendLocal!); Ldloc(_runtextposLocal!); @@ -4598,6 +4660,9 @@ private void GenerateOneCode() using RentedLocalBuilder lenLocal = RentInt32Local(); using RentedLocalBuilder iLocal = RentInt32Local(); + using RentedLocalBuilder tempMaxBacktrackPositionLocal = RentInt32Local(); + Ldloc(_runtextposLocal); + Stloc(tempMaxBacktrackPositionLocal); if (!IsRightToLeft()) { @@ -4847,6 +4912,12 @@ private void GenerateOneCode() DoPush(); Track(); + // if (_operator == RegexCode.Notoneloop) maxBacktrackPosition = tempMaxBacktrackPosition + if (_regexopcode == RegexCode.Notoneloop) + { + Ldloc(tempMaxBacktrackPositionLocal); + Stloc(_maxBacktrackPositionLocal!); + } } break; } @@ -4870,28 +4941,74 @@ private void GenerateOneCode() //: if (i > 0) //: Track(i - 1, pos - 1); //: Advance(2); - PopTrack(); - Stloc(_runtextposLocal!); + Label noBacktrackPositionBranch = DefineLabel(); PopTrack(); using (RentedLocalBuilder posLocal = RentInt32Local()) { Stloc(posLocal); - Ldloc(posLocal); - Ldc(0); - BleFar(AdvanceLabel()); + PopTrack(); + using (RentedLocalBuilder iBacktrackLocal = RentInt32Local()) + { + Stloc(iBacktrackLocal); + // if (!_caseInsensitive && maxBacktrackPosition != -1 && pos > maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft) + if (!IsCaseInsensitive() && _regexopcode == (RegexCode.Notoneloop | RegexCode.Back) && !IsRightToLeft()) + { + Ldloc(_maxBacktrackPositionLocal!); + Ldc(-1); + Beq(noBacktrackPositionBranch); + Ldloc(posLocal); + Ldloc(_maxBacktrackPositionLocal!); + Ble(noBacktrackPositionBranch); + Ldloc(_runtextposLocal!); + Ldloc(posLocal); + Bge(noBacktrackPositionBranch); + /* + int difference = pos - maxBacktrackPosition; + pos = runtextpos; + i -= difference; + maxBacktrackPosition = -1; + */ + // int difference = pos - maxBacktrackPosition; + Ldloc(posLocal); + Ldloc(_maxBacktrackPositionLocal!); + Sub(); + using (RentedLocalBuilder differenceLocal = RentInt32Local()) + { + Stloc(differenceLocal); + // pos = runtextpos; + Ldloc(_runtextposLocal!); + Stloc(posLocal); + // i -= difference; + Ldloc(iBacktrackLocal); + Ldloc(differenceLocal); + Sub(); + Stloc(iBacktrackLocal); + // maxBacktrackPosition = -1; + Ldc(-1); + Stloc(_maxBacktrackPositionLocal!); + } + } + + MarkLabel(noBacktrackPositionBranch); + Ldloc(posLocal); + Stloc(_runtextposLocal!); + Ldloc(iBacktrackLocal); + Ldc(0); + BleFar(AdvanceLabel()); + ReadyPushTrack(); + Ldloc(iBacktrackLocal); + } + Ldc(1); + Sub(); + DoPush(); ReadyPushTrack(); - Ldloc(posLocal); + Ldloc(_runtextposLocal!); + Ldc(1); + Sub(IsRightToLeft()); + DoPush(); + Trackagain(); + Advance(); } - Ldc(1); - Sub(); - DoPush(); - ReadyPushTrack(); - Ldloc(_runtextposLocal!); - Ldc(1); - Sub(IsRightToLeft()); - DoPush(); - Trackagain(); - Advance(); break; case RegexCode.Onelazy: diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 0d2134715c22b..687c792439055 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Globalization; using System.Linq; +using System.Reflection; using System.Tests; using Microsoft.DotNet.RemoteExecutor; using Xunit; @@ -619,7 +620,7 @@ public void Match_Timeout_Loop_Throws(string pattern, RegexOptions options) public void Match_Timeout_Repetition_Throws(RegexOptions options) { int repetitionCount = 800_000_000; - var regex = new Regex(@"a\s{" + repetitionCount+ "}", options, TimeSpan.FromSeconds(1)); + var regex = new Regex(@"a\s{" + repetitionCount + "}", options, TimeSpan.FromSeconds(1)); string input = @"a" + new string(' ', repetitionCount) + @"b"; Assert.Throws(() => regex.Match(input)); } @@ -1183,6 +1184,23 @@ public void Synchronized() [Fact] public void Test() + { + + // Continue from here. AFAICT the code is complete. Need to start testing it. + Debugger.Launch(); + Regex.CompileToAssembly(new[] + { + new RegexCompilationInfo("hi.*foo", RegexOptions.None, "hifooabc", "", false), + }, new AssemblyName("testregex")); + + var regex = new Regex(@"hi.*foo", RegexOptions.Compiled); + var match = regex.Match("hifooabout"); + Assert.True(match.Success); + Assert.Equal("hifoo", match.Value); + } + + [Fact] + public void TestI() { Debugger.Launch(); var regex = new Regex(".*[abc]string"); From f136bdee1cfb53ec59f9e196784758f8203278b5 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Wed, 14 Jul 2021 12:02:00 -0700 Subject: [PATCH 14/17] sq --- .../src/System/Text/RegularExpressions/RegexCompiler.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index bf3e9e6b29ea0..b0c4adfc09637 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -4279,7 +4279,7 @@ private void GenerateOneCode() Ldloc(_maxBacktrackPositionLocal!); Ldc(-1); BeqFar(_backtrack); - Ldloc(_runtextposLocal); + Ldloc(_runtextposLocal!); Ldloc(_maxBacktrackPositionLocal!); BleFar(_backtrack); // runtextpos = _maxBacktrackPosition; @@ -4661,7 +4661,7 @@ private void GenerateOneCode() using RentedLocalBuilder lenLocal = RentInt32Local(); using RentedLocalBuilder iLocal = RentInt32Local(); using RentedLocalBuilder tempMaxBacktrackPositionLocal = RentInt32Local(); - Ldloc(_runtextposLocal); + Ldloc(_runtextposLocal!); Stloc(tempMaxBacktrackPositionLocal); if (!IsRightToLeft()) From 1db6349dd49ded57e24708893cc60d8a66713a08 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Wed, 14 Jul 2021 12:02:43 -0700 Subject: [PATCH 15/17] Remove debug unit tests --- .../tests/Regex.Match.Tests.cs | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 687c792439055..fca00f4687a41 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -1181,31 +1181,5 @@ public void Synchronized() AssertExtensions.Throws("inner", () => System.Text.RegularExpressions.Match.Synchronized(null)); } - - [Fact] - public void Test() - { - - // Continue from here. AFAICT the code is complete. Need to start testing it. - Debugger.Launch(); - Regex.CompileToAssembly(new[] - { - new RegexCompilationInfo("hi.*foo", RegexOptions.None, "hifooabc", "", false), - }, new AssemblyName("testregex")); - - var regex = new Regex(@"hi.*foo", RegexOptions.Compiled); - var match = regex.Match("hifooabout"); - Assert.True(match.Success); - Assert.Equal("hifoo", match.Value); - } - - [Fact] - public void TestI() - { - Debugger.Launch(); - var regex = new Regex(".*[abc]string"); - var match1 = regex.Match("fooastring"); - var match2 = regex.Match("foostring"); - } } } From 848aff04085cfc414d8c2751d3a18cb991095c4c Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Wed, 14 Jul 2021 13:36:31 -0700 Subject: [PATCH 16/17] Add a length to the AsSpan call --- .../src/System/Text/RegularExpressions/RegexCompiler.cs | 8 +++++--- .../System/Text/RegularExpressions/RegexInterpreter.cs | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index b0c4adfc09637..8972a875b9630 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -64,7 +64,6 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!; private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!; - private static readonly MethodInfo s_stringAsSpanWithStartMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int) })!; private static readonly MethodInfo s_spanLastIndexOfMethod = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!; private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; @@ -4285,12 +4284,15 @@ private void GenerateOneCode() // runtextpos = _maxBacktrackPosition; Ldloc(_maxBacktrackPositionLocal!); Stloc(_runtextposLocal!); - // ReadOnlySpan runtextSpan = runtext.AsSpan(_maxBacktrackPosition); + // ReadOnlySpan runtextSpan = runtext.AsSpan(_maxBacktrackPosition, runtextend - _maxBacktractPosition); Ldloc(_runtextLocal!); Ldloc(_maxBacktrackPositionLocal!); + Ldloc(_runtextendLocal!); + Ldloc(_maxBacktrackPositionLocal!); + Sub(); using (RentedLocalBuilder textSpanLocal = RentReadOnlySpanCharLocal()) { - Call(s_stringAsSpanWithStartMethod); + Call(s_stringAsSpanIntIntMethod); Stloc(textSpanLocal); using (RentedLocalBuilder lastIndexOf = RentInt32Local()) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 5021ea064f56f..3e05926733fe9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -229,7 +229,7 @@ private bool MatchString(string str) { // If lastIndexOf is -1, we backtrack to the max extent possible. runtextpos = _maxBacktrackPosition; - ReadOnlySpan runtextSpan = runtext.AsSpan(_maxBacktrackPosition); + ReadOnlySpan runtextSpan = runtext.AsSpan(_maxBacktrackPosition, runtextend - _maxBacktrackPosition); int lastIndexOf = runtextSpan.LastIndexOf(str); if (lastIndexOf > -1) { From 69a9c1dd3a986b51c0047512b22f874bec0555a1 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Fri, 16 Jul 2021 14:57:46 -0700 Subject: [PATCH 17/17] Address RegexCompiler comments and add unit tests --- .../Text/RegularExpressions/RegexCompiler.cs | 57 +++++++------------ .../tests/Regex.Match.Tests.cs | 56 ++++++++++++++++++ 2 files changed, 78 insertions(+), 35 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 8972a875b9630..ce09848b867cf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -4272,7 +4272,7 @@ private void GenerateOneCode() Sub(); Ldc(str.Length); BgeFar(multiCode); - // if (!_caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition) + // if (!caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition) if (!IsCaseInsensitive()) { Ldloc(_maxBacktrackPositionLocal!); @@ -4290,29 +4290,24 @@ private void GenerateOneCode() Ldloc(_runtextendLocal!); Ldloc(_maxBacktrackPositionLocal!); Sub(); - using (RentedLocalBuilder textSpanLocal = RentReadOnlySpanCharLocal()) + using (RentedLocalBuilder runtextSpanLocal = RentReadOnlySpanCharLocal()) { Call(s_stringAsSpanIntIntMethod); - Stloc(textSpanLocal); - using (RentedLocalBuilder lastIndexOf = RentInt32Local()) + Stloc(runtextSpanLocal); + using (RentedLocalBuilder lastIndexOfLocal = RentInt32Local()) { - using (RentedLocalBuilder strAsSpanLocal = RentReadOnlySpanCharLocal()) - { - // int lastIndexOf = runtextSpan.LastIndexOf(str.AsSpan()); - Ldstr(str); - Call(s_stringAsSpanMethod); - Stloc(strAsSpanLocal); - Ldloc(textSpanLocal); - Ldloc(strAsSpanLocal); - Call(s_spanLastIndexOfMethod); - } - Stloc(lastIndexOf); - Ldloc(lastIndexOf); - Ldc(-1); + // int lastIndexOf = runtextSpan.LastIndexOf(str.AsSpan()); + Ldloc(runtextSpanLocal); + Ldstr(str); + Call(s_stringAsSpanMethod); + Call(s_spanLastIndexOfMethod); + Stloc(lastIndexOfLocal); // if (lastIndexOf > -1) + Ldloc(lastIndexOfLocal); + Ldc(-1); BleFar(_backtrack); - // runtextpos = _maxBacktrackPosition + lastIndexOf; - Ldloc(lastIndexOf); + // runtextpos = lastIndexOf + _maxBacktrackPosition; + Ldloc(lastIndexOfLocal); Ldloc(_maxBacktrackPositionLocal!); Add(); Stloc(_runtextposLocal!); @@ -4952,7 +4947,7 @@ private void GenerateOneCode() using (RentedLocalBuilder iBacktrackLocal = RentInt32Local()) { Stloc(iBacktrackLocal); - // if (!_caseInsensitive && maxBacktrackPosition != -1 && pos > maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft) + // if (!caseInsensitive && maxBacktrackPosition != -1 && pos > maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft) if (!IsCaseInsensitive() && _regexopcode == (RegexCode.Notoneloop | RegexCode.Back) && !IsRightToLeft()) { Ldloc(_maxBacktrackPositionLocal!); @@ -4971,24 +4966,16 @@ private void GenerateOneCode() maxBacktrackPosition = -1; */ // int difference = pos - maxBacktrackPosition; + Ldloc(iBacktrackLocal); Ldloc(posLocal); Ldloc(_maxBacktrackPositionLocal!); Sub(); - using (RentedLocalBuilder differenceLocal = RentInt32Local()) - { - Stloc(differenceLocal); - // pos = runtextpos; - Ldloc(_runtextposLocal!); - Stloc(posLocal); - // i -= difference; - Ldloc(iBacktrackLocal); - Ldloc(differenceLocal); - Sub(); - Stloc(iBacktrackLocal); - // maxBacktrackPosition = -1; - Ldc(-1); - Stloc(_maxBacktrackPositionLocal!); - } + Sub(); + Stloc(iBacktrackLocal); + Ldloc(_runtextposLocal!); + Stloc(posLocal); + Ldc(-1); + Stloc(_maxBacktrackPositionLocal!); } MarkLabel(noBacktrackPositionBranch); diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index fca00f4687a41..723fc68e53bab 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -384,6 +384,62 @@ public static IEnumerable Match_Basic_TestData() yield return new object[] { "\u05D0(?:\u05D1|\u05D2|\u05D3)", "\u05D0\u05D2", options, 0, 2, true, "\u05D0\u05D2" }; yield return new object[] { "\u05D0(?:\u05D1|\u05D2|\u05D3)", "\u05D0\u05D4", options, 0, 0, false, "" }; } + + // .* Perf Optimization: Case sensitive + foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.Compiled }) + { + yield return new object[] { @".*\nfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @"a.*\nfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @".*\nFoo", $"\nFooThis should match", options, 0, 21, true, "\nFoo" }; + yield return new object[] { @".*\nfoo", "\nfooThis should match", options, 4, 17, false, "" }; + + yield return new object[] { @".*\dfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @".*\dFoo", "This1Foo should match", options, 0, 21, true, "This1Foo" }; + yield return new object[] { @".*\dFoo", "This1foo should 2Foo match", options, 0, 26, true, "This1foo should 2Foo" }; + yield return new object[] { @".*\dFoo", "This1foo shouldn't 2foo match", options, 0, 29, false, "" }; + yield return new object[] { @".*\dfoo", "This1foo shouldn't 2foo match", options, 24, 5, false, "" }; + + yield return new object[] { @".*\dfoo", "1fooThis1foo should 1foo match", options, 4, 9, true, "This1foo" }; + yield return new object[] { @".*\dfoo", "This shouldn't match 1foo", options, 0, 20, false, "" }; + } + + // .* Perf Optimization: Case insensitive + foreach (RegexOptions options in new[] { RegexOptions.IgnoreCase, RegexOptions.IgnoreCase | RegexOptions.Compiled }) + { + yield return new object[] { @".*\nFoo", "\nfooThis should match", options, 0, 21, true, "\nfoo" }; + yield return new object[] { @".*\dFoo", "This1foo should match", options, 0, 21, true, "This1foo" }; + yield return new object[] { @".*\dFoo", "This1foo should 2FoO match", options, 0, 26, true, "This1foo should 2FoO" }; + yield return new object[] { @".*\dFoo", "This1Foo should 2fOo match", options, 0, 26, true, "This1Foo should 2fOo" }; + yield return new object[] { @".*\dfoo", "1fooThis1FOO should 1foo match", options, 4, 9, true, "This1FOO" }; + } + + // .* Perf Optimization: RTL, Case-sensitive + foreach (RegexOptions options in new[] { RegexOptions.None | RegexOptions.RightToLeft, RegexOptions.Compiled | RegexOptions.RightToLeft }) + { + yield return new object[] { @".*\nfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @"a.*\nfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @".*\nFoo", $"This should match\nFoo", options, 0, 21, true, "This should match\nFoo" }; + yield return new object[] { @".*\nfoo", "This should matchfoo\n", options, 4, 13, false, "" }; + + yield return new object[] { @".*\dfoo", "This shouldn't match", options, 0, 20, false, "" }; + yield return new object[] { @".*\dFoo", "This1Foo should match", options, 0, 21, true, "This1Foo" }; + yield return new object[] { @".*\dFoo", "This1foo should 2Foo match", options, 0, 26, true, "This1foo should 2Foo" }; + yield return new object[] { @".*\dFoo", "This1foo shouldn't 2foo match", options, 0, 29, false, "" }; + yield return new object[] { @".*\dfoo", "This1foo shouldn't 2foo match", options, 19, 0, false, "" }; + + yield return new object[] { @".*\dfoo", "1fooThis2foo should 1foo match", options, 8, 4, true, "2foo" }; + yield return new object[] { @".*\dfoo", "This shouldn't match 1foo", options, 0, 20, false, "" }; + } + + // .* Perf Optimization: RTL, case insensitive + foreach (RegexOptions options in new[] { RegexOptions.IgnoreCase | RegexOptions.RightToLeft, RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.RightToLeft }) + { + yield return new object[] { @".*\nFoo", "\nfooThis should match", options, 0, 21, true, "\nfoo" }; + yield return new object[] { @".*\dFoo", "This1foo should match", options, 0, 21, true, "This1foo" }; + yield return new object[] { @".*\dFoo", "This1foo should 2FoO match", options, 0, 26, true, "This1foo should 2FoO" }; + yield return new object[] { @".*\dFoo", "This1Foo should 2fOo match", options, 0, 26, true, "This1Foo should 2fOo" }; + yield return new object[] { @".*\dfoo", "1fooThis2FOO should 1foo match", options, 8, 4, true, "2FOO" }; + } } public static IEnumerable Match_Basic_TestData_NetCore()