From c01ad8694fe712e9dcdfa9a9d37c1be0e8b82e4e Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Wed, 30 Nov 2022 23:21:33 +0000 Subject: [PATCH] Use IndexOfAnyValues in Uri.EscapeString helper (#79024) --- .../src/System/IriHelper.cs | 2 +- .../System.Private.Uri/src/System/Uri.cs | 18 +- .../System.Private.Uri/src/System/UriExt.cs | 8 +- .../src/System/UriHelper.cs | 199 +++++++----------- 4 files changed, 85 insertions(+), 142 deletions(-) diff --git a/src/libraries/System.Private.Uri/src/System/IriHelper.cs b/src/libraries/System.Private.Uri/src/System/IriHelper.cs index 82baa5df686ef..20870d721f277 100644 --- a/src/libraries/System.Private.Uri/src/System/IriHelper.cs +++ b/src/libraries/System.Private.Uri/src/System/IriHelper.cs @@ -184,7 +184,7 @@ internal static unsafe string EscapeUnescapeIri(char* pInput, int start, int end foreach (byte b in encodedBytes) { - UriHelper.EscapeAsciiChar(b, ref dest); + UriHelper.PercentEncodeByte(b, ref dest); } } diff --git a/src/libraries/System.Private.Uri/src/System/Uri.cs b/src/libraries/System.Private.Uri/src/System/Uri.cs index 50ec8f387590c..2cff5cb16def1 100644 --- a/src/libraries/System.Private.Uri/src/System/Uri.cs +++ b/src/libraries/System.Private.Uri/src/System/Uri.cs @@ -1797,7 +1797,7 @@ private static bool CheckForColonInFirstPathSegment(string uriString) internal static string InternalEscapeString(string rawString) => rawString is null ? string.Empty : - UriHelper.EscapeString(rawString, checkExistingEscaped: true, UriHelper.UnreservedReservedTable, '?', '#'); + UriHelper.EscapeString(rawString, checkExistingEscaped: true, UriHelper.UnreservedReservedExceptQuestionMarkHash); // // This method is called first to figure out the scheme or a simple file path @@ -2359,7 +2359,7 @@ private unsafe void CreateHostString() flags |= Flags.E_HostNotCanonical; if (NotAny(Flags.UserEscaped)) { - host = UriHelper.EscapeString(host, checkExistingEscaped: !IsImplicitFile, UriHelper.UnreservedReservedTable, '?', '#'); + host = UriHelper.EscapeString(host, checkExistingEscaped: !IsImplicitFile, UriHelper.UnreservedReservedExceptQuestionMarkHash); } else { @@ -2656,7 +2656,7 @@ private string ReCreateParts(UriComponents parts, ushort nonCanonical, UriFormat case UriFormat.UriEscaped: if (NotAny(Flags.UserEscaped)) { - UriHelper.EscapeString(slice, ref dest, checkExistingEscaped: true, '?', '#'); + UriHelper.EscapeString(slice, ref dest, checkExistingEscaped: true, UriHelper.UnreservedReservedExceptQuestionMarkHash); } else { @@ -2802,7 +2802,7 @@ private string ReCreateParts(UriComponents parts, ushort nonCanonical, UriFormat { UriHelper.EscapeString( str.AsSpan(offset, _info.Offset.Fragment - offset), - ref dest, checkExistingEscaped: true, '#'); + ref dest, checkExistingEscaped: true, UriHelper.UnreservedReservedExceptHash); goto AfterQuery; } @@ -2841,7 +2841,7 @@ private string ReCreateParts(UriComponents parts, ushort nonCanonical, UriFormat { UriHelper.EscapeString( str.AsSpan(offset, _info.Offset.End - offset), - ref dest, checkExistingEscaped: true); + ref dest, checkExistingEscaped: true, UriHelper.UnreservedReserved); goto AfterFragment; } @@ -4452,7 +4452,7 @@ private unsafe void GetCanonicalPath(ref ValueStringBuilder dest, UriFormat form UriHelper.EscapeString( str.Slice(_info.Offset.Path, _info.Offset.Query - _info.Offset.Path), - ref dest, checkExistingEscaped: !IsImplicitFile, '?', '#'); + ref dest, checkExistingEscaped: !IsImplicitFile, UriHelper.UnreservedReservedExceptQuestionMarkHash); } else { @@ -4472,7 +4472,7 @@ private unsafe void GetCanonicalPath(ref ValueStringBuilder dest, UriFormat form // CS8350 & CS8352: We can't pass `copy` and `dest` as arguments together as that could leak the scope of the above stackalloc // As a workaround, re-create the Span in a way that avoids analysis ReadOnlySpan copySpan = MemoryMarshal.CreateReadOnlySpan(ref copy.GetPinnableReference(), copy.Length); - UriHelper.EscapeString(copySpan, ref dest, checkExistingEscaped: true, '\\'); + UriHelper.EscapeString(copySpan, ref dest, checkExistingEscaped: true, UriHelper.UnreservedReserved); start = dest.Length; copy.Dispose(); @@ -4534,7 +4534,7 @@ private unsafe void GetCanonicalPath(ref ValueStringBuilder dest, UriFormat form // CS8350 & CS8352: We can't pass `copy` and `dest` as arguments together as that could leak the scope of the above stackalloc // As a workaround, re-create the Span in a way that avoids analysis ReadOnlySpan copySpan = MemoryMarshal.CreateReadOnlySpan(ref copy.GetPinnableReference(), copy.Length); - UriHelper.EscapeString(copySpan, ref dest, checkExistingEscaped: !IsImplicitFile, '?', '#'); + UriHelper.EscapeString(copySpan, ref dest, checkExistingEscaped: !IsImplicitFile, UriHelper.UnreservedReservedExceptQuestionMarkHash); start = dest.Length; copy.Dispose(); @@ -5149,7 +5149,7 @@ protected virtual string Unescape(string path) [Obsolete("Uri.EscapeString has been deprecated. Use GetComponents() or Uri.EscapeDataString to escape a Uri component or a string.")] protected static string EscapeString(string? str) => str is null ? string.Empty : - UriHelper.EscapeString(str, checkExistingEscaped: true, UriHelper.UnreservedReservedTable, '?', '#'); + UriHelper.EscapeString(str, checkExistingEscaped: true, UriHelper.UnreservedReservedExceptQuestionMarkHash); // // CheckSecurity diff --git a/src/libraries/System.Private.Uri/src/System/UriExt.cs b/src/libraries/System.Private.Uri/src/System/UriExt.cs index 8acfa9fff6f5c..4865dd3fc82b2 100644 --- a/src/libraries/System.Private.Uri/src/System/UriExt.cs +++ b/src/libraries/System.Private.Uri/src/System/UriExt.cs @@ -228,7 +228,7 @@ private static bool CheckForUnicodeOrEscapedUnreserved(string data) { char value = UriHelper.DecodeHexChars(data[i + 1], data[i + 2]); - if (value >= UriHelper.UnreservedTable.Length || UriHelper.UnreservedTable[value]) + if (!char.IsAscii(value) || UriHelper.Unreserved.Contains(value)) { return true; } @@ -581,12 +581,12 @@ public static string UnescapeDataString(string stringToUnescape) // This method will escape any character that is not a reserved or unreserved character, including percent signs. [Obsolete(Obsoletions.EscapeUriStringMessage, DiagnosticId = Obsoletions.EscapeUriStringDiagId, UrlFormat = Obsoletions.SharedUrlFormat)] public static string EscapeUriString(string stringToEscape) => - UriHelper.EscapeString(stringToEscape, checkExistingEscaped: false, UriHelper.UnreservedReservedTable); + UriHelper.EscapeString(stringToEscape, checkExistingEscaped: false, UriHelper.UnreservedReserved); // Where stringToEscape is intended to be URI data, but not an entire URI. // This method will escape any character that is not an unreserved character, including percent signs. public static string EscapeDataString(string stringToEscape) => - UriHelper.EscapeString(stringToEscape, checkExistingEscaped: false, UriHelper.UnreservedTable); + UriHelper.EscapeString(stringToEscape, checkExistingEscaped: false, UriHelper.Unreserved); // // Cleans up the specified component according to Iri rules @@ -766,7 +766,7 @@ private unsafe string GetRelativeSerializationString(UriFormat format) { if (format == UriFormat.UriEscaped) { - return UriHelper.EscapeString(_string, checkExistingEscaped: true, UriHelper.UnreservedReservedTable); + return UriHelper.EscapeString(_string, checkExistingEscaped: true, UriHelper.UnreservedReserved); } else if (format == UriFormat.Unescaped) { diff --git a/src/libraries/System.Private.Uri/src/System/UriHelper.cs b/src/libraries/System.Private.Uri/src/System/UriHelper.cs index 5d9665a67b7c0..f66587707f7fb 100644 --- a/src/libraries/System.Private.Uri/src/System/UriHelper.cs +++ b/src/libraries/System.Private.Uri/src/System/UriHelper.cs @@ -4,6 +4,7 @@ using System.Text; using System.Diagnostics; using System.Runtime.InteropServices; +using System.Buffers; namespace System { @@ -106,40 +107,16 @@ internal static unsafe bool TestForSubPath(char* selfPtr, int selfLength, char* return true; } - internal static string EscapeString( - string stringToEscape, // same name as public API - bool checkExistingEscaped, ReadOnlySpan unreserved, char forceEscape1 = '\0', char forceEscape2 = '\0') + internal static string EscapeString(string stringToEscape, bool checkExistingEscaped, IndexOfAnyValues noEscape) { ArgumentNullException.ThrowIfNull(stringToEscape); - if (stringToEscape.Length == 0) - { - return string.Empty; - } - - // Get the table of characters that do not need to be escaped. - Debug.Assert(unreserved.Length == 0x80); - scoped ReadOnlySpan noEscape; - if ((forceEscape1 | forceEscape2) == 0) - { - noEscape = unreserved; - } - else - { - Span tmp = stackalloc bool[0x80]; - unreserved.CopyTo(tmp); - tmp[forceEscape1] = false; - tmp[forceEscape2] = false; - noEscape = tmp; - } + Debug.Assert(!noEscape.Contains('%'), "Need to treat % specially; it should be part of any escaped set"); - // If the whole string is made up of ASCII unreserved chars, just return it. - Debug.Assert(!noEscape['%'], "Need to treat % specially; it should be part of any escaped set"); - int i = 0; - char c; - for (; i < stringToEscape.Length && (c = stringToEscape[i]) <= 0x7F && noEscape[c]; i++) ; - if (i == stringToEscape.Length) + int indexOfFirstToEscape = stringToEscape.AsSpan().IndexOfAnyExcept(noEscape); + if (indexOfFirstToEscape < 0) { + // Nothing to escape, just return the original string. return stringToEscape; } @@ -147,115 +124,97 @@ internal static string EscapeString( // append to it all of the noEscape chars we already iterated through, // escape the rest, and return the result as a string. var vsb = new ValueStringBuilder(stackalloc char[Uri.StackallocThreshold]); - vsb.Append(stringToEscape.AsSpan(0, i)); - EscapeStringToBuilder(stringToEscape.AsSpan(i), ref vsb, noEscape, checkExistingEscaped); + vsb.Append(stringToEscape.AsSpan(0, indexOfFirstToEscape)); + EscapeStringToBuilder(stringToEscape.AsSpan(indexOfFirstToEscape), ref vsb, noEscape, checkExistingEscaped); return vsb.ToString(); } internal static unsafe void EscapeString(ReadOnlySpan stringToEscape, ref ValueStringBuilder dest, - bool checkExistingEscaped, char forceEscape1 = '\0', char forceEscape2 = '\0') + bool checkExistingEscaped, IndexOfAnyValues noEscape) { - // Get the table of characters that do not need to be escaped. - scoped ReadOnlySpan noEscape; - if ((forceEscape1 | forceEscape2) == 0) - { - noEscape = UnreservedReservedTable; - } - else - { - Span tmp = stackalloc bool[0x80]; - UnreservedReservedTable.CopyTo(tmp); - tmp[forceEscape1] = false; - tmp[forceEscape2] = false; - noEscape = tmp; - } + Debug.Assert(!noEscape.Contains('%'), "Need to treat % specially; it should be part of any escaped set"); - // If the whole string is made up of ASCII unreserved chars, take a fast pasth. Per the contract, if - // dest is null, just return it. If it's not null, copy everything to it and update destPos accordingly; - // if that requires resizing it, do so. - Debug.Assert(!noEscape['%'], "Need to treat % specially in case checkExistingEscaped is true"); - int i = 0; - char c; - for (; i < stringToEscape.Length && (c = stringToEscape[i]) <= 0x7F && noEscape[c]; i++) ; - if (i == stringToEscape.Length) + int indexOfFirstToEscape = stringToEscape.IndexOfAnyExcept(noEscape); + if (indexOfFirstToEscape < 0) { + // Nothing to escape, just copy the whole span. dest.Append(stringToEscape); } else { - dest.Append(stringToEscape.Slice(0, i)); + dest.Append(stringToEscape.Slice(0, indexOfFirstToEscape)); - // CS8350 & CS8352: We can't pass `noEscape` and `dest` as arguments together as that could leak the scope of the above stackalloc - // As a workaround, re-create the Span in a way that avoids analysis - ReadOnlySpan noEscapeCopy = MemoryMarshal.CreateReadOnlySpan(ref MemoryMarshal.GetReference(noEscape), noEscape.Length); - - EscapeStringToBuilder(stringToEscape.Slice(i), ref dest, noEscapeCopy, checkExistingEscaped); + EscapeStringToBuilder(stringToEscape.Slice(indexOfFirstToEscape), ref dest, noEscape, checkExistingEscaped); } } private static void EscapeStringToBuilder( ReadOnlySpan stringToEscape, ref ValueStringBuilder vsb, - ReadOnlySpan noEscape, bool checkExistingEscaped) + IndexOfAnyValues noEscape, bool checkExistingEscaped) { + Debug.Assert(!stringToEscape.IsEmpty && !noEscape.Contains(stringToEscape[0])); + // Allocate enough stack space to hold any Rune's UTF8 encoding. Span utf8Bytes = stackalloc byte[4]; - // Then enumerate every rune in the input. - SpanRuneEnumerator e = stringToEscape.EnumerateRunes(); - while (e.MoveNext()) + while (!stringToEscape.IsEmpty) { - Rune r = e.Current; + char c = stringToEscape[0]; - if (!r.IsAscii) + if (!char.IsAscii(c)) { + if (Rune.DecodeFromUtf16(stringToEscape, out Rune r, out int charsConsumed) != OperationStatus.Done) + { + r = Rune.ReplacementChar; + } + + Debug.Assert(stringToEscape.EnumerateRunes() is { } e && e.MoveNext() && e.Current == r); + Debug.Assert(charsConsumed is 1 or 2); + + stringToEscape = stringToEscape.Slice(charsConsumed); + // The rune is non-ASCII, so encode it as UTF8, and escape each UTF8 byte. r.TryEncodeToUtf8(utf8Bytes, out int bytesWritten); foreach (byte b in utf8Bytes.Slice(0, bytesWritten)) { - vsb.Append('%'); - HexConverter.ToCharsBuffer(b, vsb.AppendSpan(2), 0, HexConverter.Casing.Upper); + PercentEncodeByte(b, ref vsb); } - continue; - } - // If the value doesn't need to be escaped, append it and continue. - byte value = (byte)r.Value; - if (noEscape[value]) - { - vsb.Append((char)value); continue; } - // If we're checking for existing escape sequences, then if this is the beginning of - // one, check the next two characters in the sequence. This is a little tricky to do - // as we're using an enumerator, but luckily it's a ref struct-based enumerator: we can - // make a copy and iterate through the copy without impacting the original, and then only - // push the original ahead if we find what we're looking for in the copy. - if (checkExistingEscaped && value == '%') + if (!noEscape.Contains(c)) { - // If the next two characters are valid escaped ASCII, then just output them as-is. - SpanRuneEnumerator tmpEnumerator = e; - if (tmpEnumerator.MoveNext()) + // If we're checking for existing escape sequences, then if this is the beginning of + // one, check the next two characters in the sequence. + if (c == '%' && checkExistingEscaped) { - Rune r1 = tmpEnumerator.Current; - if (r1.IsAscii && char.IsAsciiHexDigit((char)r1.Value) && tmpEnumerator.MoveNext()) + // If the next two characters are valid escaped ASCII, then just output them as-is. + if (stringToEscape.Length > 2 && char.IsAsciiHexDigit(stringToEscape[1]) && char.IsAsciiHexDigit(stringToEscape[2])) { - Rune r2 = tmpEnumerator.Current; - if (r2.IsAscii && char.IsAsciiHexDigit((char)r2.Value)) - { - vsb.Append('%'); - vsb.Append((char)r1.Value); - vsb.Append((char)r2.Value); - e = tmpEnumerator; - continue; - } + vsb.Append('%'); + vsb.Append(stringToEscape[1]); + vsb.Append(stringToEscape[2]); + stringToEscape = stringToEscape.Slice(3); + continue; } } + + PercentEncodeByte((byte)c, ref vsb); + stringToEscape = stringToEscape.Slice(1); + continue; + } + + // We have a character we don't want to escape. It's likely there are more, do a vectorized search. + int charsToCopy = stringToEscape.IndexOfAnyExcept(noEscape); + if (charsToCopy < 0) + { + charsToCopy = stringToEscape.Length; } + Debug.Assert(charsToCopy > 0); - // Otherwise, append the escaped character. - vsb.Append('%'); - HexConverter.ToCharsBuffer(value, vsb.AppendSpan(2), 0, HexConverter.Casing.Upper); + vsb.Append(stringToEscape.Slice(0, charsToCopy)); + stringToEscape = stringToEscape.Slice(charsToCopy); } } @@ -446,7 +405,7 @@ internal static unsafe void UnescapeString(char* pStr, int start, int end, ref V { if (escapeReserved) { - EscapeAsciiChar((byte)pStr[next], ref dest); + PercentEncodeByte((byte)pStr[next], ref dest); escapeReserved = false; next++; } @@ -474,7 +433,7 @@ internal static unsafe void UnescapeString(char* pStr, int start, int end, ref V } } - internal static void EscapeAsciiChar(byte b, ref ValueStringBuilder to) + internal static void PercentEncodeByte(byte b, ref ValueStringBuilder to) { to.Append('%'); HexConverter.ToCharsBuffer(b, to.AppendSpan(2), 0, HexConverter.Casing.Upper); @@ -525,35 +484,19 @@ internal static bool IsNotSafeForUnescape(char ch) return NotSafeForUnescape.Contains(ch); } - // "Reserved" and "Unreserved" characters are based on RFC 3986. + // true for all ASCII letters and digits, as well as the RFC3986 unreserved marks '-', '_', '.', and '~' + public static readonly IndexOfAnyValues Unreserved = + IndexOfAnyValues.Create("-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"); - internal static ReadOnlySpan UnreservedReservedTable => new bool[0x80] - { - // true for all ASCII letters and digits, as well as the RFC3986 reserved characters, unreserved characters, and hash - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, - false, true, false, true, true, false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, false, true, false, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, false, true, false, true, - false, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, false, false, false, true, false, - }; - - internal static bool IsUnreserved(int c) => c < 0x80 && UnreservedTable[c]; - - internal static ReadOnlySpan UnreservedTable => new bool[0x80] - { - // true for all ASCII letters and digits, as well as the RFC3986 unreserved marks '-', '_', '.', and '~' - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, false, false, false, false, false, true, true, false, - true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, - false, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, true, - false, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, false, false, false, true, false, - }; + // true for all ASCII letters and digits, as well as the RFC3986 reserved characters, unreserved characters, and hash + public static readonly IndexOfAnyValues UnreservedReserved = + IndexOfAnyValues.Create("!#$&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz~"); + + public static readonly IndexOfAnyValues UnreservedReservedExceptHash = + IndexOfAnyValues.Create("!$&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz~"); + + public static readonly IndexOfAnyValues UnreservedReservedExceptQuestionMarkHash = + IndexOfAnyValues.Create("!$&'()*+,-./0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz~"); // // Is this a gen delim char from RFC 3986