Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Invariant Mode Case Mapping #55520

Merged
merged 10 commits into from
Jul 15, 2021
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ public CategoryCasingInfo(CodePoint codePoint)
break;
}

if (Program.IncludeCasingData)
// For compatibility reasons we are not mapping the Turkish I's nor Latin small letter long S with invariant casing.
if (Program.IncludeCasingData && codePoint.Value != 0x0130 && codePoint.Value != 0x0131 && codePoint.Value != 0x017f)
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
{
_data.offsetToSimpleUppercase = (ushort)(codePoint.SimpleUppercaseMapping - codePoint.Value);
_data.offsetToSimpleLowercase = (ushort)(codePoint.SimpleLowercaseMapping - codePoint.Value);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ namespace GenUnicodeProp
{
internal static class Program
{
internal static bool Verbose = false;
internal static bool IncludeCasingData = false;
internal static bool Verbose;
internal static bool IncludeCasingData;

private const string SOURCE_NAME = "CharUnicodeInfoData.cs";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ internal CodePoint(int value, ParsedUnicodeData parsedData)
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#PropList.txt.
/// </remarks>
public CodePointFlags Flags { get; } = default; // default is "no flags"
public CodePointFlags Flags { get; } // default is "no flags"

/// <summary>
/// The general Unicode category of this code point.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ public static IEnumerable<object[]> IndexOf_TestData()
yield return new object[] { "Hello", "L", 0, 5, CompareOptions.OrdinalIgnoreCase, 2 };
yield return new object[] { "Hello", "h", 0, 5, CompareOptions.OrdinalIgnoreCase, 0 };

yield return new object[] { "Hello\u00D3\u00D4", "\u00F3\u00F4", 0, 7, CompareOptions.OrdinalIgnoreCase, 5 };
yield return new object[] { "Hello\u00D3\u00D4", "\u00F3\u00F5", 0, 7, CompareOptions.OrdinalIgnoreCase, -1 };

yield return new object[] { "Hello\U00010400", "\U00010428", 0, 7, CompareOptions.OrdinalIgnoreCase, 5 };


// Long strings
yield return new object[] { new string('b', 100) + new string('a', 5555), "aaaaaaaaaaaaaaa", 0, 5655, CompareOptions.None, 100 };
yield return new object[] { new string('b', 101) + new string('a', 5555), new string('a', 5000), 0, 5656, CompareOptions.None, 101 };
Expand Down Expand Up @@ -159,6 +165,12 @@ public static IEnumerable<object[]> LastIndexOf_TestData()
yield return new object[] { "Hello", "L", 4, 5, CompareOptions.OrdinalIgnoreCase, 3 };
yield return new object[] { "Hello", "h", 4, 5, CompareOptions.OrdinalIgnoreCase, 0 };


yield return new object[] { "Hello\u00D3\u00D4\u00D3\u00D4", "\u00F3\u00F4", 8, 9, CompareOptions.OrdinalIgnoreCase, 7 };
yield return new object[] { "Hello\u00D3\u00D4\u00D3\u00D4", "\u00F3\u00F5", 8, 9, CompareOptions.OrdinalIgnoreCase, -1 };

yield return new object[] { "Hello\U00010400\U00010400", "\U00010428", 8, 9, CompareOptions.OrdinalIgnoreCase, 7 };

// Long strings
yield return new object[] { new string('a', 5555) + new string('b', 100), "aaaaaaaaaaaaaaa", 5654, 5655, CompareOptions.None, 5540 };
yield return new object[] { new string('b', 101) + new string('a', 5555), new string('a', 5000), 5655, 5656, CompareOptions.None, 656 };
Expand Down Expand Up @@ -237,6 +249,10 @@ public static IEnumerable<object[]> IsPrefix_TestData()
yield return new object[] { "FooBar", "Foo\u0400Bar", CompareOptions.Ordinal, false };
yield return new object[] { "FooBA\u0300R", "FooB\u00C0R", CompareOptions.IgnoreNonSpace, false };

yield return new object[] { "\u00D3\u00D4\u00D3\u00D4Hello", "\u00F3\u00F4", CompareOptions.OrdinalIgnoreCase, true };
yield return new object[] { "\u00D3\u00D4Hello\u00D3\u00D4", "\u00F3\u00F5", CompareOptions.OrdinalIgnoreCase, false };
yield return new object[] { "\U00010400\U00010400Hello", "\U00010428", CompareOptions.OrdinalIgnoreCase, true };

// Ignore symbols
yield return new object[] { "Test's can be interesting", "Tests", CompareOptions.IgnoreSymbols, false };
yield return new object[] { "Test's can be interesting", "Tests", CompareOptions.None, false };
Expand Down Expand Up @@ -277,6 +293,11 @@ public static IEnumerable<object[]> IsSuffix_TestData()
yield return new object[] { "FooBar", "Foo\u0400Bar", CompareOptions.Ordinal, false };
yield return new object[] { "FooBA\u0300R", "FooB\u00C0R", CompareOptions.IgnoreNonSpace, false };

yield return new object[] { "\u00D3\u00D4\u00D3\u00D4Hello", "\u00F3\u00F4", CompareOptions.OrdinalIgnoreCase, false };
yield return new object[] { "\u00D3\u00D4Hello\u00D3\u00D4", "\u00F3\u00F4", CompareOptions.OrdinalIgnoreCase, true };
yield return new object[] { "\U00010400\U00010400Hello", "\U00010428", CompareOptions.OrdinalIgnoreCase, false };
yield return new object[] { "Hello\U00010400", "\U00010428", CompareOptions.OrdinalIgnoreCase, true };

// Weightless characters
yield return new object[] { "", "\u200d", CompareOptions.None, false };
yield return new object[] { "", "\u200d", CompareOptions.IgnoreCase, false };
Expand Down Expand Up @@ -327,6 +348,21 @@ public static IEnumerable<object[]> Compare_TestData()

yield return new object[] { "", "'", CompareOptions.None, -1 };

yield return new object[] { "\u00D3\u00D4", "\u00F3\u00F4", CompareOptions.OrdinalIgnoreCase, 0 };
yield return new object[] { "\U00010400", "\U00010428", CompareOptions.OrdinalIgnoreCase, 0 };
yield return new object[] { "\u00D3\u00D4", "\u00F3\u00F4", CompareOptions.IgnoreCase, 0 };
yield return new object[] { "\U00010400", "\U00010428", CompareOptions.IgnoreCase, 0 };

yield return new object[] { "\u00D3\u00D4G", "\u00F3\u00F4", CompareOptions.OrdinalIgnoreCase, 1 };
yield return new object[] { "\U00010400G", "\U00010428", CompareOptions.OrdinalIgnoreCase, 1 };
yield return new object[] { "\u00D3\u00D4G", "\u00F3\u00F4", CompareOptions.IgnoreCase, 1 };
yield return new object[] { "\U00010400G", "\U00010428", CompareOptions.IgnoreCase, 1 };

yield return new object[] { "\u00D3\u00D4", "\u00F3\u00F4G", CompareOptions.OrdinalIgnoreCase, -1 };
yield return new object[] { "\U00010400", "\U00010428G", CompareOptions.OrdinalIgnoreCase, -1 };
yield return new object[] { "\u00D3\u00D4", "\u00F3\u00F4G", CompareOptions.IgnoreCase, -1 };
yield return new object[] { "\U00010400", "\U00010428G", CompareOptions.IgnoreCase, -1 };

// Hungarian
yield return new object[] { "dzsdzs", "ddzs", CompareOptions.Ordinal, 1 };
yield return new object[] { "dzsdzs", "ddzs", CompareOptions.None, 1 };
Expand All @@ -349,6 +385,14 @@ public static IEnumerable<object[]> Compare_TestData()
yield return new object[] { "llegar", "lugar", CompareOptions.None, -1 };

yield return new object[] { "\u3042", "\u30A1", CompareOptions.IgnoreKanaType | CompareOptions.IgnoreWidth | CompareOptions.IgnoreCase, -1 };

// Surrogates

yield return new object[] { "Hello\uFE6A", "Hello\U0001F601", CompareOptions.IgnoreCase, -1 };
yield return new object[] { "Hello\U0001F601", "Hello\uFE6A", CompareOptions.IgnoreCase, 1 };
yield return new object[] { "\uDBFF", "\uD800\uDC00", CompareOptions.IgnoreCase, -1 };
yield return new object[] { "\uD800\uDC00", "\uDBFF", CompareOptions.IgnoreCase, 1 };
yield return new object[] { "abcdefg\uDBFF", "abcdefg\uD800\uDC00", CompareOptions.IgnoreCase, -1 };
}

public static IEnumerable<object[]> ToLower_TestData()
Expand All @@ -375,21 +419,24 @@ public static IEnumerable<object[]> ToLower_TestData()
yield return new object[] { "EMBEDDED\0NuLL\0Byte\0", "embedded\0null\0byte\0", true };

// LATIN CAPITAL LETTER O WITH ACUTE, which has a lower case variant.
yield return new object[] { "\u00D3", "\u00F3", false };
yield return new object[] { "\u00D3", "\u00F3", true };

// SNOWMAN, which does not have a lower case variant.
yield return new object[] { "\u2603", "\u2603", true };

// RAINBOW (outside the BMP and does not case)
yield return new object[] { "\U0001F308", "\U0001F308", true };

// Surrogate casing
yield return new object[] { "\U00010400", "\U00010428", true };

// Unicode defines some codepoints which expand into multiple codepoints
// when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
// these sorts of expansions, since it would cause string lengths to change when cased,
// which is non-intuitive. In addition, there are some context sensitive mappings which
// we also don't preform.
// Greek Capital Letter Sigma (does not to case to U+03C2 with "final sigma" rule).
yield return new object[] { "\u03A3", "\u03C3", false };
yield return new object[] { "\u03A3", "\u03C3", true };
}

public static IEnumerable<object[]> ToUpper_TestData()
Expand All @@ -415,15 +462,18 @@ public static IEnumerable<object[]> ToUpper_TestData()

yield return new object[] { "embedded\0NuLL\0Byte\0", "EMBEDDED\0NULL\0BYTE\0", true };

// LATIN SMALL LETTER O WITH ACUTE, which has an upper case variant.
yield return new object[] { "\u00F3", "\u00D3", false };
// LATIN SMALL LETTER O WITH ACUTE, mapped to LATIN CAPITAL LETTER O WITH ACUTE.
yield return new object[] { "\u00F3", "\u00D3", true };

// SNOWMAN, which does not have an upper case variant.
yield return new object[] { "\u2603", "\u2603", true };

// RAINBOW (outside the BMP and does not case)
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
yield return new object[] { "\U0001F308", "\U0001F308", true };

// Surrogate casing
yield return new object[] { "\U00010428", "\U00010400", true };

// Unicode defines some codepoints which expand into multiple codepoints
// when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
// these sorts of expansions, since it would cause string lengths to change when cased,
Expand All @@ -439,7 +489,7 @@ public static IEnumerable<object[]> ToUpper_TestData()
// as part of casing.
yield return new object[] { "\u0149", "\u0149", true };

yield return new object[] { "\u03C3", "\u03A3", false };
yield return new object[] { "\u03C3", "\u03A3", true };
}

public static IEnumerable<object[]> GetAscii_TestData()
Expand Down Expand Up @@ -722,7 +772,7 @@ public unsafe void TestGetSortKeyLength_OverlongArgument(int inputLength)
[InlineData("Hello", CompareOptions.IgnoreCase, "HELLO")]
[InlineData("Hello", CompareOptions.IgnoreCase | CompareOptions.IgnoreWidth, "HELLO")]
[InlineData("Hell\u00F6", CompareOptions.None, "Hell\u00F6")] // U+00F6 = LATIN SMALL LETTER O WITH DIAERESIS
[InlineData("Hell\u00F6", CompareOptions.IgnoreCase, "HELL\u00F6")] // note the final "o with diaeresis" isn't capitalized
[InlineData("Hell\u00F6", CompareOptions.IgnoreCase, "HELL\u00D6")]
public unsafe void TestSortKey_FromSpan(string input, CompareOptions options, string expected)
{
byte[] expectedOutputBytes = GetExpectedInvariantOrdinalSortKey(expected);
Expand Down Expand Up @@ -1125,8 +1175,9 @@ public void TestHashing()
[InlineData('A', 'A', 'a')]
[InlineData('i', 'I', 'i')] // to verify that we don't special-case the Turkish I in the invariant globalization mode
[InlineData('I', 'I', 'i')]
[InlineData(0x00C1, 0x00C1, 0x00C1)] // U+00C1 LATIN CAPITAL LETTER A WITH ACUTE
[InlineData(0x00E1, 0x00E1, 0x00E1)] // U+00E1 LATIN SMALL LETTER A WITH ACUTE
[InlineData('\u017f', '\u017f', '\u017f')] // Latin small letter long S shouldn't be case mapped in the invariant mode.
[InlineData(0x00C1, 0x00C1, 0x00E1)] // U+00C1 LATIN CAPITAL LETTER A WITH ACUTE
[InlineData(0x00E1, 0x00C1, 0x00E1)] // U+00E1 LATIN SMALL LETTER A WITH ACUTE
[InlineData(0x00D7, 0x00D7, 0x00D7)] // U+00D7 MULTIPLICATION SIGN
public void TestRune(int original, int expectedToUpper, int expectedToLower)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\IdnMapping.Icu.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\IdnMapping.Nls.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\InternalGlobalizationHelper.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\InvariantModeCasing.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\ISOWeek.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\JapaneseCalendar.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\JapaneseCalendar.Icu.cs" />
Expand All @@ -363,6 +364,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\SortVersion.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\StringInfo.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\StrongBidiCategory.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\SurrogateCasing.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\TaiwanCalendar.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\TaiwanLunisolarCalendar.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\TextElementEnumerator.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,72 @@ private static double GetNumericValueNoBoundsCheck(uint codePoint)
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static char ToUpper(char codePoint)
{
nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks((uint)codePoint);

// The offset is specified in shorts:
// Get the 'ref short' corresponding to where the addend is, read it as a signed 16-bit value, then add

ref short rsStart = ref Unsafe.As<byte, short>(ref MemoryMarshal.GetReference(UppercaseValues));
ref short rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset);
int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta);
return (char)(delta + codePoint);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static uint ToUpper(uint codePoint)
{
if (!UnicodeUtility.IsValidCodePoint(codePoint))
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.codePoint);
}

nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks(codePoint);

// The offset is specified in shorts:
// Get the 'ref short' corresponding to where the addend is, read it as a signed 16-bit value, then add

ref short rsStart = ref Unsafe.As<byte, short>(ref MemoryMarshal.GetReference(UppercaseValues));
ref short rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset);
int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta);
return (uint)delta + codePoint;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static char ToLower(char codePoint)
{
nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks((uint)codePoint);

// The offset is specified in shorts:
// Get the 'ref short' corresponding to where the addend is, read it as a signed 16-bit value, then add

ref short rsStart = ref Unsafe.As<byte, short>(ref MemoryMarshal.GetReference(LowercaseValues));
ref short rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset);
int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta);
return (char)(delta + codePoint);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static uint ToLower(uint codePoint)
{
if (!UnicodeUtility.IsValidCodePoint(codePoint))
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.codePoint);
}

nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks(codePoint);

// If the offset is specified in shorts:
// Get the 'ref short' corresponding to where the addend is, read it as a signed 16-bit value, then add

ref short rsStart = ref Unsafe.As<byte, short>(ref MemoryMarshal.GetReference(LowercaseValues));
ref short rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset);
int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta);
return (uint)delta + codePoint;
}

/*
* GetUnicodeCategory
* ==================
Expand Down
Loading