Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid allocations for Unicode data tries #15074

Merged
merged 1 commit into from
Mar 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,595 changes: 1,080 additions & 515 deletions src/Avalonia.Base/Media/TextFormatting/Unicode/BiDi.trie.cs

Large diffs are not rendered by default.

1,271 changes: 861 additions & 410 deletions src/Avalonia.Base/Media/TextFormatting/Unicode/GraphemeBreak.trie.cs

Large diffs are not rendered by default.

28 changes: 8 additions & 20 deletions src/Avalonia.Base/Media/TextFormatting/Unicode/UnicodeData.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System.IO;
using System.Runtime.CompilerServices;
using System.Runtime.CompilerServices;

namespace Avalonia.Media.TextFormatting.Unicode
{
Expand Down Expand Up @@ -30,17 +29,6 @@ internal static class UnicodeData
internal const int BIDIPAIREDBRACKEDTYPE_MASK = (1 << BIDIPAIREDBRACKEDTYPE_BITS) - 1;
internal const int BIDICLASS_MASK = (1 << BIDICLASS_BITS) - 1;

private static readonly UnicodeTrie s_unicodeDataTrie;
private static readonly UnicodeTrie s_graphemeBreakTrie;
private static readonly UnicodeTrie s_biDiTrie;

static UnicodeData()
{
s_unicodeDataTrie = new UnicodeTrie(UnicodeDataTrie.Data);
s_graphemeBreakTrie = new UnicodeTrie(GraphemeBreakTrie.Data);
s_biDiTrie = new UnicodeTrie(BidiTrie.Data);
}

/// <summary>
/// Gets the <see cref="GeneralCategory"/> for a Unicode codepoint.
/// </summary>
Expand All @@ -49,7 +37,7 @@ static UnicodeData()
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static GeneralCategory GetGeneralCategory(uint codepoint)
{
return (GeneralCategory)(s_unicodeDataTrie.Get(codepoint) & CATEGORY_MASK);
return (GeneralCategory)(UnicodeDataTrie.Trie.Get(codepoint) & CATEGORY_MASK);
}

/// <summary>
Expand All @@ -60,7 +48,7 @@ public static GeneralCategory GetGeneralCategory(uint codepoint)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Script GetScript(uint codepoint)
{
return (Script)((s_unicodeDataTrie.Get(codepoint) >> SCRIPT_SHIFT) & SCRIPT_MASK);
return (Script)((UnicodeDataTrie.Trie.Get(codepoint) >> SCRIPT_SHIFT) & SCRIPT_MASK);
}

/// <summary>
Expand All @@ -71,7 +59,7 @@ public static Script GetScript(uint codepoint)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static BidiClass GetBiDiClass(uint codepoint)
{
return (BidiClass)((s_biDiTrie.Get(codepoint) >> BIDICLASS_SHIFT) & BIDICLASS_MASK);
return (BidiClass)((BidiTrie.Trie.Get(codepoint) >> BIDICLASS_SHIFT) & BIDICLASS_MASK);
}

/// <summary>
Expand All @@ -82,7 +70,7 @@ public static BidiClass GetBiDiClass(uint codepoint)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static BidiPairedBracketType GetBiDiPairedBracketType(uint codepoint)
{
return (BidiPairedBracketType)((s_biDiTrie.Get(codepoint) >> BIDIPAIREDBRACKEDTYPE_SHIFT) & BIDIPAIREDBRACKEDTYPE_MASK);
return (BidiPairedBracketType)((BidiTrie.Trie.Get(codepoint) >> BIDIPAIREDBRACKEDTYPE_SHIFT) & BIDIPAIREDBRACKEDTYPE_MASK);
}

/// <summary>
Expand All @@ -93,7 +81,7 @@ public static BidiPairedBracketType GetBiDiPairedBracketType(uint codepoint)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Codepoint GetBiDiPairedBracket(uint codepoint)
{
return new Codepoint((s_biDiTrie.Get(codepoint) & BIDIPAIREDBRACKED_MASK));
return new Codepoint(BidiTrie.Trie.Get(codepoint) & BIDIPAIREDBRACKED_MASK);
}

/// <summary>
Expand All @@ -104,7 +92,7 @@ public static Codepoint GetBiDiPairedBracket(uint codepoint)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static LineBreakClass GetLineBreakClass(uint codepoint)
{
return (LineBreakClass)((s_unicodeDataTrie.Get(codepoint) >> LINEBREAK_SHIFT) & LINEBREAK_MASK);
return (LineBreakClass)((UnicodeDataTrie.Trie.Get(codepoint) >> LINEBREAK_SHIFT) & LINEBREAK_MASK);
}

/// <summary>
Expand All @@ -115,7 +103,7 @@ public static LineBreakClass GetLineBreakClass(uint codepoint)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static GraphemeBreakClass GetGraphemeClusterBreak(uint codepoint)
{
return (GraphemeBreakClass)s_graphemeBreakTrie.Get(codepoint);
return (GraphemeBreakClass)GraphemeBreakTrie.Trie.Get(codepoint);
}
}
}
3,383 changes: 2,288 additions & 1,095 deletions src/Avalonia.Base/Media/TextFormatting/Unicode/UnicodeData.trie.cs

Large diffs are not rendered by default.

131 changes: 16 additions & 115 deletions src/Avalonia.Base/Media/TextFormatting/Unicode/UnicodeTrie.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,98 +16,25 @@
// Copied from: https://github.com/toptensoftware/RichTextKit

using System;
using System.IO;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;

namespace Avalonia.Media.TextFormatting.Unicode
{
internal class UnicodeTrie
internal ref struct UnicodeTrie
{
private readonly uint[] _data;
private readonly int _highStart;
private readonly uint _errorValue;

/// <summary>
/// Initializes a new instance of the <see cref="UnicodeTrie"/> class.
/// </summary>
/// <param name="rawData">The uncompressed trie data.</param>
public UnicodeTrie(ReadOnlySpan<byte> rawData)
{
var header = UnicodeTrieHeader.Parse(rawData);
int length = header.DataLength;
uint[] data = new uint[length / sizeof(uint)];

MemoryMarshal.Cast<byte, uint>(rawData.Slice(rawData.Length - length))
.CopyTo(data);

_highStart = header.HighStart;
_errorValue = header.ErrorValue;
_data = data;
}

/// <summary>
/// Initializes a new instance of the <see cref="UnicodeTrie"/> class.
/// </summary>
/// <param name="stream">The stream containing the data.</param>
public UnicodeTrie(Stream stream)
public UnicodeTrie(ReadOnlySpan<uint> data, int highStart, uint errorValue)
{
// Read the header info
using (var br = new BinaryReader(stream, Encoding.UTF8, true))
{
_highStart = br.ReadInt32();
_errorValue = br.ReadUInt32();
_data = new uint[br.ReadInt32() / sizeof(uint)];
}

// Read the data in compressed format.
using (var br = new BinaryReader(stream, Encoding.UTF8, true))
{
for (int i = 0; i < _data.Length; i++)
{
_data[i] = br.ReadUInt32();
}
}
Data = data;
HighStart = highStart;
ErrorValue = errorValue;
}

public ReadOnlySpan<uint> Data { get; }

/// <summary>
/// Initializes a new instance of the <see cref="UnicodeTrie"/> class.
/// </summary>
/// <param name="data">The uncompressed trie data.</param>
/// <param name="highStart">The start of the last range which ends at U+10ffff.</param>
/// <param name="errorValue">The value for out-of-range code points and illegal UTF-8.</param>
public UnicodeTrie(uint[] data, int highStart, uint errorValue)
{
_data = data;
_highStart = highStart;
_errorValue = errorValue;
}

/// <summary>
/// Saves the <see cref="UnicodeTrie"/> to the stream in a compressed format.
/// </summary>
/// <param name="stream">The output stream.</param>
internal void Save(Stream stream)
{
// Write the header info
using (var bw = new BinaryWriter(stream, Encoding.UTF8, true))
{
bw.Write(_highStart);
bw.Write(_errorValue);
bw.Write(_data.Length * sizeof(uint));
}
public int HighStart { get; }

// Write the data.
using (var bw = new BinaryWriter(stream, Encoding.UTF8, true))
{
for (int i = 0; i < _data.Length; i++)
{
bw.Write(_data[i]);
}
}
}
public uint ErrorValue { get; }

/// <summary>
/// Get the value for a code point as stored in the trie.
Expand All @@ -118,14 +45,14 @@ internal void Save(Stream stream)
public uint Get(uint codePoint)
{
uint index;
ref uint dataBase = ref MemoryMarshal.GetReference(_data.AsSpan());
ref uint dataBase = ref MemoryMarshal.GetReference(Data);

if (codePoint is < 0x0d800 or (> 0x0dbff and <= 0x0ffff))
{
// Ordinary BMP code point, excluding leading surrogates.
// BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index.
// 16 bit data is stored in the index array itself.
index = _data[codePoint >> UnicodeTrieBuilder.SHIFT_2];
index = Data[(int)(codePoint >> UnicodeTrieBuilder.SHIFT_2)];
index = (index << UnicodeTrieBuilder.INDEX_SHIFT) + (codePoint & UnicodeTrieBuilder.DATA_MASK);
return Unsafe.Add(ref dataBase, (nint)index);
}
Expand All @@ -138,55 +65,29 @@ public uint Get(uint codePoint)
// For this function, we need the code point data.
// Note: this expression could be refactored for slightly improved efficiency, but
// surrogate code points will be so rare in practice that it's not worth it.
index = _data[UnicodeTrieBuilder.LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UnicodeTrieBuilder.SHIFT_2)];
index = Data[(int)(UnicodeTrieBuilder.LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UnicodeTrieBuilder.SHIFT_2))];
index = (index << UnicodeTrieBuilder.INDEX_SHIFT) + (codePoint & UnicodeTrieBuilder.DATA_MASK);
return Unsafe.Add(ref dataBase, (nint)index);
}

if (codePoint < _highStart)
if (codePoint < HighStart)
{
// Supplemental code point, use two-level lookup.
index = UnicodeTrieBuilder.INDEX_1_OFFSET - UnicodeTrieBuilder.OMITTED_BMP_INDEX_1_LENGTH + (codePoint >> UnicodeTrieBuilder.SHIFT_1);
index = _data[index];
index = Data[(int)index];
index += (codePoint >> UnicodeTrieBuilder.SHIFT_2) & UnicodeTrieBuilder.INDEX_2_MASK;
index = _data[index];
index = Data[(int)index];
index = (index << UnicodeTrieBuilder.INDEX_SHIFT) + (codePoint & UnicodeTrieBuilder.DATA_MASK);
return Unsafe.Add(ref dataBase, (nint)index);
}

if (codePoint <= 0x10ffff)
{
return Unsafe.Add(ref dataBase, (nint)(_data.Length - UnicodeTrieBuilder.DATA_GRANULARITY));
return Data[Data.Length - UnicodeTrieBuilder.DATA_GRANULARITY];
}

// Fall through. The code point is outside of the legal range of 0..0x10ffff.
return _errorValue;
}

[StructLayout(LayoutKind.Sequential, Pack = 1)]
private struct UnicodeTrieHeader
{
public int HighStart
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get;
}

public uint ErrorValue
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get;
}

public int DataLength
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static UnicodeTrieHeader Parse(ReadOnlySpan<byte> data)
=> MemoryMarshal.Cast<byte, UnicodeTrieHeader>(data)[0];
return ErrorValue;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -325,19 +325,6 @@ public uint Get(int c, bool fromLSCP = true)
return _data[block + (c & DATA_MASK)];
}

public byte[] ToBuffer()
{
var mem = new MemoryStream();
Save(mem);
return mem.GetBuffer();
}

public void Save(Stream stream)
{
var trie = this.Freeze();
trie.Save(stream);
}

public UnicodeTrie Freeze()
{
int allIndexesLength, i;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public GraphemeBreakClassTrieGeneratorTests(ITestOutputHelper outputHelper)
_outputHelper = outputHelper;
}

[Fact(/*Skip = "Only run when we update the trie."*/)]
[Fact(Skip = "Only run when we update the trie.")]
public void Should_Enumerate()
{
var generator = new GraphemeBreakTestDataGenerator();
Expand Down Expand Up @@ -77,7 +77,7 @@ private bool Run(GraphemeBreakData t)
return true;
}

[Fact(/*Skip = "Only run when we update the trie."*/)]
[Fact(Skip = "Only run when we update the trie.")]
public void Should_Enumerate_Other()
{
const string text = "ABCDEFGHIJ";
Expand All @@ -96,7 +96,7 @@ public void Should_Enumerate_Other()
Assert.Equal(10, count);
}

[Fact(/*Skip = "Only run when we update the trie."*/)]
[Fact(Skip = "Only run when we update the trie.")]
public void Should_Generate_Trie()
{
GraphemeBreakClassTrieGenerator.Execute();
Expand Down
Loading
Loading