-
-
Notifications
You must be signed in to change notification settings - Fork 51
/
Copy pathSharedStringsExample.cs
138 lines (119 loc) · 5.78 KB
/
SharedStringsExample.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/*
* Copyright 2020 James Courtney
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using FlatSharp.Internal;
namespace Samples.SharedStrings;
/// <summary>
/// This file shows how to use FlatSharp to provide automatic string deduplication. In this example,
/// we define a collection of rows where each value is a (Key, Value) pair. We use string deduplication
/// to share the column names so that we don't serialize the column name for each cell.
/// </summary>
public class SharedStringsExample : IFlatSharpSample
{
public void Run()
{
// Create a Database of 10000 rows.
Database Database = new Database()
{
Rows = Enumerable.Range(0, 10000).Select(CreateRow).ToArray(),
};
// Shared strings are enabled by default.
ISerializer<Database> defaultSerializer = Database.Serializer;
// We can create a new serializer based on the current one with shared strings turned off.
// These factory delegates configure the writer.
ISerializer<Database> noSharedStringsSerializer = Database.Serializer.WithSettings(s => s.DisableSharedStrings());
// We can also create our own shared string providers (defined at the bottom of this file).
ISerializer<Database> customSharedStringSerializer = Database.Serializer.WithSettings(s => s.UseSharedStringWriter(() => new PerfectSharedStringWriter()));
byte[] unsharedBuffer = new byte[noSharedStringsSerializer.GetMaxSize(Database)];
byte[] sharedBuffer = new byte[defaultSerializer.GetMaxSize(Database)];
byte[] customBuffer = new byte[customSharedStringSerializer.GetMaxSize(Database)];
int unsharedBytesWritten = noSharedStringsSerializer.Write(unsharedBuffer, Database);
int defaultSharedBytesWritten = defaultSerializer.Write(sharedBuffer, Database);
int customSharedBytesWritten = customSharedStringSerializer.Write(customBuffer, Database);
Console.WriteLine($"Serialized size without shared strings: {unsharedBytesWritten}");
// These will be the same since there are so few shared strings. For large numbers,
// the custom provider will give smaller outputs while being considerably slower.
Console.WriteLine($"Serialized size with shared strings: {defaultSharedBytesWritten}");
Console.WriteLine($"Serialized size with custom shared strings: {customSharedBytesWritten}");
}
/// <summary>
/// Creates a row with three well-defined column names and random values.
/// </summary>
public static Row CreateRow(int row)
{
return new Row()
{
Values = new Column[]
{
new Column { ColumnName = "Column" + (row++ % 500), Value = Guid.NewGuid().ToString() },
new Column { ColumnName = "Column" + (row++ % 500), Value = Guid.NewGuid().ToString() },
new Column { ColumnName = "Column" + (row++ % 500), Value = Guid.NewGuid().ToString() },
}
};
}
}
/// <summary>
/// this is a "perfect" shared string writer implementation, which guarantees a single string is written only once.
/// this class will give optimal compression results, but will be considerably slower than FlatSharp's default implementation,
/// which uses a hashtable with flush-on-evict semantics and may write shared strings more than once.
/// </summary>
public sealed class PerfectSharedStringWriter : ISharedStringWriter
{
private readonly Dictionary<string, List<int>> stringOffsetMap = new Dictionary<string, List<int>>();
/// <summary>
/// Must be true if there are any strings waiting to be flushed.
/// </summary>
public bool IsDirty => this.stringOffsetMap.Count > 0;
/// <summary>
/// Called when FlatSharp has finished a serialize operation. This is the signal to flush any strings that the
/// string writer is hanging onto.
/// </summary>
public void FlushWrites<TSpanWriter>(TSpanWriter writer, Span<byte> data, SerializationContext context) where TSpanWriter : ISpanWriter
{
foreach (var kvp in this.stringOffsetMap)
{
string str = kvp.Key;
List<int> offsets = kvp.Value;
// Write the string.
int stringOffset = writer.WriteAndProvisionString(data, str, context);
// Update all the pointers that need to point to that string.
foreach (var offset in offsets)
{
writer.WriteUOffset(data, offset, stringOffset);
}
}
}
/// <summary>
/// Prepares to write. In this case, we just need to clear the internal map for a new write operation,
/// since the same SharedStringWriter is reused.
/// </summary>
public void Reset()
{
this.stringOffsetMap.Clear();
}
/// <summary>
/// Writes a shared string by storing the string mapped to the offsets at which the string occurs in the buffer.
/// </summary>
public void WriteSharedString<TSpanWriter>(TSpanWriter spanWriter, Span<byte> data, int offset, string value, SerializationContext context)
where TSpanWriter : ISpanWriter
{
if (!this.stringOffsetMap.TryGetValue(value, out List<int>? offsets))
{
offsets = new List<int>();
this.stringOffsetMap[value] = offsets;
}
offsets.Add(offset);
}
}