Skip to content

Commit

Permalink
Support reading and writing decimals with arbitrary length fixed-leng…
Browse files Browse the repository at this point in the history
…th byte array columns and writing with int32 or int64 (#482)
  • Loading branch information
adamreeve authored Aug 28, 2024
1 parent 4deb370 commit 227d838
Show file tree
Hide file tree
Showing 13 changed files with 669 additions and 128 deletions.
348 changes: 305 additions & 43 deletions csharp.test/TestDecimal.cs

Large diffs are not rendered by default.

41 changes: 1 addition & 40 deletions csharp.test/TestDecimal128.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public static void TestRoundTrip(int scale)

list.Add(decimal.MaxValue);

var multiplier = Decimal128.GetScaleMultiplier(scale);
var multiplier = DecimalConverter.GetScaleMultiplier(scale, precision: 29);
var decimals = list.Select(v => v / multiplier).ToArray();

foreach (var value in decimals)
Expand All @@ -36,15 +36,6 @@ public static void TestRoundTrip(int scale)
}
}

[Test]
public static void TestScaleMultiplier()
{
Assert.AreEqual(1M, Decimal128.GetScaleMultiplier(0));
Assert.AreEqual(10M, Decimal128.GetScaleMultiplier(1));
Assert.AreEqual(100M, Decimal128.GetScaleMultiplier(2));
Assert.AreEqual(1e+028M, Decimal128.GetScaleMultiplier(28));
}

[Test]
[SetCulture("en-US")]
public static void TestScaleOverflow()
Expand Down Expand Up @@ -96,35 +87,5 @@ public static void TestAgainstThirdParty()
var read = (decimal[]) rowGroupReader.ReadColumn(fileReader.Schema.GetDataFields()[0]).Data;
Assert.AreEqual(values, read);
}

[Test]
public static void TestThrowsWithUnsupportedPrecision()
{
using var decimalType = LogicalType.Decimal(precision: 28, scale: 3);
var columns = new Column[] {new Column<decimal>("Decimal", decimalType)};

using var buffer = new ResizableBuffer();
using var outStream = new BufferOutputStream(buffer);
using var fileWriter = new ParquetFileWriter(outStream, columns);
using var rowGroupWriter = fileWriter.AppendRowGroup();
var exception = Assert.Throws<NotSupportedException>(() => { rowGroupWriter.NextColumn().LogicalWriter<decimal>(); });
Assert.That(exception!.Message, Does.Contain("29 digits of precision"));
fileWriter.Close();
}

[Test]
public static void TestThrowsWithUnsupportedLength()
{
using var decimalType = LogicalType.Decimal(precision: 29, scale: 3);
var columns = new Column[] {new Column(typeof(decimal), "Decimal", decimalType, 13)};

using var buffer = new ResizableBuffer();
using var outStream = new BufferOutputStream(buffer);
using var fileWriter = new ParquetFileWriter(outStream, columns);
using var rowGroupWriter = fileWriter.AppendRowGroup();
var exception = Assert.Throws<NotSupportedException>(() => { rowGroupWriter.NextColumn().LogicalWriter<decimal>(); });
Assert.That(exception!.Message, Does.Contain("16 bytes of decimal length"));
fileWriter.Close();
}
}
}
4 changes: 2 additions & 2 deletions csharp.test/TestLogicalTypeRoundtrip.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2076,7 +2076,7 @@ private static ExpectedColumn[] CreateExpectedColumns()
Min = -10m,
Max = ((NumRows - 1m) * (NumRows - 1m) * (NumRows - 1m)) / 1000 - 10,
Converter = (v, descr) => LogicalRead.ToDecimal(
(FixedLenByteArray) v, Decimal128.GetScaleMultiplier(descr.TypeScale))
(FixedLenByteArray) v, DecimalConverter.GetScaleMultiplier(descr.TypeScale, descr.TypePrecision))
},
new ExpectedColumn
{
Expand All @@ -2091,7 +2091,7 @@ private static ExpectedColumn[] CreateExpectedColumns()
Min = -9.999m,
Max = ((NumRows - 1m) * (NumRows - 1m) * (NumRows - 1m)) / 1000 - 10,
Converter = (v, descr) => LogicalRead.ToDecimal(
(FixedLenByteArray) v, Decimal128.GetScaleMultiplier(descr.TypeScale))
(FixedLenByteArray) v, DecimalConverter.GetScaleMultiplier(descr.TypeScale, descr.TypePrecision))
},
new ExpectedColumn
{
Expand Down
31 changes: 28 additions & 3 deletions csharp/Column.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public class Column
#pragma warning disable RS0027

public Column(Type logicalSystemType, string name, LogicalType? logicalTypeOverride = null)
: this(logicalSystemType, name, logicalTypeOverride, GetTypeLength(logicalSystemType))
: this(logicalSystemType, name, logicalTypeOverride, GetTypeLength(logicalSystemType, logicalTypeOverride))
{
LogicalSystemType = logicalSystemType ?? throw new ArgumentNullException(nameof(logicalSystemType));
Name = name ?? throw new ArgumentNullException(nameof(name));
Expand Down Expand Up @@ -107,11 +107,36 @@ public static GroupNode CreateSchemaNode(Column[] columns, LogicalTypeFactory lo

#pragma warning restore RS0026

private static unsafe int GetTypeLength(Type logicalSystemType)
private static unsafe int GetTypeLength(Type logicalSystemType, LogicalType? logicalTypeOverride)
{
if (logicalSystemType == typeof(decimal) || logicalSystemType == typeof(decimal?))
{
return sizeof(Decimal128);
if (!(logicalTypeOverride is DecimalLogicalType decimalType))
{
throw new ArgumentException("decimal type requires a DecimalLogicalType override");
}

// Older versions of ParquetSharp only supported writing with a precision of 29,
// corresponding to the maximum precision supported by C# decimal values.
// Decimals were written as 16 byte arrays and reading only supported 16 byte arrays.
// So for backwards compatibility, if the precision is 29 we still write 16 byte values.
if (decimalType.Precision == 29)
{
return sizeof(Decimal128);
}

// For other precisions, work out the size of array required
var typeLength = 1;
while (true)
{
var maxPrecision = DecimalConverter.MaxPrecision(typeLength);
if (maxPrecision >= decimalType.Precision)
{
return typeLength;
}

++typeLength;
}
}

if (logicalSystemType == typeof(Guid) || logicalSystemType == typeof(Guid?))
Expand Down
10 changes: 0 additions & 10 deletions csharp/Decimal128.cs
Original file line number Diff line number Diff line change
Expand Up @@ -101,16 +101,6 @@ public decimal ToDecimal(decimal multiplier)
return unscaled / multiplier;
}

public static decimal GetScaleMultiplier(int scale)
{
if (scale < 0 || scale > 28)
{
throw new ArgumentOutOfRangeException(nameof(scale), "scale must be a value in [0, 28]");
}

return (decimal) Math.Pow(10, scale);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void TwosComplement(uint* ptr)
{
Expand Down
140 changes: 140 additions & 0 deletions csharp/DecimalConverter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
using System;
using System.Runtime.CompilerServices;

namespace ParquetSharp
{
/// <summary>
/// This is a more flexible converter for decimal data stored in arbitrary length byte arrays,
/// as opposed to Decimal128 which only works with 16 byte values but is more performant.
/// </summary>
internal static class DecimalConverter
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe decimal ReadDecimal(ByteArray byteArray, decimal multiplier)
{
if (byteArray.Length == 0)
{
return new decimal(0);
}

// Read into little-Endian ordered array
var tmp = stackalloc byte[byteArray.Length];
for (var byteIdx = 0; byteIdx < byteArray.Length; ++byteIdx)
{
tmp[byteArray.Length - byteIdx - 1] = *((byte*) byteArray.Pointer + byteIdx);
}

var negative = false;
if ((tmp[byteArray.Length - 1] & (1 << 7)) == 1 << 7)
{
negative = true;
TwosComplement(tmp, byteArray.Length);
}

var unscaled = new decimal(tmp[0]);
var numUsableBytes = Math.Min(byteArray.Length, 12);
decimal byteMultiplier = 1;
for (var byteIdx = 1; byteIdx < numUsableBytes; ++byteIdx)
{
byteMultiplier *= 256;
unscaled += byteMultiplier * tmp[byteIdx];
}

for (var byteIdx = numUsableBytes; byteIdx < byteArray.Length; ++byteIdx)
{
if (tmp[byteIdx] > 0)
{
throw new OverflowException("Decimal value is not representable as a .NET Decimal");
}
}

if (negative)
{
unscaled *= -1;
}

return unscaled / multiplier;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe void WriteDecimal(decimal value, ByteArray byteArray, decimal multiplier)
{
decimal unscaled;

try
{
unscaled = decimal.Truncate(value * multiplier);
}
catch (OverflowException exception)
{
throw new OverflowException($"value {value:E} is too large for decimal scale {Math.Log10((double) multiplier)}", exception);
}

var negative = unscaled < 0;
if (negative)
{
unscaled *= -1;
}

// Compute little-endian representation of unscaled value
var tmp = stackalloc byte[byteArray.Length];
for (var byteIdx = 0; byteIdx < byteArray.Length; ++byteIdx)
{
var remainder = unscaled % 256;
tmp[byteIdx] = (byte) remainder;
unscaled = (unscaled - remainder) / 256;
}

if (unscaled != 0)
{
throw new OverflowException(
$"value {value:E} is too large to be represented by {byteArray.Length} bytes with decimal scale {Math.Log10((double) multiplier)}");
}

if (negative)
{
TwosComplement(tmp, byteArray.Length);
}

// Reverse bytes to get big-Endian representation, writing into output
for (var i = 0; i < byteArray.Length; ++i)
{
*((byte*) byteArray.Pointer + i) = tmp[byteArray.Length - i - 1];
}
}

public static int MaxPrecision(int typeLength)
{
return (int) Math.Floor(Math.Log10(Math.Pow(2.0, 8.0 * typeLength - 1) - 1));
}

public static decimal GetScaleMultiplier(int scale, int precision)
{
if (scale < 0 || scale > precision)
{
throw new ArgumentOutOfRangeException(nameof(scale), $"scale must be in the range [0, precision ({precision})]");
}

return (decimal) Math.Pow(10, scale);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void TwosComplement(byte* byteArray, int length)
{
byte carry = 0;
byteArray[0] = AddCarry((byte) ~byteArray[0], 1, ref carry);
for (int i = 1; i < length; ++i)
{
byteArray[i] = AddCarry((byte) ~byteArray[i], 0, ref carry);
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static byte AddCarry(byte left, byte right, ref byte carry)
{
var r = (uint) left + right + carry;
carry = (byte) (r >> 8);
return (byte) r;
}
}
}
28 changes: 24 additions & 4 deletions csharp/LogicalRead.cs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ public static Delegate GetConverter(ColumnDescriptor columnDescriptor, ColumnChu

if (typeof(TLogical) == typeof(decimal))
{
var multiplier = Decimal128.GetScaleMultiplier(columnDescriptor.TypeScale);
var multiplier = DecimalConverter.GetScaleMultiplier(columnDescriptor.TypeScale, columnDescriptor.TypePrecision);
if (typeof(TPhysical) == typeof(int))
{
return (LogicalRead<decimal, int>.Converter) ((s, _, d, _) => LogicalRead.ConvertDecimal32(s, d, multiplier));
Expand All @@ -136,13 +136,15 @@ public static Delegate GetConverter(ColumnDescriptor columnDescriptor, ColumnChu
}
if (typeof(TPhysical) == typeof(FixedLenByteArray))
{
return (LogicalRead<decimal, FixedLenByteArray>.Converter) ((s, _, d, _) => LogicalRead.ConvertDecimal128(s, d, multiplier));
return TypeUtils.UseDecimal128(columnDescriptor)
? (LogicalRead<decimal, FixedLenByteArray>.Converter) ((s, _, d, _) => LogicalRead.ConvertDecimal128(s, d, multiplier))
: (LogicalRead<decimal, FixedLenByteArray>.Converter) ((s, _, d, _) => LogicalRead.ConvertDecimal(s, d, multiplier, columnDescriptor.TypeLength));
}
}

if (typeof(TLogical) == typeof(decimal?))
{
var multiplier = Decimal128.GetScaleMultiplier(columnDescriptor.TypeScale);
var multiplier = DecimalConverter.GetScaleMultiplier(columnDescriptor.TypeScale, columnDescriptor.TypePrecision);
if (typeof(TPhysical) == typeof(int))
{
return (LogicalRead<decimal?, int>.Converter) ((s, dl, d, del) => LogicalRead.ConvertDecimal32(s, dl, d, multiplier, del));
Expand All @@ -153,7 +155,9 @@ public static Delegate GetConverter(ColumnDescriptor columnDescriptor, ColumnChu
}
if (typeof(TPhysical) == typeof(FixedLenByteArray))
{
return (LogicalRead<decimal?, FixedLenByteArray>.Converter) ((s, dl, d, del) => LogicalRead.ConvertDecimal128(s, dl, d, multiplier, del));
return TypeUtils.UseDecimal128(columnDescriptor)
? (LogicalRead<decimal?, FixedLenByteArray>.Converter) ((s, dl, d, del) => LogicalRead.ConvertDecimal128(s, dl, d, multiplier, del))
: (LogicalRead<decimal?, FixedLenByteArray>.Converter) ((s, dl, d, del) => LogicalRead.ConvertDecimal(s, dl, d, multiplier, columnDescriptor.TypeLength, del));
}
}

Expand Down Expand Up @@ -511,6 +515,22 @@ public static void ConvertDecimal128(ReadOnlySpan<FixedLenByteArray> source, Rea
}
}

public static void ConvertDecimal(ReadOnlySpan<FixedLenByteArray> source, Span<decimal> destination, decimal multiplier, int typeLength)
{
for (int i = 0; i < destination.Length; ++i)
{
destination[i] = DecimalConverter.ReadDecimal(new ByteArray(source[i].Pointer, typeLength), multiplier);
}
}

public static void ConvertDecimal(ReadOnlySpan<FixedLenByteArray> source, ReadOnlySpan<short> defLevels, Span<decimal?> destination, decimal multiplier, int typeLength, short definedLevel)
{
for (int i = 0, src = 0; i < destination.Length; ++i)
{
destination[i] = defLevels[i] != definedLevel ? default(decimal?) : DecimalConverter.ReadDecimal(new ByteArray(source[src++].Pointer, typeLength), multiplier);
}
}

public static void ConvertUuid(ReadOnlySpan<FixedLenByteArray> source, Span<Guid> destination)
{
for (int i = 0; i < destination.Length; ++i)
Expand Down
11 changes: 5 additions & 6 deletions csharp/LogicalTypeFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,12 @@ public virtual unsafe (Type physicalType, Type logicalType) GetSystemTypes(Colum
}
case PhysicalType.FixedLenByteArray:
{
if (descriptor.TypeLength != sizeof(Decimal128))
var maxPrecision = DecimalConverter.MaxPrecision(descriptor.TypeLength);
if (descriptor.TypePrecision > maxPrecision)
{
throw new NotSupportedException($"only {sizeof(Decimal128)} bytes of decimal length is supported with fixed-length byte array data");
}
if (descriptor.TypePrecision > 29)
{
throw new NotSupportedException("only max 29 digits of decimal precision is supported with fixed-length byte array data");
throw new NotSupportedException(
$"A maximum of {maxPrecision} digits of decimal precision is supported with fixed length byte arrays " +
$"of length {descriptor.TypeLength} (specified precision is {descriptor.TypePrecision})");
}
return (typeof(FixedLenByteArray), nullable ? typeof(decimal?) : typeof(decimal));
}
Expand Down
Loading

0 comments on commit 227d838

Please sign in to comment.