Skip to content

Commit

Permalink
feat: Add Int128 IO support for csv & ipc (#20535)
Browse files Browse the repository at this point in the history
  • Loading branch information
lukemanley authored Jan 3, 2025
1 parent 5c9bb71 commit 5f44997
Show file tree
Hide file tree
Showing 15 changed files with 88 additions and 13 deletions.
4 changes: 4 additions & 0 deletions crates/polars-arrow/src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ unsafe impl DictionaryKey for i64 {
const KEY_TYPE: IntegerType = IntegerType::Int64;
const MAX_USIZE_VALUE: usize = i64::MAX as usize;
}
unsafe impl DictionaryKey for i128 {
const KEY_TYPE: IntegerType = IntegerType::Int128;
const MAX_USIZE_VALUE: usize = i128::MAX as usize;
}
unsafe impl DictionaryKey for u8 {
const KEY_TYPE: IntegerType = IntegerType::UInt8;
const MAX_USIZE_VALUE: usize = u8::MAX as usize;
Expand Down
1 change: 1 addition & 0 deletions crates/polars-arrow/src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,7 @@ impl From<IntegerType> for ArrowDataType {
IntegerType::Int16 => ArrowDataType::Int16,
IntegerType::Int32 => ArrowDataType::Int32,
IntegerType::Int64 => ArrowDataType::Int64,
IntegerType::Int128 => ArrowDataType::Int128,
IntegerType::UInt8 => ArrowDataType::UInt8,
IntegerType::UInt16 => ArrowDataType::UInt16,
IntegerType::UInt32 => ArrowDataType::UInt32,
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-arrow/src/datatypes/physical_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ pub enum IntegerType {
Int32,
/// A signed 64-bit integer.
Int64,
/// A signed 128-bit integer.
Int128,
/// An unsigned 8-bit integer.
UInt8,
/// An unsigned 16-bit integer.
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-arrow/src/io/ipc/read/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ fn deserialize_integer(int: arrow_format::ipc::IntRef) -> PolarsResult<IntegerTy
(32, false) => IntegerType::UInt32,
(64, true) => IntegerType::Int64,
(64, false) => IntegerType::UInt64,
_ => polars_bail!(oos = "IPC: indexType can only be 8, 16, 32 or 64."),
(128, true) => IntegerType::Int128,
_ => polars_bail!(oos = "IPC: indexType can only be 8, 16, 32, 64 or 128."),
})
}

Expand Down
3 changes: 2 additions & 1 deletion crates/polars-arrow/src/io/ipc/write/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ pub(crate) fn serialize_dictionary(
) -> arrow_format::ipc::DictionaryEncoding {
use IntegerType::*;
let is_signed = match index_type {
Int8 | Int16 | Int32 | Int64 => true,
Int8 | Int16 | Int32 | Int64 | Int128 => true,
UInt8 | UInt16 | UInt32 | UInt64 => false,
};

Expand All @@ -336,6 +336,7 @@ pub(crate) fn serialize_dictionary(
Int16 | UInt16 => 16,
Int32 | UInt32 => 32,
Int64 | UInt64 => 64,
Int128 => 128,
};

let index_type = arrow_format::ipc::Int {
Expand Down
1 change: 1 addition & 0 deletions crates/polars-arrow/src/util/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ macro_rules! match_integer_type {(
Int16 => __with_ty__! { i16 },
Int32 => __with_ty__! { i32 },
Int64 => __with_ty__! { i64 },
Int128 => __with_ty__! { i128 },
UInt8 => __with_ty__! { u8 },
UInt16 => __with_ty__! { u16 },
UInt32 => __with_ty__! { u32 },
Expand Down
1 change: 1 addition & 0 deletions crates/polars-compute/src/comparisons/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ macro_rules! compare {
PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>),
PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>),
PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>),
PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>),
PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>),
PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>),
PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>),
Expand Down
1 change: 1 addition & 0 deletions crates/polars-compute/src/comparisons/dyn_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ macro_rules! compare {
PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>, lhs, rhs, $op),
PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>, lhs, rhs, $op),
PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>, lhs, rhs, $op),
PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>, lhs, rhs, $op),
PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>, lhs, rhs, $op),
PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>, lhs, rhs, $op),
PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>, lhs, rhs, $op),
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-compute/src/comparisons/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ macro_rules! compare {
PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>),
PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>),
PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>),
PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>),
PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>),
PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>),
PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>),
Expand Down Expand Up @@ -196,6 +197,7 @@ macro_rules! compare_broadcast {
PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>),
PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>),
PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>),
PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>),
PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>),
PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>),
PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>),
Expand Down
1 change: 1 addition & 0 deletions crates/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ dtype-u8 = ["polars-core/dtype-u8"]
dtype-u16 = ["polars-core/dtype-u16"]
dtype-i8 = ["polars-core/dtype-i8"]
dtype-i16 = ["polars-core/dtype-i16"]
dtype-i128 = ["polars-core/dtype-i128"]
dtype-categorical = ["polars-core/dtype-categorical"]
dtype-date = ["polars-core/dtype-date", "polars-time/dtype-date"]
object = ["polars-core/object"]
Expand Down
26 changes: 26 additions & 0 deletions crates/polars-io/src/csv/read/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ impl PrimitiveParser for Int64Type {
atoi_simd::parse_skipped(bytes).ok()
}
}
#[cfg(feature = "dtype-i128")]
impl PrimitiveParser for Int128Type {
#[inline]
fn parse(bytes: &[u8]) -> Option<i128> {
atoi_simd::parse_skipped(bytes).ok()
}
}

trait ParsedBuffer {
fn parse_bytes(
Expand Down Expand Up @@ -522,6 +529,8 @@ pub fn init_buffers(
&DataType::Int16 => Buffer::Int16(PrimitiveChunkedBuilder::new(name, capacity)),
&DataType::Int32 => Buffer::Int32(PrimitiveChunkedBuilder::new(name, capacity)),
&DataType::Int64 => Buffer::Int64(PrimitiveChunkedBuilder::new(name, capacity)),
#[cfg(feature = "dtype-i128")]
&DataType::Int128 => Buffer::Int128(PrimitiveChunkedBuilder::new(name, capacity)),
#[cfg(feature = "dtype-u8")]
&DataType::UInt8 => Buffer::UInt8(PrimitiveChunkedBuilder::new(name, capacity)),
#[cfg(feature = "dtype-u16")]
Expand Down Expand Up @@ -594,6 +603,8 @@ pub enum Buffer {
Int16(PrimitiveChunkedBuilder<Int16Type>),
Int32(PrimitiveChunkedBuilder<Int32Type>),
Int64(PrimitiveChunkedBuilder<Int64Type>),
#[cfg(feature = "dtype-i128")]
Int128(PrimitiveChunkedBuilder<Int128Type>),
#[cfg(feature = "dtype-u8")]
UInt8(PrimitiveChunkedBuilder<UInt8Type>),
#[cfg(feature = "dtype-u16")]
Expand Down Expand Up @@ -628,6 +639,8 @@ impl Buffer {
Buffer::Int16(v) => v.finish().into_series(),
Buffer::Int32(v) => v.finish().into_series(),
Buffer::Int64(v) => v.finish().into_series(),
#[cfg(feature = "dtype-i128")]
Buffer::Int128(v) => v.finish().into_series(),
#[cfg(feature = "dtype-u8")]
Buffer::UInt8(v) => v.finish().into_series(),
#[cfg(feature = "dtype-u16")]
Expand Down Expand Up @@ -701,6 +714,8 @@ impl Buffer {
Buffer::Int16(v) => v.append_null(),
Buffer::Int32(v) => v.append_null(),
Buffer::Int64(v) => v.append_null(),
#[cfg(feature = "dtype-i128")]
Buffer::Int128(v) => v.append_null(),
#[cfg(feature = "dtype-u8")]
Buffer::UInt8(v) => v.append_null(),
#[cfg(feature = "dtype-u16")]
Expand Down Expand Up @@ -745,6 +760,8 @@ impl Buffer {
Buffer::Int16(_) => DataType::Int16,
Buffer::Int32(_) => DataType::Int32,
Buffer::Int64(_) => DataType::Int64,
#[cfg(feature = "dtype-i128")]
Buffer::Int128(_) => DataType::Int128,
#[cfg(feature = "dtype-u8")]
Buffer::UInt8(_) => DataType::UInt8,
#[cfg(feature = "dtype-u16")]
Expand Down Expand Up @@ -824,6 +841,15 @@ impl Buffer {
missing_is_null,
None,
),
#[cfg(feature = "dtype-i128")]
Int128(buf) => <PrimitiveChunkedBuilder<Int128Type> as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
None,
),
#[cfg(feature = "dtype-u8")]
UInt8(buf) => <PrimitiveChunkedBuilder<UInt8Type> as ParsedBuffer>::parse_bytes(
buf,
Expand Down
1 change: 1 addition & 0 deletions crates/polars-io/src/csv/write/write_impl/serializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,7 @@ pub(super) fn serializer_for<'a>(
DataType::UInt32 => quote_if_always!(integer_serializer::<u32>),
DataType::Int64 => quote_if_always!(integer_serializer::<i64>),
DataType::UInt64 => quote_if_always!(integer_serializer::<u64>),
DataType::Int128 => quote_if_always!(integer_serializer::<i128>),
DataType::Float32 => match options.float_precision {
Some(precision) => match options.float_scientific {
Some(true) => {
Expand Down
1 change: 1 addition & 0 deletions crates/polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ dtype-i16 = [
]
dtype-i128 = [
"polars-core/dtype-i128",
"polars-io/dtype-i128",
"polars-lazy?/dtype-i128",
"polars-ops/dtype-i128",
"polars-time?/dtype-i128",
Expand Down
23 changes: 17 additions & 6 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,12 +259,12 @@ def test_csv_missing_utf8_is_empty_string() -> None:

def test_csv_int_types() -> None:
f = io.StringIO(
"u8,i8,u16,i16,u32,i32,u64,i64\n"
"0,0,0,0,0,0,0,0\n"
"0,-128,0,-32768,0,-2147483648,0,-9223372036854775808\n"
"255,127,65535,32767,4294967295,2147483647,18446744073709551615,9223372036854775807\n"
"01,01,01,01,01,01,01,01\n"
"01,-01,01,-01,01,-01,01,-01\n"
"u8,i8,u16,i16,u32,i32,u64,i64,i128\n"
"0,0,0,0,0,0,0,0,0\n"
"0,-128,0,-32768,0,-2147483648,0,-9223372036854775808,-170141183460469231731687303715884105728\n"
"255,127,65535,32767,4294967295,2147483647,18446744073709551615,9223372036854775807,170141183460469231731687303715884105727\n"
"01,01,01,01,01,01,01,01,01\n"
"01,-01,01,-01,01,-01,01,-01,01\n"
)
df = pl.read_csv(
f,
Expand All @@ -277,6 +277,7 @@ def test_csv_int_types() -> None:
"i32": pl.Int32,
"u64": pl.UInt64,
"i64": pl.Int64,
"i128": pl.Int128,
},
)

Expand All @@ -295,6 +296,16 @@ def test_csv_int_types() -> None:
[0, -9223372036854775808, 9223372036854775807, 1, -1],
dtype=pl.Int64,
),
"i128": pl.Series(
[
0,
-170141183460469231731687303715884105728,
170141183460469231731687303715884105727,
1,
1,
],
dtype=pl.Int128,
),
}
),
)
Expand Down
31 changes: 26 additions & 5 deletions py-polars/tests/unit/io/test_ipc.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ def test_select_columns_from_buffer(stream: bool) -> None:
"a": [1],
"b": [2],
"c": [3],
}
},
schema={"a": pl.Int64(), "b": pl.Int128(), "c": pl.UInt8()},
)

f = io.BytesIO()
Expand All @@ -109,7 +110,8 @@ def test_select_columns_from_buffer(stream: bool) -> None:
"b": [2],
"c": [3],
"a": [1],
}
},
schema={"b": pl.Int128(), "c": pl.UInt8(), "a": pl.Int64()},
)
assert_frame_equal(expected, actual)

Expand Down Expand Up @@ -142,14 +144,33 @@ def test_compressed_simple(compression: IpcCompression, stream: bool) -> None:

@pytest.mark.parametrize("compression", COMPRESSIONS)
def test_ipc_schema(compression: IpcCompression) -> None:
df = pl.DataFrame({"a": [1, 2], "b": ["a", None], "c": [True, False]})
schema = {
"i64": pl.Int64(),
"i128": pl.Int128(),
"u8": pl.UInt8(),
"f32": pl.Float32(),
"f64": pl.Float64(),
"str": pl.String(),
"bool": pl.Boolean(),
}
df = pl.DataFrame(
{
"i64": [1, 2],
"i128": [1, 2],
"u8": [1, 2],
"f32": [1, 2],
"f64": [1, 2],
"str": ["a", None],
"bool": [True, False],
},
schema=schema,
)

f = io.BytesIO()
df.write_ipc(f, compression=compression)
f.seek(0)

expected = {"a": pl.Int64(), "b": pl.String(), "c": pl.Boolean()}
assert pl.read_ipc_schema(f) == expected
assert pl.read_ipc_schema(f) == schema


@pytest.mark.write_disk
Expand Down

0 comments on commit 5f44997

Please sign in to comment.