From ada490ce0b88a2b5b7162f13dba0bb97556c3e9b Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 2 Jan 2025 17:43:56 -0500 Subject: [PATCH] feat: Add `Int128` IO support for csv & ipc --- .../polars-arrow/src/array/dictionary/mod.rs | 4 +++ crates/polars-arrow/src/datatypes/mod.rs | 1 + .../src/datatypes/physical_type.rs | 2 ++ crates/polars-arrow/src/io/ipc/read/schema.rs | 3 +- .../polars-arrow/src/io/ipc/write/schema.rs | 3 +- crates/polars-arrow/src/util/macros.rs | 1 + .../polars-compute/src/comparisons/array.rs | 1 + .../src/comparisons/dyn_array.rs | 1 + crates/polars-compute/src/comparisons/list.rs | 2 ++ crates/polars-io/Cargo.toml | 1 + crates/polars-io/src/csv/read/buffer.rs | 26 ++++++++++++++++ .../src/csv/write/write_impl/serializer.rs | 1 + crates/polars/Cargo.toml | 1 + py-polars/tests/unit/io/test_csv.py | 23 ++++++++++---- py-polars/tests/unit/io/test_ipc.py | 31 ++++++++++++++++--- 15 files changed, 88 insertions(+), 13 deletions(-) diff --git a/crates/polars-arrow/src/array/dictionary/mod.rs b/crates/polars-arrow/src/array/dictionary/mod.rs index 3f44dd604980..8d31109d8f19 100644 --- a/crates/polars-arrow/src/array/dictionary/mod.rs +++ b/crates/polars-arrow/src/array/dictionary/mod.rs @@ -81,6 +81,10 @@ unsafe impl DictionaryKey for i64 { const KEY_TYPE: IntegerType = IntegerType::Int64; const MAX_USIZE_VALUE: usize = i64::MAX as usize; } +unsafe impl DictionaryKey for i128 { + const KEY_TYPE: IntegerType = IntegerType::Int128; + const MAX_USIZE_VALUE: usize = i128::MAX as usize; +} unsafe impl DictionaryKey for u8 { const KEY_TYPE: IntegerType = IntegerType::UInt8; const MAX_USIZE_VALUE: usize = u8::MAX as usize; diff --git a/crates/polars-arrow/src/datatypes/mod.rs b/crates/polars-arrow/src/datatypes/mod.rs index d3bc5417a9d8..cc7a081a81cc 100644 --- a/crates/polars-arrow/src/datatypes/mod.rs +++ b/crates/polars-arrow/src/datatypes/mod.rs @@ -455,6 +455,7 @@ impl From for ArrowDataType { IntegerType::Int16 => ArrowDataType::Int16, IntegerType::Int32 => ArrowDataType::Int32, IntegerType::Int64 => ArrowDataType::Int64, + IntegerType::Int128 => ArrowDataType::Int128, IntegerType::UInt8 => ArrowDataType::UInt8, IntegerType::UInt16 => ArrowDataType::UInt16, IntegerType::UInt32 => ArrowDataType::UInt32, diff --git a/crates/polars-arrow/src/datatypes/physical_type.rs b/crates/polars-arrow/src/datatypes/physical_type.rs index 732a129055a6..f75d8e644f4c 100644 --- a/crates/polars-arrow/src/datatypes/physical_type.rs +++ b/crates/polars-arrow/src/datatypes/physical_type.rs @@ -76,6 +76,8 @@ pub enum IntegerType { Int32, /// A signed 64-bit integer. Int64, + /// A signed 128-bit integer. + Int128, /// An unsigned 8-bit integer. UInt8, /// An unsigned 16-bit integer. diff --git a/crates/polars-arrow/src/io/ipc/read/schema.rs b/crates/polars-arrow/src/io/ipc/read/schema.rs index d9bb3b21828e..3ed84d3005bd 100644 --- a/crates/polars-arrow/src/io/ipc/read/schema.rs +++ b/crates/polars-arrow/src/io/ipc/read/schema.rs @@ -72,7 +72,8 @@ fn deserialize_integer(int: arrow_format::ipc::IntRef) -> PolarsResult IntegerType::UInt32, (64, true) => IntegerType::Int64, (64, false) => IntegerType::UInt64, - _ => polars_bail!(oos = "IPC: indexType can only be 8, 16, 32 or 64."), + (128, true) => IntegerType::Int128, + _ => polars_bail!(oos = "IPC: indexType can only be 8, 16, 32, 64 or 128."), }) } diff --git a/crates/polars-arrow/src/io/ipc/write/schema.rs b/crates/polars-arrow/src/io/ipc/write/schema.rs index a7e15bbdf464..bdceb58acc5d 100644 --- a/crates/polars-arrow/src/io/ipc/write/schema.rs +++ b/crates/polars-arrow/src/io/ipc/write/schema.rs @@ -327,7 +327,7 @@ pub(crate) fn serialize_dictionary( ) -> arrow_format::ipc::DictionaryEncoding { use IntegerType::*; let is_signed = match index_type { - Int8 | Int16 | Int32 | Int64 => true, + Int8 | Int16 | Int32 | Int64 | Int128 => true, UInt8 | UInt16 | UInt32 | UInt64 => false, }; @@ -336,6 +336,7 @@ pub(crate) fn serialize_dictionary( Int16 | UInt16 => 16, Int32 | UInt32 => 32, Int64 | UInt64 => 64, + Int128 => 128, }; let index_type = arrow_format::ipc::Int { diff --git a/crates/polars-arrow/src/util/macros.rs b/crates/polars-arrow/src/util/macros.rs index fb5bd61ebba0..2153d2cb3a07 100644 --- a/crates/polars-arrow/src/util/macros.rs +++ b/crates/polars-arrow/src/util/macros.rs @@ -57,6 +57,7 @@ macro_rules! match_integer_type {( Int16 => __with_ty__! { i16 }, Int32 => __with_ty__! { i32 }, Int64 => __with_ty__! { i64 }, + Int128 => __with_ty__! { i128 }, UInt8 => __with_ty__! { u8 }, UInt16 => __with_ty__! { u16 }, UInt32 => __with_ty__! { u32 }, diff --git a/crates/polars-compute/src/comparisons/array.rs b/crates/polars-compute/src/comparisons/array.rs index facde12a5c37..210a8a0489aa 100644 --- a/crates/polars-compute/src/comparisons/array.rs +++ b/crates/polars-compute/src/comparisons/array.rs @@ -205,6 +205,7 @@ macro_rules! compare { PH::Dictionary(I::Int16) => call_binary!(DictionaryArray), PH::Dictionary(I::Int32) => call_binary!(DictionaryArray), PH::Dictionary(I::Int64) => call_binary!(DictionaryArray), + PH::Dictionary(I::Int128) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray), diff --git a/crates/polars-compute/src/comparisons/dyn_array.rs b/crates/polars-compute/src/comparisons/dyn_array.rs index 3ee3d802f09f..07fd4bbd9a9d 100644 --- a/crates/polars-compute/src/comparisons/dyn_array.rs +++ b/crates/polars-compute/src/comparisons/dyn_array.rs @@ -68,6 +68,7 @@ macro_rules! compare { PH::Dictionary(I::Int16) => call_binary!(DictionaryArray, lhs, rhs, $op), PH::Dictionary(I::Int32) => call_binary!(DictionaryArray, lhs, rhs, $op), PH::Dictionary(I::Int64) => call_binary!(DictionaryArray, lhs, rhs, $op), + PH::Dictionary(I::Int128) => call_binary!(DictionaryArray, lhs, rhs, $op), PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray, lhs, rhs, $op), PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray, lhs, rhs, $op), PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray, lhs, rhs, $op), diff --git a/crates/polars-compute/src/comparisons/list.rs b/crates/polars-compute/src/comparisons/list.rs index f7e18b79c0e7..c7c1db50ed70 100644 --- a/crates/polars-compute/src/comparisons/list.rs +++ b/crates/polars-compute/src/comparisons/list.rs @@ -99,6 +99,7 @@ macro_rules! compare { PH::Dictionary(I::Int16) => call_binary!(DictionaryArray), PH::Dictionary(I::Int32) => call_binary!(DictionaryArray), PH::Dictionary(I::Int64) => call_binary!(DictionaryArray), + PH::Dictionary(I::Int128) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray), @@ -196,6 +197,7 @@ macro_rules! compare_broadcast { PH::Dictionary(I::Int16) => call_binary!(DictionaryArray), PH::Dictionary(I::Int32) => call_binary!(DictionaryArray), PH::Dictionary(I::Int64) => call_binary!(DictionaryArray), + PH::Dictionary(I::Int128) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray), PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray), diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index f4ff3dd431c7..4c3ed4b04a76 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -82,6 +82,7 @@ dtype-u8 = ["polars-core/dtype-u8"] dtype-u16 = ["polars-core/dtype-u16"] dtype-i8 = ["polars-core/dtype-i8"] dtype-i16 = ["polars-core/dtype-i16"] +dtype-i128 = ["polars-core/dtype-i128"] dtype-categorical = ["polars-core/dtype-categorical"] dtype-date = ["polars-core/dtype-date", "polars-time/dtype-date"] object = ["polars-core/object"] diff --git a/crates/polars-io/src/csv/read/buffer.rs b/crates/polars-io/src/csv/read/buffer.rs index 22b8b34d3676..a13ab46e585e 100644 --- a/crates/polars-io/src/csv/read/buffer.rs +++ b/crates/polars-io/src/csv/read/buffer.rs @@ -82,6 +82,13 @@ impl PrimitiveParser for Int64Type { atoi_simd::parse_skipped(bytes).ok() } } +#[cfg(feature = "dtype-i128")] +impl PrimitiveParser for Int128Type { + #[inline] + fn parse(bytes: &[u8]) -> Option { + atoi_simd::parse_skipped(bytes).ok() + } +} trait ParsedBuffer { fn parse_bytes( @@ -522,6 +529,8 @@ pub fn init_buffers( &DataType::Int16 => Buffer::Int16(PrimitiveChunkedBuilder::new(name, capacity)), &DataType::Int32 => Buffer::Int32(PrimitiveChunkedBuilder::new(name, capacity)), &DataType::Int64 => Buffer::Int64(PrimitiveChunkedBuilder::new(name, capacity)), + #[cfg(feature = "dtype-i128")] + &DataType::Int128 => Buffer::Int128(PrimitiveChunkedBuilder::new(name, capacity)), #[cfg(feature = "dtype-u8")] &DataType::UInt8 => Buffer::UInt8(PrimitiveChunkedBuilder::new(name, capacity)), #[cfg(feature = "dtype-u16")] @@ -594,6 +603,8 @@ pub enum Buffer { Int16(PrimitiveChunkedBuilder), Int32(PrimitiveChunkedBuilder), Int64(PrimitiveChunkedBuilder), + #[cfg(feature = "dtype-i128")] + Int128(PrimitiveChunkedBuilder), #[cfg(feature = "dtype-u8")] UInt8(PrimitiveChunkedBuilder), #[cfg(feature = "dtype-u16")] @@ -628,6 +639,8 @@ impl Buffer { Buffer::Int16(v) => v.finish().into_series(), Buffer::Int32(v) => v.finish().into_series(), Buffer::Int64(v) => v.finish().into_series(), + #[cfg(feature = "dtype-i128")] + Buffer::Int128(v) => v.finish().into_series(), #[cfg(feature = "dtype-u8")] Buffer::UInt8(v) => v.finish().into_series(), #[cfg(feature = "dtype-u16")] @@ -701,6 +714,8 @@ impl Buffer { Buffer::Int16(v) => v.append_null(), Buffer::Int32(v) => v.append_null(), Buffer::Int64(v) => v.append_null(), + #[cfg(feature = "dtype-i128")] + Buffer::Int128(v) => v.append_null(), #[cfg(feature = "dtype-u8")] Buffer::UInt8(v) => v.append_null(), #[cfg(feature = "dtype-u16")] @@ -745,6 +760,8 @@ impl Buffer { Buffer::Int16(_) => DataType::Int16, Buffer::Int32(_) => DataType::Int32, Buffer::Int64(_) => DataType::Int64, + #[cfg(feature = "dtype-i128")] + Buffer::Int128(_) => DataType::Int128, #[cfg(feature = "dtype-u8")] Buffer::UInt8(_) => DataType::UInt8, #[cfg(feature = "dtype-u16")] @@ -824,6 +841,15 @@ impl Buffer { missing_is_null, None, ), + #[cfg(feature = "dtype-i128")] + Int128(buf) => as ParsedBuffer>::parse_bytes( + buf, + bytes, + ignore_errors, + needs_escaping, + missing_is_null, + None, + ), #[cfg(feature = "dtype-u8")] UInt8(buf) => as ParsedBuffer>::parse_bytes( buf, diff --git a/crates/polars-io/src/csv/write/write_impl/serializer.rs b/crates/polars-io/src/csv/write/write_impl/serializer.rs index 6a4f964d88b3..5229455022cc 100644 --- a/crates/polars-io/src/csv/write/write_impl/serializer.rs +++ b/crates/polars-io/src/csv/write/write_impl/serializer.rs @@ -535,6 +535,7 @@ pub(super) fn serializer_for<'a>( DataType::UInt32 => quote_if_always!(integer_serializer::), DataType::Int64 => quote_if_always!(integer_serializer::), DataType::UInt64 => quote_if_always!(integer_serializer::), + DataType::Int128 => quote_if_always!(integer_serializer::), DataType::Float32 => match options.float_precision { Some(precision) => match options.float_scientific { Some(true) => { diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index eab4dd534314..25a1bcbb0489 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -330,6 +330,7 @@ dtype-i16 = [ ] dtype-i128 = [ "polars-core/dtype-i128", + "polars-io/dtype-i128", "polars-lazy?/dtype-i128", "polars-ops/dtype-i128", "polars-time?/dtype-i128", diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 43b05892d65d..8d258eb2ba5e 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -259,12 +259,12 @@ def test_csv_missing_utf8_is_empty_string() -> None: def test_csv_int_types() -> None: f = io.StringIO( - "u8,i8,u16,i16,u32,i32,u64,i64\n" - "0,0,0,0,0,0,0,0\n" - "0,-128,0,-32768,0,-2147483648,0,-9223372036854775808\n" - "255,127,65535,32767,4294967295,2147483647,18446744073709551615,9223372036854775807\n" - "01,01,01,01,01,01,01,01\n" - "01,-01,01,-01,01,-01,01,-01\n" + "u8,i8,u16,i16,u32,i32,u64,i64,i128\n" + "0,0,0,0,0,0,0,0,0\n" + "0,-128,0,-32768,0,-2147483648,0,-9223372036854775808,-170141183460469231731687303715884105728\n" + "255,127,65535,32767,4294967295,2147483647,18446744073709551615,9223372036854775807,170141183460469231731687303715884105727\n" + "01,01,01,01,01,01,01,01,01\n" + "01,-01,01,-01,01,-01,01,-01,01\n" ) df = pl.read_csv( f, @@ -277,6 +277,7 @@ def test_csv_int_types() -> None: "i32": pl.Int32, "u64": pl.UInt64, "i64": pl.Int64, + "i128": pl.Int128, }, ) @@ -295,6 +296,16 @@ def test_csv_int_types() -> None: [0, -9223372036854775808, 9223372036854775807, 1, -1], dtype=pl.Int64, ), + "i128": pl.Series( + [ + 0, + -170141183460469231731687303715884105728, + 170141183460469231731687303715884105727, + 1, + 1, + ], + dtype=pl.Int128, + ), } ), ) diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py index 84e6436cb10e..bdd7a47b38fe 100644 --- a/py-polars/tests/unit/io/test_ipc.py +++ b/py-polars/tests/unit/io/test_ipc.py @@ -95,7 +95,8 @@ def test_select_columns_from_buffer(stream: bool) -> None: "a": [1], "b": [2], "c": [3], - } + }, + schema={"a": pl.Int64(), "b": pl.Int128(), "c": pl.UInt8()}, ) f = io.BytesIO() @@ -109,7 +110,8 @@ def test_select_columns_from_buffer(stream: bool) -> None: "b": [2], "c": [3], "a": [1], - } + }, + schema={"b": pl.Int128(), "c": pl.UInt8(), "a": pl.Int64()}, ) assert_frame_equal(expected, actual) @@ -142,14 +144,33 @@ def test_compressed_simple(compression: IpcCompression, stream: bool) -> None: @pytest.mark.parametrize("compression", COMPRESSIONS) def test_ipc_schema(compression: IpcCompression) -> None: - df = pl.DataFrame({"a": [1, 2], "b": ["a", None], "c": [True, False]}) + schema = { + "i64": pl.Int64(), + "i128": pl.Int128(), + "u8": pl.UInt8(), + "f32": pl.Float32(), + "f64": pl.Float64(), + "str": pl.String(), + "bool": pl.Boolean(), + } + df = pl.DataFrame( + { + "i64": [1, 2], + "i128": [1, 2], + "u8": [1, 2], + "f32": [1, 2], + "f64": [1, 2], + "str": ["a", None], + "bool": [True, False], + }, + schema=schema, + ) f = io.BytesIO() df.write_ipc(f, compression=compression) f.seek(0) - expected = {"a": pl.Int64(), "b": pl.String(), "c": pl.Boolean()} - assert pl.read_ipc_schema(f) == expected + assert pl.read_ipc_schema(f) == schema @pytest.mark.write_disk