Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add Int128 IO support for csv & ipc #20535

Merged
merged 1 commit into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions crates/polars-arrow/src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ unsafe impl DictionaryKey for i64 {
const KEY_TYPE: IntegerType = IntegerType::Int64;
const MAX_USIZE_VALUE: usize = i64::MAX as usize;
}
unsafe impl DictionaryKey for i128 {
const KEY_TYPE: IntegerType = IntegerType::Int128;
const MAX_USIZE_VALUE: usize = i128::MAX as usize;
}
unsafe impl DictionaryKey for u8 {
const KEY_TYPE: IntegerType = IntegerType::UInt8;
const MAX_USIZE_VALUE: usize = u8::MAX as usize;
Expand Down
1 change: 1 addition & 0 deletions crates/polars-arrow/src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,7 @@ impl From<IntegerType> for ArrowDataType {
IntegerType::Int16 => ArrowDataType::Int16,
IntegerType::Int32 => ArrowDataType::Int32,
IntegerType::Int64 => ArrowDataType::Int64,
IntegerType::Int128 => ArrowDataType::Int128,
IntegerType::UInt8 => ArrowDataType::UInt8,
IntegerType::UInt16 => ArrowDataType::UInt16,
IntegerType::UInt32 => ArrowDataType::UInt32,
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-arrow/src/datatypes/physical_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ pub enum IntegerType {
Int32,
/// A signed 64-bit integer.
Int64,
/// A signed 128-bit integer.
Int128,
/// An unsigned 8-bit integer.
UInt8,
/// An unsigned 16-bit integer.
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-arrow/src/io/ipc/read/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ fn deserialize_integer(int: arrow_format::ipc::IntRef) -> PolarsResult<IntegerTy
(32, false) => IntegerType::UInt32,
(64, true) => IntegerType::Int64,
(64, false) => IntegerType::UInt64,
_ => polars_bail!(oos = "IPC: indexType can only be 8, 16, 32 or 64."),
(128, true) => IntegerType::Int128,
_ => polars_bail!(oos = "IPC: indexType can only be 8, 16, 32, 64 or 128."),
})
}

Expand Down
3 changes: 2 additions & 1 deletion crates/polars-arrow/src/io/ipc/write/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ pub(crate) fn serialize_dictionary(
) -> arrow_format::ipc::DictionaryEncoding {
use IntegerType::*;
let is_signed = match index_type {
Int8 | Int16 | Int32 | Int64 => true,
Int8 | Int16 | Int32 | Int64 | Int128 => true,
UInt8 | UInt16 | UInt32 | UInt64 => false,
};

Expand All @@ -336,6 +336,7 @@ pub(crate) fn serialize_dictionary(
Int16 | UInt16 => 16,
Int32 | UInt32 => 32,
Int64 | UInt64 => 64,
Int128 => 128,
};

let index_type = arrow_format::ipc::Int {
Expand Down
1 change: 1 addition & 0 deletions crates/polars-arrow/src/util/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ macro_rules! match_integer_type {(
Int16 => __with_ty__! { i16 },
Int32 => __with_ty__! { i32 },
Int64 => __with_ty__! { i64 },
Int128 => __with_ty__! { i128 },
UInt8 => __with_ty__! { u8 },
UInt16 => __with_ty__! { u16 },
UInt32 => __with_ty__! { u32 },
Expand Down
1 change: 1 addition & 0 deletions crates/polars-compute/src/comparisons/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ macro_rules! compare {
PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>),
PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>),
PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>),
PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>),
PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>),
PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>),
PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>),
Expand Down
1 change: 1 addition & 0 deletions crates/polars-compute/src/comparisons/dyn_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ macro_rules! compare {
PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>, lhs, rhs, $op),
PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>, lhs, rhs, $op),
PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>, lhs, rhs, $op),
PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>, lhs, rhs, $op),
PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>, lhs, rhs, $op),
PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>, lhs, rhs, $op),
PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>, lhs, rhs, $op),
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-compute/src/comparisons/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ macro_rules! compare {
PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>),
PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>),
PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>),
PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>),
PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>),
PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>),
PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>),
Expand Down Expand Up @@ -196,6 +197,7 @@ macro_rules! compare_broadcast {
PH::Dictionary(I::Int16) => call_binary!(DictionaryArray<i16>),
PH::Dictionary(I::Int32) => call_binary!(DictionaryArray<i32>),
PH::Dictionary(I::Int64) => call_binary!(DictionaryArray<i64>),
PH::Dictionary(I::Int128) => call_binary!(DictionaryArray<i128>),
PH::Dictionary(I::UInt8) => call_binary!(DictionaryArray<u8>),
PH::Dictionary(I::UInt16) => call_binary!(DictionaryArray<u16>),
PH::Dictionary(I::UInt32) => call_binary!(DictionaryArray<u32>),
Expand Down
1 change: 1 addition & 0 deletions crates/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ dtype-u8 = ["polars-core/dtype-u8"]
dtype-u16 = ["polars-core/dtype-u16"]
dtype-i8 = ["polars-core/dtype-i8"]
dtype-i16 = ["polars-core/dtype-i16"]
dtype-i128 = ["polars-core/dtype-i128"]
dtype-categorical = ["polars-core/dtype-categorical"]
dtype-date = ["polars-core/dtype-date", "polars-time/dtype-date"]
object = ["polars-core/object"]
Expand Down
26 changes: 26 additions & 0 deletions crates/polars-io/src/csv/read/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ impl PrimitiveParser for Int64Type {
atoi_simd::parse_skipped(bytes).ok()
}
}
#[cfg(feature = "dtype-i128")]
impl PrimitiveParser for Int128Type {
#[inline]
fn parse(bytes: &[u8]) -> Option<i128> {
atoi_simd::parse_skipped(bytes).ok()
}
}

trait ParsedBuffer {
fn parse_bytes(
Expand Down Expand Up @@ -522,6 +529,8 @@ pub fn init_buffers(
&DataType::Int16 => Buffer::Int16(PrimitiveChunkedBuilder::new(name, capacity)),
&DataType::Int32 => Buffer::Int32(PrimitiveChunkedBuilder::new(name, capacity)),
&DataType::Int64 => Buffer::Int64(PrimitiveChunkedBuilder::new(name, capacity)),
#[cfg(feature = "dtype-i128")]
&DataType::Int128 => Buffer::Int128(PrimitiveChunkedBuilder::new(name, capacity)),
#[cfg(feature = "dtype-u8")]
&DataType::UInt8 => Buffer::UInt8(PrimitiveChunkedBuilder::new(name, capacity)),
#[cfg(feature = "dtype-u16")]
Expand Down Expand Up @@ -594,6 +603,8 @@ pub enum Buffer {
Int16(PrimitiveChunkedBuilder<Int16Type>),
Int32(PrimitiveChunkedBuilder<Int32Type>),
Int64(PrimitiveChunkedBuilder<Int64Type>),
#[cfg(feature = "dtype-i128")]
Int128(PrimitiveChunkedBuilder<Int128Type>),
#[cfg(feature = "dtype-u8")]
UInt8(PrimitiveChunkedBuilder<UInt8Type>),
#[cfg(feature = "dtype-u16")]
Expand Down Expand Up @@ -628,6 +639,8 @@ impl Buffer {
Buffer::Int16(v) => v.finish().into_series(),
Buffer::Int32(v) => v.finish().into_series(),
Buffer::Int64(v) => v.finish().into_series(),
#[cfg(feature = "dtype-i128")]
Buffer::Int128(v) => v.finish().into_series(),
#[cfg(feature = "dtype-u8")]
Buffer::UInt8(v) => v.finish().into_series(),
#[cfg(feature = "dtype-u16")]
Expand Down Expand Up @@ -701,6 +714,8 @@ impl Buffer {
Buffer::Int16(v) => v.append_null(),
Buffer::Int32(v) => v.append_null(),
Buffer::Int64(v) => v.append_null(),
#[cfg(feature = "dtype-i128")]
Buffer::Int128(v) => v.append_null(),
#[cfg(feature = "dtype-u8")]
Buffer::UInt8(v) => v.append_null(),
#[cfg(feature = "dtype-u16")]
Expand Down Expand Up @@ -745,6 +760,8 @@ impl Buffer {
Buffer::Int16(_) => DataType::Int16,
Buffer::Int32(_) => DataType::Int32,
Buffer::Int64(_) => DataType::Int64,
#[cfg(feature = "dtype-i128")]
Buffer::Int128(_) => DataType::Int128,
#[cfg(feature = "dtype-u8")]
Buffer::UInt8(_) => DataType::UInt8,
#[cfg(feature = "dtype-u16")]
Expand Down Expand Up @@ -824,6 +841,15 @@ impl Buffer {
missing_is_null,
None,
),
#[cfg(feature = "dtype-i128")]
Int128(buf) => <PrimitiveChunkedBuilder<Int128Type> as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
None,
),
#[cfg(feature = "dtype-u8")]
UInt8(buf) => <PrimitiveChunkedBuilder<UInt8Type> as ParsedBuffer>::parse_bytes(
buf,
Expand Down
1 change: 1 addition & 0 deletions crates/polars-io/src/csv/write/write_impl/serializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,7 @@ pub(super) fn serializer_for<'a>(
DataType::UInt32 => quote_if_always!(integer_serializer::<u32>),
DataType::Int64 => quote_if_always!(integer_serializer::<i64>),
DataType::UInt64 => quote_if_always!(integer_serializer::<u64>),
DataType::Int128 => quote_if_always!(integer_serializer::<i128>),
DataType::Float32 => match options.float_precision {
Some(precision) => match options.float_scientific {
Some(true) => {
Expand Down
1 change: 1 addition & 0 deletions crates/polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ dtype-i16 = [
]
dtype-i128 = [
"polars-core/dtype-i128",
"polars-io/dtype-i128",
"polars-lazy?/dtype-i128",
"polars-ops/dtype-i128",
"polars-time?/dtype-i128",
Expand Down
23 changes: 17 additions & 6 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,12 +259,12 @@ def test_csv_missing_utf8_is_empty_string() -> None:

def test_csv_int_types() -> None:
f = io.StringIO(
"u8,i8,u16,i16,u32,i32,u64,i64\n"
"0,0,0,0,0,0,0,0\n"
"0,-128,0,-32768,0,-2147483648,0,-9223372036854775808\n"
"255,127,65535,32767,4294967295,2147483647,18446744073709551615,9223372036854775807\n"
"01,01,01,01,01,01,01,01\n"
"01,-01,01,-01,01,-01,01,-01\n"
"u8,i8,u16,i16,u32,i32,u64,i64,i128\n"
"0,0,0,0,0,0,0,0,0\n"
"0,-128,0,-32768,0,-2147483648,0,-9223372036854775808,-170141183460469231731687303715884105728\n"
"255,127,65535,32767,4294967295,2147483647,18446744073709551615,9223372036854775807,170141183460469231731687303715884105727\n"
"01,01,01,01,01,01,01,01,01\n"
"01,-01,01,-01,01,-01,01,-01,01\n"
)
df = pl.read_csv(
f,
Expand All @@ -277,6 +277,7 @@ def test_csv_int_types() -> None:
"i32": pl.Int32,
"u64": pl.UInt64,
"i64": pl.Int64,
"i128": pl.Int128,
},
)

Expand All @@ -295,6 +296,16 @@ def test_csv_int_types() -> None:
[0, -9223372036854775808, 9223372036854775807, 1, -1],
dtype=pl.Int64,
),
"i128": pl.Series(
[
0,
-170141183460469231731687303715884105728,
170141183460469231731687303715884105727,
1,
1,
],
dtype=pl.Int128,
),
}
),
)
Expand Down
31 changes: 26 additions & 5 deletions py-polars/tests/unit/io/test_ipc.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ def test_select_columns_from_buffer(stream: bool) -> None:
"a": [1],
"b": [2],
"c": [3],
}
},
schema={"a": pl.Int64(), "b": pl.Int128(), "c": pl.UInt8()},
)

f = io.BytesIO()
Expand All @@ -109,7 +110,8 @@ def test_select_columns_from_buffer(stream: bool) -> None:
"b": [2],
"c": [3],
"a": [1],
}
},
schema={"b": pl.Int128(), "c": pl.UInt8(), "a": pl.Int64()},
)
assert_frame_equal(expected, actual)

Expand Down Expand Up @@ -142,14 +144,33 @@ def test_compressed_simple(compression: IpcCompression, stream: bool) -> None:

@pytest.mark.parametrize("compression", COMPRESSIONS)
def test_ipc_schema(compression: IpcCompression) -> None:
df = pl.DataFrame({"a": [1, 2], "b": ["a", None], "c": [True, False]})
schema = {
"i64": pl.Int64(),
"i128": pl.Int128(),
"u8": pl.UInt8(),
"f32": pl.Float32(),
"f64": pl.Float64(),
"str": pl.String(),
"bool": pl.Boolean(),
}
df = pl.DataFrame(
{
"i64": [1, 2],
"i128": [1, 2],
"u8": [1, 2],
"f32": [1, 2],
"f64": [1, 2],
"str": ["a", None],
"bool": [True, False],
},
schema=schema,
)

f = io.BytesIO()
df.write_ipc(f, compression=compression)
f.seek(0)

expected = {"a": pl.Int64(), "b": pl.String(), "c": pl.Boolean()}
assert pl.read_ipc_schema(f) == expected
assert pl.read_ipc_schema(f) == schema


@pytest.mark.write_disk
Expand Down
Loading