Skip to content

Commit

Permalink
feat(rust): Impl serde for array dtype (#13168)
Browse files Browse the repository at this point in the history
  • Loading branch information
reswqa authored Dec 27, 2023
1 parent 4304ffe commit 01217d2
Show file tree
Hide file tree
Showing 12 changed files with 261 additions and 64 deletions.
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ impl NullArray {
/// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to [`crate::datatypes::PhysicalType::Null`].
pub fn try_new(data_type: ArrowDataType, length: usize) -> PolarsResult<Self> {
if data_type.to_physical_type() != PhysicalType::Null {
polars_bail!(ComputeError: "NullArray can only be initialized with a DataType whose physical type is Boolean");
polars_bail!(ComputeError: "NullArray can only be initialized with a DataType whose physical type is Null");
}

Ok(Self { data_type, length })
Expand Down
33 changes: 30 additions & 3 deletions crates/polars-arrow/src/legacy/array/fixed_size_list.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use polars_error::PolarsResult;

use crate::array::{ArrayRef, FixedSizeListArray};
use crate::array::{ArrayRef, FixedSizeListArray, NullArray};
use crate::bitmap::MutableBitmap;
use crate::datatypes::ArrowDataType;
use crate::legacy::array::{convert_inner_type, is_nested_null};
use crate::legacy::kernels::concatenate::concatenate_owned_unchecked;

#[derive(Default)]
Expand Down Expand Up @@ -34,6 +35,8 @@ impl AnonymousBuilder {
}

pub fn push_null(&mut self) {
self.arrays
.push(NullArray::new(ArrowDataType::Null, self.width).boxed());
match &mut self.validity {
Some(validity) => validity.push(false),
None => self.init_validity(),
Expand All @@ -48,8 +51,32 @@ impl AnonymousBuilder {
}

pub fn finish(self, inner_dtype: Option<&ArrowDataType>) -> PolarsResult<FixedSizeListArray> {
let values = concatenate_owned_unchecked(&self.arrays)?;
let inner_dtype = inner_dtype.unwrap_or_else(|| self.arrays[0].data_type());
let mut inner_dtype = inner_dtype.unwrap_or_else(|| self.arrays[0].data_type());

if is_nested_null(inner_dtype) {
for arr in &self.arrays {
if !is_nested_null(arr.data_type()) {
inner_dtype = arr.data_type();
break;
}
}
};

// convert nested null arrays to the correct dtype.
let arrays = self
.arrays
.iter()
.map(|arr| {
if is_nested_null(arr.data_type()) {
convert_inner_type(&**arr, inner_dtype)
} else {
arr.to_boxed()
}
})
.collect::<Vec<_>>();

let values = concatenate_owned_unchecked(&arrays)?;

let data_type = FixedSizeListArray::default_datatype(inner_dtype.clone(), self.width);
Ok(FixedSizeListArray::new(
data_type,
Expand Down
44 changes: 2 additions & 42 deletions crates/polars-arrow/src/legacy/array/list.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use polars_error::PolarsResult;

use crate::array::{new_null_array, Array, ArrayRef, ListArray, NullArray, StructArray};
use crate::array::{new_null_array, Array, ArrayRef, ListArray, NullArray};
use crate::bitmap::MutableBitmap;
use crate::compute::concatenate;
use crate::datatypes::ArrowDataType;
use crate::legacy::array::is_nested_null;
use crate::legacy::kernels::concatenate::concatenate_owned_unchecked;
use crate::legacy::prelude::*;
use crate::offset::Offsets;
Expand Down Expand Up @@ -162,44 +163,3 @@ impl<'a> AnonymousBuilder<'a> {
))
}
}

fn is_nested_null(data_type: &ArrowDataType) -> bool {
match data_type {
ArrowDataType::Null => true,
ArrowDataType::LargeList(field) => is_nested_null(field.data_type()),
ArrowDataType::Struct(fields) => {
fields.iter().all(|field| is_nested_null(field.data_type()))
},
_ => false,
}
}

/// Cast null arrays to inner type and ensure that all offsets remain correct
pub fn convert_inner_type(array: &dyn Array, dtype: &ArrowDataType) -> Box<dyn Array> {
match dtype {
ArrowDataType::LargeList(field) => {
let array = array.as_any().downcast_ref::<LargeListArray>().unwrap();
let inner = array.values();
let new_values = convert_inner_type(inner.as_ref(), field.data_type());
let dtype = LargeListArray::default_datatype(new_values.data_type().clone());
LargeListArray::new(
dtype,
array.offsets().clone(),
new_values,
array.validity().cloned(),
)
.boxed()
},
ArrowDataType::Struct(fields) => {
let array = array.as_any().downcast_ref::<StructArray>().unwrap();
let inner = array.values();
let new_values = inner
.iter()
.zip(fields)
.map(|(arr, field)| convert_inner_type(arr.as_ref(), field.data_type()))
.collect::<Vec<_>>();
StructArray::new(dtype.clone(), new_values, array.validity().cloned()).boxed()
},
_ => new_null_array(dtype.clone(), array.len()),
}
}
57 changes: 56 additions & 1 deletion crates/polars-arrow/src/legacy/array/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use crate::array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array};
use crate::array::{
new_null_array, Array, BinaryArray, BooleanArray, FixedSizeListArray, ListArray,
PrimitiveArray, StructArray, Utf8Array,
};
use crate::bitmap::MutableBitmap;
use crate::datatypes::ArrowDataType;
use crate::legacy::utils::CustomIterTools;
Expand All @@ -16,6 +19,8 @@ pub mod utf8;

pub use slice::*;

use crate::legacy::prelude::LargeListArray;

macro_rules! iter_to_values {
($iterator:expr, $validity:expr, $offsets:expr, $length_so_far:expr) => {{
$iterator
Expand Down Expand Up @@ -206,3 +211,53 @@ pub trait PolarsArray: Array {
}

impl<A: Array + ?Sized> PolarsArray for A {}

fn is_nested_null(data_type: &ArrowDataType) -> bool {
match data_type {
ArrowDataType::Null => true,
ArrowDataType::LargeList(field) => is_nested_null(field.data_type()),
ArrowDataType::FixedSizeList(field, _) => is_nested_null(field.data_type()),
ArrowDataType::Struct(fields) => {
fields.iter().all(|field| is_nested_null(field.data_type()))
},
_ => false,
}
}

/// Cast null arrays to inner type and ensure that all offsets remain correct
pub fn convert_inner_type(array: &dyn Array, dtype: &ArrowDataType) -> Box<dyn Array> {
match dtype {
ArrowDataType::LargeList(field) => {
let array = array.as_any().downcast_ref::<LargeListArray>().unwrap();
let inner = array.values();
let new_values = convert_inner_type(inner.as_ref(), field.data_type());
let dtype = LargeListArray::default_datatype(new_values.data_type().clone());
LargeListArray::new(
dtype,
array.offsets().clone(),
new_values,
array.validity().cloned(),
)
.boxed()
},
ArrowDataType::FixedSizeList(field, width) => {
let array = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
let inner = array.values();
let new_values = convert_inner_type(inner.as_ref(), field.data_type());
let dtype =
FixedSizeListArray::default_datatype(new_values.data_type().clone(), *width);
FixedSizeListArray::new(dtype, new_values, array.validity().cloned()).boxed()
},
ArrowDataType::Struct(fields) => {
let array = array.as_any().downcast_ref::<StructArray>().unwrap();
let inner = array.values();
let new_values = inner
.iter()
.zip(fields)
.map(|(arr, field)| convert_inner_type(arr.as_ref(), field.data_type()))
.collect::<Vec<_>>();
StructArray::new(dtype.clone(), new_values, array.validity().cloned()).boxed()
},
_ => new_null_array(dtype.clone(), array.len()),
}
}
25 changes: 22 additions & 3 deletions crates/polars-core/src/chunked_array/builder/fixed_size_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,25 @@ pub(crate) struct FixedSizeListNumericBuilder<T: NativeType> {
inner: Option<MutableFixedSizeListArray<MutablePrimitiveArray<T>>>,
width: usize,
name: SmartString,
logical_dtype: DataType,
}

impl<T: NativeType> FixedSizeListNumericBuilder<T> {
pub(crate) fn new(name: &str, width: usize, capacity: usize) -> Self {
/// SAFETY
/// The caller must ensure that the physical numerical type match logical type.
pub(crate) unsafe fn new(
name: &str,
width: usize,
capacity: usize,
logical_dtype: DataType,
) -> Self {
let mp = MutablePrimitiveArray::<T>::with_capacity(capacity * width);
let inner = Some(MutableFixedSizeListArray::new(mp, width));
Self {
inner,
width,
name: name.into(),
logical_dtype,
}
}
}
Expand Down Expand Up @@ -68,7 +77,14 @@ impl<T: NativeType> FixedSizeListBuilder for FixedSizeListNumericBuilder<T> {

fn finish(&mut self) -> ArrayChunked {
let arr: FixedSizeListArray = self.inner.take().unwrap().into();
ChunkedArray::with_chunk(self.name.as_str(), arr)
// SAFETY: physical type matches the logical
unsafe {
ChunkedArray::from_chunks_and_dtype(
self.name.as_str(),
vec![Box::new(arr)],
DataType::Array(Box::new(self.logical_dtype.clone()), self.width),
)
}
}
}

Expand Down Expand Up @@ -124,7 +140,10 @@ pub(crate) fn get_fixed_size_list_builder(

let builder = if phys_dtype.is_numeric() {
with_match_physical_numeric_type!(phys_dtype, |$T| {
Box::new(FixedSizeListNumericBuilder::<$T>::new(name, width, capacity)) as Box<dyn FixedSizeListBuilder>
// SAFETY: physical type match logical type
unsafe {
Box::new(FixedSizeListNumericBuilder::<$T>::new(name, width, capacity,inner_type_logical.clone())) as Box<dyn FixedSizeListBuilder>
}
})
} else {
Box::new(AnonymousOwnedFixedSizeListBuilder::new(
Expand Down
6 changes: 6 additions & 0 deletions crates/polars-core/src/datatypes/_serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ pub enum SerializableDataType {
/// A 64-bit time representing elapsed time since midnight in the given TimeUnit.
Time,
List(Box<SerializableDataType>),
#[cfg(feature = "dtype-array")]
Array(Box<SerializableDataType>, usize),
Null,
#[cfg(feature = "dtype-struct")]
Struct(Vec<Field>),
Expand Down Expand Up @@ -86,6 +88,8 @@ impl From<&DataType> for SerializableDataType {
Duration(tu) => Self::Duration(*tu),
Time => Self::Time,
List(dt) => Self::List(Box::new(dt.as_ref().into())),
#[cfg(feature = "dtype-array")]
Array(dt, width) => Self::Array(Box::new(dt.as_ref().into()), *width),
Null => Self::Null,
Unknown => Self::Unknown,
#[cfg(feature = "dtype-struct")]
Expand Down Expand Up @@ -120,6 +124,8 @@ impl From<SerializableDataType> for DataType {
Duration(tu) => Self::Duration(tu),
Time => Self::Time,
List(dt) => Self::List(Box::new((*dt).into())),
#[cfg(feature = "dtype-array")]
Array(dt, width) => Self::Array(Box::new((*dt).into()), width),
Null => Self::Null,
Unknown => Self::Unknown,
#[cfg(feature = "dtype-struct")]
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-core/src/datatypes/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ impl DataType {
#[cfg(feature = "dtype-categorical")]
Categorical(_, _) => UInt32,
List(dt) => List(Box::new(dt.to_physical())),
#[cfg(feature = "dtype-array")]
Array(dt, width) => Array(Box::new(dt.to_physical()), *width),
#[cfg(feature = "dtype-struct")]
Struct(fields) => {
let new_fields = fields
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-core/src/serde/chunked_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ impl_serialize!(StringChunked);
impl_serialize!(BooleanChunked);
impl_serialize!(ListChunked);
impl_serialize!(BinaryChunked);
#[cfg(feature = "dtype-array")]
impl_serialize!(ArrayChunked);

#[cfg(feature = "dtype-categorical")]
impl Serialize for CategoricalChunked {
Expand Down
34 changes: 34 additions & 0 deletions crates/polars-core/src/serde/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ use std::fmt::Formatter;
use serde::de::{MapAccess, Visitor};
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};

#[cfg(feature = "dtype-array")]
use crate::chunked_array::builder::get_fixed_size_list_builder;
use crate::chunked_array::builder::AnonymousListBuilder;
use crate::chunked_array::Settings;
use crate::prelude::*;
Expand All @@ -25,6 +27,11 @@ impl Serialize for Series {
let ca = self.list().unwrap();
ca.serialize(serializer)
},
#[cfg(feature = "dtype-array")]
DataType::Array(_, _) => {
let ca = self.array().unwrap();
ca.serialize(serializer)
},
DataType::Boolean => {
let ca = self.bool().unwrap();
ca.serialize(serializer)
Expand Down Expand Up @@ -213,6 +220,33 @@ impl<'de> Deserialize<'de> for Series {
}
Ok(lb.finish().into_series())
},
#[cfg(feature = "dtype-array")]
DataType::Array(inner, width) => {
let values: Vec<Option<Series>> = map.next_value()?;
let mut builder =
get_fixed_size_list_builder(&inner, values.len(), width, &name)
.map_err(|e| {
de::Error::custom(format!(
"could not get supported list builder: {e}"
))
})?;
for value in &values {
if let Some(s) = value {
// we only have one chunk per series as we serialize it in this way.
let arr = &s.chunks()[0];
// safety, we are within bounds
unsafe {
builder.push_unchecked(arr.as_ref(), 0);
}
} else {
// safety, we are within bounds
unsafe {
builder.push_null();
}
}
}
Ok(builder.finish().into_series())
},
DataType::Binary => {
let values: Vec<Option<Cow<[u8]>>> = map.next_value()?;
Ok(Series::new(&name, values))
Expand Down
37 changes: 24 additions & 13 deletions py-polars/src/series/construction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,19 +258,30 @@ impl PySeries {
Ok(series.into())
} else {
let val = vec_extract_wrapped(val);
let series = Series::new(name, &val);
match series.dtype() {
DataType::List(list_inner) => {
let series = series
.cast(&DataType::Array(
Box::new(inner.map(|dt| dt.0).unwrap_or(*list_inner.clone())),
width,
))
.map_err(PyPolarsErr::from)?;
Ok(series.into())
},
_ => Err(PyValueError::new_err("could not create Array from input")),
}
return if let Some(inner) = inner {
let series = Series::from_any_values_and_dtype(
name,
val.as_ref(),
&DataType::Array(Box::new(inner.0), width),
true,
)
.map_err(PyPolarsErr::from)?;
Ok(series.into())
} else {
let series = Series::new(name, &val);
match series.dtype() {
DataType::List(list_inner) => {
let series = series
.cast(&DataType::Array(
Box::new(inner.map(|dt| dt.0).unwrap_or(*list_inner.clone())),
width,
))
.map_err(PyPolarsErr::from)?;
Ok(series.into())
},
_ => Err(PyValueError::new_err("could not create Array from input")),
}
};
}
}

Expand Down
Loading

0 comments on commit 01217d2

Please sign in to comment.