Skip to content

Commit

Permalink
feat: Allow for zero-width fixed size lists
Browse files Browse the repository at this point in the history
This PR properly implements `polars.Array(..., 0)`. This does make a few
fundamental changes to how `FixedSizeListArray` works internally. Mainly, it
now keeps a `length` field that actually determines the length instead of
relying the `values.len() / size`.

Fixes #18921, #16878.
  • Loading branch information
coastalwhite committed Sep 26, 2024
1 parent 71a8b05 commit 0fa59f9
Show file tree
Hide file tree
Showing 32 changed files with 674 additions and 217 deletions.
2 changes: 2 additions & 0 deletions crates/polars-arrow/src/array/fixed_size_list/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ impl Arrow2Arrow for FixedSizeListArray {

fn from_data(data: &ArrayData) -> Self {
let dtype: ArrowDataType = data.data_type().clone().into();
let length = data.len() - data.offset();
let size = match dtype {
ArrowDataType::FixedSizeList(_, size) => size,
_ => unreachable!("must be FixedSizeList type"),
Expand All @@ -28,6 +29,7 @@ impl Arrow2Arrow for FixedSizeListArray {

Self {
size,
length,
dtype,
values,
validity: data.nulls().map(|n| Bitmap::from_null_buffer(n.clone())),
Expand Down
14 changes: 11 additions & 3 deletions crates/polars-arrow/src/array/fixed_size_list/ffi.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use polars_error::PolarsResult;
use polars_error::{polars_ensure, PolarsResult};

use super::FixedSizeListArray;
use crate::array::ffi::{FromFfi, ToFfi};
Expand Down Expand Up @@ -31,11 +31,19 @@ unsafe impl ToFfi for FixedSizeListArray {
impl<A: ffi::ArrowArrayRef> FromFfi<A> for FixedSizeListArray {
unsafe fn try_from_ffi(array: A) -> PolarsResult<Self> {
let dtype = array.dtype().clone();
let (_, width) = FixedSizeListArray::try_child_and_size(&dtype)?;
let validity = unsafe { array.validity() }?;
let child = unsafe { array.child(0)? };
let child = unsafe { array.child(0) }?;
let values = ffi::try_from(child)?;

let mut fsl = Self::try_new(dtype, values, validity)?;
let length = if values.len() == 0 {
0
} else {
polars_ensure!(width > 0, InvalidOperation: "Zero-width array with values");
values.len() / width
};

let mut fsl = Self::try_new(dtype, length, values, validity)?;
fsl.slice(array.offset(), array.length());
Ok(fsl)
}
Expand Down
70 changes: 49 additions & 21 deletions crates/polars-arrow/src/array/fixed_size_list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ mod iterator;

mod mutable;
pub use mutable::*;
use polars_error::{polars_bail, PolarsResult};
use polars_error::{polars_bail, polars_ensure, PolarsResult};
use polars_utils::pl_str::PlSmallStr;

/// The Arrow's equivalent to an immutable `Vec<Option<[T; size]>>` where `T` is an Arrow type.
/// Cloning and slicing this struct is `O(1)`.
#[derive(Clone)]
pub struct FixedSizeListArray {
size: usize, // this is redundant with `dtype`, but useful to not have to deconstruct the dtype.
length: usize, // invariant: this is values.len() / size if size > 0
dtype: ArrowDataType,
values: Box<dyn Array>,
validity: Option<Bitmap>,
Expand All @@ -34,6 +35,7 @@ impl FixedSizeListArray {
/// * the validity's length is not equal to `values.len() / size`.
pub fn try_new(
dtype: ArrowDataType,
length: usize,
values: Box<dyn Array>,
validity: Option<Bitmap>,
) -> PolarsResult<Self> {
Expand All @@ -45,34 +47,61 @@ impl FixedSizeListArray {
polars_bail!(ComputeError: "FixedSizeListArray's child's DataType must match. However, the expected DataType is {child_dtype:?} while it got {values_dtype:?}.")
}

if values.len() % size != 0 {
polars_bail!(ComputeError:
"values (of len {}) must be a multiple of size ({}) in FixedSizeListArray.",
values.len(),
size
)
}
let len = values.len() / size;
polars_ensure!(size == 0 || values.len() % size == 0, ComputeError:
"values (of len {}) must be a multiple of size ({}) in FixedSizeListArray.",
values.len(),
size
);

polars_ensure!(size == 0 || values.len() / size == length, ComputeError:
"length of values ({}) is not equal to given length ({}) in FixedSizeListArray({size}).",
values.len() / size,
length,
);
polars_ensure!(size != 0 || values.len() == 0, ComputeError:
"zero width FixedSizeListArray has values (length = {}).",
values.len(),
);

if validity
.as_ref()
.map_or(false, |validity| validity.len() != len)
.map_or(false, |validity| validity.len() != length)
{
polars_bail!(ComputeError: "validity mask length must be equal to the number of values divided by size")
}

Ok(Self {
size,
length,
dtype,
values,
validity,
})
}

#[inline]
fn has_invariants(&self) -> bool {
let has_valid_length = (self.size == 0 && self.values().len() == 0)
|| (self.size > 0
&& self.values().len() % self.size() == 0
&& self.values().len() / self.size() == self.length);
let has_valid_validity = self
.validity
.as_ref()
.map_or(true, |v| v.len() == self.length);

has_valid_length && has_valid_validity
}

/// Alias to `Self::try_new(...).unwrap()`
#[track_caller]
pub fn new(dtype: ArrowDataType, values: Box<dyn Array>, validity: Option<Bitmap>) -> Self {
Self::try_new(dtype, values, validity).unwrap()
pub fn new(
dtype: ArrowDataType,
length: usize,
values: Box<dyn Array>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new(dtype, length, values, validity).unwrap()
}

/// Returns the size (number of elements per slot) of this [`FixedSizeListArray`].
Expand All @@ -83,15 +112,15 @@ impl FixedSizeListArray {
/// Returns a new empty [`FixedSizeListArray`].
pub fn new_empty(dtype: ArrowDataType) -> Self {
let values = new_empty_array(Self::get_child_and_size(&dtype).0.dtype().clone());
Self::new(dtype, values, None)
Self::new(dtype, 0, values, None)
}

/// Returns a new null [`FixedSizeListArray`].
pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
let (field, size) = Self::get_child_and_size(&dtype);

let values = new_null_array(field.dtype().clone(), length * size);
Self::new(dtype, values, Some(Bitmap::new_zeroed(length)))
Self::new(dtype, length, values, Some(Bitmap::new_zeroed(length)))
}
}

Expand Down Expand Up @@ -124,6 +153,7 @@ impl FixedSizeListArray {
.filter(|bitmap| bitmap.unset_bits() > 0);
self.values
.slice_unchecked(offset * self.size, length * self.size);
self.length = length;
}

impl_sliced!();
Expand All @@ -136,7 +166,8 @@ impl FixedSizeListArray {
/// Returns the length of this array
#[inline]
pub fn len(&self) -> usize {
self.values.len() / self.size
debug_assert!(self.has_invariants());
self.length
}

/// The optional validity.
Expand Down Expand Up @@ -184,12 +215,7 @@ impl FixedSizeListArray {
impl FixedSizeListArray {
pub(crate) fn try_child_and_size(dtype: &ArrowDataType) -> PolarsResult<(&Field, usize)> {
match dtype.to_logical_type() {
ArrowDataType::FixedSizeList(child, size) => {
if *size == 0 {
polars_bail!(ComputeError: "FixedSizeBinaryArray expects a positive size")
}
Ok((child.as_ref(), *size))
},
ArrowDataType::FixedSizeList(child, size) => Ok((child.as_ref(), *size)),
_ => polars_bail!(ComputeError: "FixedSizeListArray expects DataType::FixedSizeList"),
}
}
Expand Down Expand Up @@ -233,12 +259,14 @@ impl Splitable for FixedSizeListArray {
(
Self {
dtype: self.dtype.clone(),
length: offset,
values: lhs_values,
validity: lhs_validity,
size,
},
Self {
dtype: self.dtype.clone(),
length: self.length - offset,
values: rhs_values,
validity: rhs_validity,
size,
Expand Down
44 changes: 41 additions & 3 deletions crates/polars-arrow/src/array/fixed_size_list/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::datatypes::{ArrowDataType, Field};
pub struct MutableFixedSizeListArray<M: MutableArray> {
dtype: ArrowDataType,
size: usize,
length: usize,
values: M,
validity: Option<MutableBitmap>,
}
Expand All @@ -22,6 +23,7 @@ impl<M: MutableArray> From<MutableFixedSizeListArray<M>> for FixedSizeListArray
fn from(mut other: MutableFixedSizeListArray<M>) -> Self {
FixedSizeListArray::new(
other.dtype,
other.length,
other.values.as_box(),
other.validity.map(|x| x.into()),
)
Expand Down Expand Up @@ -53,20 +55,28 @@ impl<M: MutableArray> MutableFixedSizeListArray<M> {
};
Self {
size,
length: 0,
dtype,
values,
validity: None,
}
}

#[inline]
fn has_valid_invariants(&self) -> bool {
(self.size == 0 && self.values().len() == 0)
|| (self.size > 0 && self.values.len() / self.size == self.length)
}

/// Returns the size (number of elements per slot) of this [`FixedSizeListArray`].
pub const fn size(&self) -> usize {
self.size
}

/// The length of this array
pub fn len(&self) -> usize {
self.values.len() / self.size
debug_assert!(self.has_valid_invariants());
self.length
}

/// The inner values
Expand Down Expand Up @@ -98,6 +108,10 @@ impl<M: MutableArray> MutableFixedSizeListArray<M> {
if let Some(validity) = &mut self.validity {
validity.push(true)
}
self.length += 1;

debug_assert!(self.has_valid_invariants());

Ok(())
}

Expand All @@ -108,6 +122,9 @@ impl<M: MutableArray> MutableFixedSizeListArray<M> {
if let Some(validity) = &mut self.validity {
validity.push(true)
}
self.length += 1;

debug_assert!(self.has_valid_invariants());
}

#[inline]
Expand All @@ -117,6 +134,9 @@ impl<M: MutableArray> MutableFixedSizeListArray<M> {
Some(validity) => validity.push(false),
None => self.init_validity(),
}
self.length += 1;

debug_assert!(self.has_valid_invariants());
}

/// Reserves `additional` slots.
Expand All @@ -138,7 +158,8 @@ impl<M: MutableArray> MutableFixedSizeListArray<M> {

impl<M: MutableArray + 'static> MutableArray for MutableFixedSizeListArray<M> {
fn len(&self) -> usize {
self.values.len() / self.size
debug_assert!(self.has_valid_invariants());
self.length
}

fn validity(&self) -> Option<&MutableBitmap> {
Expand All @@ -148,6 +169,7 @@ impl<M: MutableArray + 'static> MutableArray for MutableFixedSizeListArray<M> {
fn as_box(&mut self) -> Box<dyn Array> {
FixedSizeListArray::new(
self.dtype.clone(),
self.length,
self.values.as_box(),
std::mem::take(&mut self.validity).map(|x| x.into()),
)
Expand All @@ -157,6 +179,7 @@ impl<M: MutableArray + 'static> MutableArray for MutableFixedSizeListArray<M> {
fn as_arc(&mut self) -> Arc<dyn Array> {
FixedSizeListArray::new(
self.dtype.clone(),
self.length,
self.values.as_box(),
std::mem::take(&mut self.validity).map(|x| x.into()),
)
Expand Down Expand Up @@ -185,6 +208,9 @@ impl<M: MutableArray + 'static> MutableArray for MutableFixedSizeListArray<M> {
} else {
self.init_validity()
}
self.length += 1;

debug_assert!(self.has_valid_invariants());
}

fn reserve(&mut self, additional: usize) {
Expand All @@ -206,6 +232,9 @@ where
for items in iter {
self.try_push(items)?;
}

debug_assert!(self.has_valid_invariants());

Ok(())
}
}
Expand All @@ -223,6 +252,9 @@ where
} else {
self.push_null();
}

debug_assert!(self.has_valid_invariants());

Ok(())
}
}
Expand All @@ -243,6 +275,8 @@ where
} else {
self.push_null();
}

debug_assert!(self.has_valid_invariants());
}
}

Expand All @@ -251,8 +285,12 @@ where
M: MutableArray + TryExtendFromSelf,
{
fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {
self.length += other.len();
extend_validity(self.len(), &mut self.validity, &other.validity);

self.values.try_extend_from_self(&other.values)
self.values.try_extend_from_self(&other.values)?;
debug_assert!(self.has_valid_invariants());

Ok(())
}
}
Loading

0 comments on commit 0fa59f9

Please sign in to comment.