Skip to content

Commit

Permalink
fix: Don't trigger length check in array construction (#20205)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Dec 7, 2024
1 parent 958d1ef commit d4bab3f
Show file tree
Hide file tree
Showing 22 changed files with 139 additions and 99 deletions.
11 changes: 1 addition & 10 deletions crates/polars-core/src/chunked_array/from.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use polars_error::constants::LENGTH_LIMIT_MSG;

use super::*;

#[allow(clippy::all)]
Expand Down Expand Up @@ -176,14 +174,7 @@ where
})
.collect();

unsafe {
ChunkedArray::new_with_dims(
field,
chunks,
length.try_into().expect(LENGTH_LIMIT_MSG),
null_count as IdxSize,
)
}
unsafe { ChunkedArray::new_with_dims(field, chunks, length, null_count) }
}

/// Create a new [`ChunkedArray`] from existing chunks.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,15 @@ impl CategoricalChunked {
fn set_lengths(&mut self, other: &Self) {
let length_self = &mut self.physical_mut().length;
*length_self = length_self
.checked_add(other.len() as IdxSize)
.checked_add(other.len())
.expect(LENGTH_LIMIT_MSG);
self.physical_mut().null_count += other.null_count() as IdxSize;

assert!(
IdxSize::try_from(*length_self).is_ok(),
"{}",
LENGTH_LIMIT_MSG
);
self.physical_mut().null_count += other.null_count();
}

pub fn append(&mut self, other: &Self) -> PolarsResult<()> {
Expand Down
5 changes: 2 additions & 3 deletions crates/polars-core/src/chunked_array/logical/enum_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ use arrow::array::UInt32Vec;
use polars_error::{polars_bail, polars_err, PolarsResult};
use polars_utils::aliases::{InitHashMaps, PlHashMap};
use polars_utils::pl_str::PlSmallStr;
use polars_utils::IdxSize;

use super::{CategoricalChunked, CategoricalOrdering, DataType, Field, RevMapping, UInt32Chunked};

Expand Down Expand Up @@ -81,8 +80,8 @@ impl EnumChunkedBuilder {

pub fn finish(self) -> CategoricalChunked {
let arr = self.enum_builder.freeze();
let null_count = arr.validity().map_or(0, |a| a.unset_bits()) as IdxSize;
let length = arr.len() as IdxSize;
let null_count = arr.validity().map_or(0, |a| a.unset_bits());
let length = arr.len();
let ca = unsafe {
UInt32Chunked::new_with_dims(
Arc::new(Field::new(self.name, DataType::UInt32)),
Expand Down
2 changes: 0 additions & 2 deletions crates/polars-core/src/chunked_array/logical/time.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ impl Int64Chunked {
})
.collect::<Vec<Box<dyn Array>>>();

let null_count = null_count as IdxSize;

debug_assert!(null_count >= self.null_count);

// @TODO: We throw away metadata here. That is mostly not needed.
Expand Down
61 changes: 40 additions & 21 deletions crates/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@ pub struct ChunkedArray<T: PolarsDataType> {
// combine with the interior mutability that IMMetadata provides.
pub(crate) md: Arc<IMMetadata<T>>,

length: IdxSize,
null_count: IdxSize,
length: usize,
null_count: usize,
}

impl<T: PolarsDataType> ChunkedArray<T>
Expand Down Expand Up @@ -186,6 +186,38 @@ impl<T: PolarsDataType> ChunkedArray<T> {
}
}

pub(crate) fn as_any(&self) -> &dyn std::any::Any {
self
}

/// Series to [`ChunkedArray<T>`]
pub fn unpack_series_matching_type<'a>(
&self,
series: &'a Series,
) -> PolarsResult<&'a ChunkedArray<T>> {
match self.dtype() {
#[cfg(feature = "dtype-decimal")]
DataType::Decimal(_, _) => {
let logical = series.decimal()?;

let ca = logical.physical();
Ok(ca.as_any().downcast_ref::<ChunkedArray<T>>().unwrap())
},
dt => {
polars_ensure!(
dt == series.dtype(),
SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
series.dtype(),
dt,
);

// SAFETY:
// dtype will be correct.
Ok(unsafe { self.unpack_series_matching_physical_type(series) })
},
}
}

/// Create a new [`ChunkedArray`] and compute its `length` and `null_count`.
///
/// If you want to explicitly the `length` and `null_count`, look at
Expand All @@ -204,8 +236,8 @@ impl<T: PolarsDataType> ChunkedArray<T> {
pub unsafe fn new_with_dims(
field: Arc<Field>,
chunks: Vec<ArrayRef>,
length: IdxSize,
null_count: IdxSize,
length: usize,
null_count: usize,
) -> Self {
Self {
field,
Expand Down Expand Up @@ -487,7 +519,7 @@ impl<T: PolarsDataType> ChunkedArray<T> {
Self::new_with_dims(
self.field.clone(),
chunks,
(self.len() - self.null_count()) as IdxSize,
self.len() - self.null_count(),
0,
)
}
Expand Down Expand Up @@ -535,10 +567,10 @@ impl<T: PolarsDataType> ChunkedArray<T> {
///
/// This is unsafe as the dtype may be incorrect and
/// is assumed to be correct in other safe code.
pub(crate) unsafe fn unpack_series_matching_physical_type(
pub(crate) unsafe fn unpack_series_matching_physical_type<'a>(
&self,
series: &Series,
) -> &ChunkedArray<T> {
series: &'a Series,
) -> &'a ChunkedArray<T> {
let series_trait = &**series;
if self.dtype() == series.dtype() {
&*(series_trait as *const dyn SeriesTrait as *const ChunkedArray<T>)
Expand All @@ -557,19 +589,6 @@ impl<T: PolarsDataType> ChunkedArray<T> {
}
}

/// Series to [`ChunkedArray<T>`]
pub fn unpack_series_matching_type(&self, series: &Series) -> PolarsResult<&ChunkedArray<T>> {
polars_ensure!(
self.dtype() == series.dtype(),
SchemaMismatch: "cannot unpack series of type `{}` into `{}`",
series.dtype(),
self.dtype(),
);
// SAFETY:
// dtype will be correct.
Ok(unsafe { self.unpack_series_matching_physical_type(series) })
}

/// Returns an iterator over the lengths of the chunks of the array.
pub fn chunk_lengths(&self) -> ChunkLenIter {
self.chunks.iter().map(|chunk| chunk.len())
Expand Down
12 changes: 4 additions & 8 deletions crates/polars-core/src/chunked_array/object/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ where
let null_count = null_bitmap
.as_ref()
.map(|validity| validity.unset_bits())
.unwrap_or(0) as IdxSize;
.unwrap_or(0);

let arr = Box::new(ObjectArray {
values: self.values.into(),
Expand All @@ -67,9 +67,7 @@ where

self.field.dtype = get_object_type::<T>();

unsafe {
ChunkedArray::new_with_dims(Arc::new(self.field), vec![arr], len as IdxSize, null_count)
}
unsafe { ChunkedArray::new_with_dims(Arc::new(self.field), vec![arr], len, null_count) }
}
}

Expand Down Expand Up @@ -142,7 +140,7 @@ where
validity: None,
});

unsafe { ObjectChunked::new_with_dims(field, vec![arr], len as IdxSize, 0) }
unsafe { ObjectChunked::new_with_dims(field, vec![arr], len, 0) }
}

pub fn new_from_vec_and_validity(name: PlSmallStr, v: Vec<T>, validity: Bitmap) -> Self {
Expand All @@ -154,9 +152,7 @@ where
validity: Some(validity),
});

unsafe {
ObjectChunked::new_with_dims(field, vec![arr], len as IdxSize, null_count as IdxSize)
}
unsafe { ObjectChunked::new_with_dims(field, vec![arr], len, null_count) }
}

pub fn new_empty(name: PlSmallStr) -> Self {
Expand Down
32 changes: 25 additions & 7 deletions crates/polars-core/src/chunked_array/ops/chunkops.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::cell::Cell;

use arrow::bitmap::{Bitmap, MutableBitmap};
use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked;
use polars_error::constants::LENGTH_LIMIT_MSG;
Expand Down Expand Up @@ -90,17 +92,31 @@ pub(crate) fn slice(
(new_chunks, new_len)
}

// When we deal with arrays and lists we can easily exceed the limit if
// we take the underlying values array as a Series. This call stack
// is hard to follow, so for this one case we make an exception
// and use a thread local.
thread_local!(static CHECK_LENGTH: Cell<bool> = const { Cell::new(true) });

/// Meant for internal use. In very rare conditions this can be turned off.
/// # Safety
/// The caller must ensure the Series that exceeds the length get's deconstructed
/// into array values or list values before and never is used.
pub unsafe fn _set_check_length(check: bool) {
CHECK_LENGTH.set(check)
}

impl<T: PolarsDataType> ChunkedArray<T> {
/// Get the length of the ChunkedArray
#[inline]
pub fn len(&self) -> usize {
self.length as usize
self.length
}

/// Return the number of null values in the ChunkedArray.
#[inline]
pub fn null_count(&self) -> usize {
self.null_count as usize
self.null_count
}

/// Set the null count directly.
Expand All @@ -111,7 +127,7 @@ impl<T: PolarsDataType> ChunkedArray<T> {
/// # Safety
/// The new null count must match the total null count of the underlying
/// arrays.
pub unsafe fn set_null_count(&mut self, null_count: IdxSize) {
pub unsafe fn set_null_count(&mut self, null_count: usize) {
self.null_count = null_count;
}

Expand All @@ -131,13 +147,15 @@ impl<T: PolarsDataType> ChunkedArray<T> {
}
let len = inner(&self.chunks);
// Length limit is `IdxSize::MAX - 1`. We use `IdxSize::MAX` to indicate `NULL` in indexing.
assert!(len < IdxSize::MAX as usize, "{}", LENGTH_LIMIT_MSG);
self.length = len as IdxSize;
if len >= (IdxSize::MAX as usize) && CHECK_LENGTH.get() {
panic!("{}", LENGTH_LIMIT_MSG);
}
self.length = len;
self.null_count = self
.chunks
.iter()
.map(|arr| arr.null_count())
.sum::<usize>() as IdxSize;
.sum::<usize>();
}

pub fn rechunk(&self) -> Self {
Expand Down Expand Up @@ -311,7 +329,7 @@ impl<T: PolarsDataType> ChunkedArray<T> {
}

out.copy_metadata(self, properties);
out.length = len as IdxSize;
out.length = len;

out
};
Expand Down
1 change: 1 addition & 0 deletions crates/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ pub(crate) mod unique;
#[cfg(feature = "zip_with")]
pub mod zip;

pub use chunkops::_set_check_length;
#[cfg(feature = "serde-lazy")]
use serde::{Deserialize, Serialize};
pub use sort::options::*;
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/ops/rolling_window.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ mod inner_mod {
debug_assert!(offset + window_size <= arr.len());
let arr_window = unsafe { arr.slice_typed_unchecked(offset, window_size) };
// The lengths are cached, so we must update them.
heap_container.length = arr_window.len() as IdxSize;
heap_container.length = arr_window.len();

// SAFETY: ptr is not dropped as we are in scope. We are also the only
// owner of the contents of the Arc (we do this to reduce heap allocs).
Expand Down
6 changes: 2 additions & 4 deletions crates/polars-core/src/chunked_array/ops/zip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,6 @@ impl ChunkZip<StructType> for StructChunked {
debug_assert!(mask.length == 1 || mask.length == length);
debug_assert!(other.length == 1 || other.length == length);

let length = length as usize;

let mut if_true: Cow<ChunkedArray<StructType>> = Cow::Borrowed(self);
let mut if_false: Cow<ChunkedArray<StructType>> = Cow::Borrowed(other);

Expand Down Expand Up @@ -501,7 +499,7 @@ impl ChunkZip<StructType> for StructChunked {
}
}

out.null_count = null_count as IdxSize;
out.null_count = null_count;
} else {
// SAFETY: We do not change the lengths of the chunks and we update the null_count
// afterwards.
Expand All @@ -511,7 +509,7 @@ impl ChunkZip<StructType> for StructChunked {
*chunk = chunk.with_validity(None);
}

out.null_count = 0 as IdxSize;
out.null_count = 0;
}
}

Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ pub struct FalseT;
/// # Safety
///
/// The StaticArray and dtype return must be correct.
pub unsafe trait PolarsDataType: Send + Sync + Sized {
pub unsafe trait PolarsDataType: Send + Sync + Sized + 'static {
type Physical<'a>: std::fmt::Debug + Clone;
type OwnedPhysical: std::fmt::Debug + Send + Sync + Clone + PartialEq;
type ZeroablePhysical<'a>: Zeroable + From<Self::Physical<'a>>;
Expand Down
4 changes: 3 additions & 1 deletion crates/polars-core/src/series/implementations/decimal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,11 @@ impl private::PrivateSeries for SeriesWrap<DecimalChunked> {

#[cfg(feature = "zip_with")]
fn zip_with_same_type(&self, mask: &BooleanChunked, other: &Series) -> PolarsResult<Series> {
let other = other.decimal()?;

Ok(self
.0
.zip_with(mask, other.as_ref().as_ref())?
.zip_with(mask, other.physical())?
.into_decimal_unchecked(self.0.precision(), self.0.scale())
.into_series())
}
Expand Down
10 changes: 9 additions & 1 deletion crates/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1016,7 +1016,15 @@ where
T: 'static + PolarsDataType,
{
fn as_ref(&self) -> &ChunkedArray<T> {
let eq = equal_outer_type::<T>(self.dtype());
let dtype = self.dtype();

#[cfg(feature = "dtype-decimal")]
if dtype.is_decimal() {
let logical = self.as_any().downcast_ref::<DecimalChunked>().unwrap();
let ca = logical.physical();
return ca.as_any().downcast_ref::<ChunkedArray<T>>().unwrap();
}
let eq = equal_outer_type::<T>(dtype);
assert!(
eq,
"implementation error, cannot get ref {:?} from {:?}",
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/series/ops/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ impl Series {
}
},
DataType::BinaryOffset => {
let length = size as IdxSize;
let length = size;

let offsets = vec![0; size + 1];
let array = BinaryArray::<i64>::new(
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-ops/src/chunked_array/scatter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ where

// The null count may have changed - make sure to update the ChunkedArray
let new_null_count = arr.null_count();
unsafe { ca.set_null_count(new_null_count.try_into().unwrap()) };
unsafe { ca.set_null_count(new_null_count) };

Ok(ca.into_series())
}
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-ops/src/series/ops/is_in.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ where
)
}

fn is_in_helper<'a, T>(ca: &'a ChunkedArray<T>, other: &Series) -> PolarsResult<BooleanChunked>
fn is_in_helper<'a, T>(ca: &'a ChunkedArray<T>, other: &'a Series) -> PolarsResult<BooleanChunked>
where
T: PolarsDataType,
T::Physical<'a>: TotalHash + TotalEq + Copy + ToTotalOrd,
Expand Down
Loading

0 comments on commit d4bab3f

Please sign in to comment.