From f441847ff30976f2fc8cffbc30e5074988dc6d70 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Fri, 27 May 2022 17:58:24 +0000 Subject: [PATCH] More docs --- src/array/primitive/from_natural.rs | 58 +---- src/array/primitive/iterator.rs | 11 - src/array/primitive/mod.rs | 324 +++++++++++++++++++--------- src/array/utf8/mod.rs | 2 +- 4 files changed, 225 insertions(+), 170 deletions(-) diff --git a/src/array/primitive/from_natural.rs b/src/array/primitive/from_natural.rs index 3d259f90604..5f131dcab1e 100644 --- a/src/array/primitive/from_natural.rs +++ b/src/array/primitive/from_natural.rs @@ -1,6 +1,6 @@ use std::iter::FromIterator; -use crate::{trusted_len::TrustedLen, types::NativeType}; +use crate::types::NativeType; use super::{MutablePrimitiveArray, PrimitiveArray}; @@ -15,59 +15,3 @@ impl>> FromIterator for P MutablePrimitiveArray::::from_iter(iter).into() } } - -impl PrimitiveArray { - /// Creates a (non-null) [`PrimitiveArray`] from an iterator of values. - /// # Implementation - /// This does not assume that the iterator has a known length. - pub fn from_values>(iter: I) -> Self { - Self::new(T::PRIMITIVE.into(), Vec::::from_iter(iter).into(), None) - } - - /// Creates a (non-null) [`PrimitiveArray`] from a slice of values. - /// # Implementation - /// This is essentially a memcopy - pub fn from_slice>(slice: P) -> Self { - Self::new( - T::PRIMITIVE.into(), - Vec::::from(slice.as_ref()).into(), - None, - ) - } - - /// Creates a (non-null) [`PrimitiveArray`] from a vector of values. - /// This does not have memcopy and is the fastest way to create a [`PrimitiveArray`]. - pub fn from_vec(array: Vec) -> Self { - Self::new(T::PRIMITIVE.into(), array.into(), None) - } -} - -impl PrimitiveArray { - /// Creates a (non-null) [`PrimitiveArray`] from a [`TrustedLen`] of values. - /// # Implementation - /// This does not assume that the iterator has a known length. - pub fn from_trusted_len_values_iter>(iter: I) -> Self { - MutablePrimitiveArray::::from_trusted_len_values_iter(iter).into() - } - - /// Creates a new [`PrimitiveArray`] from an iterator over values - /// # Safety - /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). - /// I.e. that `size_hint().1` correctly reports its length. - pub unsafe fn from_trusted_len_values_iter_unchecked>(iter: I) -> Self { - MutablePrimitiveArray::::from_trusted_len_values_iter_unchecked(iter).into() - } - - /// Creates a [`PrimitiveArray`] from a [`TrustedLen`] of optional values. - pub fn from_trusted_len_iter>>(iter: I) -> Self { - MutablePrimitiveArray::::from_trusted_len_iter(iter).into() - } - - /// Creates a [`PrimitiveArray`] from an iterator of optional values. - /// # Safety - /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). - /// I.e. that `size_hint().1` correctly reports its length. - pub unsafe fn from_trusted_len_iter_unchecked>>(iter: I) -> Self { - MutablePrimitiveArray::::from_trusted_len_iter_unchecked(iter).into() - } -} diff --git a/src/array/primitive/iterator.rs b/src/array/primitive/iterator.rs index 5d5049e34fc..42fb758c4da 100644 --- a/src/array/primitive/iterator.rs +++ b/src/array/primitive/iterator.rs @@ -16,17 +16,6 @@ impl<'a, T: NativeType> IntoIterator for &'a PrimitiveArray { } } -impl<'a, T: NativeType> PrimitiveArray { - /// constructs a new iterator - #[inline] - pub fn iter(&'a self) -> ZipValidity<'a, &'a T, std::slice::Iter<'a, T>> { - zip_validity( - self.values().iter(), - self.validity.as_ref().map(|x| x.iter()), - ) - } -} - impl<'a, T: NativeType> MutablePrimitiveArray { /// Returns an iterator over `Option` #[inline] diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index d8ec4182b67..ae753549f75 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -1,8 +1,12 @@ use crate::{ - bitmap::Bitmap, + bitmap::{ + utils::{zip_validity, ZipValidity}, + Bitmap, + }, buffer::Buffer, datatypes::*, error::Error, + trusted_len::TrustedLen, types::{days_ms, months_days_ns, NativeType}, }; @@ -17,19 +21,30 @@ pub use iterator::*; mod mutable; pub use mutable::*; -/// A [`PrimitiveArray`] is arrow's equivalent to `Vec>`, i.e. -/// an array designed for highly performant operations on optionally nullable slots, -/// backed by a physical type of a fixed byte-width, such as `i32` or `f64`. -/// The size of this struct is `O(1)` as all data is stored behind an [`std::sync::Arc`]. +/// A [`PrimitiveArray`] is Arrow's semantically equivalent of an immutable `Vec>` where +/// T is [`NativeType`] (e.g. [`i32`]). It implements [`Array`]. +/// +/// One way to think about a [`PrimitiveArray`] is `(DataType, Arc>, Option>>)` +/// where: +/// * the first item is the array's logical type +/// * the second is the immutable values +/// * the third is the immutable validity (whether a value is null or not as a bitmap). +/// +/// The size of this struct is `O(1)`, as all data is stored behind an [`std::sync::Arc`]. /// # Example /// ``` -/// use arrow2::array::{PrimitiveArray, Array}; +/// use arrow2::array::PrimitiveArray; /// use arrow2::bitmap::Bitmap; -/// # fn main() { -/// let array = PrimitiveArray::from([Some(1), None, Some(10)]); -/// assert_eq!(array.values().as_slice(), &[1, 0, 10]); +/// use arrow2::buffer::Buffer; +/// +/// let array = PrimitiveArray::from([Some(1i32), None, Some(10)]); +/// assert_eq!(array.value(0), 1); +/// assert_eq!(array.iter().collect::>(), vec![Some(&1i32), None, Some(&10)]); +/// assert_eq!(array.values_iter().copied().collect::>(), vec![1, 0, 10]); +/// // the underlying representation +/// assert_eq!(array.values(), &Buffer::from(vec![1i32, 0, 10])); /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); -/// # } +/// /// ``` #[derive(Clone)] pub struct PrimitiveArray { @@ -39,11 +54,14 @@ pub struct PrimitiveArray { } impl PrimitiveArray { - /// The canonical method to create a [`PrimitiveArray`]. + /// The canonical method to create a [`PrimitiveArray`] out of its internal components. + /// # Implementation + /// This function is `O(1)`. + /// /// # Errors /// This function errors iff: /// * The validity is not `None` and its length is different from `values`'s length - /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive`]. + /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive(T::PRIMITIVE)`] pub fn try_new( data_type: DataType, values: Buffer, @@ -71,42 +89,127 @@ impl PrimitiveArray { }) } - /// The canonical method to create a [`PrimitiveArray`] + /// Returns a new [`PrimitiveArray`] with a different logical type. + /// + /// This function is useful to assign a different [`DataType`] to the array. + /// Used to change the arrays' logical type (see example). + /// # Example + /// ``` + /// use arrow2::array::Int32Array; + /// use arrow2::datatypes::DataType; + /// + /// let array = Int32Array::from(&[Some(1), None, Some(2)]).to(DataType::Date32); + /// assert_eq!( + /// format!("{:?}", array), + /// "Date32[1970-01-02, None, 1970-01-03]" + /// ); + /// ``` /// # Panics - /// This function errors iff: - /// * The validity is not `None` and its length is different from `values`'s length - /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive`]. - pub fn new(data_type: DataType, values: Buffer, validity: Option) -> Self { - Self::try_new(data_type, values, validity).unwrap() + /// Panics iff the `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive(T::PRIMITIVE)`] + #[inline] + #[must_use] + pub fn to(self, data_type: DataType) -> Self { + if !data_type.to_physical_type().eq_primitive(T::PRIMITIVE) { + Err(Error::InvalidArgumentError(format!( + "Type {} does not support logical type {:?}", + std::any::type_name::(), + data_type + ))) + .unwrap() + } + Self { + data_type, + values: self.values, + validity: self.validity, + } } - /// Alias for `new` - pub fn from_data(data_type: DataType, values: Buffer, validity: Option) -> Self { - Self::new(data_type, values, validity) + /// Creates a (non-null) [`PrimitiveArray`] from a vector of values. + /// # Examples + /// ``` + /// use arrow2::array::PrimitiveArray; + /// + /// let array = PrimitiveArray::from_vec(vec![1, 2, 3]); + /// assert_eq!(format!("{:?}", array), "Int32[1, 2, 3]"); + /// ``` + pub fn from_vec(values: Vec) -> Self { + Self::new(T::PRIMITIVE.into(), values.into(), None) } - /// Returns a new empty [`PrimitiveArray`]. - pub fn new_empty(data_type: DataType) -> Self { - Self::new(data_type, Buffer::new(), None) + /// Returns an iterator over the values and validity, `Option<&T>`. + #[inline] + pub fn iter(&self) -> ZipValidity<&T, std::slice::Iter> { + zip_validity( + self.values().iter(), + self.validity().as_ref().map(|x| x.iter()), + ) } - /// Returns a new [`PrimitiveArray`] whose all slots are null / `None`. + /// Returns an iterator of the values, `&T`, ignoring the arrays' validity. #[inline] - pub fn new_null(data_type: DataType, length: usize) -> Self { - Self::new( - data_type, - Buffer::new_zeroed(length), - Some(Bitmap::new_zeroed(length)), - ) + pub fn values_iter(&self) -> std::slice::Iter { + self.values().iter() } -} -impl PrimitiveArray { - /// Returns a slice of this [`PrimitiveArray`]. + /// Returns the length of this array + #[inline] + pub fn len(&self) -> usize { + self.values.len() + } + + /// The values [`Buffer`]. + /// Values on null slots are undetermined (they can be anything). + #[inline] + pub fn values(&self) -> &Buffer { + &self.values + } + + /// Returns the optional validity. + #[inline] + pub fn validity(&self) -> Option<&Bitmap> { + self.validity.as_ref() + } + + /// Returns the arrays' [`DataType`]. + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + /// Returns the value at slot `i`. + /// + /// Equivalent to `self.values()[i]`. The value of a null slot is undetermined (it can be anything). + /// # Panic + /// This function panics iff `i >= self.len`. + #[inline] + pub fn value(&self, i: usize) -> T { + self.values()[i] + } + + /// Returns the value at index `i`. + /// The value on null slots is undetermined (it can be anything). + /// # Safety + /// Caller must be sure that `i < self.len()` + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> T { + *self.values.get_unchecked(i) + } + + /// Returns a clone of this [`PrimitiveArray`] sliced by an offset and length. /// # Implementation /// This operation is `O(1)` as it amounts to increase two ref counts. + /// # Examples + /// ``` + /// use arrow2::array::PrimitiveArray; + /// + /// let array = PrimitiveArray::from_vec(vec![1, 2, 3]); + /// assert_eq!(format!("{:?}", array), "Int32[1, 2, 3]"); + /// let sliced = array.slice(1, 1); + /// assert_eq!(format!("{:?}", sliced), "Int32[2]"); + /// // note: `sliced` and `array` share the same memory region. + /// ``` /// # Panic - /// This function panics iff `offset + length >= self.len()`. + /// This function panics iff `offset + length > self.len()`. #[inline] #[must_use] pub fn slice(&self, offset: usize, length: usize) -> Self { @@ -117,7 +220,7 @@ impl PrimitiveArray { unsafe { self.slice_unchecked(offset, length) } } - /// Returns a slice of this [`PrimitiveArray`]. + /// Returns a clone of this [`PrimitiveArray`] sliced by an offset and length. /// # Implementation /// This operation is `O(1)` as it amounts to increase two ref counts. /// # Safety @@ -136,7 +239,7 @@ impl PrimitiveArray { } } - /// Sets the validity bitmap on this [`PrimitiveArray`]. + /// Returns a clone of this [`PrimitiveArray`] with a new validity. /// # Panics /// This function panics iff `validity.len() != self.len()`. #[must_use] @@ -148,71 +251,15 @@ impl PrimitiveArray { arr.validity = validity; arr } -} - -impl PrimitiveArray { - /// Returns the length of this array - #[inline] - pub fn len(&self) -> usize { - self.values.len() - } - /// The optional validity. - #[inline] - pub fn validity(&self) -> Option<&Bitmap> { - self.validity.as_ref() - } - - /// The arrays' logical type - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// The values [`Buffer`]. - /// Values on null slots are undetermined (they can be anything). - #[inline] - pub fn values(&self) -> &Buffer { - &self.values - } - - /// Returns the value at slot `i`. Equivalent to `self.values()[i]`. - /// The value on null slots is undetermined (it can be anything). - #[inline] - pub fn value(&self, i: usize) -> T { - self.values()[i] - } - - /// Returns the element at index `i` as `T`. - /// The value on null slots is undetermined (it can be anything). - /// # Safety - /// Caller must be sure that `i < self.len()` - #[inline] - pub unsafe fn value_unchecked(&self, i: usize) -> T { - *self.values.get_unchecked(i) - } - - /// Returns a new [`PrimitiveArray`] with a different logical type. - /// This is `O(1)`. - /// # Panics - /// Panics iff the data_type is not supported for the physical type. - #[inline] - pub fn to(self, data_type: DataType) -> Self { - if !data_type.to_physical_type().eq_primitive(T::PRIMITIVE) { - Err(Error::InvalidArgumentError(format!( - "Type {} does not support logical type {:?}", - std::any::type_name::(), - data_type - ))) - .unwrap() - } - Self { - data_type, - values: self.values, - validity: self.validity, - } - } - /// Try to convert this `PrimitiveArray` to a `MutablePrimitiveArray` + /// Try to convert this [`PrimitiveArray`] to a [`MutablePrimitiveArray`] via copy-on-write semantics. + /// + /// A [`PrimitiveArray`] is backed by a [`Buffer`] and [`Bitmap`] which are essentially `Arc>`. + /// This function returns a [`MutablePrimitiveArray`] (via [`std::sync::Arc::get_mut`]) iff both values + /// and validity have not been cloned / are unique references to their underlying vectors. + /// + /// This function is primarily used to re-use memory regions. + #[must_use] pub fn into_mut(self) -> Either> { use Either::*; @@ -247,6 +294,81 @@ impl PrimitiveArray { } } } + + /// Returns a new empty (zero-length) [`PrimitiveArray`]. + pub fn new_empty(data_type: DataType) -> Self { + Self::new(data_type, Buffer::new(), None) + } + + /// Returns a new [`PrimitiveArray`] where all slots are null / `None`. + #[inline] + pub fn new_null(data_type: DataType, length: usize) -> Self { + Self::new( + data_type, + Buffer::new_zeroed(length), + Some(Bitmap::new_zeroed(length)), + ) + } + + /// Creates a (non-null) [`PrimitiveArray`] from an iterator of values. + /// # Implementation + /// This does not assume that the iterator has a known length. + pub fn from_values>(iter: I) -> Self { + Self::new(T::PRIMITIVE.into(), Vec::::from_iter(iter).into(), None) + } + + /// Creates a (non-null) [`PrimitiveArray`] from a slice of values. + /// # Implementation + /// This is essentially a memcopy and is thus `O(N)` + pub fn from_slice>(slice: P) -> Self { + Self::new( + T::PRIMITIVE.into(), + Vec::::from(slice.as_ref()).into(), + None, + ) + } + + /// Creates a (non-null) [`PrimitiveArray`] from a [`TrustedLen`] of values. + /// # Implementation + /// This does not assume that the iterator has a known length. + pub fn from_trusted_len_values_iter>(iter: I) -> Self { + MutablePrimitiveArray::::from_trusted_len_values_iter(iter).into() + } + + /// Creates a new [`PrimitiveArray`] from an iterator over values + /// # Safety + /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). + /// I.e. that `size_hint().1` correctly reports its length. + pub unsafe fn from_trusted_len_values_iter_unchecked>(iter: I) -> Self { + MutablePrimitiveArray::::from_trusted_len_values_iter_unchecked(iter).into() + } + + /// Creates a [`PrimitiveArray`] from a [`TrustedLen`] of optional values. + pub fn from_trusted_len_iter>>(iter: I) -> Self { + MutablePrimitiveArray::::from_trusted_len_iter(iter).into() + } + + /// Creates a [`PrimitiveArray`] from an iterator of optional values. + /// # Safety + /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). + /// I.e. that `size_hint().1` correctly reports its length. + pub unsafe fn from_trusted_len_iter_unchecked>>(iter: I) -> Self { + MutablePrimitiveArray::::from_trusted_len_iter_unchecked(iter).into() + } + + /// Alias for `Self::try_new(..).unwrap()`. + /// # Panics + /// This function errors iff: + /// * The validity is not `None` and its length is different from `values`'s length + /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive`]. + pub fn new(data_type: DataType, values: Buffer, validity: Option) -> Self { + Self::try_new(data_type, values, validity).unwrap() + } + + /// Alias for `Self::try_new(..).unwrap()`. + pub fn from_data(data_type: DataType, values: Buffer, validity: Option) -> Self { + Self::new(data_type, values, validity) + } } impl Array for PrimitiveArray { diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index f76b1bce4e8..c216ae0ab59 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -134,7 +134,7 @@ impl Utf8Array { /// Returns the value of the element at index `i`, ignoring the array's validity. /// # Panic - /// This function panics iff `i < self.len`. + /// This function panics iff `i >= self.len`. #[inline] pub fn value(&self, i: usize) -> &str { assert!(i < self.len());