diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index ff5c8e822cc0..d85ee67d062b 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Defines kernels suitable to perform operations to primitive arrays. +//! Kernels for operating on [`PrimitiveArray`]s use arrow_array::builder::BufferBuilder; use arrow_array::types::ArrowDictionaryKeyType; @@ -162,18 +162,38 @@ where } } +/// Allies a binary infallable function to two [`PrimitiveArray`]s, +/// producing a new [`PrimitiveArray`] +/// +/// # Details +/// /// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in `0..len`, collecting -/// the results in a [`PrimitiveArray`]. If any index is null in either `a` or `b`, the +/// the results in a [`PrimitiveArray`]. +/// +/// If any index is null in either `a` or `b`, the /// corresponding index in the result will also be null /// -/// Like [`unary`] the provided function is evaluated for every index, ignoring validity. This -/// is beneficial when the cost of the operation is low compared to the cost of branching, and -/// especially when the operation can be vectorised, however, requires `op` to be infallible -/// for all possible values of its inputs +/// Like [`unary`], the `op` is evaluated for every element in the two arrays, +/// including those elements which are NULL. This is beneficial as the cost of +/// the operation is low compared to the cost of branching, and especially when +/// the operation can be vectorised, however, requires `op` to be infallible for +/// all possible values of its inputs /// -/// # Error +/// # Errors +/// +/// * if the arrays have different lengths. /// -/// This function gives error if the arrays have different lengths +/// # Example +/// ``` +/// # use arrow_arith::arity::binary; +/// # use arrow_array::{Float32Array, Int32Array}; +/// # use arrow_array::types::Int32Type; +/// let a = Float32Array::from(vec![Some(5.1f32), None, Some(6.8), Some(7.2)]); +/// let b = Int32Array::from(vec![1, 2, 4, 9]); +/// // compute int(a) + b for each element +/// let c = binary(&a, &b, |a, b| a as i32 + b).unwrap(); +/// assert_eq!(c, Int32Array::from(vec![Some(6), None, Some(10), Some(16)])); +/// ``` pub fn binary( a: &PrimitiveArray, b: &PrimitiveArray, @@ -207,23 +227,68 @@ where Ok(PrimitiveArray::new(buffer.into(), nulls)) } -/// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in `0..len`, mutating -/// the mutable [`PrimitiveArray`] `a`. If any index is null in either `a` or `b`, the -/// corresponding index in the result will also be null. +/// Applies a binary and infallible function to values in two arrays, replacing +/// the values in the first array in place. /// -/// Mutable primitive array means that the buffer is not shared with other arrays. -/// As a result, this mutates the buffer directly without allocating new buffer. +/// # Details +/// +/// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in +/// `0..len`, modifying the [`PrimitiveArray`] `a` in place, if possible. +/// +/// If any index is null in either `a` or `b`, the corresponding index in the +/// result will also be null. +/// +/// # Buffer Reuse +/// +/// If the underlying buffers in `a` are not shared with other arrays, mutates +/// the underlying buffer in place, without allocating. +/// +/// If the underlying buffer in `a` are shared, returns Err(self) /// /// Like [`unary`] the provided function is evaluated for every index, ignoring validity. This /// is beneficial when the cost of the operation is low compared to the cost of branching, and /// especially when the operation can be vectorised, however, requires `op` to be infallible /// for all possible values of its inputs /// -/// # Error +/// # Errors /// -/// This function gives error if the arrays have different lengths. -/// This function gives error of original [`PrimitiveArray`] `a` if it is not a mutable -/// primitive array. +/// * if the arrays have different lengths +/// * If the array is not mutable. +/// +/// # See Also +/// +/// * Documentation on [`PrimitiveArray::unary_mut`] for operating on [`ArrayRef`]. +/// +/// # Example +/// ``` +/// # use arrow_arith::arity::binary_mut; +/// # use arrow_array::Float32Array; +/// # use arrow_array::types::Int32Type; +/// let a = Float32Array::from(vec![Some(5.1f32), None, Some(6.8)]); +/// let b = Float32Array::from(vec![Some(1.0f32), None, Some(2.0)]); +/// // compute a + b, updating the value in a in place if possible +/// let a = binary_mut(a, &b, |a, b| a + b).unwrap().unwrap(); +/// assert_eq!(a, Float32Array::from(vec![Some(6.1), None, Some(8.8)])); +/// ``` +/// +/// # Example with shared buffers +/// ``` +/// # use arrow_arith::arity::binary_mut; +/// # use arrow_array::Float32Array; +/// # use arrow_array::types::Int32Type; +/// let a = Float32Array::from(vec![Some(5.1f32), None, Some(6.8)]); +/// let b = Float32Array::from(vec![Some(1.0f32), None, Some(2.0)]); +/// // a_clone shares the buffer with a +/// let a_cloned = a.clone(); +/// // try to update a in place, but it is shared. Returns Err(a) +/// let a = binary_mut(a, &b, |a, b| a + b).unwrap_err(); +/// assert_eq!(a_cloned, a); +/// // drop shared reference +/// drop(a_cloned); +/// // now a is not shared, so we can update it in place +/// let a = binary_mut(a, &b, |a, b| a + b).unwrap().unwrap(); +/// assert_eq!(a, Float32Array::from(vec![Some(6.1), None, Some(8.8)])); +/// ``` pub fn binary_mut( a: PrimitiveArray, b: &PrimitiveArray, diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 919a1010116b..933f19518c65 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -419,7 +419,7 @@ pub type Decimal256Array = PrimitiveArray; pub use crate::types::ArrowPrimitiveType; -/// An array of [primitive values](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) +/// An array of primtive values, of type [`ArrowPrimitiveType`] /// /// # Example: From a Vec /// @@ -480,6 +480,19 @@ pub use crate::types::ArrowPrimitiveType; /// assert_eq!(array.values(), &[1, 0, 2]); /// assert!(array.is_null(1)); /// ``` +/// +/// # Example: Get a `PrimitiveArray` from an [`ArrayRef`] +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{Array, cast::AsArray, ArrayRef, Float32Array, PrimitiveArray}; +/// # use arrow_array::types::{Float32Type}; +/// # use arrow_schema::DataType; +/// # let array: ArrayRef = Arc::new(Float32Array::from(vec![1.2, 2.3])); +/// // will panic if the array is not a Float32Array +/// assert_eq!(&DataType::Float32, array.data_type()); +/// let f32_array: Float32Array = array.as_primitive().clone(); +/// assert_eq!(f32_array, Float32Array::from(vec![1.2, 2.3])); +/// ``` pub struct PrimitiveArray { data_type: DataType, /// Values data @@ -732,22 +745,34 @@ impl PrimitiveArray { PrimitiveArray::from(unsafe { d.build_unchecked() }) } - /// Applies an unary and infallible function to a primitive array. - /// This is the fastest way to perform an operation on a primitive array when - /// the benefits of a vectorized operation outweigh the cost of branching nulls and non-nulls. + /// Applies a unary infallible function to a primitive array, producing a + /// new array of potentially different type. + /// + /// This is the fastest way to perform an operation on a primitive array + /// when the benefits of a vectorized operation outweigh the cost of + /// branching nulls and non-nulls. /// - /// # Implementation + /// See also + /// * [`Self::unary_mut`] for in place modification. + /// * [`Self::try_unary`] for fallible operations. + /// * [`arrow::compute::binary`] for binary operations + /// + /// [`arrow::compute::binary`]: https://docs.rs/arrow/latest/arrow/compute/fn.binary.html + /// # Null Handling + /// + /// Applies the function for all values, including those on null slots. This + /// will often allow the compiler to generate faster vectorized code, but + /// requires that the operation must be infallible (not error/panic) for any + /// value of the corresponding type or this function may panic. /// - /// This will apply the function for all values, including those on null slots. - /// This implies that the operation must be infallible for any value of the corresponding type - /// or this function may panic. /// # Example /// ```rust - /// # use arrow_array::{Int32Array, types::Int32Type}; + /// # use arrow_array::{Int32Array, Float32Array, types::Int32Type}; /// # fn main() { /// let array = Int32Array::from(vec![Some(5), Some(7), None]); - /// let c = array.unary(|x| x * 2 + 1); - /// assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); + /// // Create a new array with the value of applying sqrt + /// let c = array.unary(|x| f32::sqrt(x as f32)); + /// assert_eq!(c, Float32Array::from(vec![Some(2.236068), Some(2.6457512), None])); /// # } /// ``` pub fn unary(&self, op: F) -> PrimitiveArray @@ -766,24 +791,50 @@ impl PrimitiveArray { PrimitiveArray::new(buffer.into(), nulls) } - /// Applies an unary and infallible function to a mutable primitive array. - /// Mutable primitive array means that the buffer is not shared with other arrays. - /// As a result, this mutates the buffer directly without allocating new buffer. + /// Applies a unary and infallible function to the array in place if possible. + /// + /// # Buffer Reuse + /// + /// If the underlying buffers are not shared with other arrays, mutates the + /// underlying buffer in place, without allocating. + /// + /// If the underlying buffer is shared, returns Err(self) /// - /// # Implementation + /// # Null Handling + /// + /// See [`Self::unary`] for more information on null handling. /// - /// This will apply the function for all values, including those on null slots. - /// This implies that the operation must be infallible for any value of the corresponding type - /// or this function may panic. /// # Example + /// /// ```rust /// # use arrow_array::{Int32Array, types::Int32Type}; - /// # fn main() { /// let array = Int32Array::from(vec![Some(5), Some(7), None]); + /// // Apply x*2+1 to the data in place, no allocations /// let c = array.unary_mut(|x| x * 2 + 1).unwrap(); /// assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); - /// # } /// ``` + /// + /// # Example: modify [`ArrayRef`] in place, if not shared + /// + /// It is also possible to modify an [`ArrayRef`] if there are no other + /// references to the underlying buffer. + /// + /// ```rust + /// # use std::sync::Arc; + /// # use arrow_array::{Array, cast::AsArray, ArrayRef, Int32Array, PrimitiveArray, types::Int32Type}; + /// # let array: ArrayRef = Arc::new(Int32Array::from(vec![Some(5), Some(7), None])); + /// // Convert to Int32Array (panic's if array.data_type is not Int32) + /// let a = array.as_primitive::().clone(); + /// // Try to apply x*2+1 to the data in place, fails because array is still shared + /// a.unary_mut(|x| x * 2 + 1).unwrap_err(); + /// // Try again, this time dropping the last remaining reference + /// let a = array.as_primitive::().clone(); + /// drop(array); + /// // Now we can apply the operation in place + /// let c = a.unary_mut(|x| x * 2 + 1).unwrap(); + /// assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); + /// ``` + pub fn unary_mut(self, op: F) -> Result, PrimitiveArray> where F: Fn(T::Native) -> T::Native, @@ -796,11 +847,12 @@ impl PrimitiveArray { Ok(builder.finish()) } - /// Applies a unary and fallible function to all valid values in a primitive array + /// Applies a unary fallible function to all valid values in a primitive + /// array, producing a new array of potentially different type. /// - /// This is unlike [`Self::unary`] which will apply an infallible function to all rows - /// regardless of validity, in many cases this will be significantly faster and should - /// be preferred if `op` is infallible. + /// Applies `op` to only rows that are valid, which is often significantly + /// slower than [`Self::unary`], which should be preferred if `op` is + /// fallible. /// /// Note: LLVM is currently unable to effectively vectorize fallible operations pub fn try_unary(&self, op: F) -> Result, E> @@ -829,13 +881,16 @@ impl PrimitiveArray { Ok(PrimitiveArray::new(values, nulls)) } - /// Applies an unary and fallible function to all valid values in a mutable primitive array. - /// Mutable primitive array means that the buffer is not shared with other arrays. - /// As a result, this mutates the buffer directly without allocating new buffer. + /// Applies a unary fallible function to all valid values in a mutable + /// primitive array. + /// + /// # Null Handling + /// + /// See [`Self::try_unary`] for more information on null handling. + /// + /// # Buffer Reuse /// - /// This is unlike [`Self::unary_mut`] which will apply an infallible function to all rows - /// regardless of validity, in many cases this will be significantly faster and should - /// be preferred if `op` is infallible. + /// See [`Self::unary_mut`] for more information on buffer reuse. /// /// This returns an `Err` when the input array is shared buffer with other /// array. In the case, returned `Err` wraps input array. If the function @@ -870,9 +925,9 @@ impl PrimitiveArray { /// Applies a unary and nullable function to all valid values in a primitive array /// - /// This is unlike [`Self::unary`] which will apply an infallible function to all rows - /// regardless of validity, in many cases this will be significantly faster and should - /// be preferred if `op` is infallible. + /// Applies `op` to only rows that are valid, which is often significantly + /// slower than [`Self::unary`], which should be preferred if `op` is + /// fallible. /// /// Note: LLVM is currently unable to effectively vectorize fallible operations pub fn unary_opt(&self, op: F) -> PrimitiveArray @@ -915,8 +970,16 @@ impl PrimitiveArray { PrimitiveArray::new(values, Some(nulls)) } - /// Returns `PrimitiveBuilder` of this primitive array for mutating its values if the underlying - /// data buffer is not shared by others. + /// Returns a `PrimitiveBuilder` for this array, suitable for mutating values + /// in place. + /// + /// # Buffer Reuse + /// + /// If the underlying data buffer has no other outstanding references, the + /// buffer is used without copying. + /// + /// If the underlying data buffer does have outstanding references, returns + /// `Err(self)` pub fn into_builder(self) -> Result, Self> { let len = self.len(); let data = self.into_data(); diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 198a11cb6974..a790fba86fed 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -47,9 +47,11 @@ impl BooleanType { pub const DATA_TYPE: DataType = DataType::Boolean; } -/// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the -/// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. +/// Trait for [primitive values], bridging the dynamic-typed nature of Arrow +/// (via [`DataType`]) with the static-typed nature of rust types +/// ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. /// +/// [primitive values]: https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout /// [`ArrowNativeType`]: arrow_buffer::ArrowNativeType pub trait ArrowPrimitiveType: primitive::PrimitiveTypeSealed + 'static { /// Corresponding Rust native type for the primitive type. diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index e05c1311ff3c..c563f73cf5b9 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -22,11 +22,14 @@ mod private { pub trait Sealed {} } -/// Trait expressing a Rust type that has the same in-memory representation -/// as Arrow. This includes `i16`, `f32`, but excludes `bool` (which in arrow is represented in bits). +/// Trait expressing a Rust type that has the same in-memory representation as +/// Arrow. /// -/// In little endian machines, types that implement [`ArrowNativeType`] can be memcopied to arrow buffers -/// as is. +/// This includes `i16`, `f32`, but excludes `bool` (which in arrow is +/// represented in bits). +/// +/// In little endian machines, types that implement [`ArrowNativeType`] can be +/// memcopied to arrow buffers as is. /// /// # Transmute Safety ///