diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 54839604d192..67be3768cc80 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -19,7 +19,6 @@ use crate::types::{ByteArrayType, GenericBinaryType}; use crate::{ Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait, }; -use arrow_buffer::MutableBuffer; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -83,42 +82,6 @@ impl GenericBinaryArray { Self::from(data) } - /// Creates a [`GenericBinaryArray`] based on an iterator of values without nulls - pub fn from_iter_values(iter: I) -> Self - where - Ptr: AsRef<[u8]>, - I: IntoIterator, - { - let iter = iter.into_iter(); - let (_, data_len) = iter.size_hint(); - let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - - let mut offsets = - MutableBuffer::new((data_len + 1) * std::mem::size_of::()); - let mut values = MutableBuffer::new(0); - - let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); - - for s in iter { - let s = s.as_ref(); - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - offsets.push(length_so_far); - values.extend_from_slice(s); - } - - // iterator size hint may not be correct so compute the actual number of offsets - assert!(!offsets.is_empty()); // wrote at least one - let actual_len = (offsets.len() / std::mem::size_of::()) - 1; - - let array_data = ArrayData::builder(Self::DATA_TYPE) - .len(actual_len) - .add_buffer(offsets.into()) - .add_buffer(values.into()); - let array_data = unsafe { array_data.build_unchecked() }; - Self::from(array_data) - } - /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i` pub fn take_iter<'a>( &'a self, diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index be10a4508db1..f694aa32e507 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -182,6 +182,41 @@ impl GenericByteArray { } } + /// Creates a [`GenericByteArray`] based on an iterator of values without nulls + pub fn from_iter_values(iter: I) -> Self + where + Ptr: AsRef, + I: IntoIterator, + { + let iter = iter.into_iter(); + let (_, data_len) = iter.size_hint(); + let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. + + let mut offsets = + MutableBuffer::new((data_len + 1) * std::mem::size_of::()); + offsets.push(T::Offset::usize_as(0)); + + let mut values = MutableBuffer::new(0); + for s in iter { + let s: &[u8] = s.as_ref().as_ref(); + values.extend_from_slice(s); + offsets.push(T::Offset::usize_as(values.len())); + } + + T::Offset::from_usize(values.len()).expect("offset overflow"); + let offsets = Buffer::from(offsets); + + // Safety: valid by construction + let value_offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + + Self { + data_type: T::DATA_TYPE, + value_data: values.into(), + value_offsets, + nulls: None, + } + } + /// Deconstruct this array into its constituent parts pub fn into_parts(self) -> (OffsetBuffer, Buffer, Option) { (self.value_offsets, self.value_data, self.nulls) diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index f9a3a5fbd095..4c40e8b90ce2 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -17,8 +17,6 @@ use crate::types::GenericStringType; use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait}; -use arrow_buffer::MutableBuffer; -use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; /// A [`GenericByteArray`] for storing `str` @@ -40,42 +38,6 @@ impl GenericStringArray { self.value(i).chars().count() } - /// Creates a [`GenericStringArray`] based on an iterator of values without nulls - pub fn from_iter_values(iter: I) -> Self - where - Ptr: AsRef, - I: IntoIterator, - { - let iter = iter.into_iter(); - let (_, data_len) = iter.size_hint(); - let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - - let mut offsets = - MutableBuffer::new((data_len + 1) * std::mem::size_of::()); - let mut values = MutableBuffer::new(0); - - let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); - - for i in iter { - let s = i.as_ref(); - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - offsets.push(length_so_far); - values.extend_from_slice(s.as_bytes()); - } - - // iterator size hint may not be correct so compute the actual number of offsets - assert!(!offsets.is_empty()); // wrote at least one - let actual_len = (offsets.len() / std::mem::size_of::()) - 1; - - let array_data = ArrayData::builder(Self::DATA_TYPE) - .len(actual_len) - .add_buffer(offsets.into()) - .add_buffer(values.into()); - let array_data = unsafe { array_data.build_unchecked() }; - Self::from(array_data) - } - /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i` pub fn take_iter<'a>( &'a self, @@ -210,6 +172,7 @@ mod tests { use crate::types::UInt8Type; use crate::Array; use arrow_buffer::Buffer; + use arrow_data::ArrayData; use arrow_schema::Field; use std::sync::Arc;