Skip to content

Commit

Permalink
Move from_iter_values to GenericByteArray (#4586)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold authored Jul 30, 2023
1 parent a310056 commit 2adb64d
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 75 deletions.
37 changes: 0 additions & 37 deletions arrow-array/src/array/binary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ use crate::types::{ByteArrayType, GenericBinaryType};
use crate::{
Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait,
};
use arrow_buffer::MutableBuffer;
use arrow_data::ArrayData;
use arrow_schema::DataType;

Expand Down Expand Up @@ -83,42 +82,6 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> {
Self::from(data)
}

/// Creates a [`GenericBinaryArray`] based on an iterator of values without nulls
pub fn from_iter_values<Ptr, I>(iter: I) -> Self
where
Ptr: AsRef<[u8]>,
I: IntoIterator<Item = Ptr>,
{
let iter = iter.into_iter();
let (_, data_len) = iter.size_hint();
let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.

let mut offsets =
MutableBuffer::new((data_len + 1) * std::mem::size_of::<OffsetSize>());
let mut values = MutableBuffer::new(0);

let mut length_so_far = OffsetSize::zero();
offsets.push(length_so_far);

for s in iter {
let s = s.as_ref();
length_so_far += OffsetSize::from_usize(s.len()).unwrap();
offsets.push(length_so_far);
values.extend_from_slice(s);
}

// iterator size hint may not be correct so compute the actual number of offsets
assert!(!offsets.is_empty()); // wrote at least one
let actual_len = (offsets.len() / std::mem::size_of::<OffsetSize>()) - 1;

let array_data = ArrayData::builder(Self::DATA_TYPE)
.len(actual_len)
.add_buffer(offsets.into())
.add_buffer(values.into());
let array_data = unsafe { array_data.build_unchecked() };
Self::from(array_data)
}

/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
pub fn take_iter<'a>(
&'a self,
Expand Down
35 changes: 35 additions & 0 deletions arrow-array/src/array/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,41 @@ impl<T: ByteArrayType> GenericByteArray<T> {
}
}

/// Creates a [`GenericByteArray`] based on an iterator of values without nulls
pub fn from_iter_values<Ptr, I>(iter: I) -> Self
where
Ptr: AsRef<T::Native>,
I: IntoIterator<Item = Ptr>,
{
let iter = iter.into_iter();
let (_, data_len) = iter.size_hint();
let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.

let mut offsets =
MutableBuffer::new((data_len + 1) * std::mem::size_of::<T::Offset>());
offsets.push(T::Offset::usize_as(0));

let mut values = MutableBuffer::new(0);
for s in iter {
let s: &[u8] = s.as_ref().as_ref();
values.extend_from_slice(s);
offsets.push(T::Offset::usize_as(values.len()));
}

T::Offset::from_usize(values.len()).expect("offset overflow");
let offsets = Buffer::from(offsets);

// Safety: valid by construction
let value_offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };

Self {
data_type: T::DATA_TYPE,
value_data: values.into(),
value_offsets,
nulls: None,
}
}

/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
(self.value_offsets, self.value_data, self.nulls)
Expand Down
39 changes: 1 addition & 38 deletions arrow-array/src/array/string_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@

use crate::types::GenericStringType;
use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
use arrow_buffer::MutableBuffer;
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};

/// A [`GenericByteArray`] for storing `str`
Expand All @@ -40,42 +38,6 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
self.value(i).chars().count()
}

/// Creates a [`GenericStringArray`] based on an iterator of values without nulls
pub fn from_iter_values<Ptr, I>(iter: I) -> Self
where
Ptr: AsRef<str>,
I: IntoIterator<Item = Ptr>,
{
let iter = iter.into_iter();
let (_, data_len) = iter.size_hint();
let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.

let mut offsets =
MutableBuffer::new((data_len + 1) * std::mem::size_of::<OffsetSize>());
let mut values = MutableBuffer::new(0);

let mut length_so_far = OffsetSize::zero();
offsets.push(length_so_far);

for i in iter {
let s = i.as_ref();
length_so_far += OffsetSize::from_usize(s.len()).unwrap();
offsets.push(length_so_far);
values.extend_from_slice(s.as_bytes());
}

// iterator size hint may not be correct so compute the actual number of offsets
assert!(!offsets.is_empty()); // wrote at least one
let actual_len = (offsets.len() / std::mem::size_of::<OffsetSize>()) - 1;

let array_data = ArrayData::builder(Self::DATA_TYPE)
.len(actual_len)
.add_buffer(offsets.into())
.add_buffer(values.into());
let array_data = unsafe { array_data.build_unchecked() };
Self::from(array_data)
}

/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
pub fn take_iter<'a>(
&'a self,
Expand Down Expand Up @@ -210,6 +172,7 @@ mod tests {
use crate::types::UInt8Type;
use crate::Array;
use arrow_buffer::Buffer;
use arrow_data::ArrayData;
use arrow_schema::Field;
use std::sync::Arc;

Expand Down

0 comments on commit 2adb64d

Please sign in to comment.