From 3c15bd8412f359d6e84a5c681657484d2d94722d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Sat, 29 Oct 2022 09:03:53 +1300 Subject: [PATCH 1/3] Generalize filter byte array (#2969) --- arrow-select/src/filter.rs | 50 +++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 71175ca5788d..b989989cf919 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -17,12 +17,11 @@ //! Defines filter kernels -use std::ops::AddAssign; use std::sync::Arc; -use num::Zero; - use arrow_array::builder::BooleanBufferBuilder; +use arrow_array::cast::{as_generic_binary_array, as_largestring_array, as_string_array}; +use arrow_array::types::ByteArrayType; use arrow_array::*; use arrow_buffer::bit_util; use arrow_buffer::{buffer::buffer_bin_and, Buffer, MutableBuffer}; @@ -355,18 +354,16 @@ fn filter_array( Ok(Arc::new(filter_boolean(values, predicate))) } DataType::Utf8 => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(filter_string::(values, predicate))) + Ok(Arc::new(filter_bytes(as_string_array(values), predicate))) } DataType::LargeUtf8 => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(filter_string::(values, predicate))) + Ok(Arc::new(filter_bytes(as_largestring_array(values), predicate))) + } + DataType::Binary => { + Ok(Arc::new(filter_bytes(as_generic_binary_array::(values), predicate))) + } + DataType::LargeBinary => { + Ok(Arc::new(filter_bytes(as_generic_binary_array::(values), predicate))) } DataType::Dictionary(_, _) => downcast_dictionary_array! { values => Ok(Arc::new(filter_dict(values, predicate))), @@ -549,7 +546,7 @@ where /// used to build a new [`GenericStringArray`] by copying values from the source /// /// TODO(raphael): Could this be used for the take kernel as well? -struct FilterString<'a, OffsetSize> { +struct FilterBytes<'a, OffsetSize> { src_offsets: &'a [OffsetSize], src_values: &'a [u8], dst_offsets: MutableBuffer, @@ -557,15 +554,18 @@ struct FilterString<'a, OffsetSize> { cur_offset: OffsetSize, } -impl<'a, OffsetSize> FilterString<'a, OffsetSize> +impl<'a, OffsetSize> FilterBytes<'a, OffsetSize> where - OffsetSize: Zero + AddAssign + OffsetSizeTrait, + OffsetSize: OffsetSizeTrait, { - fn new(capacity: usize, array: &'a GenericStringArray) -> Self { + fn new(capacity: usize, array: &'a GenericByteArray) -> Self + where + T: ByteArrayType, + { let num_offsets_bytes = (capacity + 1) * std::mem::size_of::(); let mut dst_offsets = MutableBuffer::new(num_offsets_bytes); let dst_values = MutableBuffer::new(0); - let cur_offset = OffsetSize::zero(); + let cur_offset = OffsetSize::from_usize(0).unwrap(); dst_offsets.push(cur_offset); Self { @@ -626,17 +626,17 @@ where /// /// Note: NULLs with a non-zero slot length in `array` will have the corresponding /// data copied across. This allows handling the null mask separately from the data -fn filter_string( - array: &GenericStringArray, +fn filter_bytes( + array: &GenericByteArray, predicate: &FilterPredicate, -) -> GenericStringArray +) -> GenericByteArray where - OffsetSize: Zero + AddAssign + OffsetSizeTrait, + T: ByteArrayType, { let data = array.data(); assert_eq!(data.buffers().len(), 2); assert_eq!(data.child_data().len(), 0); - let mut filter = FilterString::new(predicate.count, array); + let mut filter = FilterBytes::new(predicate.count, array); match &predicate.strategy { IterationStrategy::SlicesIterator => { @@ -650,7 +650,7 @@ where IterationStrategy::All | IterationStrategy::None => unreachable!(), } - let mut builder = ArrayDataBuilder::new(data.data_type().clone()) + let mut builder = ArrayDataBuilder::new(T::DATA_TYPE) .len(predicate.count) .add_buffer(filter.dst_offsets.into()) .add_buffer(filter.dst_values.into()); @@ -660,7 +660,7 @@ where } let data = unsafe { builder.build_unchecked() }; - GenericStringArray::from(data) + GenericByteArray::from(data) } /// `filter` implementation for dictionaries From fed1f4e2cf317a86e28b1e207782ebbe2bf32e67 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Sat, 29 Oct 2022 15:10:10 +1300 Subject: [PATCH 2/3] Fix doc --- arrow-select/src/filter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index b989989cf919..1a7c293f1ba2 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -542,8 +542,8 @@ where PrimitiveArray::from(data) } -/// [`FilterString`] is created from a source [`GenericStringArray`] and can be -/// used to build a new [`GenericStringArray`] by copying values from the source +/// [`FilterBytes`] is created from a source [`GenericByteArray`] and can be +/// used to build a new [`GenericByteArray`] by copying values from the source /// /// TODO(raphael): Could this be used for the take kernel as well? struct FilterBytes<'a, OffsetSize> { From b7795e8cdf1b47c68b749206a8f8386ab535d8e8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 2 Nov 2022 07:12:29 +1300 Subject: [PATCH 3/3] Update comment --- arrow-select/src/filter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 1a7c293f1ba2..4596afc8791f 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -622,7 +622,7 @@ where } } -/// `filter` implementation for string arrays +/// `filter` implementation for byte arrays /// /// Note: NULLs with a non-zero slot length in `array` will have the corresponding /// data copied across. This allows handling the null mask separately from the data