Skip to content

Commit

Permalink
Support the length kernel on Binary Array (#1465)
Browse files Browse the repository at this point in the history
* support length on binary array (not test)
rewrite unary_offset using macro

Signed-off-by: remzi <13716567376yh@gmail.com>

* add tests

Signed-off-by: remzi <13716567376yh@gmail.com>

* add non-utf8 test cases

Signed-off-by: remzi <13716567376yh@gmail.com>

* fix some doc

Signed-off-by: remzi <13716567376yh@gmail.com>

* update doc
simplify the way to get offsets. No performance penalty

Signed-off-by: remzi <13716567376yh@gmail.com>
  • Loading branch information
HaoYang670 authored Mar 24, 2022
1 parent 1c25580 commit 2932da3
Showing 1 changed file with 178 additions and 64 deletions.
242 changes: 178 additions & 64 deletions arrow/src/compute/kernels/length.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,59 +15,58 @@
// specific language governing permissions and limitations
// under the License.

//! Defines kernel for length of a string array
//! Defines kernel for length of string arrays and binary arrays

use crate::{
array::*,
buffer::Buffer,
datatypes::{ArrowNativeType, ArrowPrimitiveType},
};
use crate::{array::*, buffer::Buffer, datatypes::ArrowPrimitiveType};
use crate::{
datatypes::{DataType, Int32Type, Int64Type},
error::{ArrowError, Result},
};

fn unary_offsets_string<O, F>(
array: &GenericStringArray<O>,
data_type: DataType,
op: F,
macro_rules! unary_offsets {
($array: expr, $data_type: expr, $op: expr) => {{
let slice = $array.value_offsets();

let lengths = slice.windows(2).map(|offset| $op(offset[1] - offset[0]));

// JUSTIFICATION
// Benefit
// ~60% speedup
// Soundness
// `values` come from a slice iterator with a known size.
let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) };

let null_bit_buffer = $array
.data_ref()
.null_buffer()
.map(|b| b.bit_slice($array.offset(), $array.len()));

let data = unsafe {
ArrayData::new_unchecked(
$data_type,
$array.len(),
None,
null_bit_buffer,
0,
vec![buffer],
vec![],
)
};
make_array(data)
}};
}

fn octet_length_binary<O: BinaryOffsetSizeTrait, T: ArrowPrimitiveType>(
array: &dyn Array,
) -> ArrayRef
where
O: StringOffsetSizeTrait + ArrowNativeType,
F: Fn(O) -> O,
T::Native: BinaryOffsetSizeTrait,
{
// note: offsets are stored as u8, but they can be interpreted as OffsetSize
let offsets = &array.data_ref().buffers()[0];
// this is a 30% improvement over iterating over u8s and building OffsetSize, which
// justifies the usage of `unsafe`.
let slice: &[O] = &unsafe { offsets.typed_data::<O>() }[array.offset()..];

let lengths = slice.windows(2).map(|offset| op(offset[1] - offset[0]));

// JUSTIFICATION
// Benefit
// ~60% speedup
// Soundness
// `values` come from a slice iterator with a known size.
let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) };

let null_bit_buffer = array
.data_ref()
.null_buffer()
.map(|b| b.bit_slice(array.offset(), array.len()));

let data = unsafe {
ArrayData::new_unchecked(
data_type,
array.len(),
None,
null_bit_buffer,
0,
vec![buffer],
vec![],
)
};
make_array(data)
let array = array
.as_any()
.downcast_ref::<GenericBinaryArray<O>>()
.unwrap();
unary_offsets!(array, T::DATA_TYPE, |x| x)
}

fn octet_length<O: StringOffsetSizeTrait, T: ArrowPrimitiveType>(
Expand All @@ -80,7 +79,21 @@ where
.as_any()
.downcast_ref::<GenericStringArray<O>>()
.unwrap();
unary_offsets_string::<O, _>(array, T::DATA_TYPE, |x| x)
unary_offsets!(array, T::DATA_TYPE, |x| x)
}

fn bit_length_impl_binary<O: BinaryOffsetSizeTrait, T: ArrowPrimitiveType>(
array: &dyn Array,
) -> ArrayRef
where
T::Native: BinaryOffsetSizeTrait,
{
let array = array
.as_any()
.downcast_ref::<GenericBinaryArray<O>>()
.unwrap();
let bits_in_bytes = O::from_usize(8).unwrap();
unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes)
}

fn bit_length_impl<O: StringOffsetSizeTrait, T: ArrowPrimitiveType>(
Expand All @@ -94,34 +107,38 @@ where
.downcast_ref::<GenericStringArray<O>>()
.unwrap();
let bits_in_bytes = O::from_usize(8).unwrap();
unary_offsets_string::<O, _>(array, T::DATA_TYPE, |x| x * bits_in_bytes)
unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes)
}

/// Returns an array of Int32/Int64 denoting the number of bytes in each string in the array.
/// Returns an array of Int32/Int64 denoting the number of bytes in each value in the array.
///
/// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8
/// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, BinaryArray and LargeBinaryArray
/// * length of null is null.
/// * length is in number of bytes
pub fn length(array: &dyn Array) -> Result<ArrayRef> {
match array.data_type() {
DataType::Utf8 => Ok(octet_length::<i32, Int32Type>(array)),
DataType::LargeUtf8 => Ok(octet_length::<i64, Int64Type>(array)),
DataType::Binary => Ok(octet_length_binary::<i32, Int32Type>(array)),
DataType::LargeBinary => Ok(octet_length_binary::<i64, Int64Type>(array)),
_ => Err(ArrowError::ComputeError(format!(
"length not supported for {:?}",
array.data_type()
))),
}
}

/// Returns an array of Int32/Int64 denoting the number of bits in each string in the array.
/// Returns an array of Int32/Int64 denoting the number of bits in each value in the array.
///
/// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8
/// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, BinaryArray and LargeBinaryArray
/// * bit_length of null is null.
/// * bit_length is in number of bits
pub fn bit_length(array: &dyn Array) -> Result<ArrayRef> {
match array.data_type() {
DataType::Utf8 => Ok(bit_length_impl::<i32, Int32Type>(array)),
DataType::LargeUtf8 => Ok(bit_length_impl::<i64, Int64Type>(array)),
DataType::Binary => Ok(bit_length_impl_binary::<i32, Int32Type>(array)),
DataType::LargeBinary => Ok(bit_length_impl_binary::<i64, Int64Type>(array)),
_ => Err(ArrowError::ComputeError(format!(
"bit_length not supported for {:?}",
array.data_type()
Expand All @@ -133,11 +150,11 @@ pub fn bit_length(array: &dyn Array) -> Result<ArrayRef> {
mod tests {
use super::*;

fn length_cases() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
[&v[..], &v[..]].concat()
}
fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
[&v[..], &v[..]].concat()
}

fn length_cases_string() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
// a large array
let mut values = vec!["one", "on", "o", ""];
let mut expected = vec![3, 2, 1, 0];
Expand All @@ -154,10 +171,21 @@ mod tests {
]
}

macro_rules! length_binary_helper {
($offset_ty: ty, $result_ty: ty, $kernel: ident, $value: expr, $expected: expr) => {{
let array = GenericBinaryArray::<$offset_ty>::from($value);
let result = $kernel(&array)?;
let result = result.as_any().downcast_ref::<$result_ty>().unwrap();
let expected: $result_ty = $expected.into();
assert_eq!(expected.data(), result.data());
Ok(())
}};
}

#[test]
#[cfg_attr(miri, ignore)] // running forever
fn length_test_string() -> Result<()> {
length_cases()
length_cases_string()
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = StringArray::from(input);
Expand All @@ -174,7 +202,7 @@ mod tests {
#[test]
#[cfg_attr(miri, ignore)] // running forever
fn length_test_large_string() -> Result<()> {
length_cases()
length_cases_string()
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
Expand All @@ -188,9 +216,23 @@ mod tests {
})
}

#[test]
fn length_test_binary() -> Result<()> {
let value: Vec<&[u8]> = vec![b"zero", b"one", &[0xff, 0xf8]];
let result: Vec<i32> = vec![4, 3, 2];
length_binary_helper!(i32, Int32Array, length, value, result)
}

#[test]
fn length_test_large_binary() -> Result<()> {
let value: Vec<&[u8]> = vec![b"zero", &[0xff, 0xf8], b"two"];
let result: Vec<i64> = vec![4, 2, 3];
length_binary_helper!(i64, Int64Array, length, value, result)
}

type OptionStr = Option<&'static str>;

fn length_null_cases() -> Vec<(Vec<OptionStr>, usize, Vec<Option<i32>>)> {
fn length_null_cases_string() -> Vec<(Vec<OptionStr>, usize, Vec<Option<i32>>)> {
vec![(
vec![Some("one"), None, Some("three"), Some("four")],
4,
Expand All @@ -200,7 +242,7 @@ mod tests {

#[test]
fn length_null_string() -> Result<()> {
length_null_cases()
length_null_cases_string()
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = StringArray::from(input);
Expand All @@ -216,7 +258,7 @@ mod tests {

#[test]
fn length_null_large_string() -> Result<()> {
length_null_cases()
length_null_cases_string()
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
Expand All @@ -235,6 +277,22 @@ mod tests {
})
}

#[test]
fn length_null_binary() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(b"zero"), None, Some(&[0xff, 0xf8]), Some(b"three")];
let result: Vec<Option<i32>> = vec![Some(4), None, Some(2), Some(5)];
length_binary_helper!(i32, Int32Array, length, value, result)
}

#[test]
fn length_null_large_binary() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(&[0xff, 0xf8]), None, Some(b"two"), Some(b"three")];
let result: Vec<Option<i64>> = vec![Some(2), None, Some(3), Some(5)];
length_binary_helper!(i64, Int64Array, length, value, result)
}

/// Tests that length is not valid for u64.
#[test]
fn length_wrong_type() {
Expand All @@ -257,11 +315,22 @@ mod tests {
Ok(())
}

fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
[&v[..], &v[..]].concat()
}
#[test]
fn binary_length_offsets() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(b"hello"), Some(b" "), Some(&[0xff, 0xf8]), None];
let a = BinaryArray::from(value);
let b = a.slice(1, 3);
let result = length(b.as_ref())?;
let result: &Int32Array = as_primitive_array(&result);

let expected = Int32Array::from(vec![Some(1), Some(2), None]);
assert_eq!(&expected, result);

Ok(())
}

fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
// a large array
let mut values = vec!["one", "on", "o", ""];
let mut expected = vec![24, 16, 8, 0];
Expand Down Expand Up @@ -312,6 +381,20 @@ mod tests {
})
}

#[test]
fn bit_length_binary() -> Result<()> {
let value: Vec<&[u8]> = vec![b"one", &[0xff, 0xf8], b"three"];
let expected: Vec<i32> = vec![24, 16, 40];
length_binary_helper!(i32, Int32Array, bit_length, value, expected)
}

#[test]
fn bit_length_large_binary() -> Result<()> {
let value: Vec<&[u8]> = vec![b"zero", b" ", &[0xff, 0xf8]];
let expected: Vec<i64> = vec![32, 8, 16];
length_binary_helper!(i64, Int64Array, bit_length, value, expected)
}

fn bit_length_null_cases() -> Vec<(Vec<OptionStr>, usize, Vec<Option<i32>>)> {
vec![(
vec![Some("one"), None, Some("three"), Some("four")],
Expand Down Expand Up @@ -357,6 +440,22 @@ mod tests {
})
}

#[test]
fn bit_length_null_binary() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(b"one"), None, Some(b"three"), Some(&[0xff, 0xf8])];
let expected: Vec<Option<i32>> = vec![Some(24), None, Some(40), Some(16)];
length_binary_helper!(i32, Int32Array, bit_length, value, expected)
}

#[test]
fn bit_length_null_large_binary() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(b"one"), None, Some(&[0xff, 0xf8]), Some(b"four")];
let expected: Vec<Option<i64>> = vec![Some(24), None, Some(16), Some(32)];
length_binary_helper!(i64, Int64Array, bit_length, value, expected)
}

/// Tests that bit_length is not valid for u64.
#[test]
fn bit_length_wrong_type() {
Expand All @@ -367,7 +466,7 @@ mod tests {

/// Tests with an offset
#[test]
fn bit_length_offsets() -> Result<()> {
fn bit_length_offsets_string() -> Result<()> {
let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]);
let b = a.slice(1, 3);
let result = bit_length(b.as_ref())?;
Expand All @@ -378,4 +477,19 @@ mod tests {

Ok(())
}

#[test]
fn bit_length_offsets_binary() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(b"hello"), Some(&[]), Some(b"world"), None];
let a = BinaryArray::from(value);
let b = a.slice(1, 3);
let result = bit_length(b.as_ref())?;
let result: &Int32Array = as_primitive_array(&result);

let expected = Int32Array::from(vec![Some(0), Some(40), None]);
assert_eq!(&expected, result);

Ok(())
}
}

0 comments on commit 2932da3

Please sign in to comment.