From 391789116d0055c914c8695c3aac1e3a7cbe511b Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 25 Mar 2022 21:15:44 +0800 Subject: [PATCH 1/4] add fn for list length code format Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/length.rs | 53 ++++++++++++++++++----------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index 19be3369404b..36f667949c77 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -56,10 +56,23 @@ macro_rules! unary_offsets { }}; } -fn octet_length_binary( - array: &dyn Array, -) -> ArrayRef +fn length_list(array: &dyn Array) -> ArrayRef where + O: OffsetSizeTrait, + T: ArrowPrimitiveType, + T::Native: OffsetSizeTrait, +{ + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + unary_offsets!(array, T::DATA_TYPE, |x| x) +} + +fn length_binary(array: &dyn Array) -> ArrayRef +where + O: BinaryOffsetSizeTrait, + T: ArrowPrimitiveType, T::Native: BinaryOffsetSizeTrait, { let array = array @@ -69,10 +82,10 @@ where unary_offsets!(array, T::DATA_TYPE, |x| x) } -fn octet_length( - array: &dyn Array, -) -> ArrayRef +fn length_string(array: &dyn Array) -> ArrayRef where + O: StringOffsetSizeTrait, + T: ArrowPrimitiveType, T::Native: StringOffsetSizeTrait, { let array = array @@ -82,10 +95,10 @@ where unary_offsets!(array, T::DATA_TYPE, |x| x) } -fn bit_length_impl_binary( - array: &dyn Array, -) -> ArrayRef +fn bit_length_binary(array: &dyn Array) -> ArrayRef where + O: BinaryOffsetSizeTrait, + T: ArrowPrimitiveType, T::Native: BinaryOffsetSizeTrait, { let array = array @@ -96,10 +109,10 @@ where unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes) } -fn bit_length_impl( - array: &dyn Array, -) -> ArrayRef +fn bit_length_string(array: &dyn Array) -> ArrayRef where + O: StringOffsetSizeTrait, + T: ArrowPrimitiveType, T::Native: StringOffsetSizeTrait, { let array = array @@ -117,10 +130,10 @@ where /// * length is in number of bytes pub fn length(array: &dyn Array) -> Result { match array.data_type() { - DataType::Utf8 => Ok(octet_length::(array)), - DataType::LargeUtf8 => Ok(octet_length::(array)), - DataType::Binary => Ok(octet_length_binary::(array)), - DataType::LargeBinary => Ok(octet_length_binary::(array)), + DataType::Utf8 => Ok(length_string::(array)), + DataType::LargeUtf8 => Ok(length_string::(array)), + DataType::Binary => Ok(length_binary::(array)), + DataType::LargeBinary => Ok(length_binary::(array)), _ => Err(ArrowError::ComputeError(format!( "length not supported for {:?}", array.data_type() @@ -135,10 +148,10 @@ pub fn length(array: &dyn Array) -> Result { /// * bit_length is in number of bits pub fn bit_length(array: &dyn Array) -> Result { match array.data_type() { - DataType::Utf8 => Ok(bit_length_impl::(array)), - DataType::LargeUtf8 => Ok(bit_length_impl::(array)), - DataType::Binary => Ok(bit_length_impl_binary::(array)), - DataType::LargeBinary => Ok(bit_length_impl_binary::(array)), + DataType::Utf8 => Ok(bit_length_string::(array)), + DataType::LargeUtf8 => Ok(bit_length_string::(array)), + DataType::Binary => Ok(bit_length_binary::(array)), + DataType::LargeBinary => Ok(bit_length_binary::(array)), _ => Err(ArrowError::ComputeError(format!( "bit_length not supported for {:?}", array.data_type() From 1d5221f44022f733ba1246fc9bcfda5167ba89ab Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Fri, 25 Mar 2022 21:44:28 +0800 Subject: [PATCH 2/4] add list support into length function Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/length.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index 36f667949c77..262f6f0e5bae 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -123,20 +123,24 @@ where unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes) } -/// Returns an array of Int32/Int64 denoting the number of bytes in each value in the array. +/// Returns an array of Int32/Int64 denoting the length of each value in the array. +/// For list array, length is the number of elements in each list. +/// For string array and binary array, length is the number of bytes of each value. /// -/// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, BinaryArray and LargeBinaryArray +/// * this only accepts ListArray/LargeListArray, StringArray/LargeStringArray and BinaryArray/LargeBinaryArray /// * length of null is null. /// * length is in number of bytes pub fn length(array: &dyn Array) -> Result { match array.data_type() { + DataType::List(_) => Ok(length_list::(array)), + DataType::LargeList(_) => Ok(length_list::(array)), DataType::Utf8 => Ok(length_string::(array)), DataType::LargeUtf8 => Ok(length_string::(array)), DataType::Binary => Ok(length_binary::(array)), DataType::LargeBinary => Ok(length_binary::(array)), - _ => Err(ArrowError::ComputeError(format!( + other => Err(ArrowError::ComputeError(format!( "length not supported for {:?}", - array.data_type() + other ))), } } From aa6a1388e1d8cc002375b8b7e3916a86287bcbac Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sat, 26 Mar 2022 11:09:27 +0800 Subject: [PATCH 3/4] add tests Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/length.rs | 70 +++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 4 deletions(-) diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index 262f6f0e5bae..ea25e6787ac4 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -156,15 +156,17 @@ pub fn bit_length(array: &dyn Array) -> Result { DataType::LargeUtf8 => Ok(bit_length_string::(array)), DataType::Binary => Ok(bit_length_binary::(array)), DataType::LargeBinary => Ok(bit_length_binary::(array)), - _ => Err(ArrowError::ComputeError(format!( + other => Err(ArrowError::ComputeError(format!( "bit_length not supported for {:?}", - array.data_type() + other ))), } } #[cfg(test)] mod tests { + use crate::datatypes::{Float32Type, Int8Type}; + use super::*; fn double_vec(v: Vec) -> Vec { @@ -199,6 +201,20 @@ mod tests { }}; } + macro_rules! length_list_helper { + ($offset_ty: ty, $result_ty: ty, $element_ty: ty, $value: expr, $expected: expr) => {{ + let array = + GenericListArray::<$offset_ty>::from_iter_primitive::<$element_ty, _, _>( + $value, + ); + let result = length(&array)?; + let result = result.as_any().downcast_ref::<$result_ty>().unwrap(); + let expected: $result_ty = $expected.into(); + assert_eq!(expected.data(), result.data()); + Ok(()) + }}; + } + #[test] #[cfg_attr(miri, ignore)] // running forever fn length_test_string() -> Result<()> { @@ -247,6 +263,28 @@ mod tests { length_binary_helper!(i64, Int64Array, length, value, result) } + #[test] + fn length_test_list() -> Result<()> { + let value = vec![ + Some(vec![]), + Some(vec![Some(1), Some(2), Some(4)]), + Some(vec![Some(0)]), + ]; + let result: Vec = vec![0, 3, 1]; + length_list_helper!(i32, Int32Array, Int32Type, value, result) + } + + #[test] + fn length_test_large_list() -> Result<()> { + let value = vec![ + Some(vec![]), + Some(vec![Some(1.1), Some(2.2), Some(3.3)]), + Some(vec![None]), + ]; + let result: Vec = vec![0, 3, 1]; + length_list_helper!(i64, Int64Array, Float32Type, value, result) + } + type OptionStr = Option<&'static str>; fn length_null_cases_string() -> Vec<(Vec, usize, Vec>)> { @@ -310,6 +348,30 @@ mod tests { length_binary_helper!(i64, Int64Array, length, value, result) } + #[test] + fn length_null_list() -> Result<()> { + let value = vec![ + Some(vec![]), + None, + Some(vec![Some(1), None, Some(2), Some(4)]), + Some(vec![Some(0)]), + ]; + let result: Vec> = vec![Some(0), None, Some(4), Some(1)]; + length_list_helper!(i32, Int32Array, Int8Type, value, result) + } + + #[test] + fn length_null_large_list() -> Result<()> { + let value = vec![ + Some(vec![]), + None, + Some(vec![Some(1.1), None, Some(4.0)]), + Some(vec![Some(0.1)]), + ]; + let result: Vec> = vec![Some(0), None, Some(3), Some(1)]; + length_list_helper!(i64, Int64Array, Float32Type, value, result) + } + /// Tests that length is not valid for u64. #[test] fn length_wrong_type() { @@ -320,7 +382,7 @@ mod tests { /// Tests with an offset #[test] - fn length_offsets() -> Result<()> { + fn length_offsets_string() -> Result<()> { let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]); let b = a.slice(1, 3); let result = length(b.as_ref())?; @@ -333,7 +395,7 @@ mod tests { } #[test] - fn binary_length_offsets() -> Result<()> { + fn length_offsets_binary() -> Result<()> { let value: Vec> = vec![Some(b"hello"), Some(b" "), Some(&[0xff, 0xf8]), None]; let a = BinaryArray::from(value); From ae4bce010cabd230c2f4f64e6e3f734516cfd64d Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sat, 26 Mar 2022 14:18:29 +0800 Subject: [PATCH 4/4] update doc Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/length.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index ea25e6787ac4..60a48ef6a270 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -129,7 +129,6 @@ where /// /// * this only accepts ListArray/LargeListArray, StringArray/LargeStringArray and BinaryArray/LargeBinaryArray /// * length of null is null. -/// * length is in number of bytes pub fn length(array: &dyn Array) -> Result { match array.data_type() { DataType::List(_) => Ok(length_list::(array)),