diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 187f5b8e6f96..f31bc1c785b9 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -16,19 +16,22 @@ // under the License. use crate::array::print_long_array; -use crate::builder::GenericByteViewBuilder; +use crate::builder::{ArrayBuilder, GenericByteViewBuilder}; use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; -use crate::{Array, ArrayAccessor, ArrayRef, Scalar}; -use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer}; +use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar}; +use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder, ByteView}; use arrow_schema::{ArrowError, DataType}; +use num::ToPrimitive; use std::any::Any; use std::fmt::Debug; use std::marker::PhantomData; use std::sync::Arc; +use super::ByteArrayType; + /// [Variable-size Binary View Layout]: An array of variable length bytes view arrays. /// /// Different than [`crate::GenericByteArray`] as it stores both an offset and length @@ -429,6 +432,51 @@ impl From for GenericByteViewArray { } } +/// Convert a [`GenericByteArray`] to a [`GenericByteViewArray`] but in a smart way: +/// If the offsets are all less than u32::MAX, then we directly build the view array on top of existing buffer. +impl From<&GenericByteArray> for GenericByteViewArray +where + FROM: ByteArrayType, + FROM::Offset: OffsetSizeTrait + ToPrimitive, + V: ByteViewType, +{ + fn from(byte_array: &GenericByteArray) -> Self { + let offsets = byte_array.offsets(); + + let can_reuse_buffer = match offsets.last() { + Some(offset) => offset.as_usize() < u32::MAX as usize, + None => true, + }; + + if can_reuse_buffer { + let len = byte_array.len(); + let mut views_builder = GenericByteViewBuilder::::with_capacity(len); + let str_values_buf = byte_array.values().clone(); + let block = views_builder.append_block(str_values_buf); + for (i, w) in offsets.windows(2).enumerate() { + let offset = w[0].as_usize(); + let end = w[1].as_usize(); + let length = end - offset; + + if byte_array.is_null(i) { + views_builder.append_null(); + } else { + // Safety: the input was a valid array so it valid UTF8 (if string). And + // all offsets were valid + unsafe { + views_builder.append_view_unchecked(block, offset as u32, length as u32) + } + } + } + assert_eq!(views_builder.len(), len); + views_builder.finish() + } else { + // TODO: the first u32::MAX can still be reused + GenericByteViewArray::::from_iter(byte_array.iter()) + } + } +} + impl From> for ArrayData { fn from(mut array: GenericByteViewArray) -> Self { let len = array.len(); diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 55f2ed72836b..f05708b75569 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -1219,7 +1219,7 @@ pub fn cast_with_options( let binary = BinaryArray::from(array.as_string::().clone()); cast_byte_container::(&binary) } - Utf8View => cast_byte_to_view::(array), + Utf8View => Ok(Arc::new(StringViewArray::from(array.as_string::()))), LargeUtf8 => cast_byte_container::(array), Time32(TimeUnit::Second) => parse_string::(array, cast_options), Time32(TimeUnit::Millisecond) => { @@ -1279,7 +1279,7 @@ pub fn cast_with_options( LargeBinary => Ok(Arc::new(LargeBinaryArray::from( array.as_string::().clone(), ))), - Utf8View => cast_byte_to_view::(array), + Utf8View => Ok(Arc::new(StringViewArray::from(array.as_string::()))), Time32(TimeUnit::Second) => parse_string::(array, cast_options), Time32(TimeUnit::Millisecond) => { parse_string::(array, cast_options) @@ -1327,7 +1327,7 @@ pub fn cast_with_options( FixedSizeBinary(size) => { cast_binary_to_fixed_size_binary::(array, *size, cast_options) } - BinaryView => cast_byte_to_view::(array), + BinaryView => Ok(Arc::new(BinaryViewArray::from(array.as_binary::()))), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -1342,7 +1342,7 @@ pub fn cast_with_options( FixedSizeBinary(size) => { cast_binary_to_fixed_size_binary::(array, *size, cast_options) } - BinaryView => cast_byte_to_view::(array), + BinaryView => Ok(Arc::new(BinaryViewArray::from(array.as_binary::()))), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -2334,38 +2334,6 @@ where Ok(Arc::new(GenericByteArray::::from(array_data))) } -/// Helper function to cast from one `ByteArrayType` array to `ByteViewType` array. -fn cast_byte_to_view(array: &dyn Array) -> Result -where - FROM: ByteArrayType, - FROM::Offset: OffsetSizeTrait + ToPrimitive, - V: ByteViewType, -{ - let byte_array: &GenericByteArray = array.as_bytes(); - let len = array.len(); - let str_values_buf = byte_array.values().clone(); - let offsets = byte_array.offsets(); - - let mut views_builder = GenericByteViewBuilder::::with_capacity(len); - let block = views_builder.append_block(str_values_buf); - for (i, w) in offsets.windows(2).enumerate() { - let offset = w[0].to_u32().unwrap(); - let end = w[1].to_u32().unwrap(); - let length = end - offset; - - if byte_array.is_null(i) { - views_builder.append_null(); - } else { - // Safety: the input was a valid array so it valid UTF8 (if string). And - // all offsets were valid and we created the views correctly - unsafe { views_builder.append_view_unchecked(block, offset, length) } - } - } - - assert_eq!(views_builder.len(), len); - Ok(Arc::new(views_builder.finish())) -} - /// Helper function to cast from one `ByteViewType` array to `ByteArrayType` array. fn cast_view_to_byte(array: &dyn Array) -> Result where diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs index c300f995283b..18f77a9275ce 100644 --- a/arrow-ord/src/cmp.rs +++ b/arrow-ord/src/cmp.rs @@ -540,98 +540,32 @@ impl<'a, T: ByteArrayType> ArrayOrd for &'a GenericByteArray { } } -/// Comparing two ByteView types are non-trivial. -/// It takes a bit of patience to understand why we don't just compare two &[u8] directly. -/// -/// ByteView types give us the following two advantages, and we need to be careful not to lose them: -/// (1) For string/byte smaller than 12 bytes, the entire data is inlined in the view. -/// Meaning that reading one array element requires only one memory access -/// (two memory access required for StringArray, one for offset buffer, the other for value buffer). -/// -/// (2) For string/byte larger than 12 bytes, we can still be faster than (for certain operations) StringArray/ByteArray, -/// thanks to the inlined 4 bytes. -/// Consider equality check: -/// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access). -/// If we are unlucky and the first four bytes are the same, we need to fallback to compare two full strings. impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray { - /// Item.0 is the array, Item.1 is the index into the array. - /// Why don't we just store Item.0[Item.1] as the item? - /// - Because if we do so, we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary. - /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer, - /// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string. + /// This is the item type for the GenericByteViewArray::compare + /// Item.0 is the array, Item.1 is the index type Item = (&'a GenericByteViewArray, usize); - /// # Equality check flow - /// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view. - /// (2) if any of the string is larger than 12 bytes, we need to compare the full string. - /// (2.1) if the inlined 4 bytes are different, we can return false immediately. - /// (2.2) o.w., we need to compare the full string. - /// - /// # Safety - /// (1) Indexing. The Self::Item.1 encodes the index value, which is already checked in `value` function, - /// so it is safe to index into the views. - /// (2) Slice data from view. We know the bytes 4-8 are inlined data (per spec), so it is safe to slice from the view. fn is_eq(l: Self::Item, r: Self::Item) -> bool { + // # Safety + // The index is within bounds as it is checked in value() let l_view = unsafe { l.0.views().get_unchecked(l.1) }; let l_len = *l_view as u32; let r_view = unsafe { r.0.views().get_unchecked(r.1) }; let r_len = *r_view as u32; - + // This is a fast path for equality check. + // We don't need to look at the actual bytes to determine if they are equal. if l_len != r_len { return false; } - if l_len <= 12 { - let l_data = unsafe { GenericByteViewArray::::inline_value(l_view, l_len as usize) }; - let r_data = unsafe { GenericByteViewArray::::inline_value(r_view, r_len as usize) }; - l_data == r_data - } else { - let l_inlined_data = unsafe { GenericByteViewArray::::inline_value(l_view, 4) }; - let r_inlined_data = unsafe { GenericByteViewArray::::inline_value(r_view, 4) }; - if l_inlined_data != r_inlined_data { - return false; - } - - let l_full_data: &[u8] = unsafe { l.0.value_unchecked(l.1).as_ref() }; - let r_full_data: &[u8] = unsafe { r.0.value_unchecked(r.1).as_ref() }; - l_full_data == r_full_data - } + unsafe { compare_byte_view_unchecked(l.0, l.1, r.0, r.1).is_eq() } } - /// # Ordering check flow - /// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view. - /// (2) if any of the string is larger than 12 bytes, we need to compare the full string. - /// (2.1) if the inlined 4 bytes are different, we can return the result immediately. - /// (2.2) o.w., we need to compare the full string. - /// - /// # Safety - /// (1) Indexing. The Self::Item.1 encodes the index value, which is already checked in `value` function, - /// so it is safe to index into the views. - /// (2) Slice data from view. We know the bytes 4-8 are inlined data (per spec), so it is safe to slice from the view. fn is_lt(l: Self::Item, r: Self::Item) -> bool { - let l_view = l.0.views().get(l.1).unwrap(); - let l_len = *l_view as u32; - - let r_view = r.0.views().get(r.1).unwrap(); - let r_len = *r_view as u32; - - if l_len <= 12 && r_len <= 12 { - let l_data = unsafe { GenericByteViewArray::::inline_value(l_view, l_len as usize) }; - let r_data = unsafe { GenericByteViewArray::::inline_value(r_view, r_len as usize) }; - return l_data < r_data; - } - // one of the string is larger than 12 bytes, - // we then try to compare the inlined data first - let l_inlined_data = unsafe { GenericByteViewArray::::inline_value(l_view, 4) }; - let r_inlined_data = unsafe { GenericByteViewArray::::inline_value(r_view, 4) }; - if r_inlined_data != l_inlined_data { - return l_inlined_data < r_inlined_data; - } - // unfortunately, we need to compare the full data - let l_full_data: &[u8] = unsafe { l.0.value_unchecked(l.1).as_ref() }; - let r_full_data: &[u8] = unsafe { r.0.value_unchecked(r.1).as_ref() }; - l_full_data < r_full_data + // # Safety + // The index is within bounds as it is checked in value() + unsafe { compare_byte_view_unchecked(l.0, l.1, r.0, r.1).is_lt() } } fn len(&self) -> usize { @@ -663,6 +597,78 @@ impl<'a> ArrayOrd for &'a FixedSizeBinaryArray { } } +/// Compares two [`GenericByteViewArray`] at index `left_idx` and `right_idx` +pub fn compare_byte_view( + left: &GenericByteViewArray, + left_idx: usize, + right: &GenericByteViewArray, + right_idx: usize, +) -> std::cmp::Ordering { + assert!(left_idx < left.len()); + assert!(right_idx < right.len()); + unsafe { compare_byte_view_unchecked(left, left_idx, right, right_idx) } +} + +/// Comparing two [`GenericByteViewArray`] at index `left_idx` and `right_idx` +/// +/// Comparing two ByteView types are non-trivial. +/// It takes a bit of patience to understand why we don't just compare two &[u8] directly. +/// +/// ByteView types give us the following two advantages, and we need to be careful not to lose them: +/// (1) For string/byte smaller than 12 bytes, the entire data is inlined in the view. +/// Meaning that reading one array element requires only one memory access +/// (two memory access required for StringArray, one for offset buffer, the other for value buffer). +/// +/// (2) For string/byte larger than 12 bytes, we can still be faster than (for certain operations) StringArray/ByteArray, +/// thanks to the inlined 4 bytes. +/// Consider equality check: +/// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access). +/// +/// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary. +/// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer, +/// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string. +/// +/// # Order check flow +/// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view. +/// (2) if any of the string is larger than 12 bytes, we need to compare the full string. +/// (2.1) if the inlined 4 bytes are different, we can return the result immediately. +/// (2.2) o.w., we need to compare the full string. +/// +/// # Safety +/// The left/right_idx must within range of each array +pub unsafe fn compare_byte_view_unchecked( + left: &GenericByteViewArray, + left_idx: usize, + right: &GenericByteViewArray, + right_idx: usize, +) -> std::cmp::Ordering { + let l_view = left.views().get_unchecked(left_idx); + let l_len = *l_view as u32; + + let r_view = right.views().get_unchecked(right_idx); + let r_len = *r_view as u32; + + if l_len <= 12 && r_len <= 12 { + let l_data = unsafe { GenericByteViewArray::::inline_value(l_view, l_len as usize) }; + let r_data = unsafe { GenericByteViewArray::::inline_value(r_view, r_len as usize) }; + return l_data.cmp(r_data); + } + + // one of the string is larger than 12 bytes, + // we then try to compare the inlined data first + let l_inlined_data = unsafe { GenericByteViewArray::::inline_value(l_view, 4) }; + let r_inlined_data = unsafe { GenericByteViewArray::::inline_value(r_view, 4) }; + if r_inlined_data != l_inlined_data { + return l_inlined_data.cmp(r_inlined_data); + } + + // unfortunately, we need to compare the full data + let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() }; + let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() }; + + l_full_data.cmp(r_full_data) +} + #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index 3825e5ec66f4..6430c8f0e405 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -135,6 +135,21 @@ fn compare_bytes( }) } +fn compare_byte_view( + left: &dyn Array, + right: &dyn Array, + opts: SortOptions, +) -> DynComparator { + let left = left.as_byte_view::(); + let right = right.as_byte_view::(); + + let l = left.clone(); + let r = right.clone(); + compare(left, right, opts, move |i, j| { + crate::cmp::compare_byte_view(&l, i, &r, j) + }) +} + fn compare_dict( left: &dyn Array, right: &dyn Array, @@ -342,8 +357,10 @@ pub fn make_comparator( (Boolean, Boolean) => Ok(compare_boolean(left, right, opts)), (Utf8, Utf8) => Ok(compare_bytes::(left, right, opts)), (LargeUtf8, LargeUtf8) => Ok(compare_bytes::(left, right, opts)), + (Utf8View, Utf8View) => Ok(compare_byte_view::(left, right, opts)), (Binary, Binary) => Ok(compare_bytes::(left, right, opts)), (LargeBinary, LargeBinary) => Ok(compare_bytes::(left, right, opts)), + (BinaryView, BinaryView) => Ok(compare_byte_view::(left, right, opts)), (FixedSizeBinary(_), FixedSizeBinary(_)) => { let left = left.as_fixed_size_binary(); let right = right.as_fixed_size_binary(); diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 8e1285493b0b..5dce771c85a9 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -135,6 +135,7 @@ use arrow_array::*; use arrow_buffer::ArrowNativeType; use arrow_data::ArrayDataBuilder; use arrow_schema::*; +use variable::{decode_binary_view, decode_string_view}; use crate::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive}; use crate::variable::{decode_binary, decode_string}; @@ -1079,6 +1080,9 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> Vec { .iter() .zip(lengths.iter_mut()) .for_each(|(slice, length)| *length += variable::encoded_len(slice)), + DataType::BinaryView => array.as_binary_view().iter().zip(lengths.iter_mut()).for_each(|(slice, length)| { + *length += variable::encoded_len(slice) + }), DataType::Utf8 => array.as_string::() .iter() .zip(lengths.iter_mut()) @@ -1091,11 +1095,14 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> Vec { .for_each(|(slice, length)| { *length += variable::encoded_len(slice.map(|x| x.as_bytes())) }), + DataType::Utf8View => array.as_string_view().iter().zip(lengths.iter_mut()).for_each(|(slice, length)| { + *length += variable::encoded_len(slice.map(|x| x.as_bytes())) + }), DataType::FixedSizeBinary(len) => { let len = len.to_usize().unwrap(); lengths.iter_mut().for_each(|x| *x += 1 + len) } - _ => unreachable!(), + _ => unimplemented!("unsupported data type: {}", array.data_type()), } } Encoder::Dictionary(values, null) => { @@ -1152,6 +1159,9 @@ fn encode_column( DataType::Binary => { variable::encode(data, offsets, as_generic_binary_array::(column).iter(), opts) } + DataType::BinaryView => { + variable::encode(data, offsets, column.as_binary_view().iter(), opts) + } DataType::LargeBinary => { variable::encode(data, offsets, as_generic_binary_array::(column).iter(), opts) } @@ -1167,11 +1177,16 @@ fn encode_column( .map(|x| x.map(|x| x.as_bytes())), opts, ), + DataType::Utf8View => variable::encode( + data, offsets, + column.as_string_view().iter().map(|x| x.map(|x| x.as_bytes())), + opts, + ), DataType::FixedSizeBinary(_) => { let array = column.as_any().downcast_ref().unwrap(); fixed::encode_fixed_size_binary(data, offsets, array, opts) } - _ => unreachable!(), + _ => unimplemented!("unsupported data type: {}", column.data_type()), } } Encoder::Dictionary(values, nulls) => { @@ -1255,11 +1270,12 @@ unsafe fn decode_column( DataType::Boolean => Arc::new(decode_bool(rows, options)), DataType::Binary => Arc::new(decode_binary::(rows, options)), DataType::LargeBinary => Arc::new(decode_binary::(rows, options)), + DataType::BinaryView => Arc::new(decode_binary_view(rows, options)), DataType::FixedSizeBinary(size) => Arc::new(decode_fixed_size_binary(rows, size, options)), DataType::Utf8 => Arc::new(decode_string::(rows, options, validate_utf8)), DataType::LargeUtf8 => Arc::new(decode_string::(rows, options, validate_utf8)), - DataType::Dictionary(_, _) => todo!(), - _ => unreachable!() + DataType::Utf8View => Arc::new(decode_string_view(rows, options, validate_utf8)), + _ => return Err(ArrowError::NotYetImplemented(format!("unsupported data type: {}", data_type))) } } Codec::Dictionary(converter, _) => { @@ -2047,6 +2063,32 @@ mod tests { .collect() } + fn generate_string_view(len: usize, valid_percent: f64) -> StringViewArray { + let mut rng = thread_rng(); + (0..len) + .map(|_| { + rng.gen_bool(valid_percent).then(|| { + let len = rng.gen_range(0..100); + let bytes = (0..len).map(|_| rng.gen_range(0..128)).collect(); + String::from_utf8(bytes).unwrap() + }) + }) + .collect() + } + + fn generate_byte_view(len: usize, valid_percent: f64) -> BinaryViewArray { + let mut rng = thread_rng(); + (0..len) + .map(|_| { + rng.gen_bool(valid_percent).then(|| { + let len = rng.gen_range(0..100); + let bytes: Vec<_> = (0..len).map(|_| rng.gen_range(0..128)).collect(); + bytes + }) + }) + .collect() + } + fn generate_dictionary( values: ArrayRef, len: usize, @@ -2127,7 +2169,7 @@ mod tests { fn generate_column(len: usize) -> ArrayRef { let mut rng = thread_rng(); - match rng.gen_range(0..14) { + match rng.gen_range(0..16) { 0 => Arc::new(generate_primitive_array::(len, 0.8)), 1 => Arc::new(generate_primitive_array::(len, 0.8)), 2 => Arc::new(generate_primitive_array::(len, 0.8)), @@ -2161,6 +2203,8 @@ mod tests { 13 => Arc::new(generate_list(len, 0.8, |values_len| { Arc::new(generate_struct(values_len, 0.8)) })), + 14 => Arc::new(generate_string_view(len, 0.8)), + 15 => Arc::new(generate_byte_view(len, 0.8)), _ => unreachable!(), } } diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index 45068baf2a32..c5aa7d8ac323 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -268,3 +268,30 @@ pub unsafe fn decode_string( // Row data must have come from a valid UTF-8 array GenericStringArray::from(builder.build_unchecked()) } + +/// Decodes a binary view array from `rows` with the provided `options` +pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> BinaryViewArray { + let decoded: GenericBinaryArray = decode_binary(rows, options); + + // Better performance might be to directly build the binary view instead of building to BinaryArray and then casting + // I suspect that the overhead is not a big deal. + // If it is, we can reimplement the `decode_binary_view` function to directly build the StringViewArray + BinaryViewArray::from(&decoded) +} + +/// Decodes a string view array from `rows` with the provided `options` +/// +/// # Safety +/// +/// The row must contain valid UTF-8 data +pub unsafe fn decode_string_view( + rows: &mut [&[u8]], + options: SortOptions, + validate_utf8: bool, +) -> StringViewArray { + let decoded: GenericStringArray = decode_string(rows, options, validate_utf8); + // Better performance might be to directly build the string view instead of building to StringArray and then casting + // I suspect that the overhead is not a big deal. + // If it is, we can reimplement the `decode_string_view` function to directly build the StringViewArray + StringViewArray::from(&decoded) +} diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 1e081d141a0a..7ec6402c0a97 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -171,6 +171,26 @@ fn add_benchmark(c: &mut Criterion) { }) }); + c.bench_function("lt scalar StringViewArray", |b| { + b.iter(|| { + lt( + &Scalar::new(StringViewArray::from_iter_values(["xxxx"])), + &string_view_left, + ) + .unwrap() + }) + }); + + c.bench_function("lt scalar StringArray", |b| { + b.iter(|| { + lt( + &Scalar::new(StringArray::from_iter_values(["xxxx"])), + &string_left, + ) + .unwrap() + }) + }); + c.bench_function("eq scalar StringViewArray", |b| { b.iter(|| { eq(