diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index 8808c409062f..0592c048c030 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -891,10 +891,18 @@ mod tests { assert!(binary_array.is_valid(i)); assert!(!binary_array.is_null(i)); } + } + + #[test] + fn test_binary_array_with_offsets() { + let values: [u8; 12] = [ + b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', + ]; + let offsets: [i32; 4] = [0, 5, 5, 12]; // Test binary array with offset let array_data = ArrayData::builder(DataType::Binary) - .len(4) + .len(2) .offset(1) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) @@ -947,10 +955,18 @@ mod tests { assert!(binary_array.is_valid(i)); assert!(!binary_array.is_null(i)); } + } + + #[test] + fn test_large_binary_array_with_offsets() { + let values: [u8; 12] = [ + b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', + ]; + let offsets: [i64; 4] = [0, 5, 5, 12]; // Test binary array with offset let array_data = ArrayData::builder(DataType::LargeBinary) - .len(4) + .len(2) .offset(1) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) @@ -1196,26 +1212,25 @@ mod tests { #[test] #[should_panic( - expected = "FixedSizeBinaryArray can only be created from list array of u8 values \ - (i.e. FixedSizeList>)." + expected = "FixedSizeBinaryArray can only be created from FixedSizeList arrays" )] fn test_fixed_size_binary_array_from_incorrect_list_array() { let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; let values_data = ArrayData::builder(DataType::UInt32) .len(12) .add_buffer(Buffer::from_slice_ref(&values)) - .add_child_data(ArrayData::builder(DataType::Boolean).build().unwrap()) .build() .unwrap(); - let array_data = ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Binary, false)), - 4, - )) - .len(3) - .add_child_data(values_data) - .build() - .unwrap(); + let array_data = unsafe { + ArrayData::builder(DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Binary, false)), + 4, + )) + .len(3) + .add_child_data(values_data) + .build_unchecked() + }; let list_array = FixedSizeListArray::from(array_data); drop(FixedSizeBinaryArray::from(list_array)); } diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs index d98ad5f9d8c2..3b1959431bfc 100644 --- a/arrow/src/array/array_boolean.rs +++ b/arrow/src/array/array_boolean.rs @@ -332,10 +332,11 @@ mod tests { #[should_panic(expected = "BooleanArray data should contain a single buffer only \ (values buffer)")] fn test_boolean_array_invalid_buffer_len() { - let data = ArrayData::builder(DataType::Boolean) - .len(5) - .build() - .unwrap(); + let data = unsafe { + ArrayData::builder(DataType::Boolean) + .len(5) + .build_unchecked() + }; drop(BooleanArray::from(data)); } } diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs index fb8d007daa55..7de357a1dc20 100644 --- a/arrow/src/array/array_list.rs +++ b/arrow/src/array/array_list.rs @@ -552,9 +552,10 @@ mod tests { assert!(!list_array.is_null(i)); } - // Now test with a non-zero offset + // Now test with a non-zero offset (skip first element) + // [[3, 4, 5], [6, 7]] let list_data = ArrayData::builder(list_data_type) - .len(3) + .len(2) .offset(1) .add_buffer(value_offsets) .add_child_data(value_data.clone()) @@ -565,7 +566,7 @@ mod tests { let values = list_array.values(); assert_eq!(&value_data, values.data()); assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); + assert_eq!(2, list_array.len()); assert_eq!(0, list_array.null_count()); assert_eq!(6, list_array.value_offsets()[1]); assert_eq!(2, list_array.value_length(1)); @@ -642,8 +643,9 @@ mod tests { } // Now test with a non-zero offset + // [[3, 4, 5], [6, 7]] let list_data = ArrayData::builder(list_data_type) - .len(3) + .len(2) .offset(1) .add_buffer(value_offsets) .add_child_data(value_data.clone()) @@ -654,7 +656,7 @@ mod tests { let values = list_array.values(); assert_eq!(&value_data, values.data()); assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); + assert_eq!(2, list_array.len()); assert_eq!(0, list_array.null_count()); assert_eq!(6, list_array.value_offsets()[1]); assert_eq!(2, list_array.value_length(1)); @@ -763,11 +765,12 @@ mod tests { Box::new(Field::new("item", DataType::Int32, false)), 3, ); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_child_data(value_data) - .build() - .unwrap(); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_child_data(value_data) + .build_unchecked() + }; drop(FixedSizeListArray::from(list_data)); } @@ -1038,18 +1041,20 @@ mod tests { expected = "ListArray data should contain a single buffer only (value offsets)" )] fn test_list_array_invalid_buffer_len() { - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build() - .unwrap(); + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .build_unchecked() + }; let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_child_data(value_data) - .build() - .unwrap(); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_child_data(value_data) + .build_unchecked() + }; drop(ListArray::from(list_data)); } @@ -1061,11 +1066,12 @@ mod tests { let value_offsets = Buffer::from_slice_ref(&[0, 2, 5, 7]); let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .build() - .unwrap(); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .build_unchecked() + }; drop(ListArray::from(list_data)); } @@ -1112,18 +1118,20 @@ mod tests { let buf2 = buf.slice(1); let values: [i32; 8] = [0; 8]; - let value_data = ArrayData::builder(DataType::Int32) - .add_buffer(Buffer::from_slice_ref(&values)) - .build() - .unwrap(); + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .add_buffer(Buffer::from_slice_ref(&values)) + .build_unchecked() + }; let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .add_buffer(buf2) - .add_child_data(value_data) - .build() - .unwrap(); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .add_buffer(buf2) + .add_child_data(value_data) + .build_unchecked() + }; drop(ListArray::from(list_data)); } diff --git a/arrow/src/array/array_map.rs b/arrow/src/array/array_map.rs index bd888ff83e9b..caccebcc56e3 100644 --- a/arrow/src/array/array_map.rs +++ b/arrow/src/array/array_map.rs @@ -320,7 +320,7 @@ mod tests { // Now test with a non-zero offset let map_data = ArrayData::builder(map_array.data_type().clone()) - .len(3) + .len(2) .offset(1) .add_buffer(map_array.data().buffers()[0].clone()) .add_child_data(map_array.data().child_data()[0].clone()) @@ -331,7 +331,7 @@ mod tests { let values = map_array.values(); assert_eq!(&value_data, values.data()); assert_eq!(DataType::UInt32, map_array.value_type()); - assert_eq!(3, map_array.len()); + assert_eq!(2, map_array.len()); assert_eq!(0, map_array.null_count()); assert_eq!(6, map_array.value_offsets()[1]); assert_eq!(2, map_array.value_length(1)); diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index 0b88ce40c7ff..ac56b4a97cc7 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -894,7 +894,7 @@ mod tests { #[test] fn test_primitive_array_builder() { // Test building a primitive array with ArrayData builder and offset - let buf = Buffer::from_slice_ref(&[0, 1, 2, 3, 4]); + let buf = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4, 5, 6]); let buf2 = buf.clone(); let data = ArrayData::builder(DataType::Int32) .len(5) @@ -950,7 +950,15 @@ mod tests { #[should_panic(expected = "PrimitiveArray data should contain a single buffer only \ (values buffer)")] fn test_primitive_array_invalid_buffer_len() { - let data = ArrayData::builder(DataType::Int32).len(5).build().unwrap(); + let buffer = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4]); + let data = unsafe { + ArrayData::builder(DataType::Int32) + .add_buffer(buffer.clone()) + .add_buffer(buffer) + .len(5) + .build_unchecked() + }; + drop(Int32Array::from(data)); } diff --git a/arrow/src/array/array_union.rs b/arrow/src/array/array_union.rs index 6460e072646b..56efcfb30c75 100644 --- a/arrow/src/array/array_union.rs +++ b/arrow/src/array/array_union.rs @@ -137,7 +137,10 @@ impl UnionArray { } } - Ok(Self::new(type_ids, value_offsets, child_arrays, bitmap)) + let new_self = Self::new(type_ids, value_offsets, child_arrays, bitmap); + new_self.data().validate()?; + + Ok(new_self) } /// Accesses the child array for `type_id`. diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index dbc54342b034..3ca14d85b9db 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -18,11 +18,12 @@ //! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates //! common attributes and operations for Arrow array. +use std::convert::TryInto; use std::mem; use std::sync::Arc; use crate::datatypes::{DataType, IntervalUnit}; -use crate::error::Result; +use crate::error::{ArrowError, Result}; use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; use crate::{ buffer::{Buffer, MutableBuffer}, @@ -189,6 +190,28 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff } } +/// Ensures that at least `min_size` elements of type `data_type` can +/// be stored in a buffer of `buffer_size`. +/// +/// `buffer_index` is used in error messages to identify which buffer +/// had the invalid index +fn ensure_size( + data_type: &DataType, + min_size: usize, + buffer_size: usize, + buffer_index: usize, +) -> Result<()> { + // if min_size is zero, may not have buffers (e.g. NullArray) + if min_size > 0 && buffer_size < min_size { + Err(ArrowError::InvalidArgumentError(format!( + "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}", + buffer_size, buffer_index, data_type, min_size + ))) + } else { + Ok(()) + } +} + /// Maps 2 [`MutableBuffer`]s into a vector of [Buffer]s whose size depends on `data_type`. #[inline] pub(crate) fn into_buffers( @@ -313,18 +336,6 @@ impl ArrayData { Ok(new_self) } - /// Validates that buffers in this ArrayData are sufficiently - /// sized, to store `len` + `offset` total elements of - /// `data_type`. - /// - /// This check is "cheap" in the sense that it does not validate the - /// contents of the buffers (e.g. that string offsets for UTF8 arrays - /// are within the length of the buffer). - pub fn validate(&self) -> Result<()> { - // will be filled in a subsequent PR - Ok(()) - } - /// Returns a builder to construct a `ArrayData` instance. #[inline] pub const fn builder(data_type: DataType) -> ArrayDataBuilder { @@ -559,6 +570,437 @@ impl ArrayData { ) } } + + /// "cheap" validation of an `ArrayData`. Ensures buffers are + /// sufficiently sized to store `len` + `offset` total elements of + /// `data_type` and performs other inexpensive consistency checks. + /// + /// This check is "cheap" in the sense that it does not validate the + /// contents of the buffers (e.g. that all offsets for UTF8 arrays + /// are within the bounds of the values buffer). + /// + /// TODO: add a validate_full that validates the offsets and valid utf8 data + pub fn validate(&self) -> Result<()> { + // Need at least this mich space in each buffer + let len_plus_offset = self.len + self.offset; + + // Check that the data layout conforms to the spec + let layout = layout(&self.data_type); + + // Will validate Union when conforms to new spec: + // https://github.com/apache/arrow-rs/issues/85 + if matches!(&self.data_type, DataType::Union(_)) { + return Ok(()); + } + if self.buffers.len() != layout.buffers.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Expected {} buffers in array of type {:?}, got {}", + layout.buffers.len(), + self.data_type, + self.buffers.len(), + ))); + } + + for (i, (buffer, spec)) in + self.buffers.iter().zip(layout.buffers.iter()).enumerate() + { + match spec { + BufferSpec::FixedWidth { byte_width } => { + let min_buffer_size = len_plus_offset + .checked_mul(*byte_width) + .expect("integer overflow computing min buffer size"); + + if buffer.len() < min_buffer_size { + return Err(ArrowError::InvalidArgumentError(format!( + "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}", + min_buffer_size, i, self.data_type, buffer.len() + ))); + } + } + BufferSpec::VariableWidth => { + // not cheap to validate (need to look at the + // data). Partially checked in validate_offsets + // called below. Can check with `validate_full` + } + BufferSpec::BitMap => { + let min_buffer_size = bit_util::ceil(len_plus_offset, 8); + if buffer.len() < min_buffer_size { + return Err(ArrowError::InvalidArgumentError(format!( + "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}", + min_buffer_size, i, self.data_type, buffer.len() + ))); + } + } + BufferSpec::AlwaysNull => { + // Nothing to validate + } + } + } + + if self.null_count > self.len { + return Err(ArrowError::InvalidArgumentError(format!( + "null_count {} for an array exceeds length of {} elements", + self.null_count, self.len + ))); + } + + // check null bit buffer size + if let Some(null_bit_buffer) = self.null_bitmap.as_ref() { + let needed_len = bit_util::ceil(len_plus_offset, 8); + if null_bit_buffer.len() < needed_len { + return Err(ArrowError::InvalidArgumentError(format!( + "null_bit_buffer size too small. got {} needed {}", + null_bit_buffer.len(), + needed_len + ))); + } + } else if self.null_count > 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "Array of type {} has {} nulls but no null bitmap", + self.data_type, self.null_count + ))); + } + + self.validate_child_data()?; + + // Additional Type specific checks + match &self.data_type { + DataType::Utf8 | DataType::Binary => { + self.validate_offsets::(&self.buffers[0], self.buffers[1].len())?; + } + DataType::LargeUtf8 | DataType::LargeBinary => { + self.validate_offsets::(&self.buffers[0], self.buffers[1].len())?; + } + DataType::Dictionary(key_type, _value_type) => { + // At the moment, constructing a DictionaryArray will also check this + if !DataType::is_dictionary_key_type(key_type) { + return Err(ArrowError::InvalidArgumentError(format!( + "Dictionary values must be integer, but was {}", + key_type + ))); + } + } + _ => {} + }; + + Ok(()) + } + + /// Does a cheap sanity check that the `self.len` values in `buffer` are valid + /// offsets (of type T> into some other buffer of `values_length` bytes long + fn validate_offsets( + &self, + buffer: &Buffer, + values_length: usize, + ) -> Result<()> { + // Validate that there are the correct number of offsets for this array's length + let required_offsets = self.len + self.offset + 1; + + // An empty list-like array can have 0 offsets + if buffer.is_empty() { + return Ok(()); + } + + if (buffer.len() / std::mem::size_of::()) < required_offsets { + return Err(ArrowError::InvalidArgumentError(format!( + "Offsets buffer size (bytes): {} isn't large enough for {}. Length {} needs {}", + buffer.len(), self.data_type, self.len, required_offsets + ))); + } + + // Justification: buffer size was validated above + let offsets = unsafe { &(buffer.typed_data::()[self.offset..]) }; + + let first_offset = offsets[0].to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Error converting offset[0] ({}) to usize for {}", + offsets[0], self.data_type + )) + })?; + + let last_offset = offsets[self.len].to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Error converting offset[{}] ({}) to usize for {}", + self.len, offsets[self.len], self.data_type + )) + })?; + + if first_offset > values_length { + return Err(ArrowError::InvalidArgumentError(format!( + "First offset {} of {} is larger than values length {}", + first_offset, self.data_type, values_length, + ))); + } + + if last_offset > values_length { + return Err(ArrowError::InvalidArgumentError(format!( + "Last offset {} of {} is larger than values length {}", + last_offset, self.data_type, values_length, + ))); + } + + if first_offset > last_offset { + return Err(ArrowError::InvalidArgumentError(format!( + "First offset {} in {} is smaller than last offset {}", + first_offset, self.data_type, last_offset, + ))); + } + + Ok(()) + } + + /// Validates the layout of `child_data` ArrayData structures + fn validate_child_data(&self) -> Result<()> { + match &self.data_type { + DataType::List(field) | DataType::Map(field, _) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_offsets::(&self.buffers[0], values_data.len)?; + Ok(()) + } + DataType::LargeList(field) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_offsets::(&self.buffers[0], values_data.len)?; + Ok(()) + } + DataType::FixedSizeList(field, list_size) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + + let list_size: usize = (*list_size).try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "{} has a negative list_size {}", + self.data_type, list_size + )) + })?; + + let expected_values_len = self.len + .checked_mul(list_size) + .expect("integer overflow computing expected number of expected values in FixedListSize"); + + if values_data.len < expected_values_len { + return Err(ArrowError::InvalidArgumentError(format!( + "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}", + values_data.len, list_size, list_size, self.data_type + ))); + } + + Ok(()) + } + DataType::Struct(fields) => { + self.validate_num_child_data(fields.len())?; + for (i, field) in fields.iter().enumerate() { + let field_data = self.get_valid_child_data(i, field.data_type())?; + + // C++ does this check, but it is not clear why + // field_data checks only len, but self checks len+offset + if field_data.len < (self.len + self.offset) { + return Err(ArrowError::InvalidArgumentError(format!( + "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})", + self.data_type, i, field.name(), field_data.len, self.len + self.offset + ))); + } + } + Ok(()) + } + DataType::Union(_fields) => { + // Validate Union Array as part of implementing new Union semantics + // See comments in `ArrayData::validate()` + // https://github.com/apache/arrow-rs/issues/85 + Ok(()) + } + DataType::Dictionary(_key_type, value_type) => { + self.get_single_valid_child_data(value_type)?; + Ok(()) + } + _ => { + // other types do not have child data + if !self.child_data.is_empty() { + return Err(ArrowError::InvalidArgumentError(format!( + "Expected no child arrays for type {} but got {}", + self.data_type, + self.child_data.len() + ))); + } + Ok(()) + } + } + } + + /// Ensures that this array data has a single child_data with the + /// expected type, and calls `validate()` on it. Returns a + /// reference to that child_data + fn get_single_valid_child_data( + &self, + expected_type: &DataType, + ) -> Result<&ArrayData> { + self.validate_num_child_data(1)?; + self.get_valid_child_data(0, expected_type) + } + + /// Returns `Err` if self.child_data does not have exactly `expected_len` elements + fn validate_num_child_data(&self, expected_len: usize) -> Result<()> { + if self.child_data().len() != expected_len { + Err(ArrowError::InvalidArgumentError(format!( + "Value data for {} should contain {} child data array(s), had {}", + self.data_type(), + expected_len, + self.child_data.len() + ))) + } else { + Ok(()) + } + } + + /// Ensures that `child_data[i]` has the expected type, calls + /// `validate()` on it, and returns a reference to that child_data + fn get_valid_child_data( + &self, + i: usize, + expected_type: &DataType, + ) -> Result<&ArrayData> { + let values_data = self.child_data + .get(i) + .ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "{} did not have enough child arrays. Expected at least {} but had only {}", + self.data_type, i+1, self.child_data.len() + )) + })?; + + if expected_type != &values_data.data_type { + return Err(ArrowError::InvalidArgumentError(format!( + "Child type mismatch for {}. Expected {} but child data had {}", + self.data_type, expected_type, values_data.data_type + ))); + } + + values_data.validate()?; + Ok(values_data) + } +} + +/// Return the expected [`DataTypeLayout`] Arrays of this data +/// type are expected to have +fn layout(data_type: &DataType) -> DataTypeLayout { + // based on C/C++ implementation in + // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc) + use std::mem::size_of; + match data_type { + DataType::Null => DataTypeLayout::new_empty(), + DataType::Boolean => DataTypeLayout { + buffers: vec![BufferSpec::BitMap], + }, + DataType::Int8 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Int16 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Int32 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Int64 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::UInt8 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::UInt16 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::UInt32 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::UInt64 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Float16 => unimplemented!(), + DataType::Float32 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Float64 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Date32 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Date64 => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Time32(_) => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Time64(_) => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Interval(IntervalUnit::YearMonth) => { + DataTypeLayout::new_fixed_width(size_of::()) + } + DataType::Interval(IntervalUnit::DayTime) => { + DataTypeLayout::new_fixed_width(size_of::()) + } + DataType::Duration(_) => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Binary => DataTypeLayout::new_binary(size_of::()), + DataType::FixedSizeBinary(bytes_per_value) => { + let bytes_per_value: usize = (*bytes_per_value) + .try_into() + .expect("negative size for fixed size binary"); + DataTypeLayout::new_fixed_width(bytes_per_value) + } + DataType::LargeBinary => DataTypeLayout::new_binary(size_of::()), + DataType::Utf8 => DataTypeLayout::new_binary(size_of::()), + DataType::LargeUtf8 => DataTypeLayout::new_binary(size_of::()), + DataType::List(_) => DataTypeLayout::new_fixed_width(size_of::()), + DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data + DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, + DataType::Union(_) => { + DataTypeLayout::new_fixed_width(size_of::()) + // Note sparse unions only have one buffer (u8) type_ids, + // and dense unions have 2 (type_ids as well as offsets). + // https://github.com/apache/arrow-rs/issues/85 + } + DataType::Dictionary(key_type, _value_type) => layout(key_type), + DataType::Decimal(_, _) => { + // Decimals are always some fixed width; The rust implemenation + // always uses 16 bytes / size of i128 + DataTypeLayout::new_fixed_width(size_of::()) + } + DataType::Map(_, _) => { + // same as ListType + DataTypeLayout::new_fixed_width(size_of::()) + } + } +} + +/// Layout specification for a data type +#[derive(Debug, PartialEq)] +// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91 +struct DataTypeLayout { + /// A vector of buffer layout specifications, one for each expected buffer + pub buffers: Vec, +} + +impl DataTypeLayout { + /// Describes a basic numeric array where each element has a fixed width + pub fn new_fixed_width(byte_width: usize) -> Self { + Self { + buffers: vec![BufferSpec::FixedWidth { byte_width }], + } + } + + /// Describes arrays which have no data of their own + /// (e.g. FixedSizeList). Note such arrays may still have a Null + /// Bitmap + pub fn new_empty() -> Self { + Self { buffers: vec![] } + } + + /// Describes a basic numeric array where each element has a fixed + /// with offset buffer of `offset_byte_width` bytes, followed by a + /// variable width data buffer + pub fn new_binary(offset_byte_width: usize) -> Self { + Self { + buffers: vec![ + // offsets + BufferSpec::FixedWidth { + byte_width: offset_byte_width, + }, + // values + BufferSpec::VariableWidth, + ], + } + } +} + +/// Layout specification for a single data type buffer +#[derive(Debug, PartialEq)] +enum BufferSpec { + /// each element has a fixed width + FixedWidth { byte_width: usize }, + /// Variable width, such as string data for utf8 data + VariableWidth, + /// Buffer holds a bitmap. + /// + /// Note: Unlike the C++ implementation, the null/validity buffer + /// is handled specially rather than as another of the buffers in + /// the spec, so this variant is only used for the Boolean type. + BitMap, + /// Buffer is always null. Unused currently in Rust implementation, + /// (used in C++ for Union type) + AlwaysNull, } impl PartialEq for ArrayData { @@ -672,23 +1114,38 @@ impl ArrayDataBuilder { mod tests { use super::*; + use crate::array::{Array, Int32Array, StringArray}; use crate::buffer::Buffer; + use crate::datatypes::Field; use crate::util::bit_util; #[test] - fn test_new() { - let arr_data = - ArrayData::try_new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![]) - .unwrap(); - assert_eq!(10, arr_data.len()); - assert_eq!(1, arr_data.null_count()); - assert_eq!(2, arr_data.offset()); - assert_eq!(0, arr_data.buffers().len()); - assert_eq!(0, arr_data.child_data().len()); + fn test_builder() { + // Buffer needs to be at least 25 long + let v = (0..25).collect::>(); + let b1 = Buffer::from_slice_ref(&v); + let arr_data = ArrayData::builder(DataType::Int32) + .len(20) + .offset(5) + .add_buffer(b1) + .null_bit_buffer(Buffer::from(vec![ + 0b01011111, 0b10110101, 0b01100011, 0b00011110, + ])) + .build() + .unwrap(); + + assert_eq!(20, arr_data.len()); + assert_eq!(10, arr_data.null_count()); + assert_eq!(5, arr_data.offset()); + assert_eq!(1, arr_data.buffers().len()); + assert_eq!( + Buffer::from_slice_ref(&v).as_slice(), + arr_data.buffers()[0].as_slice() + ); } #[test] - fn test_builder() { + fn test_builder_with_child_data() { let child_arr_data = ArrayData::try_new( DataType::Int32, 5, @@ -699,24 +1156,17 @@ mod tests { vec![], ) .unwrap(); - let v = vec![0, 1, 2, 3]; - let b1 = Buffer::from(&v[..]); - let arr_data = ArrayData::builder(DataType::Int32) - .len(20) - .offset(5) - .add_buffer(b1) - .null_bit_buffer(Buffer::from(vec![ - 0b01011111, 0b10110101, 0b01100011, 0b00011110, - ])) + + let data_type = DataType::Struct(vec![Field::new("x", DataType::Int32, true)]); + + let arr_data = ArrayData::builder(data_type) + .len(5) + .offset(0) .add_child_data(child_arr_data.clone()) .build() .unwrap(); - assert_eq!(20, arr_data.len()); - assert_eq!(10, arr_data.null_count()); - assert_eq!(5, arr_data.offset()); - assert_eq!(1, arr_data.buffers().len()); - assert_eq!(&[0, 1, 2, 3], arr_data.buffers()[0].as_slice()); + assert_eq!(5, arr_data.len()); assert_eq!(1, arr_data.child_data().len()); assert_eq!(child_arr_data, arr_data.child_data()[0]); } @@ -729,6 +1179,7 @@ mod tests { bit_util::set_bit(&mut bit_v, 10); let arr_data = ArrayData::builder(DataType::Int32) .len(16) + .add_buffer(make_i32_buffer(16)) .null_bit_buffer(Buffer::from(bit_v)) .build() .unwrap(); @@ -742,6 +1193,7 @@ mod tests { let arr_data = ArrayData::builder(DataType::Int32) .len(12) .offset(2) + .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space, .null_bit_buffer(Buffer::from(bit_v)) .build() .unwrap(); @@ -756,6 +1208,7 @@ mod tests { bit_util::set_bit(&mut bit_v, 10); let arr_data = ArrayData::builder(DataType::Int32) .len(16) + .add_buffer(make_i32_buffer(16)) .null_bit_buffer(Buffer::from(bit_v)) .build() .unwrap(); @@ -771,6 +1224,7 @@ mod tests { bit_util::set_bit(&mut bit_v, 10); let data = ArrayData::builder(DataType::Int32) .len(16) + .add_buffer(make_i32_buffer(16)) .null_bit_buffer(Buffer::from(bit_v)) .build() .unwrap(); @@ -788,8 +1242,16 @@ mod tests { #[test] fn test_equality() { - let int_data = ArrayData::builder(DataType::Int32).build().unwrap(); - let float_data = ArrayData::builder(DataType::Float32).build().unwrap(); + let int_data = ArrayData::builder(DataType::Int32) + .len(1) + .add_buffer(make_i32_buffer(1)) + .build() + .unwrap(); + let float_data = ArrayData::builder(DataType::Float32) + .len(1) + .add_buffer(make_f32_buffer(1)) + .build() + .unwrap(); assert_ne!(int_data, float_data); } @@ -802,4 +1264,380 @@ mod tests { let count = count_nulls(null_buffer.as_ref(), 4, 8); assert_eq!(count, 3); } + + #[test] + #[should_panic( + expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8" + )] + fn test_buffer_too_small() { + let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + // should fail as the declared size (10*8 = 80) is larger than the underlying bfufer (8) + ArrayData::try_new(DataType::Int64, 10, Some(0), None, 0, vec![buffer], vec![]) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8" + )] + fn test_buffer_too_small_offset() { + let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + // should fail -- size is ok, but also has offset + ArrayData::try_new(DataType::Int64, 1, Some(0), None, 1, vec![buffer], vec![]) + .unwrap(); + } + + #[test] + #[should_panic(expected = "Expected 1 buffers in array of type Int64, got 2")] + fn test_bad_number_of_buffers() { + let buffer1 = Buffer::from_slice_ref(&[0i32, 2i32]); + let buffer2 = Buffer::from_slice_ref(&[0i32, 2i32]); + ArrayData::try_new( + DataType::Int64, + 1, + Some(0), + None, + 0, + vec![buffer1, buffer2], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "integer overflow computing min buffer size")] + fn test_fixed_width_overflow() { + let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + ArrayData::try_new( + DataType::Int64, + usize::MAX, + Some(0), + None, + 0, + vec![buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "null_bit_buffer size too small. got 8 needed 13")] + fn test_bitmap_too_small() { + let buffer = make_i32_buffer(100); + let null_bit_buffer = Buffer::from(vec![0b11111111]); + + ArrayData::try_new( + DataType::Int32, + 100, + Some(0), + Some(null_bit_buffer), + 0, + vec![buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "null_count 3 for an array exceeds length of 2 elements")] + fn test_bad_null_count() { + let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + ArrayData::try_new(DataType::Int32, 2, Some(3), None, 0, vec![buffer], vec![]) + .unwrap(); + } + + // Test creating a dictionary with a non integer type + #[test] + #[should_panic(expected = "Dictionary values must be integer, but was Utf8")] + fn test_non_int_dictionary() { + let i32_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + let data_type = + DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32)); + let child_data = ArrayData::try_new( + DataType::Int32, + 1, + Some(0), + None, + 0, + vec![i32_buffer.clone()], + vec![], + ) + .unwrap(); + ArrayData::try_new( + data_type, + 1, + Some(0), + None, + 0, + vec![i32_buffer.clone(), i32_buffer], + vec![child_data], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "Expected LargeUtf8 but child data had Utf8")] + fn test_mismatched_dictionary_types() { + // test w/ dictionary created with a child array data that has type different than declared + let string_array: StringArray = + vec![Some("foo"), Some("bar")].into_iter().collect(); + let i32_buffer = Buffer::from_slice_ref(&[0i32, 1i32]); + // Dict says LargeUtf8 but array is Utf8 + let data_type = DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::LargeUtf8), + ); + let child_data = string_array.data().clone(); + ArrayData::try_new( + data_type, + 1, + Some(0), + None, + 0, + vec![i32_buffer], + vec![child_data], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Offsets buffer size (bytes): 8 isn't large enough for Utf8. Length 2 needs 3" + )] + fn test_validate_offsets_i32() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Offsets buffer size (bytes): 16 isn't large enough for LargeUtf8. Length 2 needs 3" + )] + fn test_validate_offsets_i64() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i64, 2i64]); + ArrayData::try_new( + DataType::LargeUtf8, + 2, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "Error converting offset[0] (-2) to usize for Utf8")] + fn test_validate_offsets_negative_first_i32() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[-2i32, 1i32, 3i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "Error converting offset[2] (-3) to usize for Utf8")] + fn test_validate_offsets_negative_last_i32() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, -3i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "First offset 4 in Utf8 is smaller than last offset 3")] + fn test_validate_offsets_range_too_small() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // start offset is larger than end + let offsets_buffer = Buffer::from_slice_ref(&[4i32, 2i32, 3i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "Last offset 10 of Utf8 is larger than values length 6")] + fn test_validate_offsets_range_too_large() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, 10i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "First offset 10 of Utf8 is larger than values length 6")] + fn test_validate_offsets_first_too_large() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer + let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 10i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + fn test_validate_offsets_first_too_large_skipped() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer, but offset starts at 1 so it is skipped + let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 3i32, 4i32]); + let data = ArrayData::try_new( + DataType::Utf8, + 2, + None, + None, + 1, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + let array: StringArray = data.into(); + let expected: StringArray = vec![Some("c"), Some("d")].into_iter().collect(); + assert_eq!(array, expected); + } + + #[test] + #[should_panic(expected = "Last offset 8 of Utf8 is larger than values length 6")] + fn test_validate_offsets_last_too_large() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer + let offsets_buffer = Buffer::from_slice_ref(&[5i32, 7i32, 8i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList" + )] + fn test_validate_fixed_size_list() { + // child has 4 elements, + let child_array = vec![Some(1), Some(2), Some(3), None] + .into_iter() + .collect::(); + + // but claim we have 3 elements for a fixed size of 2 + // 10 is off the end of the buffer + let field = Field::new("field", DataType::Int32, true); + ArrayData::try_new( + DataType::FixedSizeList(Box::new(field), 2), + 3, + None, + None, + 0, + vec![], + vec![child_array.data().clone()], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "Child type mismatch for Struct")] + fn test_validate_struct_child_type() { + let field1 = vec![Some(1), Some(2), Some(3), None] + .into_iter() + .collect::(); + + // validate the the type of struct fields matches child fields + ArrayData::try_new( + DataType::Struct(vec![Field::new("field1", DataType::Int64, true)]), + 3, + None, + None, + 0, + vec![], + vec![field1.data().clone()], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "child array #0 for field field1 has length smaller than expected for struct array (4 < 6)" + )] + fn test_validate_struct_child_length() { + // field length only has 4 items, but array claims to have 6 + let field1 = vec![Some(1), Some(2), Some(3), None] + .into_iter() + .collect::(); + + ArrayData::try_new( + DataType::Struct(vec![Field::new("field1", DataType::Int32, true)]), + 6, + None, + None, + 0, + vec![], + vec![field1.data().clone()], + ) + .unwrap(); + } + + /// returns a buffer initialized with some constant value for tests + fn make_i32_buffer(n: usize) -> Buffer { + Buffer::from_slice_ref(&vec![42i32; n]) + } + + /// returns a buffer initialized with some constant value for tests + fn make_f32_buffer(n: usize) -> Buffer { + Buffer::from_slice_ref(&vec![42f32; n]) + } } diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index dbfa1a99f0ad..8b09df06aa89 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -2089,6 +2089,7 @@ mod tests { assert_eq!(4, values.null_count()); let u16arr = values.as_any().downcast_ref::().unwrap(); + // expect 4 nulls: negative numbers and overflow let expected: UInt16Array = vec![Some(0), Some(0), Some(0), None, None, None, Some(2), None] .into_iter() diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs index f4ddbaf56d1e..e4808e25199c 100644 --- a/arrow/src/compute/util.rs +++ b/arrow/src/compute/util.rs @@ -184,7 +184,8 @@ pub(super) mod tests { offset: usize, null_bit_buffer: Option, ) -> Arc { - // empty vec for buffers and children is not really correct, but for these tests we only care about the null bitmap + let buffer = Buffer::from(&vec![11; len]); + Arc::new( ArrayData::try_new( DataType::UInt8, @@ -192,7 +193,7 @@ pub(super) mod tests { None, null_bit_buffer, offset, - vec![], + vec![buffer], vec![], ) .unwrap(), diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index 1cbec341cf37..96fb18be6ec9 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -477,6 +477,16 @@ impl DataType { ) } + /// Returns true if this type is valid as a dictionary key + /// (e.g. [`super::ArrowDictionaryKeyType`] + pub fn is_dictionary_key_type(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 + ) + } + /// Compares the datatype with another, ignoring nested field names /// and metadata. pub(crate) fn equals_datatype(&self, other: &DataType) -> bool { diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index e925e2af8f4b..82cd1010a73e 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -968,6 +968,22 @@ mod tests { FileReader::try_new(file).unwrap(); } + #[test] + #[should_panic( + expected = "Last offset 687865856 of Utf8 is larger than values length 41" + )] + fn read_dictionary_be_not_implemented() { + // The offsets are not translated for big-endian files + // https://github.com/apache/arrow-rs/issues/859 + let testdata = crate::util::test_util::arrow_test_data(); + let file = File::open(format!( + "{}/arrow-ipc-stream/integration/1.0.0-bigendian/generated_dictionary.arrow_file", + testdata + )) + .unwrap(); + FileReader::try_new(file).unwrap(); + } + #[test] fn read_generated_be_files_should_work() { // complementary to the previous test @@ -975,7 +991,6 @@ mod tests { let paths = vec![ "generated_interval", "generated_datetime", - "generated_dictionary", "generated_map", "generated_nested", "generated_null_trivial",