Skip to content

Commit

Permalink
Merge c5ba200 into 8f1fd12
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold authored Jun 9, 2022
2 parents 8f1fd12 + c5ba200 commit 7a40745
Show file tree
Hide file tree
Showing 10 changed files with 153 additions and 249 deletions.
4 changes: 2 additions & 2 deletions arrow/src/array/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,7 @@ mod tests {
let array = new_empty_array(&DataType::Utf8);
let a = array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!(a.len(), 0);
assert_eq!(a.value_offsets()[0], 0i32);
assert_eq!(a.value_offsets().len(), 0);
}

#[test]
Expand All @@ -741,7 +741,7 @@ mod tests {
let array = new_empty_array(&data_type);
let a = array.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(a.len(), 0);
assert_eq!(a.value_offsets()[0], 0i32);
assert_eq!(a.value_offsets().len(), 0);
}

#[test]
Expand Down
93 changes: 36 additions & 57 deletions arrow/src/array/array_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ use std::{any::Any, iter::FromIterator};

use super::BooleanBufferBuilder;
use super::{
array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData,
FixedSizeListArray, GenericBinaryIter, GenericListArray, OffsetSizeTrait,
array::print_long_array, Array, ArrayData, FixedSizeListArray, GenericBinaryIter,
GenericListArray, OffsetSizeTrait,
};
pub use crate::array::DecimalIter;
use crate::buffer::Buffer;
use crate::buffer::{Buffer, ScalarBuffer};
use crate::datatypes::{
validate_decimal_precision, DECIMAL_DEFAULT_SCALE, DECIMAL_MAX_PRECISION,
DECIMAL_MAX_SCALE,
Expand All @@ -39,8 +39,8 @@ use crate::{buffer::MutableBuffer, datatypes::DataType};
/// binary data.
pub struct GenericBinaryArray<OffsetSize: OffsetSizeTrait> {
data: ArrayData,
value_offsets: RawPtrBox<OffsetSize>,
value_data: RawPtrBox<u8>,
value_offsets: ScalarBuffer<OffsetSize>,
value_data: Buffer,
}

impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> {
Expand All @@ -64,67 +64,40 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> {

/// Returns a clone of the value data buffer
pub fn value_data(&self) -> Buffer {
self.data.buffers()[1].clone()
self.value_data.clone()
}

/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[OffsetSize] {
// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the ArrayData instance.
unsafe {
std::slice::from_raw_parts(
self.value_offsets.as_ptr().add(self.data.offset()),
self.len() + 1,
)
}
&self.value_offsets
}

/// Returns the element at index `i` as bytes slice
/// # Safety
/// Caller is responsible for ensuring that the index is within the bounds of the array
pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
let end = *self.value_offsets().get_unchecked(i + 1);
let start = *self.value_offsets().get_unchecked(i);

// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the value_offset invariants

// Safety of `to_isize().unwrap()`
// `start` and `end` are &OffsetSize, which is a generic type that implements the
// OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait,
// both of which should cleanly cast to isize on an architecture that supports
// 32/64-bit offsets
std::slice::from_raw_parts(
self.value_data.as_ptr().offset(start.to_isize().unwrap()),
(end - start).to_usize().unwrap(),
)
let start = self.value_offsets.get_unchecked(i).to_isize().unwrap();
let end = self.value_offsets.get_unchecked(i + 1).to_isize().unwrap();

// buffer bounds/offset is ensured by the value_offset invariants
self.value_data.get_unchecked(start as usize..end as usize)
}

/// Returns the element at index `i` as bytes slice
pub fn value(&self, i: usize) -> &[u8] {
assert!(i < self.data.len(), "BinaryArray out of bounds access");
//Soundness: length checked above, offset buffer length is 1 larger than logical array length
let end = unsafe { self.value_offsets().get_unchecked(i + 1) };
let start = unsafe { self.value_offsets().get_unchecked(i) };

// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the value_offset invariants

// Safety of `to_isize().unwrap()`
// `start` and `end` are &OffsetSize, which is a generic type that implements the
// OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait,
// both of which should cleanly cast to isize on an architecture that supports
// 32/64-bit offsets
unsafe {
std::slice::from_raw_parts(
self.value_data.as_ptr().offset(start.to_isize().unwrap()),
(*end - *start).to_usize().unwrap(),
)
}
assert!(
i < self.data.len(),
"BinaryArray index out of bounds: the len is {} but the index is {}",
self.data.len(),
i
);
unsafe { self.value_unchecked(i) }
}

/// Creates a [GenericBinaryArray] from a vector of byte slices
Expand Down Expand Up @@ -259,12 +232,14 @@ impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for GenericBinaryArray<OffsetS
2,
"BinaryArray data should contain 2 buffers only (offsets and values)"
);
let offsets = data.buffers()[0].as_ptr();
let values = data.buffers()[1].as_ptr();
let offsets = data.buffers()[0].clone();
let value_data = data.buffers()[1].clone();
let offsets_len = data.is_empty().then(|| 0).unwrap_or(data.len() + 1);
let value_offsets = ScalarBuffer::new(offsets, data.offset(), offsets_len);
Self {
data,
value_offsets: unsafe { RawPtrBox::new(offsets) },
value_data: unsafe { RawPtrBox::new(values) },
value_offsets,
value_data,
}
}
}
Expand Down Expand Up @@ -447,7 +422,7 @@ impl<T: OffsetSizeTrait> From<GenericListArray<T>> for GenericBinaryArray<T> {
///
pub struct FixedSizeBinaryArray {
data: ArrayData,
value_data: RawPtrBox<u8>,
value_data: Buffer,
length: i32,
}

Expand Down Expand Up @@ -662,14 +637,15 @@ impl From<ArrayData> for FixedSizeBinaryArray {
1,
"FixedSizeBinaryArray data should contain 1 buffer only (values)"
);
let value_data = data.buffers()[0].as_ptr();
// TODO: Materialize offset into value_data
let value_data = data.buffers()[0].clone();
let length = match data.data_type() {
DataType::FixedSizeBinary(len) => *len,
_ => panic!("Expected data type to be FixedSizeBinary"),
};
Self {
data,
value_data: unsafe { RawPtrBox::new(value_data) },
value_data,
length,
}
}
Expand Down Expand Up @@ -756,7 +732,7 @@ impl Array for FixedSizeBinaryArray {
///
pub struct DecimalArray {
data: ArrayData,
value_data: RawPtrBox<u8>,
value_data: Buffer,
precision: usize,
scale: usize,
length: i32,
Expand Down Expand Up @@ -949,15 +925,16 @@ impl From<ArrayData> for DecimalArray {
1,
"DecimalArray data should contain 1 buffer only (values)"
);
let values = data.buffers()[0].as_ptr();
let (precision, scale) = match data.data_type() {
DataType::Decimal(precision, scale) => (*precision, *scale),
_ => panic!("Expected data type to be Decimal"),
};
// TODO: Materialize offset into value_data
let value_data = data.buffers()[0].clone();
let length = 16;
Self {
data,
value_data: unsafe { RawPtrBox::new(values) },
value_data,
precision,
scale,
length,
Expand Down Expand Up @@ -1441,7 +1418,9 @@ mod tests {
}

#[test]
#[should_panic(expected = "BinaryArray out of bounds access")]
#[should_panic(
expected = "BinaryArray index out of bounds: the len is 3 but the index is 4"
)]
fn test_binary_array_get_value_index_out_of_bound() {
let values: [u8; 12] =
[104, 101, 108, 108, 111, 112, 97, 114, 113, 117, 101, 116];
Expand Down
16 changes: 6 additions & 10 deletions arrow/src/array/array_boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ use std::convert::From;
use std::iter::{FromIterator, IntoIterator};
use std::{any::Any, fmt};

use super::array::print_long_array;
use super::*;
use super::{array::print_long_array, raw_pointer::RawPtrBox};
use crate::buffer::{Buffer, MutableBuffer};
use crate::util::bit_util;

Expand Down Expand Up @@ -66,9 +66,7 @@ use crate::util::bit_util;
/// ```
pub struct BooleanArray {
data: ArrayData,
/// Pointer to the value array. The lifetime of this must be <= to the value buffer
/// stored in `data`, so it's safe to store.
raw_values: RawPtrBox<u8>,
raw_values: Buffer,
}

impl fmt::Debug for BooleanArray {
Expand Down Expand Up @@ -101,7 +99,7 @@ impl BooleanArray {
///
/// Note this doesn't take the offset of this array into account.
pub fn values(&self) -> &Buffer {
&self.data.buffers()[0]
&self.raw_values
}

/// Returns the boolean value at index `i`.
Expand Down Expand Up @@ -186,11 +184,9 @@ impl From<ArrayData> for BooleanArray {
1,
"BooleanArray data should contain a single buffer only (values buffer)"
);
let ptr = data.buffers()[0].as_ptr();
Self {
data,
raw_values: unsafe { RawPtrBox::new(ptr) },
}
// TODO: Hydrate offset into BitMap
let raw_values = data.buffers()[0].clone();
Self { data, raw_values }
}
}

Expand Down
53 changes: 27 additions & 26 deletions arrow/src/array/array_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ use std::fmt;
use num::Integer;

use super::{
array::print_long_array, make_array, raw_pointer::RawPtrBox, Array, ArrayData,
ArrayRef, BooleanBufferBuilder, GenericListArrayIter, PrimitiveArray,
array::print_long_array, make_array, Array, ArrayData, ArrayRef,
BooleanBufferBuilder, GenericListArrayIter, PrimitiveArray,
};
use crate::buffer::ScalarBuffer;
use crate::{
buffer::MutableBuffer,
datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType, Field},
Expand All @@ -49,10 +50,10 @@ impl OffsetSizeTrait for i64 {
/// <https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout>
///
/// For non generic lists, you may wish to consider using [`ListArray`] or [`LargeListArray`]`
pub struct GenericListArray<OffsetSize> {
pub struct GenericListArray<OffsetSize: OffsetSizeTrait> {
data: ArrayData,
values: ArrayRef,
value_offsets: RawPtrBox<OffsetSize>,
value_offsets: ScalarBuffer<OffsetSize>,
}

impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
Expand All @@ -70,32 +71,27 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
/// # Safety
/// Caller must ensure that the index is within the array bounds
pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef {
let end = *self.value_offsets().get_unchecked(i + 1);
let start = *self.value_offsets().get_unchecked(i);
let end = *self.value_offsets.get_unchecked(i + 1);
let start = *self.value_offsets.get_unchecked(i);
self.values
.slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap())
}

/// Returns ith value of this list array.
pub fn value(&self, i: usize) -> ArrayRef {
let end = self.value_offsets()[i + 1];
let start = self.value_offsets()[i];
self.values
.slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap())
assert!(
i < self.data.len(),
"GenericListArray index out of bounds: the len is {} but the index is {}",
self.data.len(),
i
);
unsafe { self.value_unchecked(i) }
}

/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[OffsetSize] {
// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the ArrayData instance.
unsafe {
std::slice::from_raw_parts(
self.value_offsets.as_ptr().add(self.data.offset()),
self.len() + 1,
)
}
&self.value_offsets
}

/// Returns the length for value at index `i`.
Expand Down Expand Up @@ -227,8 +223,10 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
}

let values = make_array(values);
let value_offsets = data.buffers()[0].as_ptr();
let value_offsets = unsafe { RawPtrBox::<OffsetSize>::new(value_offsets) };
let offsets = data.buffers()[0].clone();
let offsets_len = data.is_empty().then(|| 0).unwrap_or(data.len() + 1);
let value_offsets = ScalarBuffer::new(offsets, data.offset(), offsets_len);

Ok(Self {
data,
values,
Expand Down Expand Up @@ -940,7 +938,7 @@ mod tests {
}

#[test]
#[should_panic(expected = "index out of bounds: the len is 10 but the index is 11")]
#[should_panic(expected = "index out of bounds: the len is 9 but the index is 10")]
fn test_list_array_index_out_of_bound() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int32)
Expand Down Expand Up @@ -1142,31 +1140,33 @@ mod tests {
}

#[test]
#[should_panic(expected = "memory is not aligned")]
#[should_panic(expected = "buffer is not aligned to 4 byte boundary")]
fn test_primitive_array_alignment() {
let ptr = alloc::allocate_aligned::<u8>(8);
let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) };
let buf = unsafe { Buffer::from_raw_parts(ptr, 9, 9) };
let buf2 = buf.slice(1);
let array_data = ArrayData::builder(DataType::Int32)
.len(2)
.add_buffer(buf2)
.build()
.unwrap();
drop(Int32Array::from(array_data));
}

#[test]
#[should_panic(expected = "memory is not aligned")]
#[should_panic(expected = "buffer is not aligned to 4 byte boundary")]
// Different error messages, so skip for now
// https://github.com/apache/arrow-rs/issues/1545
#[cfg(not(feature = "force_validate"))]
fn test_list_array_alignment() {
let ptr = alloc::allocate_aligned::<u8>(8);
let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) };
let buf = unsafe { Buffer::from_raw_parts(ptr, 9, 9) };
let buf2 = buf.slice(1);

let values: [i32; 8] = [0; 8];
let value_data = unsafe {
ArrayData::builder(DataType::Int32)
.len(1)
.add_buffer(Buffer::from_slice_ref(&values))
.build_unchecked()
};
Expand All @@ -1175,6 +1175,7 @@ mod tests {
DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = unsafe {
ArrayData::builder(list_data_type)
.len(1)
.add_buffer(buf2)
.add_child_data(value_data)
.build_unchecked()
Expand Down
Loading

0 comments on commit 7a40745

Please sign in to comment.