Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace RawPtrBox with ScalarBuffer, reduce unsafe usage (#1811) #1825

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions arrow/src/array/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,7 @@ mod tests {
let array = new_empty_array(&DataType::Utf8);
let a = array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!(a.len(), 0);
assert_eq!(a.value_offsets()[0], 0i32);
assert_eq!(a.value_offsets().len(), 0);
}

#[test]
Expand All @@ -741,7 +741,7 @@ mod tests {
let array = new_empty_array(&data_type);
let a = array.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(a.len(), 0);
assert_eq!(a.value_offsets()[0], 0i32);
assert_eq!(a.value_offsets().len(), 0);
}

#[test]
Expand Down
93 changes: 36 additions & 57 deletions arrow/src/array/array_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ use std::{any::Any, iter::FromIterator};

use super::BooleanBufferBuilder;
use super::{
array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData,
FixedSizeListArray, GenericBinaryIter, GenericListArray, OffsetSizeTrait,
array::print_long_array, Array, ArrayData, FixedSizeListArray, GenericBinaryIter,
GenericListArray, OffsetSizeTrait,
};
pub use crate::array::DecimalIter;
use crate::buffer::Buffer;
use crate::buffer::{Buffer, ScalarBuffer};
use crate::datatypes::{
validate_decimal_precision, DECIMAL_DEFAULT_SCALE, DECIMAL_MAX_PRECISION,
DECIMAL_MAX_SCALE,
Expand All @@ -39,8 +39,8 @@ use crate::{buffer::MutableBuffer, datatypes::DataType};
/// binary data.
pub struct GenericBinaryArray<OffsetSize: OffsetSizeTrait> {
data: ArrayData,
value_offsets: RawPtrBox<OffsetSize>,
value_data: RawPtrBox<u8>,
value_offsets: ScalarBuffer<OffsetSize>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if calling ScalarBuffer something like TypedBuffer would make its purpose slightly clearer 🤔

value_data: Buffer,
}

impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> {
Expand All @@ -64,67 +64,40 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> {

/// Returns a clone of the value data buffer
pub fn value_data(&self) -> Buffer {
self.data.buffers()[1].clone()
self.value_data.clone()
}

/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[OffsetSize] {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❤️

// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the ArrayData instance.
unsafe {
std::slice::from_raw_parts(
self.value_offsets.as_ptr().add(self.data.offset()),
self.len() + 1,
)
}
&self.value_offsets
}

/// Returns the element at index `i` as bytes slice
/// # Safety
/// Caller is responsible for ensuring that the index is within the bounds of the array
pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
let end = *self.value_offsets().get_unchecked(i + 1);
let start = *self.value_offsets().get_unchecked(i);

// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the value_offset invariants

// Safety of `to_isize().unwrap()`
// `start` and `end` are &OffsetSize, which is a generic type that implements the
// OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait,
// both of which should cleanly cast to isize on an architecture that supports
// 32/64-bit offsets
std::slice::from_raw_parts(
self.value_data.as_ptr().offset(start.to_isize().unwrap()),
(end - start).to_usize().unwrap(),
)
let start = self.value_offsets.get_unchecked(i).to_isize().unwrap();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like this to_isize() dance, really I just want ArrowNativeType to provide AsPrimitive so that we can do an unchecked cast. Will file a follow up PR

let end = self.value_offsets.get_unchecked(i + 1).to_isize().unwrap();

// buffer bounds/offset is ensured by the value_offset invariants
self.value_data.get_unchecked(start as usize..end as usize)
}

/// Returns the element at index `i` as bytes slice
pub fn value(&self, i: usize) -> &[u8] {
assert!(i < self.data.len(), "BinaryArray out of bounds access");
//Soundness: length checked above, offset buffer length is 1 larger than logical array length
let end = unsafe { self.value_offsets().get_unchecked(i + 1) };
let start = unsafe { self.value_offsets().get_unchecked(i) };

// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the value_offset invariants

// Safety of `to_isize().unwrap()`
// `start` and `end` are &OffsetSize, which is a generic type that implements the
// OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait,
// both of which should cleanly cast to isize on an architecture that supports
// 32/64-bit offsets
unsafe {
std::slice::from_raw_parts(
self.value_data.as_ptr().offset(start.to_isize().unwrap()),
(*end - *start).to_usize().unwrap(),
)
}
assert!(
i < self.data.len(),
"BinaryArray index out of bounds: the len is {} but the index is {}",
self.data.len(),
i
);
unsafe { self.value_unchecked(i) }
}

/// Creates a [GenericBinaryArray] from a vector of byte slices
Expand Down Expand Up @@ -259,12 +232,14 @@ impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for GenericBinaryArray<OffsetS
2,
"BinaryArray data should contain 2 buffers only (offsets and values)"
);
let offsets = data.buffers()[0].as_ptr();
let values = data.buffers()[1].as_ptr();
let offsets = data.buffers()[0].clone();
let value_data = data.buffers()[1].clone();
let offsets_len = data.is_empty().then(|| 0).unwrap_or(data.len() + 1);
let value_offsets = ScalarBuffer::new(offsets, data.offset(), offsets_len);
Self {
data,
value_offsets: unsafe { RawPtrBox::new(offsets) },
value_data: unsafe { RawPtrBox::new(values) },
value_offsets,
value_data,
}
}
}
Expand Down Expand Up @@ -447,7 +422,7 @@ impl<T: OffsetSizeTrait> From<GenericListArray<T>> for GenericBinaryArray<T> {
///
pub struct FixedSizeBinaryArray {
data: ArrayData,
value_data: RawPtrBox<u8>,
value_data: Buffer,
length: i32,
}

Expand Down Expand Up @@ -662,14 +637,15 @@ impl From<ArrayData> for FixedSizeBinaryArray {
1,
"FixedSizeBinaryArray data should contain 1 buffer only (values)"
);
let value_data = data.buffers()[0].as_ptr();
// TODO: Materialize offset into value_data
let value_data = data.buffers()[0].clone();
let length = match data.data_type() {
DataType::FixedSizeBinary(len) => *len,
_ => panic!("Expected data type to be FixedSizeBinary"),
};
Self {
data,
value_data: unsafe { RawPtrBox::new(value_data) },
value_data,
length,
}
}
Expand Down Expand Up @@ -756,7 +732,7 @@ impl Array for FixedSizeBinaryArray {
///
pub struct DecimalArray {
data: ArrayData,
value_data: RawPtrBox<u8>,
value_data: Buffer,
precision: usize,
scale: usize,
length: i32,
Expand Down Expand Up @@ -949,15 +925,16 @@ impl From<ArrayData> for DecimalArray {
1,
"DecimalArray data should contain 1 buffer only (values)"
);
let values = data.buffers()[0].as_ptr();
let (precision, scale) = match data.data_type() {
DataType::Decimal(precision, scale) => (*precision, *scale),
_ => panic!("Expected data type to be Decimal"),
};
// TODO: Materialize offset into value_data
let value_data = data.buffers()[0].clone();
let length = 16;
Self {
data,
value_data: unsafe { RawPtrBox::new(values) },
value_data,
precision,
scale,
length,
Expand Down Expand Up @@ -1441,7 +1418,9 @@ mod tests {
}

#[test]
#[should_panic(expected = "BinaryArray out of bounds access")]
#[should_panic(
expected = "BinaryArray index out of bounds: the len is 3 but the index is 4"
)]
fn test_binary_array_get_value_index_out_of_bound() {
let values: [u8; 12] =
[104, 101, 108, 108, 111, 112, 97, 114, 113, 117, 101, 116];
Expand Down
16 changes: 6 additions & 10 deletions arrow/src/array/array_boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ use std::convert::From;
use std::iter::{FromIterator, IntoIterator};
use std::{any::Any, fmt};

use super::array::print_long_array;
use super::*;
use super::{array::print_long_array, raw_pointer::RawPtrBox};
use crate::buffer::{Buffer, MutableBuffer};
use crate::util::bit_util;

Expand Down Expand Up @@ -66,9 +66,7 @@ use crate::util::bit_util;
/// ```
pub struct BooleanArray {
data: ArrayData,
/// Pointer to the value array. The lifetime of this must be <= to the value buffer
/// stored in `data`, so it's safe to store.
raw_values: RawPtrBox<u8>,
raw_values: Buffer,
}

impl fmt::Debug for BooleanArray {
Expand Down Expand Up @@ -101,7 +99,7 @@ impl BooleanArray {
///
/// Note this doesn't take the offset of this array into account.
pub fn values(&self) -> &Buffer {
&self.data.buffers()[0]
&self.raw_values
}

/// Returns the boolean value at index `i`.
Expand Down Expand Up @@ -186,11 +184,9 @@ impl From<ArrayData> for BooleanArray {
1,
"BooleanArray data should contain a single buffer only (values buffer)"
);
let ptr = data.buffers()[0].as_ptr();
Self {
data,
raw_values: unsafe { RawPtrBox::new(ptr) },
}
// TODO: Hydrate offset into BitMap
let raw_values = data.buffers()[0].clone();
Self { data, raw_values }
}
}

Expand Down
53 changes: 27 additions & 26 deletions arrow/src/array/array_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ use std::fmt;
use num::Integer;

use super::{
array::print_long_array, make_array, raw_pointer::RawPtrBox, Array, ArrayData,
ArrayRef, BooleanBufferBuilder, GenericListArrayIter, PrimitiveArray,
array::print_long_array, make_array, Array, ArrayData, ArrayRef,
BooleanBufferBuilder, GenericListArrayIter, PrimitiveArray,
};
use crate::buffer::ScalarBuffer;
use crate::{
buffer::MutableBuffer,
datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType, Field},
Expand All @@ -49,10 +50,10 @@ impl OffsetSizeTrait for i64 {
/// <https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout>
///
/// For non generic lists, you may wish to consider using [`ListArray`] or [`LargeListArray`]`
pub struct GenericListArray<OffsetSize> {
pub struct GenericListArray<OffsetSize: OffsetSizeTrait> {
data: ArrayData,
values: ArrayRef,
value_offsets: RawPtrBox<OffsetSize>,
value_offsets: ScalarBuffer<OffsetSize>,
}

impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
Expand All @@ -70,32 +71,27 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
/// # Safety
/// Caller must ensure that the index is within the array bounds
pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef {
let end = *self.value_offsets().get_unchecked(i + 1);
let start = *self.value_offsets().get_unchecked(i);
let end = *self.value_offsets.get_unchecked(i + 1);
let start = *self.value_offsets.get_unchecked(i);
self.values
.slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap())
}

/// Returns ith value of this list array.
pub fn value(&self, i: usize) -> ArrayRef {
let end = self.value_offsets()[i + 1];
let start = self.value_offsets()[i];
self.values
.slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap())
assert!(
i < self.data.len(),
"GenericListArray index out of bounds: the len is {} but the index is {}",
self.data.len(),
i
);
unsafe { self.value_unchecked(i) }
}

/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[OffsetSize] {
// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the ArrayData instance.
unsafe {
std::slice::from_raw_parts(
self.value_offsets.as_ptr().add(self.data.offset()),
self.len() + 1,
)
}
&self.value_offsets
}

/// Returns the length for value at index `i`.
Expand Down Expand Up @@ -227,8 +223,10 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
}

let values = make_array(values);
let value_offsets = data.buffers()[0].as_ptr();
let value_offsets = unsafe { RawPtrBox::<OffsetSize>::new(value_offsets) };
let offsets = data.buffers()[0].clone();
let offsets_len = data.is_empty().then(|| 0).unwrap_or(data.len() + 1);
let value_offsets = ScalarBuffer::new(offsets, data.offset(), offsets_len);

Ok(Self {
data,
values,
Expand Down Expand Up @@ -940,7 +938,7 @@ mod tests {
}

#[test]
#[should_panic(expected = "index out of bounds: the len is 10 but the index is 11")]
#[should_panic(expected = "index out of bounds: the len is 9 but the index is 10")]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The index changes because we check the length of the array, not the end offset first

fn test_list_array_index_out_of_bound() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int32)
Expand Down Expand Up @@ -1142,31 +1140,33 @@ mod tests {
}

#[test]
#[should_panic(expected = "memory is not aligned")]
#[should_panic(expected = "buffer is not aligned to 4 byte boundary")]
fn test_primitive_array_alignment() {
let ptr = alloc::allocate_aligned::<u8>(8);
let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) };
let buf = unsafe { Buffer::from_raw_parts(ptr, 9, 9) };
let buf2 = buf.slice(1);
let array_data = ArrayData::builder(DataType::Int32)
.len(2)
.add_buffer(buf2)
.build()
.unwrap();
drop(Int32Array::from(array_data));
}

#[test]
#[should_panic(expected = "memory is not aligned")]
#[should_panic(expected = "buffer is not aligned to 4 byte boundary")]
// Different error messages, so skip for now
// https://github.com/apache/arrow-rs/issues/1545
#[cfg(not(feature = "force_validate"))]
fn test_list_array_alignment() {
let ptr = alloc::allocate_aligned::<u8>(8);
let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) };
let buf = unsafe { Buffer::from_raw_parts(ptr, 9, 9) };
let buf2 = buf.slice(1);

let values: [i32; 8] = [0; 8];
let value_data = unsafe {
ArrayData::builder(DataType::Int32)
.len(1)
.add_buffer(Buffer::from_slice_ref(&values))
.build_unchecked()
};
Expand All @@ -1175,6 +1175,7 @@ mod tests {
DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = unsafe {
ArrayData::builder(list_data_type)
.len(1)
.add_buffer(buf2)
.add_child_data(value_data)
.build_unchecked()
Expand Down
Loading