-
Notifications
You must be signed in to change notification settings - Fork 750
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Replace RawPtrBox with ScalarBuffer, reduce unsafe
usage (#1811)
#1825
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,11 +22,11 @@ use std::{any::Any, iter::FromIterator}; | |
|
||
use super::BooleanBufferBuilder; | ||
use super::{ | ||
array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, | ||
FixedSizeListArray, GenericBinaryIter, GenericListArray, OffsetSizeTrait, | ||
array::print_long_array, Array, ArrayData, FixedSizeListArray, GenericBinaryIter, | ||
GenericListArray, OffsetSizeTrait, | ||
}; | ||
pub use crate::array::DecimalIter; | ||
use crate::buffer::Buffer; | ||
use crate::buffer::{Buffer, ScalarBuffer}; | ||
use crate::datatypes::{ | ||
validate_decimal_precision, DECIMAL_DEFAULT_SCALE, DECIMAL_MAX_PRECISION, | ||
DECIMAL_MAX_SCALE, | ||
|
@@ -39,8 +39,8 @@ use crate::{buffer::MutableBuffer, datatypes::DataType}; | |
/// binary data. | ||
pub struct GenericBinaryArray<OffsetSize: OffsetSizeTrait> { | ||
data: ArrayData, | ||
value_offsets: RawPtrBox<OffsetSize>, | ||
value_data: RawPtrBox<u8>, | ||
value_offsets: ScalarBuffer<OffsetSize>, | ||
value_data: Buffer, | ||
} | ||
|
||
impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> { | ||
|
@@ -64,67 +64,40 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> { | |
|
||
/// Returns a clone of the value data buffer | ||
pub fn value_data(&self) -> Buffer { | ||
self.data.buffers()[1].clone() | ||
self.value_data.clone() | ||
} | ||
|
||
/// Returns the offset values in the offsets buffer | ||
#[inline] | ||
pub fn value_offsets(&self) -> &[OffsetSize] { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❤️ |
||
// Soundness | ||
// pointer alignment & location is ensured by RawPtrBox | ||
// buffer bounds/offset is ensured by the ArrayData instance. | ||
unsafe { | ||
std::slice::from_raw_parts( | ||
self.value_offsets.as_ptr().add(self.data.offset()), | ||
self.len() + 1, | ||
) | ||
} | ||
&self.value_offsets | ||
} | ||
|
||
/// Returns the element at index `i` as bytes slice | ||
/// # Safety | ||
/// Caller is responsible for ensuring that the index is within the bounds of the array | ||
pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { | ||
let end = *self.value_offsets().get_unchecked(i + 1); | ||
let start = *self.value_offsets().get_unchecked(i); | ||
|
||
// Soundness | ||
// pointer alignment & location is ensured by RawPtrBox | ||
// buffer bounds/offset is ensured by the value_offset invariants | ||
|
||
// Safety of `to_isize().unwrap()` | ||
// `start` and `end` are &OffsetSize, which is a generic type that implements the | ||
// OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, | ||
// both of which should cleanly cast to isize on an architecture that supports | ||
// 32/64-bit offsets | ||
std::slice::from_raw_parts( | ||
self.value_data.as_ptr().offset(start.to_isize().unwrap()), | ||
(end - start).to_usize().unwrap(), | ||
) | ||
let start = self.value_offsets.get_unchecked(i).to_isize().unwrap(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't like this |
||
let end = self.value_offsets.get_unchecked(i + 1).to_isize().unwrap(); | ||
|
||
// buffer bounds/offset is ensured by the value_offset invariants | ||
self.value_data.get_unchecked(start as usize..end as usize) | ||
} | ||
|
||
/// Returns the element at index `i` as bytes slice | ||
pub fn value(&self, i: usize) -> &[u8] { | ||
assert!(i < self.data.len(), "BinaryArray out of bounds access"); | ||
//Soundness: length checked above, offset buffer length is 1 larger than logical array length | ||
let end = unsafe { self.value_offsets().get_unchecked(i + 1) }; | ||
let start = unsafe { self.value_offsets().get_unchecked(i) }; | ||
|
||
// Soundness | ||
// pointer alignment & location is ensured by RawPtrBox | ||
// buffer bounds/offset is ensured by the value_offset invariants | ||
|
||
// Safety of `to_isize().unwrap()` | ||
// `start` and `end` are &OffsetSize, which is a generic type that implements the | ||
// OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, | ||
// both of which should cleanly cast to isize on an architecture that supports | ||
// 32/64-bit offsets | ||
unsafe { | ||
std::slice::from_raw_parts( | ||
self.value_data.as_ptr().offset(start.to_isize().unwrap()), | ||
(*end - *start).to_usize().unwrap(), | ||
) | ||
} | ||
assert!( | ||
i < self.data.len(), | ||
"BinaryArray index out of bounds: the len is {} but the index is {}", | ||
self.data.len(), | ||
i | ||
); | ||
unsafe { self.value_unchecked(i) } | ||
} | ||
|
||
/// Creates a [GenericBinaryArray] from a vector of byte slices | ||
|
@@ -259,12 +232,14 @@ impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for GenericBinaryArray<OffsetS | |
2, | ||
"BinaryArray data should contain 2 buffers only (offsets and values)" | ||
); | ||
let offsets = data.buffers()[0].as_ptr(); | ||
let values = data.buffers()[1].as_ptr(); | ||
let offsets = data.buffers()[0].clone(); | ||
let value_data = data.buffers()[1].clone(); | ||
let offsets_len = data.is_empty().then(|| 0).unwrap_or(data.len() + 1); | ||
let value_offsets = ScalarBuffer::new(offsets, data.offset(), offsets_len); | ||
Self { | ||
data, | ||
value_offsets: unsafe { RawPtrBox::new(offsets) }, | ||
value_data: unsafe { RawPtrBox::new(values) }, | ||
value_offsets, | ||
value_data, | ||
} | ||
} | ||
} | ||
|
@@ -447,7 +422,7 @@ impl<T: OffsetSizeTrait> From<GenericListArray<T>> for GenericBinaryArray<T> { | |
/// | ||
pub struct FixedSizeBinaryArray { | ||
data: ArrayData, | ||
value_data: RawPtrBox<u8>, | ||
value_data: Buffer, | ||
length: i32, | ||
} | ||
|
||
|
@@ -662,14 +637,15 @@ impl From<ArrayData> for FixedSizeBinaryArray { | |
1, | ||
"FixedSizeBinaryArray data should contain 1 buffer only (values)" | ||
); | ||
let value_data = data.buffers()[0].as_ptr(); | ||
// TODO: Materialize offset into value_data | ||
let value_data = data.buffers()[0].clone(); | ||
let length = match data.data_type() { | ||
DataType::FixedSizeBinary(len) => *len, | ||
_ => panic!("Expected data type to be FixedSizeBinary"), | ||
}; | ||
Self { | ||
data, | ||
value_data: unsafe { RawPtrBox::new(value_data) }, | ||
value_data, | ||
length, | ||
} | ||
} | ||
|
@@ -756,7 +732,7 @@ impl Array for FixedSizeBinaryArray { | |
/// | ||
pub struct DecimalArray { | ||
data: ArrayData, | ||
value_data: RawPtrBox<u8>, | ||
value_data: Buffer, | ||
precision: usize, | ||
scale: usize, | ||
length: i32, | ||
|
@@ -949,15 +925,16 @@ impl From<ArrayData> for DecimalArray { | |
1, | ||
"DecimalArray data should contain 1 buffer only (values)" | ||
); | ||
let values = data.buffers()[0].as_ptr(); | ||
let (precision, scale) = match data.data_type() { | ||
DataType::Decimal(precision, scale) => (*precision, *scale), | ||
_ => panic!("Expected data type to be Decimal"), | ||
}; | ||
// TODO: Materialize offset into value_data | ||
let value_data = data.buffers()[0].clone(); | ||
let length = 16; | ||
Self { | ||
data, | ||
value_data: unsafe { RawPtrBox::new(values) }, | ||
value_data, | ||
precision, | ||
scale, | ||
length, | ||
|
@@ -1441,7 +1418,9 @@ mod tests { | |
} | ||
|
||
#[test] | ||
#[should_panic(expected = "BinaryArray out of bounds access")] | ||
#[should_panic( | ||
expected = "BinaryArray index out of bounds: the len is 3 but the index is 4" | ||
)] | ||
fn test_binary_array_get_value_index_out_of_bound() { | ||
let values: [u8; 12] = | ||
[104, 101, 108, 108, 111, 112, 97, 114, 113, 117, 101, 116]; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,9 +21,10 @@ use std::fmt; | |
use num::Integer; | ||
|
||
use super::{ | ||
array::print_long_array, make_array, raw_pointer::RawPtrBox, Array, ArrayData, | ||
ArrayRef, BooleanBufferBuilder, GenericListArrayIter, PrimitiveArray, | ||
array::print_long_array, make_array, Array, ArrayData, ArrayRef, | ||
BooleanBufferBuilder, GenericListArrayIter, PrimitiveArray, | ||
}; | ||
use crate::buffer::ScalarBuffer; | ||
use crate::{ | ||
buffer::MutableBuffer, | ||
datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType, Field}, | ||
|
@@ -49,10 +50,10 @@ impl OffsetSizeTrait for i64 { | |
/// <https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout> | ||
/// | ||
/// For non generic lists, you may wish to consider using [`ListArray`] or [`LargeListArray`]` | ||
pub struct GenericListArray<OffsetSize> { | ||
pub struct GenericListArray<OffsetSize: OffsetSizeTrait> { | ||
data: ArrayData, | ||
values: ArrayRef, | ||
value_offsets: RawPtrBox<OffsetSize>, | ||
value_offsets: ScalarBuffer<OffsetSize>, | ||
} | ||
|
||
impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> { | ||
|
@@ -70,32 +71,27 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> { | |
/// # Safety | ||
/// Caller must ensure that the index is within the array bounds | ||
pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { | ||
let end = *self.value_offsets().get_unchecked(i + 1); | ||
let start = *self.value_offsets().get_unchecked(i); | ||
let end = *self.value_offsets.get_unchecked(i + 1); | ||
let start = *self.value_offsets.get_unchecked(i); | ||
self.values | ||
.slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap()) | ||
} | ||
|
||
/// Returns ith value of this list array. | ||
pub fn value(&self, i: usize) -> ArrayRef { | ||
let end = self.value_offsets()[i + 1]; | ||
let start = self.value_offsets()[i]; | ||
self.values | ||
.slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap()) | ||
assert!( | ||
i < self.data.len(), | ||
"GenericListArray index out of bounds: the len is {} but the index is {}", | ||
self.data.len(), | ||
i | ||
); | ||
unsafe { self.value_unchecked(i) } | ||
} | ||
|
||
/// Returns the offset values in the offsets buffer | ||
#[inline] | ||
pub fn value_offsets(&self) -> &[OffsetSize] { | ||
// Soundness | ||
// pointer alignment & location is ensured by RawPtrBox | ||
// buffer bounds/offset is ensured by the ArrayData instance. | ||
unsafe { | ||
std::slice::from_raw_parts( | ||
self.value_offsets.as_ptr().add(self.data.offset()), | ||
self.len() + 1, | ||
) | ||
} | ||
&self.value_offsets | ||
} | ||
|
||
/// Returns the length for value at index `i`. | ||
|
@@ -227,8 +223,10 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> { | |
} | ||
|
||
let values = make_array(values); | ||
let value_offsets = data.buffers()[0].as_ptr(); | ||
let value_offsets = unsafe { RawPtrBox::<OffsetSize>::new(value_offsets) }; | ||
let offsets = data.buffers()[0].clone(); | ||
let offsets_len = data.is_empty().then(|| 0).unwrap_or(data.len() + 1); | ||
let value_offsets = ScalarBuffer::new(offsets, data.offset(), offsets_len); | ||
|
||
Ok(Self { | ||
data, | ||
values, | ||
|
@@ -940,7 +938,7 @@ mod tests { | |
} | ||
|
||
#[test] | ||
#[should_panic(expected = "index out of bounds: the len is 10 but the index is 11")] | ||
#[should_panic(expected = "index out of bounds: the len is 9 but the index is 10")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The index changes because we check the length of the array, not the end offset first |
||
fn test_list_array_index_out_of_bound() { | ||
// Construct a value array | ||
let value_data = ArrayData::builder(DataType::Int32) | ||
|
@@ -1142,31 +1140,33 @@ mod tests { | |
} | ||
|
||
#[test] | ||
#[should_panic(expected = "memory is not aligned")] | ||
#[should_panic(expected = "buffer is not aligned to 4 byte boundary")] | ||
fn test_primitive_array_alignment() { | ||
let ptr = alloc::allocate_aligned::<u8>(8); | ||
let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; | ||
let buf = unsafe { Buffer::from_raw_parts(ptr, 9, 9) }; | ||
let buf2 = buf.slice(1); | ||
let array_data = ArrayData::builder(DataType::Int32) | ||
.len(2) | ||
.add_buffer(buf2) | ||
.build() | ||
.unwrap(); | ||
drop(Int32Array::from(array_data)); | ||
} | ||
|
||
#[test] | ||
#[should_panic(expected = "memory is not aligned")] | ||
#[should_panic(expected = "buffer is not aligned to 4 byte boundary")] | ||
// Different error messages, so skip for now | ||
// https://github.com/apache/arrow-rs/issues/1545 | ||
#[cfg(not(feature = "force_validate"))] | ||
fn test_list_array_alignment() { | ||
let ptr = alloc::allocate_aligned::<u8>(8); | ||
let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; | ||
let buf = unsafe { Buffer::from_raw_parts(ptr, 9, 9) }; | ||
let buf2 = buf.slice(1); | ||
|
||
let values: [i32; 8] = [0; 8]; | ||
let value_data = unsafe { | ||
ArrayData::builder(DataType::Int32) | ||
.len(1) | ||
.add_buffer(Buffer::from_slice_ref(&values)) | ||
.build_unchecked() | ||
}; | ||
|
@@ -1175,6 +1175,7 @@ mod tests { | |
DataType::List(Box::new(Field::new("item", DataType::Int32, false))); | ||
let list_data = unsafe { | ||
ArrayData::builder(list_data_type) | ||
.len(1) | ||
.add_buffer(buf2) | ||
.add_child_data(value_data) | ||
.build_unchecked() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if calling
ScalarBuffer
something likeTypedBuffer
would make its purpose slightly clearer 🤔