Skip to content

Commit

Permalink
update comments
Browse files Browse the repository at this point in the history
  • Loading branch information
XiangpengHao committed Jul 8, 2024
1 parent c8bbc84 commit df486d1
Showing 1 changed file with 44 additions and 33 deletions.
77 changes: 44 additions & 33 deletions parquet/src/arrow/array_reader/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ impl ByteViewArrayDecoderDeltaLength {
// # Safety
// The length is from the delta length decoder, so it is valid
// The start_offset is calculated from the lengths, so it is valid
// `start_offset` + *length is guaranteed to be within the bounds of `data`, as checked in `new`
// `start_offset + length` is guaranteed to be within the bounds of `data`, as checked in `new`
unsafe { output.append_view_unchecked(block_id, current_offset as u32, *length as u32) }

current_offset += *length as usize;
Expand Down Expand Up @@ -562,51 +562,62 @@ impl ByteViewArrayDecoderDelta {

// Unlike other encodings, we need to copy the data.
//
// DeltaByteArray data is stored using shared prefixes/suffixes,
// DeltaByteArray data is stored using shared prefixes/suffixes,
// which results in potentially non-contiguous
// strings, while Arrow encodings require contiguous strings
//
// <https://parquet.apache.org/docs/file-format/data-pages/encodings/#delta-strings-delta_byte_array--7>

fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result<usize> {
output.views.reserve(len.min(self.decoder.remaining()));

// array buffer only have long strings
let mut array_buffer: Vec<u8> = Vec::with_capacity(4096);

// utf8 validation buffer have all strings, we batch the strings in one buffer to accelerate validation
let mut utf8_validation_buffer = if self.validate_utf8 {
Some(Vec::with_capacity(4096))
} else {
None
};

let buffer_id = output.buffers.len() as u32;

let read = self.decoder.read(len, |bytes| {
let offset = array_buffer.len();
let view = make_view(bytes, buffer_id, offset as u32);
if bytes.len() > 12 {
// only copy the data to buffer if the string can not be inlined.
array_buffer.extend_from_slice(bytes);
}
if let Some(v) = utf8_validation_buffer.as_mut() {
v.extend_from_slice(bytes);
}

// # Safety
// The buffer_id is the last buffer in the output buffers
// The offset is calculated from the buffer, so it is valid
// Utf-8 validation is done later
unsafe {
output.append_raw_view_unchecked(&view);
}
Ok(())
})?;
let read = if !self.validate_utf8 {
self.decoder.read(len, |bytes| {
let offset = array_buffer.len();
let view = make_view(bytes, buffer_id, offset as u32);
if bytes.len() > 12 {
// only copy the data to buffer if the string can not be inlined.
array_buffer.extend_from_slice(bytes);
}

utf8_validation_buffer
.map(|v| check_valid_utf8(&v))
.transpose()?;
// # Safety
// The buffer_id is the last buffer in the output buffers
// The offset is calculated from the buffer, so it is valid
unsafe {
output.append_raw_view_unchecked(&view);
}
Ok(())
})?
} else {
// utf8 validation buffer have all strings, we batch the strings in one buffer to accelerate validation
let mut utf8_validation_buffer = Vec::with_capacity(4096);

let v = self.decoder.read(len, |bytes| {
let offset = array_buffer.len();
let view = make_view(bytes, buffer_id, offset as u32);
if bytes.len() > 12 {
// only copy the data to buffer if the string can not be inlined.
array_buffer.extend_from_slice(bytes);
}
utf8_validation_buffer.extend_from_slice(bytes);

// # Safety
// The buffer_id is the last buffer in the output buffers
// The offset is calculated from the buffer, so it is valid
// Utf-8 validation is done later
unsafe {
output.append_raw_view_unchecked(&view);
}
Ok(())
})?;
check_valid_utf8(&utf8_validation_buffer)?;
v
};

let actual_block_id = output.append_block(array_buffer.into());
assert_eq!(actual_block_id, buffer_id);
Expand Down

0 comments on commit df486d1

Please sign in to comment.