From 9acc9fa0b84755c5caffb8acf806fa4fd21928bb Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Mon, 15 Jul 2024 05:52:37 -0400 Subject: [PATCH] Minor API adjustments for StringViewBuilder (#6047) * minor update * add memory accounting * Update arrow-buffer/src/builder/null.rs Co-authored-by: Andrew Lamb * Update arrow-array/src/builder/generic_bytes_view_builder.rs Co-authored-by: Andrew Lamb * update comments --------- Co-authored-by: Andrew Lamb --- arrow-array/src/array/byte_view_array.rs | 6 ++++-- .../src/builder/generic_bytes_view_builder.rs | 16 +++++++++++++++- arrow-buffer/src/builder/null.rs | 8 ++++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index a00bf7271bba..5ce150a7a347 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -324,9 +324,11 @@ impl GenericByteViewArray { /// Note that it will copy the array regardless of whether the original array is compact. /// Use with caution as this can be an expensive operation, only use it when you are sure that the view /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing. + /// + /// Note: this function does not attempt to canonicalize / deduplicate values. For this + /// feature see [`GenericByteViewBuilder::with_deduplicate_strings`]. pub fn gc(&self) -> Self { - let mut builder = - GenericByteViewBuilder::::with_capacity(self.len()).with_deduplicate_strings(); + let mut builder = GenericByteViewBuilder::::with_capacity(self.len()); for v in self.iter() { builder.append_option(v); diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index dda553545640..587255cc6b6a 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -201,7 +201,8 @@ impl GenericByteViewBuilder { /// Returns the value at the given index /// Useful if we want to know what value has been inserted to the builder - fn get_value(&self, index: usize) -> &[u8] { + /// The index has to be smaller than `self.len()`, otherwise it will panic + pub fn get_value(&self, index: usize) -> &[u8] { let view = self.views_builder.as_slice().get(index).unwrap(); let len = *view as u32; if len <= 12 { @@ -337,6 +338,19 @@ impl GenericByteViewBuilder { pub fn validity_slice(&self) -> Option<&[u8]> { self.null_buffer_builder.as_slice() } + + /// Return the allocated size of this builder in bytes, useful for memory accounting. + pub fn allocated_size(&self) -> usize { + let views = self.views_builder.capacity() * std::mem::size_of::(); + let null = self.null_buffer_builder.allocated_size(); + let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::(); + let in_progress = self.in_progress.capacity(); + let tracker = match &self.string_tracker { + Some((ht, _)) => ht.capacity() * std::mem::size_of::(), + None => 0, + }; + buffer_size + in_progress + tracker + views + null + } } impl Default for GenericByteViewBuilder { diff --git a/arrow-buffer/src/builder/null.rs b/arrow-buffer/src/builder/null.rs index 55b3303c9e34..a1cea6ef2cca 100644 --- a/arrow-buffer/src/builder/null.rs +++ b/arrow-buffer/src/builder/null.rs @@ -161,6 +161,14 @@ impl NullBufferBuilder { pub fn as_slice_mut(&mut self) -> Option<&mut [u8]> { self.bitmap_builder.as_mut().map(|b| b.as_slice_mut()) } + + /// Return the allocated size of this builder, in bytes, useful for memory accounting. + pub fn allocated_size(&self) -> usize { + self.bitmap_builder + .as_ref() + .map(|b| b.capacity()) + .unwrap_or(0) + } } impl NullBufferBuilder {