diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index de87c3db1fab..76b29d37239f 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -197,11 +197,21 @@ pub trait Array: fmt::Debug + Send + Sync + JsonEqual { self.data_ref().null_count() } - /// Returns the total number of bytes of memory occupied by the buffers owned by this array. - fn get_buffer_memory_size(&self) -> usize; + /// Returns the total number of bytes of memory pointed to by this array. + /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map. + fn get_buffer_memory_size(&self) -> usize { + self.data_ref().get_buffer_memory_size() + } /// Returns the total number of bytes of memory occupied physically by this array. - fn get_array_memory_size(&self) -> usize; + /// This value will always be greater than returned by `get_buffer_memory_size()` and + /// includes the overhead of the data structures that contain the pointers to the various buffers. + fn get_array_memory_size(&self) -> usize { + // both data.get_array_memory_size and size_of_val(self) include ArrayData fields, + // to only count additional fields of this array substract size_of(ArrayData) + self.data_ref().get_array_memory_size() + std::mem::size_of_val(self) + - std::mem::size_of::() + } /// returns two pointers that represent this array in the C Data Interface (FFI) fn to_raw( @@ -575,6 +585,7 @@ where #[cfg(test)] mod tests { use super::*; + #[test] fn test_empty_primitive() { let array = new_empty_array(&DataType::Int32); @@ -661,4 +672,105 @@ mod tests { null_array.data().buffers()[0].len() ); } + + #[test] + fn test_memory_size_null() { + let null_arr = NullArray::new(32); + + assert_eq!(0, null_arr.get_buffer_memory_size()); + assert_eq!( + std::mem::size_of::(), + null_arr.get_array_memory_size() + ); + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::(), + ); + } + + #[test] + fn test_memory_size_primitive() { + let arr = PrimitiveArray::::from_iter_values(0..128); + let empty = + PrimitiveArray::::from(ArrayData::new_empty(arr.data_type())); + + // substract empty array to avoid magic numbers for the size of additional fields + assert_eq!( + arr.get_array_memory_size() - empty.get_array_memory_size(), + 128 * std::mem::size_of::() + ); + } + + #[test] + fn test_memory_size_primitive_nullable() { + let arr: PrimitiveArray = (0..128).map(Some).collect(); + let empty_with_bitmap = PrimitiveArray::::from( + ArrayData::builder(arr.data_type().clone()) + .add_buffer(MutableBuffer::new(0).into()) + .null_bit_buffer(MutableBuffer::new_null(0).into()) + .build(), + ); + + // expected size is the size of the PrimitiveArray struct, + // which includes the optional validity buffer + // plus one buffer on the heap + assert_eq!( + std::mem::size_of::>() + + std::mem::size_of::(), + empty_with_bitmap.get_array_memory_size() + ); + + // substract empty array to avoid magic numbers for the size of additional fields + // the size of the validity bitmap is rounded up to 64 bytes + assert_eq!( + arr.get_array_memory_size() - empty_with_bitmap.get_array_memory_size(), + 128 * std::mem::size_of::() + 64 + ); + } + + #[test] + fn test_memory_size_dictionary() { + let values = PrimitiveArray::::from_iter_values(0..16); + let keys = PrimitiveArray::::from_iter_values( + (0..256).map(|i| (i % values.len()) as i16), + ); + + let dict_data = ArrayData::builder(DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + )) + .len(keys.len()) + .buffers(keys.data_ref().buffers().to_vec()) + .child_data(vec![ArrayData::builder(DataType::Int64) + .len(values.len()) + .buffers(values.data_ref().buffers().to_vec()) + .build()]) + .build(); + + let empty_data = ArrayData::new_empty(&DataType::Dictionary( + Box::new(DataType::Int16), + Box::new(DataType::Int64), + )); + + let arr = DictionaryArray::::from(dict_data); + let empty = DictionaryArray::::from(empty_data); + + let expected_keys_size = 256 * std::mem::size_of::(); + assert_eq!( + arr.keys().get_array_memory_size() - empty.keys().get_array_memory_size(), + expected_keys_size + ); + + let expected_values_size = 16 * std::mem::size_of::(); + assert_eq!( + arr.values().get_array_memory_size() - empty.values().get_array_memory_size(), + expected_values_size + ); + + let expected_size = expected_keys_size + expected_values_size; + assert_eq!( + arr.get_array_memory_size() - empty.get_array_memory_size(), + expected_size + ); + } } diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index e397e0d302bf..18f7f7c15e69 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -17,7 +17,6 @@ use std::convert::{From, TryInto}; use std::fmt; -use std::mem; use std::{any::Any, iter::FromIterator}; use super::{ @@ -199,16 +198,6 @@ impl Array for GenericBinaryArray fn data(&self) -> &ArrayData { &self.data } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [$name]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [$name]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } } impl From @@ -600,16 +589,6 @@ impl Array for FixedSizeBinaryArray { fn data(&self) -> &ArrayData { &self.data } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeBinaryArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [FixedSizeBinaryArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } } /// A type of `DecimalArray` whose elements are binaries. @@ -756,16 +735,6 @@ impl Array for DecimalArray { fn data(&self) -> &ArrayData { &self.data } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [DecimalArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [DecimalArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } } #[cfg(test)] diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs index de24c44d0f24..37080fe5a8c7 100644 --- a/arrow/src/array/array_boolean.rs +++ b/arrow/src/array/array_boolean.rs @@ -18,7 +18,6 @@ use std::borrow::Borrow; use std::convert::From; use std::iter::{FromIterator, IntoIterator}; -use std::mem; use std::{any::Any, fmt}; use super::*; @@ -114,16 +113,6 @@ impl Array for BooleanArray { fn data(&self) -> &ArrayData { &self.data } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [BooleanArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [BooleanArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } } impl From> for BooleanArray { diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs index a2419a1f271c..cb5f51125425 100644 --- a/arrow/src/array/array_dictionary.rs +++ b/arrow/src/array/array_dictionary.rs @@ -18,7 +18,6 @@ use std::any::Any; use std::fmt; use std::iter::IntoIterator; -use std::mem; use std::{convert::From, iter::FromIterator}; use super::{ @@ -224,18 +223,6 @@ impl Array for DictionaryArray { fn data(&self) -> &ArrayData { &self.data } - - fn get_buffer_memory_size(&self) -> usize { - // Since both `keys` and `values` derive (are references from) `data`, we only need to account for `data`. - self.data.get_buffer_memory_size() - } - - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() - + self.keys.get_array_memory_size() - + self.values.get_array_memory_size() - + mem::size_of_val(self) - } } impl fmt::Debug for DictionaryArray { diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs index f794dbefbc99..0efb65c99d5a 100644 --- a/arrow/src/array/array_list.rs +++ b/arrow/src/array/array_list.rs @@ -17,7 +17,6 @@ use std::any::Any; use std::fmt; -use std::mem; use num::Num; @@ -261,16 +260,6 @@ impl Array for GenericListArray &ArrayData { &self.data } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [ListArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [ListArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } } impl fmt::Debug for GenericListArray { @@ -444,18 +433,6 @@ impl Array for FixedSizeListArray { fn data(&self) -> &ArrayData { &self.data } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeListArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() + self.values().get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [FixedSizeListArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() - + self.values().get_array_memory_size() - + mem::size_of_val(self) - } } impl fmt::Debug for FixedSizeListArray { diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index 5e852608b089..fa950a93fa96 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -150,16 +150,6 @@ impl Array for PrimitiveArray { fn data(&self) -> &ArrayData { &self.data } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [PrimitiveArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [PrimitiveArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of::>() - } } fn as_datetime(v: i64) -> Option { @@ -508,9 +498,6 @@ mod tests { assert!(arr.is_valid(i)); assert_eq!(i as i32, arr.value(i)); } - - assert_eq!(64, arr.get_buffer_memory_size()); - assert_eq!(136, arr.get_array_memory_size()); } #[test] @@ -530,9 +517,6 @@ mod tests { assert!(!arr.is_valid(i)); } } - - assert_eq!(128, arr.get_buffer_memory_size()); - assert_eq!(216, arr.get_array_memory_size()); } #[test] diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 8b0ad1041761..ba285078369b 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -17,7 +17,6 @@ use std::convert::From; use std::fmt; -use std::mem; use std::{any::Any, iter::FromIterator}; use super::{ @@ -286,16 +285,6 @@ impl Array for GenericStringArray fn data(&self) -> &ArrayData { &self.data } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [$name]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [$name]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } } impl From diff --git a/arrow/src/array/array_struct.rs b/arrow/src/array/array_struct.rs index f721d35ce9fe..0e7304e80a46 100644 --- a/arrow/src/array/array_struct.rs +++ b/arrow/src/array/array_struct.rs @@ -19,7 +19,6 @@ use std::any::Any; use std::convert::{From, TryFrom}; use std::fmt; use std::iter::IntoIterator; -use std::mem; use super::{make_array, Array, ArrayData, ArrayRef}; use crate::datatypes::DataType; @@ -178,16 +177,6 @@ impl Array for StructArray { fn len(&self) -> usize { self.data_ref().len() } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [StructArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [StructArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } } impl From> for StructArray { diff --git a/arrow/src/array/array_union.rs b/arrow/src/array/array_union.rs index e4a4dfb41eed..e369037244ac 100644 --- a/arrow/src/array/array_union.rs +++ b/arrow/src/array/array_union.rs @@ -80,7 +80,6 @@ use crate::error::{ArrowError, Result}; use core::fmt; use std::any::Any; -use std::mem; use std::mem::size_of; /// An Array that can represent slots of varying types. @@ -276,25 +275,6 @@ impl Array for UnionArray { fn data(&self) -> &ArrayData { &self.data } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [UnionArray]. - fn get_buffer_memory_size(&self) -> usize { - let mut size = self.data.get_buffer_memory_size(); - for field in &self.boxed_fields { - size += field.get_buffer_memory_size(); - } - size - } - - /// Returns the total number of bytes of memory occupied physically by this [UnionArray]. - fn get_array_memory_size(&self) -> usize { - let mut size = self.data.get_array_memory_size(); - size += mem::size_of_val(self) - mem::size_of_val(&self.boxed_fields); - for field in &self.boxed_fields { - size += field.get_array_memory_size(); - } - size - } } impl fmt::Debug for UnionArray { diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 172bdaac9eb6..00c8e150daa8 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -354,20 +354,18 @@ impl ArrayData { /// Returns the total number of bytes of memory occupied physically by this [ArrayData]. pub fn get_array_memory_size(&self) -> usize { - let mut size = 0; - // Calculate size of the fields that don't have [get_array_memory_size] method internally. - size += mem::size_of_val(self) - - mem::size_of_val(&self.buffers) - - mem::size_of_val(&self.null_bitmap) - - mem::size_of_val(&self.child_data); + let mut size = mem::size_of_val(self); // Calculate rest of the fields top down which contain actual data for buffer in &self.buffers { - size += mem::size_of_val(&buffer); + size += mem::size_of::(); size += buffer.capacity(); } if let Some(bitmap) = &self.null_bitmap { - size += bitmap.get_array_memory_size() + // this includes the size of the bitmap struct itself, since it is stored directly in + // this struct we already counted those bytes in the size_of_val(self) above + size += bitmap.get_array_memory_size(); + size -= mem::size_of::(); } for child in &self.child_data { size += child.get_array_memory_size(); diff --git a/arrow/src/array/null.rs b/arrow/src/array/null.rs index da472c7c330b..6728a407fde0 100644 --- a/arrow/src/array/null.rs +++ b/arrow/src/array/null.rs @@ -19,7 +19,6 @@ use std::any::Any; use std::fmt; -use std::mem; use crate::array::{Array, ArrayData}; use crate::datatypes::*; @@ -84,16 +83,6 @@ impl Array for NullArray { fn null_count(&self) -> usize { self.data_ref().len() } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [NullArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [NullArray]. - fn get_array_memory_size(&self) -> usize { - mem::size_of_val(self) - } } impl From for NullArray { @@ -133,12 +122,6 @@ mod tests { assert_eq!(null_arr.len(), 32); assert_eq!(null_arr.null_count(), 32); assert!(!null_arr.is_valid(0)); - - assert_eq!(0, null_arr.get_buffer_memory_size()); - assert_eq!( - null_arr.get_buffer_memory_size() + std::mem::size_of::(), - null_arr.get_array_memory_size() - ); } #[test]