Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct array memory usage calculation for dictionary arrays #505

Merged
merged 7 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 66 additions & 2 deletions arrow/src/array/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,14 @@ pub trait Array: fmt::Debug + Send + Sync + JsonEqual {
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this array.
jhorstmann marked this conversation as resolved.
Show resolved Hide resolved
fn get_buffer_memory_size(&self) -> usize;
fn get_buffer_memory_size(&self) -> usize {
self.data_ref().get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this array.
jhorstmann marked this conversation as resolved.
Show resolved Hide resolved
fn get_array_memory_size(&self) -> usize;
fn get_array_memory_size(&self) -> usize {
self.data_ref().get_array_memory_size() + std::mem::size_of_val(self)
}

/// returns two pointers that represent this array in the C Data Interface (FFI)
fn to_raw(
Expand Down Expand Up @@ -575,6 +579,7 @@ where
#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_empty_primitive() {
let array = new_empty_array(&DataType::Int32);
Expand Down Expand Up @@ -661,4 +666,63 @@ mod tests {
null_array.data().buffers()[0].len()
);
}

#[test]
fn test_memory_size_primitive() {
let arr = PrimitiveArray::<Int64Type>::from_iter_values(0..128);
let empty =
PrimitiveArray::<Int64Type>::from(ArrayData::new_empty(arr.data_type()));

// substract empty array to avoid magic numbers for the size of additional fields
assert_eq!(
arr.get_array_memory_size() - empty.get_array_memory_size(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a cool calculation 👍

128 * std::mem::size_of::<i64>()
);
}

#[test]
fn test_memory_size_dictionary() {
let values = PrimitiveArray::<Int64Type>::from_iter_values(0..16);
let keys = PrimitiveArray::<Int16Type>::from_iter_values(
(0..256).map(|i| (i % values.len()) as i16),
);

let dict_data = ArrayData::builder(DataType::Dictionary(
Box::new(keys.data_type().clone()),
Box::new(values.data_type().clone()),
))
.len(keys.len())
.buffers(keys.data_ref().buffers().to_vec())
.child_data(vec![ArrayData::builder(DataType::Int64)
.len(values.len())
.buffers(values.data_ref().buffers().to_vec())
.build()])
.build();

let empty_data = ArrayData::new_empty(&DataType::Dictionary(
Box::new(DataType::Int16),
Box::new(DataType::Int64),
));

let arr = DictionaryArray::<Int16Type>::from(dict_data);
let empty = DictionaryArray::<Int16Type>::from(empty_data);

let expected_keys_size = 256 * std::mem::size_of::<i16>();
assert_eq!(
arr.keys().get_array_memory_size() - empty.keys().get_array_memory_size(),
expected_keys_size
);

let expected_values_size = 16 * std::mem::size_of::<i64>();
assert_eq!(
arr.values().get_array_memory_size() - empty.values().get_array_memory_size(),
expected_values_size
);

let expected_size = expected_keys_size + expected_values_size;
assert_eq!(
arr.get_array_memory_size() - empty.get_array_memory_size(),
expected_size
);
}
}
31 changes: 0 additions & 31 deletions arrow/src/array/array_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

use std::convert::{From, TryInto};
use std::fmt;
use std::mem;
use std::{any::Any, iter::FromIterator};

use super::{
Expand Down Expand Up @@ -199,16 +198,6 @@ impl<OffsetSize: BinaryOffsetSizeTrait> Array for GenericBinaryArray<OffsetSize>
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [$name].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [$name].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl<OffsetSize: BinaryOffsetSizeTrait> From<ArrayData>
Expand Down Expand Up @@ -600,16 +589,6 @@ impl Array for FixedSizeBinaryArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeBinaryArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [FixedSizeBinaryArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

/// A type of `DecimalArray` whose elements are binaries.
Expand Down Expand Up @@ -780,16 +759,6 @@ impl Array for DecimalArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [DecimalArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [DecimalArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

#[cfg(test)]
Expand Down
11 changes: 0 additions & 11 deletions arrow/src/array/array_boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
use std::borrow::Borrow;
use std::convert::From;
use std::iter::{FromIterator, IntoIterator};
use std::mem;
use std::{any::Any, fmt};

use super::*;
Expand Down Expand Up @@ -114,16 +113,6 @@ impl Array for BooleanArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [BooleanArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [BooleanArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl From<Vec<bool>> for BooleanArray {
Expand Down
13 changes: 0 additions & 13 deletions arrow/src/array/array_dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
use std::any::Any;
use std::fmt;
use std::iter::IntoIterator;
use std::mem;
use std::{convert::From, iter::FromIterator};

use super::{
Expand Down Expand Up @@ -209,18 +208,6 @@ impl<T: ArrowPrimitiveType> Array for DictionaryArray<T> {
fn data(&self) -> &ArrayData {
&self.data
}

fn get_buffer_memory_size(&self) -> usize {
// Since both `keys` and `values` derive (are references from) `data`, we only need to account for `data`.
self.data.get_buffer_memory_size()
}

fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size()
+ self.keys.get_array_memory_size()
+ self.values.get_array_memory_size()
+ mem::size_of_val(self)
}
}

impl<T: ArrowPrimitiveType> fmt::Debug for DictionaryArray<T> {
Expand Down
23 changes: 0 additions & 23 deletions arrow/src/array/array_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

use std::any::Any;
use std::fmt;
use std::mem;

use num::Num;

Expand Down Expand Up @@ -261,16 +260,6 @@ impl<OffsetSize: 'static + OffsetSizeTrait> Array for GenericListArray<OffsetSiz
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [ListArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [ListArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl<OffsetSize: OffsetSizeTrait> fmt::Debug for GenericListArray<OffsetSize> {
Expand Down Expand Up @@ -444,18 +433,6 @@ impl Array for FixedSizeListArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeListArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size() + self.values().get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [FixedSizeListArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size()
+ self.values().get_array_memory_size()
+ mem::size_of_val(self)
}
}

impl fmt::Debug for FixedSizeListArray {
Expand Down
10 changes: 0 additions & 10 deletions arrow/src/array/array_primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,6 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [PrimitiveArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [PrimitiveArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of::<RawPtrBox<T::Native>>()
}
}

fn as_datetime<T: ArrowPrimitiveType>(v: i64) -> Option<NaiveDateTime> {
Expand Down
11 changes: 0 additions & 11 deletions arrow/src/array/array_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

use std::convert::From;
use std::fmt;
use std::mem;
use std::{any::Any, iter::FromIterator};

use super::{
Expand Down Expand Up @@ -286,16 +285,6 @@ impl<OffsetSize: StringOffsetSizeTrait> Array for GenericStringArray<OffsetSize>
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [$name].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [$name].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl<OffsetSize: StringOffsetSizeTrait> From<ArrayData>
Expand Down
11 changes: 0 additions & 11 deletions arrow/src/array/array_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ use std::any::Any;
use std::convert::{From, TryFrom};
use std::fmt;
use std::iter::IntoIterator;
use std::mem;

use super::{make_array, Array, ArrayData, ArrayRef};
use crate::datatypes::DataType;
Expand Down Expand Up @@ -178,16 +177,6 @@ impl Array for StructArray {
fn len(&self) -> usize {
self.data_ref().len()
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [StructArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [StructArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl From<Vec<(Field, ArrayRef)>> for StructArray {
Expand Down
20 changes: 0 additions & 20 deletions arrow/src/array/array_union.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ use crate::error::{ArrowError, Result};

use core::fmt;
use std::any::Any;
use std::mem;
use std::mem::size_of;

/// An Array that can represent slots of varying types.
Expand Down Expand Up @@ -276,25 +275,6 @@ impl Array for UnionArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [UnionArray].
fn get_buffer_memory_size(&self) -> usize {
let mut size = self.data.get_buffer_memory_size();
for field in &self.boxed_fields {
size += field.get_buffer_memory_size();
}
size
}

/// Returns the total number of bytes of memory occupied physically by this [UnionArray].
fn get_array_memory_size(&self) -> usize {
let mut size = self.data.get_array_memory_size();
size += mem::size_of_val(self) - mem::size_of_val(&self.boxed_fields);
for field in &self.boxed_fields {
size += field.get_array_memory_size();
}
size
}
}

impl fmt::Debug for UnionArray {
Expand Down
11 changes: 0 additions & 11 deletions arrow/src/array/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

use std::any::Any;
use std::fmt;
use std::mem;

use crate::array::{Array, ArrayData};
use crate::datatypes::*;
Expand Down Expand Up @@ -84,16 +83,6 @@ impl Array for NullArray {
fn null_count(&self) -> usize {
self.data_ref().len()
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [NullArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [NullArray].
fn get_array_memory_size(&self) -> usize {
mem::size_of_val(self)
}
}

impl From<ArrayData> for NullArray {
Expand Down