diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 282b9ff2c9fcc..c73d4b386cf54 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -36,4 +36,7 @@ #include "arrow/types/string.h" #include "arrow/types/struct.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + #endif // ARROW_API_H diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index fafee91f92831..8cc689c3e81ee 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -32,7 +32,7 @@ class Array; class MemoryPool; class PoolBuffer; -static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 8; +static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 5; // Base class for all data array builders class ArrayBuilder { diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index e25729dfb67dc..f35a258e2cb57 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -235,6 +235,29 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { } } +TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { + DECL_T(); + + int size = 10000; + + vector& draws = this->draws_; + vector& nulls = this->nulls_; + + int64_t memory_before = this->pool_->bytes_allocated(); + + this->RandomData(size); + + int i; + for (i = 0; i < size; ++i) { + ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + } + + do { + std::shared_ptr result = this->builder_->Finish(); + } while (false); + + ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); +} TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { DECL_T(); @@ -332,11 +355,11 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { } TYPED_TEST(TestPrimitiveBuilder, TestReserve) { - int n = 100; - ASSERT_OK(this->builder_->Reserve(n)); + ASSERT_OK(this->builder_->Reserve(10)); ASSERT_EQ(0, this->builder_->length()); ASSERT_EQ(MIN_BUILDER_CAPACITY, this->builder_->capacity()); + ASSERT_OK(this->builder_->Reserve(90)); ASSERT_OK(this->builder_->Advance(100)); ASSERT_OK(this->builder_->Reserve(MIN_BUILDER_CAPACITY)); diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index a55ac068a3b46..1073bb6e1c340 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -64,6 +64,8 @@ class PrimitiveArrayImpl : public PrimitiveArray { PrimitiveArrayImpl() : PrimitiveArray() {} + virtual ~PrimitiveArrayImpl() {} + PrimitiveArrayImpl(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index 3f3807d4e2094..50f4716769d70 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -31,6 +31,8 @@ Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, parent_ = parent; } +Buffer::~Buffer() {} + std::shared_ptr MutableBuffer::GetImmutableView() { return std::make_shared(this->get_shared_ptr(), 0, size()); } @@ -43,6 +45,12 @@ PoolBuffer::PoolBuffer(MemoryPool* pool) : pool_ = pool; } +PoolBuffer::~PoolBuffer() { + if (mutable_data_ != nullptr) { + pool_->Free(mutable_data_, capacity_); + } +} + Status PoolBuffer::Reserve(int64_t new_capacity) { if (!mutable_data_ || new_capacity > capacity_) { uint8_t* new_data; diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 8704723eb0a89..0c3e210abd910 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -39,6 +39,7 @@ class Buffer : public std::enable_shared_from_this { Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} + virtual ~Buffer(); // An offset into data that is owned by another buffer, but we want to be // able to retain a valid pointer to it even after other shared_ptr's to the @@ -136,6 +137,7 @@ class ResizableBuffer : public MutableBuffer { class PoolBuffer : public ResizableBuffer { public: explicit PoolBuffer(MemoryPool* pool = nullptr); + virtual ~PoolBuffer(); virtual Status Resize(int64_t new_size); virtual Status Reserve(int64_t new_capacity); diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index e59c6fda40b44..f3b8f4659af18 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -17,7 +17,7 @@ # flake8: noqa -from arrow.array import Array, from_list +from arrow.array import Array, from_list, total_allocated_bytes from arrow.schema import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 3f1efe79be164..3d6df4965d0e8 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -25,6 +25,12 @@ cimport arrow.includes.pyarrow as pyarrow from arrow.compat import frombytes, tobytes from arrow.error cimport check_status + +def total_allocated_bytes(): + cdef MemoryPool* pool = pyarrow.GetMemoryPool() + return pool.bytes_allocated() + + cdef class Array: cdef init(self, const shared_ptr[CArray]& sp_array): diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index a67c3bf5e0abc..fde5de910915a 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -51,6 +51,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string ToString() + cdef cppclass MemoryPool" arrow::MemoryPool": + int64_t bytes_allocated() + cdef cppclass CListType" arrow::ListType"(CDataType): CListType(const shared_ptr[CDataType]& value_type, c_bool nullable) diff --git a/python/arrow/includes/pyarrow.pxd b/python/arrow/includes/pyarrow.pxd index 165d1e7f63e82..3eed5b8542493 100644 --- a/python/arrow/includes/pyarrow.pxd +++ b/python/arrow/includes/pyarrow.pxd @@ -18,7 +18,8 @@ # distutils: language = c++ from arrow.includes.common cimport * -from arrow.includes.arrow cimport CArray, CDataType, LogicalType +from arrow.includes.arrow cimport (CArray, CDataType, LogicalType, + MemoryPool) cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: # We can later add more of the common status factory methods as needed @@ -40,3 +41,5 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: shared_ptr[CDataType] GetPrimitiveType(LogicalType type, c_bool nullable) Status ConvertPySequence(object obj, shared_ptr[CArray]* out) + + MemoryPool* GetMemoryPool() diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py index 1a926acf24d6c..731ed2c0eb77e 100644 --- a/python/arrow/tests/test_convert_builtin.py +++ b/python/arrow/tests/test_convert_builtin.py @@ -42,6 +42,13 @@ def test_integer(self): assert arr.null_count == 2 assert arr.type == arrow.int64() + def test_garbage_collection(self): + import gc + bytes_before = arrow.total_allocated_bytes() + arrow.from_list([1, None, 3, None]) + gc.collect() + assert arrow.total_allocated_bytes() == bytes_before + def test_double(self): pass