Skip to content

Commit

Permalink
apacheGH-40316: [Python] only allocate the ScalarMemoTable when used
Browse files Browse the repository at this point in the history
  • Loading branch information
anjakefala committed Mar 14, 2024
1 parent 0ce7267 commit 1e90930
Showing 1 changed file with 33 additions and 28 deletions.
61 changes: 33 additions & 28 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -620,37 +620,42 @@ inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArra
using ArrayType = typename TypeTraits<Type>::ArrayType;
using Scalar = typename MemoizationTraits<Type>::Scalar;

::arrow::internal::ScalarMemoTable<Scalar> memo_table(options.pool);
std::vector<PyObject*> unique_values;
int32_t memo_size = 0;

auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) {
int32_t memo_index;
RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index));
if (memo_index == memo_size) {
// New entry
RETURN_NOT_OK(wrap_func(value, out_values));
unique_values.push_back(*out_values);
++memo_size;
} else {
// Duplicate entry
Py_INCREF(unique_values[memo_index]);
*out_values = unique_values[memo_index];
}
return Status::OK();
};

auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) {
return wrap_func(value, out_values);
};
std::shared_ptr<::arrow::internal::ScalarMemoTable<Scalar>> memo_table = nullptr;
std::shared_ptr<std::vector<PyObject*>> unique_values = nullptr;
std::shared_ptr<int32_t> memo_size = std::make_shared<int32_t>(0);

std::function<Status(const typename MemoizationTraits<Type>::Scalar&, PyObject**)>
WrapFunc;

if (options.deduplicate_objects) {
memo_table =
std::make_shared<::arrow::internal::ScalarMemoTable<Scalar>>(options.pool);
unique_values = std::make_shared<std::vector<PyObject*>>();

WrapFunc = [&](const Scalar& value, PyObject** out_values) {
int32_t memo_index;
RETURN_NOT_OK(memo_table->GetOrInsert(value, &memo_index));
if (memo_index == *memo_size) {
// New entry
RETURN_NOT_OK(wrap_func(value, out_values));
unique_values->push_back(*out_values);
++(*memo_size);
} else {
// Duplicate entry
Py_INCREF((*unique_values)[memo_index]);
*out_values = (*unique_values)[memo_index];
}
return Status::OK();
};
} else {
WrapFunc = [&](const Scalar& value, PyObject** out_values) {
return wrap_func(value, out_values);
};
}

for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = arrow::internal::checked_cast<const ArrayType&>(*data.chunk(c));
if (options.deduplicate_objects) {
RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapMemoized, out_values));
} else {
RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapUnmemoized, out_values));
}
RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapFunc, out_values));
out_values += arr.length();
}
return Status::OK();
Expand Down

0 comments on commit 1e90930

Please sign in to comment.