-
Notifications
You must be signed in to change notification settings - Fork 71
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make InsertTupleIntoChunk
more efficient
#416
Changes from all commits
47abdef
09c9b53
12a6835
b547f77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1072,6 +1072,13 @@ InsertTupleIntoChunk(duckdb::DataChunk &output, duckdb::shared_ptr<PostgresScanG | |
auto &values = scan_local_state->values; | ||
auto &nulls = scan_local_state->nulls; | ||
|
||
// Track detoasted values and their cleanup status | ||
struct DetoastedValue { | ||
Datum value; | ||
bool should_free; | ||
}; | ||
duckdb::map<idx_t, DetoastedValue> detoasted_values; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't want to allocate any datastructures in this function, because it's extremely performance sensitive. We should pre-allocate this one like we do for the others. We instead want to allocate them once. Also as described in my other we don't want to use |
||
|
||
/* First we are fetching all required columns ordered by column id | ||
* and than we need to write this tuple into output vector. Output column id list | ||
* could be out of order so we need to match column values from ordered list. | ||
|
@@ -1080,18 +1087,41 @@ InsertTupleIntoChunk(duckdb::DataChunk &output, duckdb::shared_ptr<PostgresScanG | |
/* Read heap tuple with all required columns. */ | ||
for (auto const &[attr_num, duckdb_scanned_index] : scan_global_state->m_columns_to_scan) { | ||
bool is_null = false; | ||
values[duckdb_scanned_index] = | ||
Datum value = | ||
HeapTupleFetchNextColumnDatum(scan_global_state->m_tuple_desc, tuple, heap_tuple_read_state, attr_num, | ||
&is_null, scan_global_state->m_relation_missing_attrs); | ||
nulls[duckdb_scanned_index] = is_null; | ||
|
||
bool needs_output = scan_global_state->attr_to_output_set.find(attr_num) != scan_global_state->attr_to_output_set.end(); | ||
if (needs_output) { | ||
values[duckdb_scanned_index] = value; | ||
nulls[duckdb_scanned_index] = is_null; | ||
} | ||
|
||
auto filter = scan_global_state->m_column_filters[duckdb_scanned_index]; | ||
if (!filter) { | ||
continue; | ||
} | ||
|
||
const auto valid_tuple = ApplyValueFilter(*filter, values[duckdb_scanned_index], is_null, | ||
// If this is a varlena type and needs output later, detoast it now | ||
auto attr = scan_global_state->m_tuple_desc->attrs[attr_num - 1]; | ||
if (!is_null && attr.attlen == -1 && needs_output) { | ||
bool should_free = false; | ||
Datum detoasted = DetoastPostgresDatum(reinterpret_cast<varlena *>(value), &should_free); | ||
detoasted_values[attr_num] = {detoasted, should_free}; | ||
value = detoasted; // Use detoasted value for filtering | ||
} | ||
|
||
// It's safe to use ApplyValueFilter directly for the detoasted values. | ||
// Because DetoastPostgresDatum handles already detoasted values correctly | ||
const auto valid_tuple = ApplyValueFilter(*filter, value, is_null, | ||
scan_global_state->m_tuple_desc->attrs[attr_num - 1].atttypid); | ||
if (!valid_tuple) { | ||
// Clean up any detoasted values we've created | ||
for (auto &[_, dv] : detoasted_values) { | ||
if (dv.should_free) { | ||
duckdb_free(reinterpret_cast<void *>(dv.value)); | ||
} | ||
} | ||
return; | ||
} | ||
} | ||
|
@@ -1100,28 +1130,45 @@ InsertTupleIntoChunk(duckdb::DataChunk &output, duckdb::shared_ptr<PostgresScanG | |
int duckdb_output_index = 0; | ||
for (auto const &[duckdb_scanned_index, attr_num] : scan_global_state->m_output_columns) { | ||
auto &result = output.data[duckdb_output_index]; | ||
if (nulls[duckdb_scanned_index]) { | ||
auto &array_mask = duckdb::FlatVector::Validity(result); | ||
array_mask.SetInvalid(scan_local_state->m_output_vector_size); | ||
|
||
// Check if we have a pre-detoasted value | ||
auto it = detoasted_values.find(attr_num); | ||
if (it != detoasted_values.end()) { | ||
// Use existing detoasted value | ||
ConvertPostgresToDuckValue(scan_global_state->m_tuple_desc->attrs[attr_num - 1].atttypid, it->second.value, | ||
result, scan_local_state->m_output_vector_size); | ||
} else { | ||
auto attr = scan_global_state->m_tuple_desc->attrs[attr_num - 1]; | ||
if (attr.attlen == -1) { | ||
bool should_free = false; | ||
values[duckdb_scanned_index] = | ||
DetoastPostgresDatum(reinterpret_cast<varlena *>(values[duckdb_scanned_index]), &should_free); | ||
ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result, | ||
scan_local_state->m_output_vector_size); | ||
if (should_free) { | ||
duckdb_free(reinterpret_cast<void *>(values[duckdb_scanned_index])); | ||
} | ||
// Process new value | ||
if (nulls[duckdb_scanned_index]) { | ||
auto &array_mask = duckdb::FlatVector::Validity(result); | ||
array_mask.SetInvalid(scan_local_state->m_output_vector_size); | ||
} else { | ||
ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result, | ||
scan_local_state->m_output_vector_size); | ||
auto attr = scan_global_state->m_tuple_desc->attrs[attr_num - 1]; | ||
if (attr.attlen == -1) { | ||
bool should_free = false; | ||
values[duckdb_scanned_index] = | ||
DetoastPostgresDatum(reinterpret_cast<varlena *>(values[duckdb_scanned_index]), &should_free); | ||
ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result, | ||
scan_local_state->m_output_vector_size); | ||
if (should_free) { | ||
duckdb_free(reinterpret_cast<void *>(values[duckdb_scanned_index])); | ||
} | ||
} else { | ||
ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result, | ||
scan_local_state->m_output_vector_size); | ||
} | ||
} | ||
} | ||
duckdb_output_index++; | ||
} | ||
|
||
// Cleanup detoasted values | ||
for (auto &[_, dv] : detoasted_values) { | ||
if (dv.should_free) { | ||
duckdb_free(reinterpret_cast<void *>(dv.value)); | ||
} | ||
} | ||
|
||
scan_local_state->m_output_vector_size++; | ||
scan_global_state->m_total_row_count++; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@mkaruza converted all the maps that we had to vectors in #366. Because the hashing of the keys had significant overhead. Since
AttrNumber
is bound to be small, I think it's better to use a vector of booleans (or a bitmap or something) to avoid the hashing.