diff --git a/include/pgduckdb/scan/postgres_scan.hpp b/include/pgduckdb/scan/postgres_scan.hpp index 4e842eb4..0c1e83d5 100644 --- a/include/pgduckdb/scan/postgres_scan.hpp +++ b/include/pgduckdb/scan/postgres_scan.hpp @@ -34,6 +34,8 @@ class PostgresScanGlobalState { std::vector m_column_filters; /* Duckdb output vector idx with information about postgres column id */ duckdb::vector> m_output_columns; + /* Store the column ID which needs to output in the set for quick lookup */ + duckdb::set attr_to_output_set; std::atomic m_total_row_count; duckdb::map m_relation_missing_attrs; }; diff --git a/src/pgduckdb_types.cpp b/src/pgduckdb_types.cpp index 8ab97743..933a0851 100644 --- a/src/pgduckdb_types.cpp +++ b/src/pgduckdb_types.cpp @@ -1072,6 +1072,13 @@ InsertTupleIntoChunk(duckdb::DataChunk &output, duckdb::shared_ptrvalues; auto &nulls = scan_local_state->nulls; + // Track detoasted values and their cleanup status + struct DetoastedValue { + Datum value; + bool should_free; + }; + duckdb::map detoasted_values; + /* First we are fetching all required columns ordered by column id * and than we need to write this tuple into output vector. Output column id list * could be out of order so we need to match column values from ordered list. @@ -1080,18 +1087,41 @@ InsertTupleIntoChunk(duckdb::DataChunk &output, duckdb::shared_ptrm_columns_to_scan) { bool is_null = false; - values[duckdb_scanned_index] = + Datum value = HeapTupleFetchNextColumnDatum(scan_global_state->m_tuple_desc, tuple, heap_tuple_read_state, attr_num, &is_null, scan_global_state->m_relation_missing_attrs); - nulls[duckdb_scanned_index] = is_null; + + bool needs_output = scan_global_state->attr_to_output_set.find(attr_num) != scan_global_state->attr_to_output_set.end(); + if (needs_output) { + values[duckdb_scanned_index] = value; + nulls[duckdb_scanned_index] = is_null; + } + auto filter = scan_global_state->m_column_filters[duckdb_scanned_index]; if (!filter) { continue; } - const auto valid_tuple = ApplyValueFilter(*filter, values[duckdb_scanned_index], is_null, + // If this is a varlena type and needs output later, detoast it now + auto attr = scan_global_state->m_tuple_desc->attrs[attr_num - 1]; + if (!is_null && attr.attlen == -1 && needs_output) { + bool should_free = false; + Datum detoasted = DetoastPostgresDatum(reinterpret_cast(value), &should_free); + detoasted_values[attr_num] = {detoasted, should_free}; + value = detoasted; // Use detoasted value for filtering + } + + // It's safe to use ApplyValueFilter directly for the detoasted values. + // Because DetoastPostgresDatum handles already detoasted values correctly + const auto valid_tuple = ApplyValueFilter(*filter, value, is_null, scan_global_state->m_tuple_desc->attrs[attr_num - 1].atttypid); if (!valid_tuple) { + // Clean up any detoasted values we've created + for (auto &[_, dv] : detoasted_values) { + if (dv.should_free) { + duckdb_free(reinterpret_cast(dv.value)); + } + } return; } } @@ -1100,28 +1130,45 @@ InsertTupleIntoChunk(duckdb::DataChunk &output, duckdb::shared_ptrm_output_columns) { auto &result = output.data[duckdb_output_index]; - if (nulls[duckdb_scanned_index]) { - auto &array_mask = duckdb::FlatVector::Validity(result); - array_mask.SetInvalid(scan_local_state->m_output_vector_size); + + // Check if we have a pre-detoasted value + auto it = detoasted_values.find(attr_num); + if (it != detoasted_values.end()) { + // Use existing detoasted value + ConvertPostgresToDuckValue(scan_global_state->m_tuple_desc->attrs[attr_num - 1].atttypid, it->second.value, + result, scan_local_state->m_output_vector_size); } else { - auto attr = scan_global_state->m_tuple_desc->attrs[attr_num - 1]; - if (attr.attlen == -1) { - bool should_free = false; - values[duckdb_scanned_index] = - DetoastPostgresDatum(reinterpret_cast(values[duckdb_scanned_index]), &should_free); - ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result, - scan_local_state->m_output_vector_size); - if (should_free) { - duckdb_free(reinterpret_cast(values[duckdb_scanned_index])); - } + // Process new value + if (nulls[duckdb_scanned_index]) { + auto &array_mask = duckdb::FlatVector::Validity(result); + array_mask.SetInvalid(scan_local_state->m_output_vector_size); } else { - ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result, - scan_local_state->m_output_vector_size); + auto attr = scan_global_state->m_tuple_desc->attrs[attr_num - 1]; + if (attr.attlen == -1) { + bool should_free = false; + values[duckdb_scanned_index] = + DetoastPostgresDatum(reinterpret_cast(values[duckdb_scanned_index]), &should_free); + ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result, + scan_local_state->m_output_vector_size); + if (should_free) { + duckdb_free(reinterpret_cast(values[duckdb_scanned_index])); + } + } else { + ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result, + scan_local_state->m_output_vector_size); + } } } duckdb_output_index++; } + // Cleanup detoasted values + for (auto &[_, dv] : detoasted_values) { + if (dv.should_free) { + duckdb_free(reinterpret_cast(dv.value)); + } + } + scan_local_state->m_output_vector_size++; scan_global_state->m_total_row_count++; } diff --git a/src/scan/postgres_scan.cpp b/src/scan/postgres_scan.cpp index 848db7d9..7125be04 100644 --- a/src/scan/postgres_scan.cpp +++ b/src/scan/postgres_scan.cpp @@ -84,6 +84,10 @@ PostgresScanGlobalState::InitGlobalState(duckdb::TableFunctionInitInput &input) m_output_columns.emplace_back(output_index++, column_id + 1); } } + + for (const auto &[_, attr_num] : m_output_columns) { + attr_to_output_set.emplace(attr_num); + } } void