duckdb · Reminiscent · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/include/pgduckdb/scan/postgres_scan.hpp b/include/pgduckdb/scan/postgres_scan.hpp
@@ -34,6 +34,8 @@ class PostgresScanGlobalState {
 	std::vector<duckdb::TableFilter *> m_column_filters;
 	/* Duckdb output vector idx with information about postgres column id */
 	duckdb::vector<duckdb::pair<duckdb::idx_t, AttrNumber>> m_output_columns;
+	/* Store the column ID which needs to output in the set for quick lookup */
+    duckdb::set<AttrNumber> attr_to_output_set;
 	std::atomic<std::uint32_t> m_total_row_count;
 	duckdb::map<int, Datum> m_relation_missing_attrs;
 };

diff --git a/src/pgduckdb_types.cpp b/src/pgduckdb_types.cpp
@@ -1072,6 +1072,13 @@ InsertTupleIntoChunk(duckdb::DataChunk &output, duckdb::shared_ptr<PostgresScanG
 	auto &values = scan_local_state->values;
 	auto &nulls = scan_local_state->nulls;
 
+	// Track detoasted values and their cleanup status
+	struct DetoastedValue {
+		Datum value;
+		bool should_free;
+	};
+	duckdb::map<idx_t, DetoastedValue> detoasted_values;
+
 	/* First we are fetching all required columns ordered by column id
 	 * and than we need to write this tuple into output vector. Output column id list
 	 * could be out of order so we need to match column values from ordered list.
@@ -1080,18 +1087,41 @@ InsertTupleIntoChunk(duckdb::DataChunk &output, duckdb::shared_ptr<PostgresScanG
 	/* Read heap tuple with all required columns. */
 	for (auto const &[attr_num, duckdb_scanned_index] : scan_global_state->m_columns_to_scan) {
 		bool is_null = false;
-		values[duckdb_scanned_index] =
+		Datum value =
 		    HeapTupleFetchNextColumnDatum(scan_global_state->m_tuple_desc, tuple, heap_tuple_read_state, attr_num,
 		                                  &is_null, scan_global_state->m_relation_missing_attrs);
-		nulls[duckdb_scanned_index] = is_null;
+
+		bool needs_output = scan_global_state->attr_to_output_set.find(attr_num) != scan_global_state->attr_to_output_set.end();
+		if (needs_output) {
+			values[duckdb_scanned_index] = value;
+			nulls[duckdb_scanned_index] = is_null;
+		}
+
 		auto filter = scan_global_state->m_column_filters[duckdb_scanned_index];
 		if (!filter) {
 			continue;
 		}
 
-		const auto valid_tuple = ApplyValueFilter(*filter, values[duckdb_scanned_index], is_null,
+		// If this is a varlena type and needs output later, detoast it now
+		auto attr = scan_global_state->m_tuple_desc->attrs[attr_num - 1];
+		if (!is_null && attr.attlen == -1 && needs_output) {
+			bool should_free = false;
+			Datum detoasted = DetoastPostgresDatum(reinterpret_cast<varlena *>(value), &should_free);
+			detoasted_values[attr_num] = {detoasted, should_free};
+			value = detoasted; // Use detoasted value for filtering
+		}
+
+		// It's safe to use ApplyValueFilter directly for the detoasted values.
+		// Because DetoastPostgresDatum handles already detoasted values correctly
+		const auto valid_tuple = ApplyValueFilter(*filter, value, is_null,
 		                                          scan_global_state->m_tuple_desc->attrs[attr_num - 1].atttypid);
 		if (!valid_tuple) {
+			// Clean up any detoasted values we've created
+			for (auto &[_, dv] : detoasted_values) {
+				if (dv.should_free) {
+					duckdb_free(reinterpret_cast<void *>(dv.value));
+				}
+			}
 			return;
 		}
 	}
@@ -1100,28 +1130,45 @@ InsertTupleIntoChunk(duckdb::DataChunk &output, duckdb::shared_ptr<PostgresScanG
 	int duckdb_output_index = 0;
 	for (auto const &[duckdb_scanned_index, attr_num] : scan_global_state->m_output_columns) {
 		auto &result = output.data[duckdb_output_index];
-		if (nulls[duckdb_scanned_index]) {
-			auto &array_mask = duckdb::FlatVector::Validity(result);
-			array_mask.SetInvalid(scan_local_state->m_output_vector_size);
+
+		// Check if we have a pre-detoasted value
+		auto it = detoasted_values.find(attr_num);
+		if (it != detoasted_values.end()) {
+			// Use existing detoasted value
+			ConvertPostgresToDuckValue(scan_global_state->m_tuple_desc->attrs[attr_num - 1].atttypid, it->second.value,
+			                           result, scan_local_state->m_output_vector_size);
 		} else {
-			auto attr = scan_global_state->m_tuple_desc->attrs[attr_num - 1];
-			if (attr.attlen == -1) {
-				bool should_free = false;
-				values[duckdb_scanned_index] =
-				    DetoastPostgresDatum(reinterpret_cast<varlena *>(values[duckdb_scanned_index]), &should_free);
-				ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result,
-				                           scan_local_state->m_output_vector_size);
-				if (should_free) {
-					duckdb_free(reinterpret_cast<void *>(values[duckdb_scanned_index]));
-				}
+			// Process new value
+			if (nulls[duckdb_scanned_index]) {
+				auto &array_mask = duckdb::FlatVector::Validity(result);
+				array_mask.SetInvalid(scan_local_state->m_output_vector_size);
 			} else {
-				ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result,
-				                           scan_local_state->m_output_vector_size);
+				auto attr = scan_global_state->m_tuple_desc->attrs[attr_num - 1];
+				if (attr.attlen == -1) {
+					bool should_free = false;
+					values[duckdb_scanned_index] =
+						DetoastPostgresDatum(reinterpret_cast<varlena *>(values[duckdb_scanned_index]), &should_free);
+					ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result,
+											   scan_local_state->m_output_vector_size);
+					if (should_free) {
+						duckdb_free(reinterpret_cast<void *>(values[duckdb_scanned_index]));
+					}
+				} else {
+					ConvertPostgresToDuckValue(attr.atttypid, values[duckdb_scanned_index], result,
+											   scan_local_state->m_output_vector_size);
+				}
 			}
 		}
 		duckdb_output_index++;
 	}
 
+	// Cleanup detoasted values
+	for (auto &[_, dv] : detoasted_values) {
+		if (dv.should_free) {
+			duckdb_free(reinterpret_cast<void *>(dv.value));
+		}
+	}
+
 	scan_local_state->m_output_vector_size++;
 	scan_global_state->m_total_row_count++;
 }

diff --git a/src/scan/postgres_scan.cpp b/src/scan/postgres_scan.cpp
@@ -84,6 +84,10 @@ PostgresScanGlobalState::InitGlobalState(duckdb::TableFunctionInitInput &input)
 			m_output_columns.emplace_back(output_index++, column_id + 1);
 		}
 	}
+
+	for (const auto &[_, attr_num] : m_output_columns) {
+		attr_to_output_set.emplace(attr_num);
+	}
 }
 
 void
-Original file line number
+Diff line change
@@ Expand Up @@
     			m_output_columns.emplace_back(output_index++, column_id + 1);
     		}
     	}
+    	for (const auto &[_, attr_num] : m_output_columns) {
+    		attr_to_output_set.emplace(attr_num);
+    	}
     }
     void
@@ Expand Down @@