Skip to content

Commit

Permalink
Fix stats updates (kuzudb#3582)
Browse files Browse the repository at this point in the history
Make use of the data offset when calculating the minimum and maximum values of column updates
Also make use of null data to avoid including stats for placeholder data
used for null values

(cherry picked from commit 35eee92)
  • Loading branch information
benjaminwinger authored and wangqiang committed Jun 4, 2024
1 parent 3f07c62 commit dbf22e0
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 17 deletions.
2 changes: 1 addition & 1 deletion src/include/storage/store/column.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ class Column {

static ChunkCollection getNullChunkCollection(const ChunkCollection& chunkCollection);
void updateStatistics(ColumnChunkMetadata& metadata, common::offset_t maxIndex,
std::optional<StorageValue> min, std::optional<StorageValue> max);
const std::optional<StorageValue>& min, const std::optional<StorageValue>& max);

private:
bool isInsertionsOutOfPagesCapacity(const ColumnChunkMetadata& metadata,
Expand Down
47 changes: 31 additions & 16 deletions src/storage/store/column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ void Column::write(ChunkState& state, offset_t offsetInChunk, ValueVector* vecto
}

void Column::updateStatistics(ColumnChunkMetadata& metadata, offset_t maxIndex,
std::optional<StorageValue> min, std::optional<StorageValue> max) {
const std::optional<StorageValue>& min, const std::optional<StorageValue>& max) {
if (maxIndex >= metadata.numValues) {
metadata.numValues = maxIndex + 1;
KU_ASSERT(sanityCheckForWrites(metadata, dataType));
Expand Down Expand Up @@ -555,35 +555,49 @@ void Column::writeValue(ChunkState& state, offset_t offsetInChunk, ValueVector*
}

inline std::pair<std::optional<StorageValue>, std::optional<StorageValue>> getMinMax(
const uint8_t* data, uint64_t numValues, PhysicalTypeID physicalType) {
const uint8_t* data, uint64_t offset, uint64_t numValues, PhysicalTypeID physicalType,
const NullMask* nullMask) {
// TODO(bmwinger): STRING and maybe LIST columns should store their offsets in separate columns
// with actual integer types so that we can store statistics about the values in the main column
// metadata. This should also simplify some code as we no longer need to sometimes treat those
// columns as integers.
std::optional<StorageValue> min, max;

auto minmax_element = [&]<typename T>(std::span<T> data) {
if (!nullMask || nullMask->hasNoNullsGuarantee()) {
auto [minRaw, maxRaw] = std::minmax_element(data.begin(), data.end());
min = StorageValue(*minRaw);
max = StorageValue(*maxRaw);
} else {
for (uint64_t i = 0; i < numValues; i++) {
if (!nullMask->isNull(offset + i)) {
if (!min || data[i] < min->get<T>()) {
min = StorageValue(data[i]);
}
if (!max || data[i] > max->get<T>()) {
max = StorageValue(data[i]);
}
}
}
}
};
TypeUtils::visit(
physicalType,
[&]<typename T>(T)
requires(std::integral<T> || std::floating_point<T>)
{
auto typedData = std::span(reinterpret_cast<const T*>(data), numValues);
auto [minRaw, maxRaw] = std::minmax_element(typedData.begin(), typedData.end());
min = StorageValue(*minRaw);
max = StorageValue(*maxRaw);
auto typedData = std::span(reinterpret_cast<const T*>(data) + offset, numValues);
minmax_element(typedData);
},
[&](ku_string_t) {
auto typedData = std::span(reinterpret_cast<const uint32_t*>(data), numValues);
auto [minRaw, maxRaw] = std::minmax_element(typedData.begin(), typedData.end());
min = StorageValue(*minRaw);
max = StorageValue(*maxRaw);
auto typedData = std::span(reinterpret_cast<const uint32_t*>(data) + offset, numValues);
minmax_element(typedData);
},
[&]<typename T>(T)
requires(std::same_as<T, list_entry_t> || std::same_as<T, internalID_t>)
{
auto typedData = std::span(reinterpret_cast<const uint64_t*>(data), numValues);
auto [minRaw, maxRaw] = std::minmax_element(typedData.begin(), typedData.end());
min = StorageValue(*minRaw);
max = StorageValue(*maxRaw);
auto typedData = std::span(reinterpret_cast<const uint64_t*>(data) + offset, numValues);
minmax_element(typedData);
},
[&](auto) {});
return std::make_pair(min, max);
Expand All @@ -600,7 +614,7 @@ void Column::write(ChunkState& state, offset_t offsetInChunk, ColumnChunk* data,
writeValues(state, offsetInChunk, data->getData(), nullMaskPtr, dataOffset, numValues);

auto [minWritten, maxWritten] =
getMinMax(data->getData(), numValues, dataType.getPhysicalType());
getMinMax(data->getData(), dataOffset, numValues, dataType.getPhysicalType(), nullMaskPtr);
updateStatistics(state.metadata, offsetInChunk + numValues - 1, minWritten, maxWritten);
}

Expand Down Expand Up @@ -630,7 +644,8 @@ offset_t Column::appendValues(ChunkState& state, const uint8_t* data, const Null
auto newNumPages = dataFH->getNumPages();
state.metadata.numPages += (newNumPages - numPages);

auto [minWritten, maxWritten] = getMinMax(data, numValues, dataType.getPhysicalType());
auto [minWritten, maxWritten] =
getMinMax(data, 0 /*offset*/, numValues, dataType.getPhysicalType(), nullChunkData);
updateStatistics(state.metadata, startOffset + numValues - 1, minWritten, maxWritten);
// TODO(bmwinger): it shouldn't be necessary to do this here; it should be handled in
// prepareCommit
Expand Down
17 changes: 17 additions & 0 deletions test/test_files/copy/copy_rel_stats.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
-DATASET CSV tinysnb

--
-CASE CopyRelStatistics
-STATEMENT CALL storage_info('knows') WHERE column_name = "bwd_date" RETURN min, max;
---- 1
1905-12-12|2021-06-30
-STATEMENT CALL storage_info('knows') WHERE column_name = "fwd_date" RETURN min, max;
---- 1
1905-12-12|2021-06-30

-STATEMENT CALL storage_info('knows') WHERE column_name = "bwd_meetTime" RETURN min, max;
---- 1
1936-11-02 11:02:01|2025-01-01 11:22:33.52
-STATEMENT CALL storage_info('knows') WHERE column_name = "fwd_meetTime" RETURN min, max;
---- 1
1936-11-02 11:02:01|2025-01-01 11:22:33.52
10 changes: 10 additions & 0 deletions test/test_files/update_node/stats.test
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ True
-STATEMENT CALL storage_info('test') WHERE column_name = "value" RETURN to_int8(min) <= 2;
---- 1
True
-STATEMENT MATCH (a:test) WHERE a.value = 2 SET a.value = -2
---- ok
-STATEMENT CALL storage_info('test') WHERE column_name = "value" RETURN min;
---- 1
-2
-STATEMENT MATCH (a:test) WHERE a.value = -2 SET a.value = -3
---- ok
-STATEMENT CALL storage_info('test') WHERE column_name = "value" RETURN min;
---- 1
-3

-CASE UpdateDoubleStats
-STATEMENT CREATE NODE TABLE test(id SERIAL, value DOUBLE, PRIMARY KEY(id));
Expand Down

0 comments on commit dbf22e0

Please sign in to comment.