From db92b2a0e842ffd5d0a63099ee2fa95f1b46b7bd Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 15 Jun 2024 13:40:47 +0300 Subject: [PATCH 01/13] implement rolling hyper-log-log algorithm --- pgxn/neon/Makefile | 3 +- pgxn/neon/file_cache.c | 54 +++-- pgxn/neon/hll.c | 201 ++++++++++++++++++ pgxn/neon/hll.h | 85 ++++++++ pgxn/neon/neon--1.3--1.4.sql | 9 + pgxn/neon/neon.control | 2 +- .../test_lfc_working_set_approximation.py | 44 ++++ 7 files changed, 382 insertions(+), 16 deletions(-) create mode 100644 pgxn/neon/hll.c create mode 100644 pgxn/neon/hll.h create mode 100644 pgxn/neon/neon--1.3--1.4.sql diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index cd316dbb9141..16a00523b05d 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -6,6 +6,7 @@ OBJS = \ $(WIN32RES) \ extension_server.o \ file_cache.o \ + hll.o \ libpagestore.o \ neon.o \ neon_utils.o \ @@ -22,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql +DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 25275ef31fe9..45f42dc213af 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -26,7 +26,6 @@ #include "miscadmin.h" #include "pagestore_client.h" #include "common/hashfn.h" -#include "lib/hyperloglog.h" #include "pgstat.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR @@ -40,6 +39,8 @@ #include "utils/dynahash.h" #include "utils/guc.h" +#include "hll.h" + /* * Local file cache is used to temporary store relations pages in local file system. * All blocks of all relations are stored inside one file and addressed using shared hash map. @@ -62,7 +63,6 @@ #define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ #define MB ((uint64)1024*1024) -#define HYPER_LOG_LOG_BIT_WIDTH 10 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) typedef struct FileCacheEntry @@ -87,8 +87,7 @@ typedef struct FileCacheControl uint64 writes; dlist_head lru; /* double linked list for LRU replacement * algorithm */ - hyperLogLogState wss_estimation; /* estimation of wroking set size */ - uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1]; + HyperLogLogState wss_estimation; /* estimation of working set size */ } FileCacheControl; static HTAB *lfc_hash; @@ -96,6 +95,7 @@ static int lfc_desc = 0; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; +static int wss_max_duration; static char *lfc_path; static FileCacheControl *lfc_ctl; static shmem_startup_hook_type prev_shmem_startup_hook; @@ -238,12 +238,7 @@ lfc_shmem_startup(void) dlist_init(&lfc_ctl->lru); /* Initialize hyper-log-log structure for estimating working set size */ - initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH); - - /* We need hashes in shared memory */ - pfree(lfc_ctl->wss_estimation.hashesArr); - memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); - lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes; + initHyperLogLog(&lfc_ctl->wss_estimation, wss_max_duration); /* Recreate file cache on restart */ fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); @@ -383,6 +378,19 @@ lfc_init(void) NULL, NULL); + DefineCustomIntVariable("neon.wss_max_duration", + "Maximal duration for estimating working set size", + NULL, + &wss_max_duration, + 3600, /* one hour */ + 0, + INT_MAX, + PGC_SIGHUP, + GUC_UNIT_S, + NULL, + NULL, + NULL); + if (lfc_max_size == 0) return; @@ -986,20 +994,38 @@ local_cache_pages(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds); + +Datum +approximate_working_set_size_seconds(PG_FUNCTION_ARGS) +{ + if (lfc_size_limit != 0) + { + int32 dc; + time_t duration = PG_ARGISNULL(0) ? wss_max_duration : PG_GETARG_UINT32(0); + LWLockAcquire(lfc_lock, LW_SHARED); + dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation, duration); + LWLockRelease(lfc_lock); + PG_RETURN_INT32(dc); + } + PG_RETURN_NULL(); +} + PG_FUNCTION_INFO_V1(approximate_working_set_size); Datum approximate_working_set_size(PG_FUNCTION_ARGS) { - int32 dc = -1; if (lfc_size_limit != 0) { + int32 dc; bool reset = PG_GETARG_BOOL(0); LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); - dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation); + dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation, wss_max_duration); if (reset) - memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); + memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); LWLockRelease(lfc_lock); + PG_RETURN_INT32(dc); } - PG_RETURN_INT32(dc); + PG_RETURN_NULL(); } diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c new file mode 100644 index 000000000000..597594e5fafa --- /dev/null +++ b/pgxn/neon/hll.c @@ -0,0 +1,201 @@ +/*------------------------------------------------------------------------- + * + * hll.c + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "postgres.h" +#include "funcapi.h" +#include "port/pg_bitutils.h" +#include "utils/timestamp.h" +#include "hll.h" + + +#define POW_2_32 (4294967296.0) +#define NEG_POW_2_32 (-4294967296.0) + +#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS) + +/* + * Worker for addHyperLogLog(). + * + * Calculates the position of the first set bit in first b bits of x argument + * starting from the first, reading from most significant to least significant + * bits. + * + * Example (when considering fist 10 bits of x): + * + * rho(x = 0b1000000000) returns 1 + * rho(x = 0b0010000000) returns 3 + * rho(x = 0b0000000000) returns b + 1 + * + * "The binary address determined by the first b bits of x" + * + * Return value "j" used to index bit pattern to watch. + */ +static inline uint8 +rho(uint32 x, uint8 b) +{ + uint8 j = 1; + + if (x == 0) + return b + 1; + + j = 32 - pg_leftmost_one_pos32(x); + + if (j > b) + return b + 1; + + return j; +} + +/* + * Initialize HyperLogLog track state + */ +void +initHyperLogLog(HyperLogLogState *cState, time_t max_duration) +{ + cState->window = max_duration * USECS_PER_SEC; + memset(cState->regs, 0, sizeof(cState->regs)); +} + +/* + * Adds element to the estimator, from caller-supplied hash. + * + * It is critical that the hash value passed be an actual hash value, typically + * generated using hash_any(). The algorithm relies on a specific bit-pattern + * observable in conjunction with stochastic averaging. There must be a + * uniform distribution of bits in hash values for each distinct original value + * observed. + */ +void +addHyperLogLog(HyperLogLogState *cState, uint32 hash) +{ + uint8 count; + uint32 index; + size_t i; + size_t j; + + TimestampTz now = GetCurrentTimestamp(); + /* Use the first "k" (registerWidth) bits as a zero based index */ + index = hash >> HLL_C_BITS; + + /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ + count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); + + for (i = 0, j = 0; i < cState->regs[index].size; i++) + { + if (cState->regs[index].fpm[i].ts >= now - cState->window + && cState->regs[index].fpm[i].R > count) + { + cState->regs[index].fpm[j++] = cState->regs[index].fpm[i]; + } + } + cState->regs[index].fpm[j].ts = now; + cState->regs[index].fpm[j].R = count; + cState->regs[index].size = j + 1; +} + +static uint8 +getMaximum(LFPM* lfpm, TimestampTz since) +{ + uint8 max = 0; + for (size_t i = 0; i < lfpm->size; i++) + { + if (lfpm->fpm[i].ts >= since && lfpm->fpm[i].R > max) + { + max = lfpm->fpm[i].R; + } + } + return max; +} + + +/* + * Estimates cardinality, based on elements added so far + */ +double +estimateHyperLogLog(HyperLogLogState *cState, time_t duration) +{ + double result; + double sum = 0.0; + size_t i; + uint8 R[HLL_N_REGISTERS]; + TimestampTz since = GetCurrentTimestamp() - duration * USECS_PER_SEC; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + R[i] = getMaximum(&cState->regs[i], since); + sum += 1.0 / pow(2.0, R[i]); + } + + /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ + result = ALPHA_MM / sum; + + if (result <= (5.0 / 2.0) * HLL_N_REGISTERS) + { + /* Small range correction */ + int zero_count = 0; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + zero_count += R[i] == 0; + } + + if (zero_count != 0) + result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS / + zero_count); + } + else if (result > (1.0 / 30.0) * POW_2_32) + { + /* Large range correction */ + result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); + } + + return result; +} + diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h new file mode 100644 index 000000000000..e5d77dfa7727 --- /dev/null +++ b/pgxn/neon/hll.h @@ -0,0 +1,85 @@ +/*------------------------------------------------------------------------- + * + * hll.h + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef HLL_H +#define HLL_H + +#define HLL_BIT_WIDTH 10 +#define HLL_C_BITS (32 - HLL_BIT_WIDTH) +#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH) + +/* Future possible maximum */ +typedef struct FPM +{ + uint8 R; + TimestampTz ts; +} FPM; + +typedef struct LFPM +{ + FPM fpm[HLL_C_BITS]; + size_t size; +} LFPM; + +/* + * HyperLogLog is an approximate technique for computing the number of distinct + * entries in a set. Importantly, it does this by using a fixed amount of + * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal + * cardinality estimation algorithm" for more. + */ +typedef struct HyperLogLogState +{ + time_t window; /* window size in microseconds */ + LFPM regs[HLL_N_REGISTERS]; +} HyperLogLogState; + +extern void initHyperLogLog(HyperLogLogState *cState, time_t max_duration); +extern void addHyperLogLog(HyperLogLogState *cState, uint32 hash); +extern double estimateHyperLogLog(HyperLogLogState *cState, time_t dutration); + +#endif diff --git a/pgxn/neon/neon--1.3--1.4.sql b/pgxn/neon/neon--1.3--1.4.sql new file mode 100644 index 000000000000..042effe3461c --- /dev/null +++ b/pgxn/neon/neon--1.3--1.4.sql @@ -0,0 +1,9 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit + +CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null) +RETURNS integer +AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds' +LANGUAGE C PARALLEL SAFE; + +GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor; + diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control index cee2f336f27f..03bdb9a0b41c 100644 --- a/pgxn/neon/neon.control +++ b/pgxn/neon/neon.control @@ -1,6 +1,6 @@ # neon extension comment = 'cloud storage for PostgreSQL' -default_version = '1.3' +default_version = '1.4' module_pathname = '$libdir/neon' relocatable = true trusted = true diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index a6f05fe0f712..edbb7f376dec 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -1,3 +1,4 @@ +import time from pathlib import Path from fixtures.log_helper import log @@ -72,3 +73,46 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): blocks = query_scalar(cur, "select approximate_working_set_size(true)") log.info(f"working set size after some index access of a few select pages only {blocks}") assert blocks < 10 + + +def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): + env = neon_simple_env + + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "shared_buffers=1MB", + "neon.wss_max_duration=100s", + "neon.max_file_cache_size=256MB", + "neon.file_cache_size_limit=245MB", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("create extension neon") + cur.execute( + "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))" + ) + cur.execute("insert into t (pk) values (generate_series(1,1000000))") + time.sleep(2) + before_10k = time.monotonic() + cur.execute("select sum(count) from t where pk between 10000 and 20000") + time.sleep(2) + before_1k = time.monotonic() + cur.execute("select sum(count) from t where pk between 1000 and 2000") + after = time.monotonic() + + cur.execute(f"select approximate_working_set_size_seconds({int(after - before_1k + 1)})") + estimation_1k = cur.fetchall()[0][0] + log.info(f"Working set size for selecting 1k records {estimation_1k}") + + cur.execute(f"select approximate_working_set_size_seconds({int(after - before_10k + 1)})") + estimation_10k = cur.fetchall()[0][0] + log.info(f"Working set size for selecting 10k records {estimation_10k}") + + cur.execute("select pg_table_size('t')") + size = cur.fetchall()[0][0] // 8192 + log.info(f"Table size {size} blocks") + + assert estimation_1k >= 20 and estimation_1k <= 40 + assert estimation_10k >= 200 and estimation_10k <= 400 From 280de5d1796e62c7556d3e307e331357e6cc9439 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 15 Jun 2024 15:30:29 +0300 Subject: [PATCH 02/13] Fix test_neon_extension to support 1.4 --- pgxn/neon/neon--1.4--1.3.sql | 1 + test_runner/regress/test_neon_extension.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 pgxn/neon/neon--1.4--1.3.sql diff --git a/pgxn/neon/neon--1.4--1.3.sql b/pgxn/neon/neon--1.4--1.3.sql new file mode 100644 index 000000000000..bea72d1a6b17 --- /dev/null +++ b/pgxn/neon/neon--1.4--1.3.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE; diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 39b486502672..bb844244e329 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -24,7 +24,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.3",) + assert cur.fetchone() == ("1.4",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") res = cur.fetchall() log.info(res) @@ -48,10 +48,10 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.3",) + assert cur.fetchone() == ("1.4",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") - all_versions = ["1.3", "1.2", "1.1", "1.0"] - current_version = "1.3" + all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"] + current_version = "1.4" for idx, begin_version in enumerate(all_versions): for target_version in all_versions[idx + 1 :]: if current_version != begin_version: From 6f3d17625f2dc91e8427a51f14f334e3dd0dfcd2 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 15 Jun 2024 17:15:49 +0300 Subject: [PATCH 03/13] Fix size of HLL registers array --- pgxn/neon/Makefile | 2 +- pgxn/neon/hll.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 16a00523b05d..3b755bb0420c 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -23,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql +DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h index e5d77dfa7727..7462609ced71 100644 --- a/pgxn/neon/hll.h +++ b/pgxn/neon/hll.h @@ -75,7 +75,7 @@ typedef struct LFPM typedef struct HyperLogLogState { time_t window; /* window size in microseconds */ - LFPM regs[HLL_N_REGISTERS]; + LFPM regs[HLL_N_REGISTERS+1]; } HyperLogLogState; extern void initHyperLogLog(HyperLogLogState *cState, time_t max_duration); From fa2a462c5ca2d32f9c29fe6a30abf90c256e3dea Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 15 Jun 2024 23:26:22 +0300 Subject: [PATCH 04/13] Rename sliding HyperLogLog functions to avoid conflict name conflicts with hyperloglog.c --- pgxn/neon/file_cache.c | 8 ++++---- pgxn/neon/hll.c | 6 +++--- pgxn/neon/hll.h | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 45f42dc213af..5e5bad22f57d 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -238,7 +238,7 @@ lfc_shmem_startup(void) dlist_init(&lfc_ctl->lru); /* Initialize hyper-log-log structure for estimating working set size */ - initHyperLogLog(&lfc_ctl->wss_estimation, wss_max_duration); + initSHLL(&lfc_ctl->wss_estimation, wss_max_duration); /* Recreate file cache on restart */ fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); @@ -553,7 +553,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* Approximate working set */ tag.blockNum = blkno; - addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) { @@ -1004,7 +1004,7 @@ approximate_working_set_size_seconds(PG_FUNCTION_ARGS) int32 dc; time_t duration = PG_ARGISNULL(0) ? wss_max_duration : PG_GETARG_UINT32(0); LWLockAcquire(lfc_lock, LW_SHARED); - dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation, duration); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration); LWLockRelease(lfc_lock); PG_RETURN_INT32(dc); } @@ -1021,7 +1021,7 @@ approximate_working_set_size(PG_FUNCTION_ARGS) int32 dc; bool reset = PG_GETARG_BOOL(0); LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); - dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation, wss_max_duration); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, wss_max_duration); if (reset) memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); LWLockRelease(lfc_lock); diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c index 597594e5fafa..b7abf02a8f88 100644 --- a/pgxn/neon/hll.c +++ b/pgxn/neon/hll.c @@ -97,7 +97,7 @@ rho(uint32 x, uint8 b) * Initialize HyperLogLog track state */ void -initHyperLogLog(HyperLogLogState *cState, time_t max_duration) +initSHLL(HyperLogLogState *cState, time_t max_duration) { cState->window = max_duration * USECS_PER_SEC; memset(cState->regs, 0, sizeof(cState->regs)); @@ -113,7 +113,7 @@ initHyperLogLog(HyperLogLogState *cState, time_t max_duration) * observed. */ void -addHyperLogLog(HyperLogLogState *cState, uint32 hash) +addSHLL(HyperLogLogState *cState, uint32 hash) { uint8 count; uint32 index; @@ -159,7 +159,7 @@ getMaximum(LFPM* lfpm, TimestampTz since) * Estimates cardinality, based on elements added so far */ double -estimateHyperLogLog(HyperLogLogState *cState, time_t duration) +estimateSHLL(HyperLogLogState *cState, time_t duration) { double result; double sum = 0.0; diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h index 7462609ced71..9fe680e2ebd2 100644 --- a/pgxn/neon/hll.h +++ b/pgxn/neon/hll.h @@ -78,8 +78,8 @@ typedef struct HyperLogLogState LFPM regs[HLL_N_REGISTERS+1]; } HyperLogLogState; -extern void initHyperLogLog(HyperLogLogState *cState, time_t max_duration); -extern void addHyperLogLog(HyperLogLogState *cState, uint32 hash); -extern double estimateHyperLogLog(HyperLogLogState *cState, time_t dutration); +extern void initSHLL(HyperLogLogState *cState, time_t max_duration); +extern void addSHLL(HyperLogLogState *cState, uint32 hash); +extern double estimateSHLL(HyperLogLogState *cState, time_t dutration); #endif From 069ba7e1a1f4b525cd1570536bb43f50fde895ec Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 18 Jun 2024 18:50:40 +0200 Subject: [PATCH 05/13] Improve the memory usage of our own HLL implementation We don't use sliding windows, so we can just drop the historical snapshot requirement from the implementation, thus removing some tracking overhead. --- pgxn/neon/file_cache.c | 2 +- pgxn/neon/hll.c | 26 ++++++++++---------------- pgxn/neon/hll.h | 33 +++++++++++++++++---------------- 3 files changed, 28 insertions(+), 33 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 5e5bad22f57d..a0d4eb31c74c 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -238,7 +238,7 @@ lfc_shmem_startup(void) dlist_init(&lfc_ctl->lru); /* Initialize hyper-log-log structure for estimating working set size */ - initSHLL(&lfc_ctl->wss_estimation, wss_max_duration); + initSHLL(&lfc_ctl->wss_estimation); /* Recreate file cache on restart */ fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c index b7abf02a8f88..4ce188a46397 100644 --- a/pgxn/neon/hll.c +++ b/pgxn/neon/hll.c @@ -97,9 +97,8 @@ rho(uint32 x, uint8 b) * Initialize HyperLogLog track state */ void -initSHLL(HyperLogLogState *cState, time_t max_duration) +initSHLL(HyperLogLogState *cState) { - cState->window = max_duration * USECS_PER_SEC; memset(cState->regs, 0, sizeof(cState->regs)); } @@ -127,30 +126,25 @@ addSHLL(HyperLogLogState *cState, uint32 hash) /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); - for (i = 0, j = 0; i < cState->regs[index].size; i++) + for (i = 0; i < count + 1; i++) { - if (cState->regs[index].fpm[i].ts >= now - cState->window - && cState->regs[index].fpm[i].R > count) - { - cState->regs[index].fpm[j++] = cState->regs[index].fpm[i]; - } + cState->regs[index][i] = now; } - cState->regs[index].fpm[j].ts = now; - cState->regs[index].fpm[j].R = count; - cState->regs[index].size = j + 1; } static uint8 -getMaximum(LFPM* lfpm, TimestampTz since) +getMaximum(const TimestampTz* reg, TimestampTz since) { uint8 max = 0; - for (size_t i = 0; i < lfpm->size; i++) + + for (size_t i = 0; i < HLL_C_BITS + 1; i++) { - if (lfpm->fpm[i].ts >= since && lfpm->fpm[i].R > max) + if (reg[i] >= since) { - max = lfpm->fpm[i].R; + max = i; } } + return max; } @@ -169,7 +163,7 @@ estimateSHLL(HyperLogLogState *cState, time_t duration) for (i = 0; i < HLL_N_REGISTERS; i++) { - R[i] = getMaximum(&cState->regs[i], since); + R[i] = getMaximum(cState->regs[i], since); sum += 1.0 / pow(2.0, R[i]); } diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h index 9fe680e2ebd2..9256cb9afa2f 100644 --- a/pgxn/neon/hll.h +++ b/pgxn/neon/hll.h @@ -53,32 +53,33 @@ #define HLL_C_BITS (32 - HLL_BIT_WIDTH) #define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH) -/* Future possible maximum */ -typedef struct FPM -{ - uint8 R; - TimestampTz ts; -} FPM; - -typedef struct LFPM -{ - FPM fpm[HLL_C_BITS]; - size_t size; -} LFPM; - /* * HyperLogLog is an approximate technique for computing the number of distinct * entries in a set. Importantly, it does this by using a fixed amount of * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal * cardinality estimation algorithm" for more. + * + * Instead of a single counter for every bits register, we have a timestamp + * for every valid number of bits we can encounter. Every time we encounter + * a certain number of bits, we update the timestamp in those registers to + * the current timestamp. + * + * We can query the sketch's stored cardinality for the range of some timestamp + * up to now: For each register, we return the highest bits bucket that has a + * modified timestamp >= the query timestamp. This value is the number of bits + * for this register in the normal HLL calculation. + * + * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB. + * Usage could be halved if we decide to reduce the required time dimension + * precision; as 32 bits in second precision should be enough for statistics. + * However, that is not yet implemented. */ typedef struct HyperLogLogState { - time_t window; /* window size in microseconds */ - LFPM regs[HLL_N_REGISTERS+1]; + TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1]; } HyperLogLogState; -extern void initSHLL(HyperLogLogState *cState, time_t max_duration); +extern void initSHLL(HyperLogLogState *cState); extern void addSHLL(HyperLogLogState *cState, uint32 hash); extern double estimateSHLL(HyperLogLogState *cState, time_t dutration); From 901e88d90db5d7c81cf063f362804475d8776e8e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 20 Jun 2024 10:20:06 +0300 Subject: [PATCH 06/13] Remove sliding window --- pgxn/neon/hll.c | 5 +---- test_runner/regress/test_lfc_working_set_approximation.py | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c index 4ce188a46397..55df59171e1b 100644 --- a/pgxn/neon/hll.c +++ b/pgxn/neon/hll.c @@ -126,10 +126,7 @@ addSHLL(HyperLogLogState *cState, uint32 hash) /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); - for (i = 0; i < count + 1; i++) - { - cState->regs[index][i] = now; - } + cState->regs[index][count] = now; } static uint8 diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index edbb7f376dec..45849e87ca19 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -82,7 +82,6 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): branch_name="main", config_lines=[ "shared_buffers=1MB", - "neon.wss_max_duration=100s", "neon.max_file_cache_size=256MB", "neon.file_cache_size_limit=245MB", ], From 02b30d320d5d8bb2e5a285e37174194ad6f06e78 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 20 Jun 2024 17:27:48 +0300 Subject: [PATCH 07/13] Didable autovacuum for working set size estimation test --- test_runner/regress/test_lfc_working_set_approximation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index 45849e87ca19..4c53e4e2fd35 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -81,6 +81,7 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): endpoint = env.endpoints.create_start( branch_name="main", config_lines=[ + "autovacuum = off", "shared_buffers=1MB", "neon.max_file_cache_size=256MB", "neon.file_cache_size_limit=245MB", From fdea1e20f1b7741d3d9083ebf6233be33a89e248 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 3 Jul 2024 16:34:33 +0300 Subject: [PATCH 08/13] RTemporary switch default version to 1.3 to ve able to revert deploy --- pgxn/neon/neon.control | 2 +- test_runner/regress/test_lfc_working_set_approximation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control index 03bdb9a0b41c..cee2f336f27f 100644 --- a/pgxn/neon/neon.control +++ b/pgxn/neon/neon.control @@ -1,6 +1,6 @@ # neon extension comment = 'cloud storage for PostgreSQL' -default_version = '1.4' +default_version = '1.3' module_pathname = '$libdir/neon' relocatable = true trusted = true diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index 4c53e4e2fd35..6465bdfd217d 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -89,7 +89,7 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): ) conn = endpoint.connect() cur = conn.cursor() - cur.execute("create extension neon") + cur.execute("create extension neon version '1.4'") cur.execute( "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))" ) From 753da983040c403f7eaef0332d7e199e279aca74 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 4 Jul 2024 09:34:40 +0300 Subject: [PATCH 09/13] Use 1.3 as cxurrent version in test_neon_extension --- test_runner/regress/test_neon_extension.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index bb844244e329..57be7df313cb 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -51,7 +51,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): assert cur.fetchone() == ("1.4",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"] - current_version = "1.4" + current_version = "1.3" for idx, begin_version in enumerate(all_versions): for target_version in all_versions[idx + 1 :]: if current_version != begin_version: From 3510f35324e3fdff2eb90efa985491bc836466f8 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 4 Jul 2024 12:09:19 +0300 Subject: [PATCH 10/13] Use 1.3 as current version in test_neon_extension --- test_runner/regress/test_neon_extension.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 57be7df313cb..4961eefc7d40 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -48,7 +48,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.4",) + assert cur.fetchone() == ("1.3",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"] current_version = "1.3" From 93630a0164afca3af0b4183ad0e4b4b350f006ca Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 4 Jul 2024 14:32:27 +0300 Subject: [PATCH 11/13] Use 1.3 as current version in test_neon_extension --- test_runner/regress/test_neon_extension.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 4961eefc7d40..e83aaf91c60f 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -24,7 +24,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.4",) + assert cur.fetchone() == ("1.3",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") res = cur.fetchall() log.info(res) From 3948ad9a84c5db672a829d068112137f8ae4d360 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 4 Jul 2024 16:15:07 +0300 Subject: [PATCH 12/13] Remove neon.wss_max_duration GUC --- pgxn/neon/file_cache.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index a0d4eb31c74c..944dd9e257fe 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -95,7 +95,6 @@ static int lfc_desc = 0; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; -static int wss_max_duration; static char *lfc_path; static FileCacheControl *lfc_ctl; static shmem_startup_hook_type prev_shmem_startup_hook; @@ -378,19 +377,6 @@ lfc_init(void) NULL, NULL); - DefineCustomIntVariable("neon.wss_max_duration", - "Maximal duration for estimating working set size", - NULL, - &wss_max_duration, - 3600, /* one hour */ - 0, - INT_MAX, - PGC_SIGHUP, - GUC_UNIT_S, - NULL, - NULL, - NULL); - if (lfc_max_size == 0) return; @@ -1002,7 +988,7 @@ approximate_working_set_size_seconds(PG_FUNCTION_ARGS) if (lfc_size_limit != 0) { int32 dc; - time_t duration = PG_ARGISNULL(0) ? wss_max_duration : PG_GETARG_UINT32(0); + time_t duration = PG_ARGISNULL(0) ? INT_MAX : PG_GETARG_UINT32(0); LWLockAcquire(lfc_lock, LW_SHARED); dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration); LWLockRelease(lfc_lock); @@ -1021,7 +1007,7 @@ approximate_working_set_size(PG_FUNCTION_ARGS) int32 dc; bool reset = PG_GETARG_BOOL(0); LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); - dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, wss_max_duration); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, INT_MAX); if (reset) memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); LWLockRelease(lfc_lock); From 7df9051c6eb4c1e093fdb8c2396f1033059f0343 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 4 Jul 2024 18:51:11 +0300 Subject: [PATCH 13/13] Fix specifying infinite interval for working set estimation --- pgxn/neon/file_cache.c | 4 ++-- pgxn/neon/hll.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 944dd9e257fe..1894e8c72a5c 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -988,7 +988,7 @@ approximate_working_set_size_seconds(PG_FUNCTION_ARGS) if (lfc_size_limit != 0) { int32 dc; - time_t duration = PG_ARGISNULL(0) ? INT_MAX : PG_GETARG_UINT32(0); + time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0); LWLockAcquire(lfc_lock, LW_SHARED); dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration); LWLockRelease(lfc_lock); @@ -1007,7 +1007,7 @@ approximate_working_set_size(PG_FUNCTION_ARGS) int32 dc; bool reset = PG_GETARG_BOOL(0); LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); - dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, INT_MAX); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1); if (reset) memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); LWLockRelease(lfc_lock); diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c index 55df59171e1b..f8496b31259d 100644 --- a/pgxn/neon/hll.c +++ b/pgxn/neon/hll.c @@ -156,7 +156,8 @@ estimateSHLL(HyperLogLogState *cState, time_t duration) double sum = 0.0; size_t i; uint8 R[HLL_N_REGISTERS]; - TimestampTz since = GetCurrentTimestamp() - duration * USECS_PER_SEC; + /* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */ + TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC; for (i = 0; i < HLL_N_REGISTERS; i++) {