From 7d70bb92095c8edc6d9fd2bcb0147a93c49bf59a Mon Sep 17 00:00:00 2001 From: Minghui Yang Date: Fri, 10 May 2024 23:34:34 +0000 Subject: [PATCH] [BACKPORT 2024.1][#22262] YSQL: Fix memory leak in catalog cache refresh Summary: In a newly conducted upgrade stress testing, we found that when upgrading 2.18.4.0 to 2.20.3.0, we can successfully complete the upgrade of a 12-DB, 50 connection per DB, 7-node cluster configuration. But when upgrading 2.18.4.0 to 2024.1-b123, the upgrade failed. We noticed that the PG memory spike in case of 2024.1-b123 was higher. After debugging, the upgrade to 2024.1-b123 involved running more DDLs that cause catalog version to increment. Each time after doing a catalog cache refresh, the memory size of `CacheMemoryContext` goes up which suggests a memory leak. An experiment below demonstrated the memory leak (between each `select`, run `alter user yugabyte superuser` on another session to trigger catalog cache refresh on the next execution of the `select` statement) ``` yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 524288 | 409160 (1 row) yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 12879448 | 10254152 (1 row) yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 12879448 | 10507912 (1 row) yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 12879448 | 10761672 (1 row) yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 12879448 | 11015432 (1 row) yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 12879448 | 11269192 (1 row) ``` Each catalog cache refresh we see `used_bytes` increased by an average of (11269192 - 10254152) / 4 = 253760 Because of the memory leak, the more catalog-version-bumping-DDLs to execute, the more memory used by `CacheMemoryContext`. Therefore the upgrade to 2024.1-b123 saw higher PG memory spike. This diff fixes two identified memory leaks in `CacheMemoryContext` that account for most of the leaks. * ybctid memory needs to be released in `CatCacheRemoveCTup` * yb_table_properties needs to be released in `RelationDestroyRelation`. Jira: DB-11181 Original commit: 166ef51df1 / D34969 Test Plan: Manual test. After the fix ``` yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 524288 | 409000 (1 row) yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 12879448 | 10253160 (1 row) yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 12879448 | 10256456 (1 row) yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 12879448 | 10259752 (1 row) yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 12879448 | 10263048 (1 row) yugabyte=# select total_bytes, used_bytes from pg_get_backend_memory_contexts() where name = 'CacheMemoryContext'; total_bytes | used_bytes -------------+------------ 12879448 | 10266344 (1 row) ``` Each catalog cache refresh we see `used_bytes` increased by an average of (10266344 - 10253160) / 4 = 3296 Reviewers: kfranz, fizaa Reviewed By: fizaa Subscribers: yql Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D35139 --- src/postgres/src/backend/utils/cache/catcache.c | 16 ++++++++++++++++ src/postgres/src/backend/utils/cache/relcache.c | 2 ++ 2 files changed, 18 insertions(+) diff --git a/src/postgres/src/backend/utils/cache/catcache.c b/src/postgres/src/backend/utils/cache/catcache.c index fb3660a39b4a..d14201843a14 100644 --- a/src/postgres/src/backend/utils/cache/catcache.c +++ b/src/postgres/src/backend/utils/cache/catcache.c @@ -505,13 +505,17 @@ CatCacheRemoveCTup(CatCache *cache, CatCTup *ct) /* delink from linked list */ dlist_delete(&ct->cache_elem); + bool need_to_free_ybctid = false; /* * Free keys when we're dealing with a negative entry, normal entries just * point into tuple, allocated together with the CatCTup. + * YB Note: for normal entries we may need to free ybctid. */ if (ct->negative) CatCacheFreeKeys(cache->cc_tupdesc, cache->cc_nkeys, cache->cc_keyno, ct->keys); + else if (IsYugaByteEnabled() && ct->tuple.t_ybctid) + need_to_free_ybctid = true; #ifdef CATCACHE_STATS /* @@ -521,10 +525,16 @@ CatCacheRemoveCTup(CatCache *cache, CatCTup *ct) if (ct->negative) cache->yb_cc_size_bytes -= sizeof(CatCTup); else + { cache->yb_cc_size_bytes -= sizeof(CatCTup) + MAXIMUM_ALIGNOF + ct->tuple.t_len; + if (need_to_free_ybctid) + cache->yb_cc_size_bytes -= VARSIZE(ct->tuple.t_ybctid); + } #endif + if (need_to_free_ybctid) + pfree(DatumGetPointer(ct->tuple.t_ybctid)); pfree(ct); --cache->cc_ntup; @@ -2362,6 +2372,12 @@ CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, Datum *arguments, (const char *) dtp->t_data, dtp->t_len); HEAPTUPLE_COPY_YBCTID(dtp->t_ybctid, ct->tuple.t_ybctid); +#ifdef CATCACHE_STATS + /* HEAPTUPLE_COPY_YBCTID makes allocation for t_ybctid. */ + bool allocated_ybctid = IsYugaByteEnabled() && ct->tuple.t_ybctid; + if (allocated_ybctid) + cache->yb_cc_size_bytes += VARSIZE(ct->tuple.t_ybctid); +#endif MemoryContextSwitchTo(oldcxt); if (dtp != ntp) diff --git a/src/postgres/src/backend/utils/cache/relcache.c b/src/postgres/src/backend/utils/cache/relcache.c index c8ba1fadcc84..781e8f9ea3b3 100644 --- a/src/postgres/src/backend/utils/cache/relcache.c +++ b/src/postgres/src/backend/utils/cache/relcache.c @@ -4265,6 +4265,8 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) pfree(relation->rd_partcheck); if (relation->rd_fdwroutine) pfree(relation->rd_fdwroutine); + if (relation->yb_table_properties) + pfree(relation->yb_table_properties); pfree(relation); }