From 4a0bedeb3831fcfb1c177fe9ddc56a11a08887a5 Mon Sep 17 00:00:00 2001 From: Sherin T George <sherin-t.george@hpe.com> Date: Mon, 16 Oct 2023 13:05:39 +0530 Subject: [PATCH] DAOS-14491: Retain support for phase-1 DAV heap The phase-2 DAV allocator is placed under the subdirectory src/common/dav_v2. This allocator is built as a standalone shared library and linked to the libdaos_common_pmem library. The umem will now support one more mode DAOS_MD_BMEM_V2. Setting this mode in umem instance will result in using phase-2 DAV allocator interfaces. Signed-off-by: Sherin T George <sherin-t.george@hpe.com> --- debian/changelog | 7 + debian/daos-server.install | 1 + src/common/SConscript | 3 +- src/common/dav/bucket.c | 25 +- src/common/dav/bucket.h | 8 +- src/common/dav/dav.h | 238 ++- src/common/dav/dav_iface.c | 7 + src/common/dav/dav_internal.h | 5 + src/common/dav/heap.c | 880 +++++++++-- src/common/dav/heap.h | 108 +- src/common/dav/obj.h | 5 +- src/common/dav/palloc.c | 70 +- src/common/dav/palloc.h | 105 +- src/common/dav/recycler.c | 26 +- src/common/dav/recycler.h | 9 +- src/common/dav/tx.c | 350 +++-- src/common/dav_v2/README.md | 6 + src/common/dav_v2/SConscript | 30 + src/common/dav_v2/alloc_class.c | 647 +++++++++ src/common/dav_v2/alloc_class.h | 71 + src/common/dav_v2/bucket.c | 275 ++++ src/common/dav_v2/bucket.h | 47 + src/common/dav_v2/container.h | 44 + src/common/dav_v2/container_ravl.c | 194 +++ src/common/dav_v2/container_seglists.c | 175 +++ src/common/dav_v2/critnib.c | 678 +++++++++ src/common/dav_v2/critnib.h | 23 + src/common/dav_v2/dav_clogs.c | 104 ++ src/common/dav_v2/dav_clogs.h | 56 + src/common/dav_v2/dav_iface.c | 434 ++++++ src/common/dav_v2/dav_internal.h | 82 ++ src/common/dav_v2/dav_v2.h | 307 ++++ src/common/dav_v2/heap.c | 1398 ++++++++++++++++++ src/common/dav_v2/heap.h | 98 ++ src/common/dav_v2/heap_layout.h | 198 +++ src/common/dav_v2/memblock.c | 1615 +++++++++++++++++++++ src/common/dav_v2/memblock.h | 297 ++++ src/common/dav_v2/memops.c | 677 +++++++++ src/common/dav_v2/memops.h | 66 + src/common/dav_v2/mo_wal.h | 95 ++ src/common/dav_v2/obj.h | 50 + src/common/dav_v2/out.h | 104 ++ src/common/dav_v2/palloc.c | 977 +++++++++++++ src/common/dav_v2/palloc.h | 105 ++ src/common/dav_v2/queue.h | 112 ++ src/common/dav_v2/ravl.c | 613 ++++++++ src/common/dav_v2/ravl.h | 48 + src/common/dav_v2/ravl_interval.c | 344 +++++ src/common/dav_v2/ravl_interval.h | 37 + src/common/dav_v2/recycler.c | 323 +++++ src/common/dav_v2/recycler.h | 46 + src/common/dav_v2/stats.c | 78 + src/common/dav_v2/stats.h | 61 + src/common/dav_v2/sys_util.h | 83 ++ src/common/dav_v2/tx.c | 1855 ++++++++++++++++++++++++ src/common/dav_v2/tx.h | 22 + src/common/dav_v2/ulog.c | 695 +++++++++ src/common/dav_v2/ulog.h | 167 +++ src/common/dav_v2/util.c | 223 +++ src/common/dav_v2/util.h | 202 +++ src/common/dav_v2/valgrind_internal.h | 293 ++++ src/common/dav_v2/vec.h | 145 ++ src/common/dav_v2/vecq.h | 121 ++ src/common/dav_v2/wal_tx.c | 509 +++++++ src/common/dav_v2/wal_tx.h | 44 + src/common/mem.c | 321 +++- src/common/tests/umem_test_bmem.c | 1 + src/include/daos/mem.h | 3 + src/include/gurt/common.h | 1 + utils/rpms/daos.rpmlintrc | 2 +- utils/rpms/daos.spec | 7 +- utils/utest.yaml | 10 + 72 files changed, 16468 insertions(+), 598 deletions(-) create mode 100644 src/common/dav_v2/README.md create mode 100644 src/common/dav_v2/SConscript create mode 100644 src/common/dav_v2/alloc_class.c create mode 100644 src/common/dav_v2/alloc_class.h create mode 100644 src/common/dav_v2/bucket.c create mode 100644 src/common/dav_v2/bucket.h create mode 100644 src/common/dav_v2/container.h create mode 100644 src/common/dav_v2/container_ravl.c create mode 100644 src/common/dav_v2/container_seglists.c create mode 100644 src/common/dav_v2/critnib.c create mode 100644 src/common/dav_v2/critnib.h create mode 100644 src/common/dav_v2/dav_clogs.c create mode 100644 src/common/dav_v2/dav_clogs.h create mode 100644 src/common/dav_v2/dav_iface.c create mode 100644 src/common/dav_v2/dav_internal.h create mode 100644 src/common/dav_v2/dav_v2.h create mode 100644 src/common/dav_v2/heap.c create mode 100644 src/common/dav_v2/heap.h create mode 100644 src/common/dav_v2/heap_layout.h create mode 100644 src/common/dav_v2/memblock.c create mode 100644 src/common/dav_v2/memblock.h create mode 100644 src/common/dav_v2/memops.c create mode 100644 src/common/dav_v2/memops.h create mode 100644 src/common/dav_v2/mo_wal.h create mode 100644 src/common/dav_v2/obj.h create mode 100644 src/common/dav_v2/out.h create mode 100644 src/common/dav_v2/palloc.c create mode 100644 src/common/dav_v2/palloc.h create mode 100644 src/common/dav_v2/queue.h create mode 100644 src/common/dav_v2/ravl.c create mode 100644 src/common/dav_v2/ravl.h create mode 100644 src/common/dav_v2/ravl_interval.c create mode 100644 src/common/dav_v2/ravl_interval.h create mode 100644 src/common/dav_v2/recycler.c create mode 100644 src/common/dav_v2/recycler.h create mode 100644 src/common/dav_v2/stats.c create mode 100644 src/common/dav_v2/stats.h create mode 100644 src/common/dav_v2/sys_util.h create mode 100644 src/common/dav_v2/tx.c create mode 100644 src/common/dav_v2/tx.h create mode 100644 src/common/dav_v2/ulog.c create mode 100644 src/common/dav_v2/ulog.h create mode 100644 src/common/dav_v2/util.c create mode 100644 src/common/dav_v2/util.h create mode 100644 src/common/dav_v2/valgrind_internal.h create mode 100644 src/common/dav_v2/vec.h create mode 100644 src/common/dav_v2/vecq.h create mode 100644 src/common/dav_v2/wal_tx.c create mode 100644 src/common/dav_v2/wal_tx.h diff --git a/debian/changelog b/debian/changelog index 44c317daedd..806a855fbe4 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +daos (2.5.100-10) unstable; urgency=medium + + [ Sherin T George ] + * Add DAV v2 lib + + -- Sherin T George <sherin-t.george@hpe.com> Mon, 16 Oct 2023 11:54:00 +0530 + daos (2.5.100-9) unstable; urgency=medium [ Brian J. Murrell ] diff --git a/debian/daos-server.install b/debian/daos-server.install index e34d8e92ae8..77ad815788a 100644 --- a/debian/daos-server.install +++ b/debian/daos-server.install @@ -25,6 +25,7 @@ usr/lib64/daos_srv/libbio.so usr/lib64/daos_srv/libplacement.so usr/lib64/daos_srv/libpipeline.so usr/lib64/libdaos_common_pmem.so +usr/lib64/libdav_v2.so usr/share/daos/control/setup_spdk.sh usr/lib/systemd/system/daos_server.service usr/lib/sysctl.d/10-daos_server.conf diff --git a/src/common/SConscript b/src/common/SConscript index c61ecdeebe3..ca19d27e94a 100644 --- a/src/common/SConscript +++ b/src/common/SConscript @@ -30,7 +30,7 @@ def build_daos_common(denv, client): 'dav/ravl_interval.c', 'dav/recycler.c', 'dav/stats.c', 'dav/tx.c', 'dav/ulog.c', 'dav/util.c', 'dav/wal_tx.c'] ad_mem_files = ['ad_mem.c', 'ad_tx.c'] - common_libs.extend(['pmemobj', 'abt']) + common_libs.extend(['pmemobj', 'abt', 'dav_v2']) benv.AppendUnique(RPATH_FULL=['$PREFIX/lib64/daos_srv']) benv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD']) benv.Append(OBJPREFIX="v_") @@ -53,6 +53,7 @@ def scons(): """Execute build""" Import('env', 'base_env', 'prereqs') + SConscript('dav_v2/SConscript') env.AppendUnique(LIBPATH=[Dir('.')]) base_env.AppendUnique(LIBPATH=[Dir('.')]) base_env.d_add_build_rpath() diff --git a/src/common/dav/bucket.c b/src/common/dav/bucket.c index d3a975a5f26..8df41288a13 100644 --- a/src/common/dav/bucket.c +++ b/src/common/dav/bucket.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2023, Intel Corporation */ +/* Copyright 2015-2022, Intel Corporation */ /* * bucket.c -- bucket implementation @@ -23,13 +23,15 @@ struct bucket { /* this struct is both the lock guard and the locked state */ - struct bucket_locked *locked; - struct alloc_class *aclass; - struct block_container *container; + struct bucket_locked *locked; + + struct alloc_class *aclass; + + struct block_container *container; const struct block_container_ops *c_ops; - struct memory_block_reserved *active_memory_block; - struct zoneset *zset; - int is_active; + + struct memory_block_reserved *active_memory_block; + int is_active; }; struct bucket_locked { @@ -75,7 +77,7 @@ bucket_fini(struct bucket *b) * bucket_locked_new -- creates a new locked bucket instance */ struct bucket_locked * -bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct zoneset *zset) +bucket_locked_new(struct block_container *c, struct alloc_class *aclass) { ASSERTne(c, NULL); @@ -90,7 +92,6 @@ bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct util_mutex_init(&b->lock); b->bucket.locked = b; - b->bucket.zset = zset; return b; @@ -267,9 +268,3 @@ bucket_active_block(struct bucket *b) { return b->is_active ? b->active_memory_block : NULL; } - -struct zoneset * -bucket_get_zoneset(struct bucket *b) -{ - return b->zset; -} diff --git a/src/common/dav/bucket.h b/src/common/dav/bucket.h index b0d92b66995..aadc6e714fc 100644 --- a/src/common/dav/bucket.h +++ b/src/common/dav/bucket.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2023, Intel Corporation */ +/* Copyright 2015-2021, Intel Corporation */ /* * bucket.h -- internal definitions for bucket @@ -21,8 +21,8 @@ struct bucket_locked; struct bucket; -struct bucket_locked * -bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct zoneset *zset); +struct bucket_locked *bucket_locked_new(struct block_container *c, + struct alloc_class *aclass); struct bucket *bucket_acquire(struct bucket_locked *b); void bucket_release(struct bucket *b); @@ -41,7 +41,5 @@ int bucket_detach_run(struct bucket *b, struct memory_block_reserved *bucket_active_block(struct bucket *b); void bucket_locked_delete(struct bucket_locked *b); -struct zoneset * -bucket_get_zoneset(struct bucket *b); #endif /* __DAOS_COMMON_BUCKET_H */ diff --git a/src/common/dav/dav.h b/src/common/dav/dav.h index b505d739f8a..40af0351af3 100644 --- a/src/common/dav/dav.h +++ b/src/common/dav/dav.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2023, Intel Corporation */ +/* Copyright 2015-2022, Intel Corporation */ /* * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV) @@ -16,42 +16,52 @@ /* * allocation functions flags */ -#define DAV_FLAG_ZERO (((uint64_t)1) << 0) -#define DAV_FLAG_NO_FLUSH (((uint64_t)1) << 1) -#define DAV_FLAG_NO_SNAPSHOT (((uint64_t)1) << 2) -#define DAV_FLAG_ASSUME_INITIALIZED (((uint64_t)1) << 3) -#define DAV_FLAG_TX_NO_ABORT (((uint64_t)1) << 4) - -#define DAV_CLASS_ID(id) (((uint64_t)(id)) << 48) -#define DAV_EZONE_ID(id) (((uint64_t)(id)) << 16) - -#define DAV_XALLOC_CLASS_MASK ((((uint64_t)1 << 16) - 1) << 48) -#define DAV_XALLOC_EZONE_MASK ((((uint64_t)1 << 32) - 1) << 16) -#define DAV_XALLOC_ZERO DAV_FLAG_ZERO -#define DAV_XALLOC_NO_FLUSH DAV_FLAG_NO_FLUSH -#define DAV_XALLOC_NO_ABORT DAV_FLAG_TX_NO_ABORT - -#define DAV_TX_XALLOC_VALID_FLAGS \ - (DAV_XALLOC_ZERO | DAV_XALLOC_NO_FLUSH | DAV_XALLOC_EZONE_MASK | DAV_XALLOC_CLASS_MASK | \ - DAV_XALLOC_NO_ABORT) - -#define DAV_XADD_NO_FLUSH DAV_FLAG_NO_FLUSH -#define DAV_XADD_NO_SNAPSHOT DAV_FLAG_NO_SNAPSHOT -#define DAV_XADD_ASSUME_INITIALIZED DAV_FLAG_ASSUME_INITIALIZED -#define DAV_XADD_NO_ABORT DAV_FLAG_TX_NO_ABORT -#define DAV_XADD_VALID_FLAGS \ - (DAV_XADD_NO_FLUSH | DAV_XADD_NO_SNAPSHOT | DAV_XADD_ASSUME_INITIALIZED | DAV_XADD_NO_ABORT) +#define DAV_FLAG_ZERO (((uint64_t)1) << 0) +#define DAV_FLAG_NO_FLUSH (((uint64_t)1) << 1) +#define DAV_FLAG_NO_SNAPSHOT (((uint64_t)1) << 2) +#define DAV_FLAG_ASSUME_INITIALIZED (((uint64_t)1) << 3) +#define DAV_FLAG_TX_NO_ABORT (((uint64_t)1) << 4) + +#define DAV_CLASS_ID(id) (((uint64_t)(id)) << 48) +#ifdef DAV_V2_BUILD +#define DAV_EZONE_ID(id) (((uint64_t)(id)) << 16) +#endif /* DAV_V2_BUILD */ + +#define DAV_XALLOC_CLASS_MASK ((((uint64_t)1 << 16) - 1) << 48) +#ifdef DAV_V2_BUILD +#define DAV_XALLOC_EZONE_MASK ((((uint64_t)1 << 16) - 1) << 32) +#else /* DAV_V2_BUILD */ +#define DAV_XALLOC_EZONE_MASK 0 +#endif /* DAV_V2_BUILD */ +#define DAV_XALLOC_ZERO DAV_FLAG_ZERO +#define DAV_XALLOC_NO_FLUSH DAV_FLAG_NO_FLUSH +#define DAV_XALLOC_NO_ABORT DAV_FLAG_TX_NO_ABORT + +#define DAV_TX_XALLOC_VALID_FLAGS (DAV_XALLOC_ZERO |\ + DAV_XALLOC_NO_FLUSH |\ + DAV_XALLOC_EZONE_MASK |\ + DAV_XALLOC_CLASS_MASK |\ + DAV_XALLOC_NO_ABORT) + +#define DAV_XADD_NO_FLUSH DAV_FLAG_NO_FLUSH +#define DAV_XADD_NO_SNAPSHOT DAV_FLAG_NO_SNAPSHOT +#define DAV_XADD_ASSUME_INITIALIZED DAV_FLAG_ASSUME_INITIALIZED +#define DAV_XADD_NO_ABORT DAV_FLAG_TX_NO_ABORT +#define DAV_XADD_VALID_FLAGS (DAV_XADD_NO_FLUSH |\ + DAV_XADD_NO_SNAPSHOT |\ + DAV_XADD_ASSUME_INITIALIZED |\ + DAV_XADD_NO_ABORT) /* * WAL Redo hints. */ -#define DAV_XADD_WAL_CPTR (((uint64_t)1) << 5) +#define DAV_XADD_WAL_CPTR (((uint64_t)1) << 5) -#define DAV_XLOCK_NO_ABORT DAV_FLAG_TX_NO_ABORT -#define DAV_XLOCK_VALID_FLAGS (DAV_XLOCK_NO_ABORT) +#define DAV_XLOCK_NO_ABORT DAV_FLAG_TX_NO_ABORT +#define DAV_XLOCK_VALID_FLAGS (DAV_XLOCK_NO_ABORT) -#define DAV_XFREE_NO_ABORT DAV_FLAG_TX_NO_ABORT -#define DAV_XFREE_VALID_FLAGS (DAV_XFREE_NO_ABORT) +#define DAV_XFREE_NO_ABORT DAV_FLAG_TX_NO_ABORT +#define DAV_XFREE_VALID_FLAGS (DAV_XFREE_NO_ABORT) typedef struct dav_obj dav_obj_t; struct umem_store; @@ -73,7 +83,8 @@ struct umem_store; * it returns NULL with errno set appropriately. */ dav_obj_t * -dav_obj_create(const char *path, int flags, size_t sz, mode_t mode, struct umem_store *store); +dav_obj_create(const char *path, int flags, size_t sz, mode_t mode, + struct umem_store *store); /** * Open and initialize a DAV object and return its handle. @@ -117,9 +128,8 @@ typedef int (*dav_constr)(dav_obj_t *pop, void *ptr, void *arg); * initialized, or if it's interrupted before the constructor completes, the * memory reserved for the object is automatically reclaimed. */ -int -dav_alloc(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags, - dav_constr constructor, void *arg); +int dav_alloc(dav_obj_t *pop, uint64_t *offp, size_t size, + uint64_t type_num, dav_constr constructor, void *arg); /** * Frees the memory at specified offset within the DAV object pointed to by hdl. @@ -135,13 +145,13 @@ dav_free(dav_obj_t *pop, uint64_t off); /* * DAV version of memcpy. Data copied is made persistent in blob. */ -void * -dav_memcpy_persist(dav_obj_t *pop, void *dest, const void *src, size_t len); +void *dav_memcpy_persist(dav_obj_t *pop, void *dest, const void *src, + size_t len); /* * DAV version of memcpy with deferred commit to blob. */ -void * -dav_memcpy_persist_relaxed(dav_obj_t *pop, void *dest, const void *src, size_t len); +void *dav_memcpy_persist_relaxed(dav_obj_t *pop, void *dest, const void *src, + size_t len); /* * If called for the first time on a newly created dav heap, the root object @@ -152,8 +162,8 @@ dav_memcpy_persist_relaxed(dav_obj_t *pop, void *dest, const void *src, size_t l * * This function is currently *not* thread-safe. */ -uint64_t -dav_root(dav_obj_t *pop, size_t size); +uint64_t dav_root(dav_obj_t *pop, size_t size); + /* * Transactions @@ -163,22 +173,23 @@ dav_root(dav_obj_t *pop, size_t size); * the dav_tx_begin function. */ enum dav_tx_stage { - DAV_TX_STAGE_NONE, /* no transaction in this thread */ - DAV_TX_STAGE_WORK, /* transaction in progress */ - DAV_TX_STAGE_ONCOMMIT, /* successfully committed */ - DAV_TX_STAGE_ONABORT, /* tx_begin failed or transaction aborted */ - DAV_TX_STAGE_FINALLY, /* always called */ + DAV_TX_STAGE_NONE, /* no transaction in this thread */ + DAV_TX_STAGE_WORK, /* transaction in progress */ + DAV_TX_STAGE_ONCOMMIT, /* successfully committed */ + DAV_TX_STAGE_ONABORT, /* tx_begin failed or transaction aborted */ + DAV_TX_STAGE_FINALLY, /* always called */ DAV_MAX_TX_STAGE }; -typedef void (*dav_tx_callback)(dav_obj_t *pop, enum dav_tx_stage stage, void *); +typedef void (*dav_tx_callback)(dav_obj_t *pop, enum dav_tx_stage stage, + void *); enum dav_tx_param { DAV_TX_PARAM_NONE, - DAV_TX_PARAM_UNUSED1, /* For parity with libpmemobj */ - DAV_TX_PARAM_UNUSED2, /* For parity with libpmemobj */ - DAV_TX_PARAM_CB, /* dav_tx_callback cb, void *arg */ + DAV_TX_PARAM_UNUSED1, /* For parity with libpmemobj */ + DAV_TX_PARAM_UNUSED2, /* For parity with libpmemobj */ + DAV_TX_PARAM_CB, /* dav_tx_callback cb, void *arg */ }; /* @@ -189,8 +200,7 @@ enum dav_tx_param { * returns zero. Otherwise, stage changes to TX_STAGE_ONABORT and an error * number is returned. */ -int -dav_tx_begin(dav_obj_t *pop, jmp_buf env, ...); +int dav_tx_begin(dav_obj_t *pop, jmp_buf env, ...); /* * Aborts current transaction @@ -199,16 +209,14 @@ dav_tx_begin(dav_obj_t *pop, jmp_buf env, ...); * * This function must be called during TX_STAGE_WORK. */ -void -dav_tx_abort(int errnum); +void dav_tx_abort(int errnum); /* * Commits current transaction * * This function must be called during TX_STAGE_WORK. */ -void -dav_tx_commit(void); +void dav_tx_commit(void); /* * Cleanups current transaction. Must always be called after dav_tx_begin, @@ -223,38 +231,52 @@ dav_tx_commit(void); * * This function must *not* be called during TX_STAGE_WORK. */ -int -dav_tx_end(void *data); +int dav_tx_end(void *data); /* * Returns the current stage of the transaction. */ -enum dav_tx_stage -dav_tx_stage(void); +enum dav_tx_stage dav_tx_stage(void); /* * Returns last transaction error code. */ -int -dav_tx_errno(void); +int dav_tx_errno(void); + +/* + * Transactionally allocates a new object. + * + * If successful, returns PMEMoid. + * Otherwise, stage changes to TX_STAGE_ONABORT and an OID_NULL is returned. + * + * This function must be called during TX_STAGE_WORK. + */ +uint64_t dav_tx_alloc(size_t size, uint64_t type_num); /* * Transactionally allocates a new object. * - * If successful, returns offset of the object in the heap. - * Otherwise, stage changes to TX_STAGE_ONABORT and an zero is returned. + * If successful, returns PMEMoid. + * Otherwise, stage changes to TX_STAGE_ONABORT and an OID_NULL is returned. * 'Flags' is a bitmask of the following values: * - POBJ_XALLOC_ZERO - zero the allocated object * - POBJ_XALLOC_NO_FLUSH - skip flush on commit * - POBJ_XALLOC_NO_ABORT - if the function does not end successfully, - * - DAV_CLASS_ID(id) - id of allocation class to use. - * - DAV_EZONE_ID(id) - id of zone to use. * do not abort the transaction and return the error number. * * This function must be called during TX_STAGE_WORK. */ -uint64_t -dav_tx_alloc(size_t size, uint64_t type_num, uint64_t flags); +uint64_t dav_tx_xalloc(size_t size, uint64_t type_num, uint64_t flags); + +/* + * Transactionally allocates new zeroed object. + * + * If successful, returns PMEMoid. + * Otherwise, stage changes to TX_STAGE_ONABORT and an OID_NULL is returned. + * + * This function must be called during TX_STAGE_WORK. + */ +uint64_t dav_tx_zalloc(size_t size, uint64_t type_num); /* * Transactionally frees an existing object. @@ -264,8 +286,7 @@ dav_tx_alloc(size_t size, uint64_t type_num, uint64_t flags); * * This function must be called during TX_STAGE_WORK. */ -int -dav_tx_free(uint64_t off); +int dav_tx_free(uint64_t off); /* * Takes a "snapshot" of the memory block of given size and located at given @@ -279,8 +300,7 @@ dav_tx_free(uint64_t off); * * This function must be called during TX_STAGE_WORK. */ -int -dav_tx_add_range(uint64_t off, size_t size); +int dav_tx_add_range(uint64_t off, size_t size); /* * Takes a "snapshot" of the given memory region and saves it in the undo log. @@ -294,8 +314,7 @@ dav_tx_add_range(uint64_t off, size_t size); * * This function must be called during TX_STAGE_WORK. */ -int -dav_tx_add_range_direct(const void *ptr, size_t size); +int dav_tx_add_range_direct(const void *ptr, size_t size); /* * Behaves exactly the same as dav_tx_add_range when 'flags' equals 0. @@ -306,8 +325,7 @@ dav_tx_add_range_direct(const void *ptr, size_t size); * - POBJ_XADD_NO_ABORT - if the function does not end successfully, * do not abort the transaction and return the error number. */ -int -dav_tx_xadd_range(uint64_t off, size_t size, uint64_t flags); +int dav_tx_xadd_range(uint64_t off, size_t size, uint64_t flags); /* * Behaves exactly the same as dav_tx_add_range_direct when 'flags' equals @@ -318,15 +336,13 @@ dav_tx_xadd_range(uint64_t off, size_t size, uint64_t flags); * - POBJ_XADD_NO_ABORT - if the function does not end successfully, * do not abort the transaction and return the error number. */ -int -dav_tx_xadd_range_direct(const void *ptr, size_t size, uint64_t flags); +int dav_tx_xadd_range_direct(const void *ptr, size_t size, uint64_t flags); /* * Converts the offset to a pointer in the context of heap associated with * current transaction. */ -void * -dav_tx_off2ptr(uint64_t off); +void *dav_tx_off2ptr(uint64_t off); enum dav_action_type { /* a heap action (e.g., alloc) */ @@ -353,27 +369,18 @@ struct dav_action { * This structure should NEVER be stored on persistent memory! */ enum dav_action_type type; - uint32_t data[3]; + uint32_t data[3]; union { struct dav_action_heap heap; - uint64_t data2[14]; + uint64_t data2[14]; }; }; -#define DAV_ACTION_XRESERVE_VALID_FLAGS \ - (DAV_XALLOC_CLASS_MASK | DAV_XALLOC_EZONE_MASK | DAV_XALLOC_ZERO) - -uint64_t -dav_reserve(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num, - uint64_t flags); -void -dav_defer_free(dav_obj_t *pop, uint64_t off, struct dav_action *act); -int -dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt); -void -dav_cancel(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt); -int -dav_tx_publish(struct dav_action *actv, size_t actvcnt); +uint64_t dav_reserve(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num); +void dav_defer_free(dav_obj_t *pop, uint64_t off, struct dav_action *act); +int dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt); +void dav_cancel(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt); +int dav_tx_publish(struct dav_action *actv, size_t actvcnt); /* * Allocation class interface @@ -456,7 +463,7 @@ struct dav_alloc_class_desc { * containing 256 bytes that spans two units. The usable size of that * allocation will be 240 bytes: 2 * 128 - 16 (header). */ - size_t unit_size; + size_t unit_size; /* * Desired alignment of objects from the allocation class. @@ -467,7 +474,7 @@ struct dav_alloc_class_desc { * compact one this means that the alignment is 48 bytes. * */ - size_t alignment; + size_t alignment; /* * The minimum number of units that must be present in a @@ -480,7 +487,7 @@ struct dav_alloc_class_desc { * allocate, but blocks do go back to the global heap if they are no * longer actively used for allocation. */ - unsigned units_per_block; + unsigned units_per_block; /* * The header of allocations that originate from this allocation class. @@ -490,14 +497,14 @@ struct dav_alloc_class_desc { /* * The identifier of this allocation class. */ - unsigned class_id; + unsigned class_id; }; /* * Registers an allocation class handle with the DAV object. */ -int -dav_class_register(dav_obj_t *pop, struct dav_alloc_class_desc *p); +int dav_class_register(dav_obj_t *pop, struct dav_alloc_class_desc *p); + struct dav_heap_stats { uint64_t curr_allocated; @@ -508,30 +515,13 @@ struct dav_heap_stats { * Returns the heap allocation statistics associated with the * DAV object. */ -int -dav_get_heap_stats(dav_obj_t *pop, struct dav_heap_stats *st); +int dav_get_heap_stats(dav_obj_t *pop, struct dav_heap_stats *st); struct umem_wal_tx; -uint32_t -wal_tx_act_nr(struct umem_wal_tx *tx); -uint32_t -wal_tx_payload_len(struct umem_wal_tx *tx); -struct umem_action * -wal_tx_act_first(struct umem_wal_tx *tx); -struct umem_action * -wal_tx_act_next(struct umem_wal_tx *tx); - -/** - * Get an evictable zone with sufficient free space within. - * - * \param[in] pop pool handle - * \param[in] flags zone selection criteria. - * - * \return id >= 0. Zero indicates non-evictable zone and will be - * returned if no evictable zone can be chosen. - */ -uint32_t -dav_get_zone_evictable(dav_obj_t *pop, int flags); +uint32_t wal_tx_act_nr(struct umem_wal_tx *tx); +uint32_t wal_tx_payload_len(struct umem_wal_tx *tx); +struct umem_action *wal_tx_act_first(struct umem_wal_tx *tx); +struct umem_action *wal_tx_act_next(struct umem_wal_tx *tx); #endif /* __DAOS_COMMON_DAV_H */ diff --git a/src/common/dav/dav_iface.c b/src/common/dav/dav_iface.c index c1686570390..36d3c17a162 100644 --- a/src/common/dav/dav_iface.c +++ b/src/common/dav/dav_iface.c @@ -180,6 +180,13 @@ dav_obj_open_internal(int fd, int flags, size_t sz, const char *path, struct ume palloc_heap_vg_open(hdl->do_heap, 1); #endif + rc = heap_buckets_init(hdl->do_heap); + if (rc) { + err = rc; + heap_cleanup(hdl->do_heap); + goto out2; + } + rc = dav_create_clogs(hdl); if (rc) { err = rc; diff --git a/src/common/dav/dav_internal.h b/src/common/dav/dav_internal.h index 9c9b263c494..0f8ddff5916 100644 --- a/src/common/dav/dav_internal.h +++ b/src/common/dav/dav_internal.h @@ -28,6 +28,11 @@ enum dav_stats_enabled { DAV_STATS_DISABLED, }; +enum dav_arenas_assignment_type { + DAV_ARENAS_ASSIGNMENT_THREAD_KEY, + DAV_ARENAS_ASSIGNMENT_GLOBAL, +}; + #define DAV_PHDR_SIZE 4096 /* DAV header data that will be persisted */ diff --git a/src/common/dav/heap.c b/src/common/dav/heap.c index 6e4aec10edf..4384fe40f8c 100644 --- a/src/common/dav/heap.c +++ b/src/common/dav/heap.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2023, Intel Corporation */ +/* Copyright 2015-2022, Intel Corporation */ /* * heap.c -- heap implementation @@ -31,70 +31,126 @@ * This is the value by which the heap might grow once we hit an OOM. */ #define HEAP_DEFAULT_GROW_SIZE (1 << 27) /* 128 megabytes */ +#define MAX_DEFAULT_ARENAS (1 << 10) /* 1024 arenas */ -/* - * zoneset stores the collection of buckets and recyclers for allocation classes. - * Each evictable zone is assigned a zoneset during first allocation. - */ -struct zoneset { - uint32_t zset_id; - uint32_t padding; - struct bucket_locked *default_bucket; /* bucket for free chunks */ - struct bucket_locked *buckets[MAX_ALLOCATION_CLASSES]; /* one bucket per allocation class */ - struct recycler *recyclers[MAX_ALLOCATION_CLASSES]; +enum dav_arenas_assignment_type Default_arenas_assignment_type = + DAV_ARENAS_ASSIGNMENT_GLOBAL; + +size_t Default_arenas_max; + +struct arenas_thread_assignment { + enum dav_arenas_assignment_type type; + union { + os_tls_key_t thread; + struct arena *global; + }; }; -struct heap_rt { - struct alloc_class_collection *alloc_classes; - struct zoneset *default_zset; - struct zoneset **evictable_zsets; - os_mutex_t run_locks[MAX_RUN_LOCKS]; - unsigned nlocks; - unsigned nzones; - unsigned zones_exhausted; +struct arenas { + VEC(, struct arena *) vec; + size_t nactive; + + /* + * When nesting with other locks, this one must be acquired first, + * prior to locking any buckets or memory blocks. + */ + os_mutex_t lock; + + /* stores a pointer to one of the arenas */ + struct arenas_thread_assignment assignment; }; /* - * heap_get_zoneset - returns the reference to the zoneset given - * zone or zoneset id. + * Arenas store the collection of buckets for allocation classes. + * Each thread is assigned an arena on its first allocator operation + * if arena is set to auto. */ -struct zoneset * -heap_get_zoneset(struct palloc_heap *heap, uint32_t zone_id) -{ - /* REVISIT: - * Implement the code for evictable zonesets. +struct arena { + /* one bucket per allocation class */ + struct bucket_locked *buckets[MAX_ALLOCATION_CLASSES]; + + /* + * Decides whether the arena can be + * automatically assigned to a thread. */ - return heap->rt->default_zset; -} + int automatic; + size_t nthreads; + struct arenas *arenas; +}; + +struct heap_rt { + struct alloc_class_collection *alloc_classes; + + struct bucket_locked *default_bucket; + + struct arenas arenas; + + struct recycler *recyclers[MAX_ALLOCATION_CLASSES]; + + os_mutex_t run_locks[MAX_RUN_LOCKS]; + unsigned nlocks; + + unsigned nzones; + unsigned zones_exhausted; +}; /* - * heap_get_recycler - (internal) retrieves the recycler instance from the zoneset with + * heap_get_recycler - (internal) retrieves the recycler instance with * the corresponding class id. Initializes the recycler if needed. + * */ static struct recycler * -heap_get_recycler(struct palloc_heap *heap, struct zoneset *zset, size_t id, size_t nallocs) +heap_get_recycler(struct palloc_heap *heap, size_t id, size_t nallocs) { struct recycler *r; - D_ASSERT(zset != NULL); - util_atomic_load_explicit64(&zset->recyclers[id], &r, memory_order_acquire); + util_atomic_load_explicit64(&heap->rt->recyclers[id], &r, + memory_order_acquire); + if (r != NULL) return r; - r = recycler_new(heap, nallocs, zset); - if (r && !util_bool_compare_and_swap64(&zset->recyclers[id], NULL, r)) { + r = recycler_new(heap, nallocs, + &heap->rt->arenas.nactive); + if (r && !util_bool_compare_and_swap64(&heap->rt->recyclers[id], + NULL, r)) { /* * If a different thread succeeded in assigning the recycler * first, the recycler this thread created needs to be deleted. */ recycler_delete(r); - return heap_get_recycler(heap, zset, id, nallocs); + return heap_get_recycler(heap, id, nallocs); } return r; } +/* + * heap_arenas_init - (internal) initialize generic arenas info + */ +static int +heap_arenas_init(struct arenas *arenas) +{ + util_mutex_init(&arenas->lock); + VEC_INIT(&arenas->vec); + arenas->nactive = 0; + + if (VEC_RESERVE(&arenas->vec, MAX_DEFAULT_ARENAS) == -1) + return -1; + return 0; +} + +/* + * heap_arenas_fini - (internal) destroy generic arenas info + */ +static void +heap_arenas_fini(struct arenas *arenas) +{ + util_mutex_destroy(&arenas->lock); + VEC_DELETE(&arenas->vec); +} + /* * heap_alloc_classes -- returns the allocation classes collection */ @@ -104,6 +160,58 @@ heap_alloc_classes(struct palloc_heap *heap) return heap->rt ? heap->rt->alloc_classes : NULL; } +/* + * heap_arena_delete -- (internal) destroys arena instance + */ +static void +heap_arena_delete(struct arena *arena) +{ + for (int i = 0; i < MAX_ALLOCATION_CLASSES; ++i) + if (arena->buckets[i] != NULL) + bucket_locked_delete(arena->buckets[i]); + D_FREE(arena); +} + +/* + * heap_arena_new -- (internal) initializes arena instance + */ +static struct arena * +heap_arena_new(struct palloc_heap *heap, int automatic) +{ + struct heap_rt *rt = heap->rt; + struct arena *arena; + + D_ALLOC_PTR(arena); + if (arena == NULL) { + D_CRIT("!heap: arena malloc error\n"); + return NULL; + } + arena->nthreads = 0; + arena->automatic = automatic; + arena->arenas = &heap->rt->arenas; + + COMPILE_ERROR_ON(MAX_ALLOCATION_CLASSES > UINT8_MAX); + for (uint8_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + struct alloc_class *ac = + alloc_class_by_id(rt->alloc_classes, i); + if (ac != NULL) { + arena->buckets[i] = + bucket_locked_new(container_new_seglists(heap), + ac); + if (arena->buckets[i] == NULL) + goto error_bucket_create; + } else { + arena->buckets[i] = NULL; + } + } + + return arena; + +error_bucket_create: + heap_arena_delete(arena); + return NULL; +} + /* * heap_get_best_class -- returns the alloc class that best fits the * requested size @@ -115,29 +223,283 @@ heap_get_best_class(struct palloc_heap *heap, size_t size) } /* - * zoneset_bucket_acquire -- fetches by zoneset or by id a bucket exclusive - * for the thread until zoneset_bucket_release is called + * heap_arena_thread_detach -- detaches arena from the current thread + * + * Must be called with arenas lock taken. + */ +static void +heap_arena_thread_detach(struct arena *a) +{ + /* + * Even though this is under a lock, nactive variable can also be read + * concurrently from the recycler (without the arenas lock). + * That's why we are using an atomic operation. + */ + if ((--a->nthreads) == 0) + util_fetch_and_sub64(&a->arenas->nactive, 1); +} + +/* + * heap_arena_thread_attach -- assign arena to the current thread + * + * Must be called with arenas lock taken. + */ +static void +heap_arena_thread_attach(struct palloc_heap *heap, struct arena *a) +{ + struct heap_rt *h = heap->rt; + + struct arenas_thread_assignment *assignment = &h->arenas.assignment; + + ASSERTeq(assignment->type, DAV_ARENAS_ASSIGNMENT_THREAD_KEY); + + struct arena *thread_arena = os_tls_get(assignment->thread); + + if (thread_arena) + heap_arena_thread_detach(thread_arena); + + ASSERTne(a, NULL); + + /* + * Even though this is under a lock, nactive variable can also be read + * concurrently from the recycler (without the arenas lock). + * That's why we are using an atomic operation. + */ + if ((a->nthreads++) == 0) + util_fetch_and_add64(&a->arenas->nactive, 1); + + os_tls_set(assignment->thread, a); +} + +/* + * heap_thread_arena_destructor -- (internal) removes arena thread assignment + */ +static void +heap_thread_arena_destructor(void *arg) +{ + struct arena *a = arg; + + os_mutex_lock(&a->arenas->lock); + heap_arena_thread_detach(a); + os_mutex_unlock(&a->arenas->lock); +} + +/* + * arena_thread_assignment_init -- (internal) initializes thread assignment + * type for arenas. + */ +static int +arena_thread_assignment_init(struct arenas_thread_assignment *assignment, + enum dav_arenas_assignment_type type) +{ + int ret = 0; + + assignment->type = type; + + switch (type) { + case DAV_ARENAS_ASSIGNMENT_THREAD_KEY: + ret = os_tls_key_create(&assignment->thread, + heap_thread_arena_destructor); + break; + case DAV_ARENAS_ASSIGNMENT_GLOBAL: + assignment->global = NULL; + break; + default: + ASSERT(0); /* unreachable */ + } + + return ret; +} + +/* + * arena_thread_assignment_fini -- (internal) destroys thread assignment + * type for arenas. + */ +static void +arena_thread_assignment_fini(struct arenas_thread_assignment *assignment) +{ + switch (assignment->type) { + case DAV_ARENAS_ASSIGNMENT_THREAD_KEY: + os_tls_key_delete(assignment->thread); + break; + case DAV_ARENAS_ASSIGNMENT_GLOBAL: + break; + default: + ASSERT(0); /* unreachable */ + } +} + +/* + * heap_get_arena_by_id -- returns arena by id + * + * Must be called with arenas lock taken. + */ +static struct arena * +heap_get_arena_by_id(struct palloc_heap *heap, unsigned arena_id) +{ + return VEC_ARR(&heap->rt->arenas.vec)[arena_id - 1]; +} + +/* + * heap_global_arena_assign -- (internal) assigns the first automatic arena + * as the heaps' global arena assignment. + */ +static struct arena * +heap_global_arena_assign(struct palloc_heap *heap) +{ + util_mutex_lock(&heap->rt->arenas.lock); + + ASSERTne(VEC_SIZE(&heap->rt->arenas.vec), 0); + + struct arena *a = NULL; + + VEC_FOREACH(a, &heap->rt->arenas.vec) { + if (a->automatic) + break; + } + + DAV_DBG("assigning %p arena to current thread", a); + + /* at least one automatic arena must exist */ + ASSERTne(a, NULL); + heap->rt->arenas.assignment.global = a; + + util_mutex_unlock(&heap->rt->arenas.lock); + + return a; +} + +/* + * heap_thread_arena_assign -- (internal) assigns the least used arena + * to current thread + * + * To avoid complexities with regards to races in the search for the least + * used arena, a lock is used, but the nthreads counter of the arena is still + * bumped using atomic instruction because it can happen in parallel to a + * destructor of a thread, which also touches that variable. + */ +static struct arena * +heap_thread_arena_assign(struct palloc_heap *heap) +{ + util_mutex_lock(&heap->rt->arenas.lock); + + struct arena *least_used = NULL; + + ASSERTne(VEC_SIZE(&heap->rt->arenas.vec), 0); + + struct arena *a; + + VEC_FOREACH(a, &heap->rt->arenas.vec) { + if (!a->automatic) + continue; + if (least_used == NULL || + a->nthreads < least_used->nthreads) + least_used = a; + } + + DAV_DBG("assigning %p arena to current thread", least_used); + + /* at least one automatic arena must exist */ + ASSERTne(least_used, NULL); + heap_arena_thread_attach(heap, least_used); + + util_mutex_unlock(&heap->rt->arenas.lock); + + return least_used; +} + +/* + * heap_thread_arena -- (internal) returns the arena assigned to the current + * thread + */ +static struct arena * +heap_thread_arena(struct palloc_heap *heap) +{ + struct arenas_thread_assignment *assignment = + &heap->rt->arenas.assignment; + struct arena *arena = NULL; + + switch (assignment->type) { + case DAV_ARENAS_ASSIGNMENT_THREAD_KEY: + arena = os_tls_get(assignment->thread); + if (arena == NULL) + arena = heap_thread_arena_assign(heap); + break; + case DAV_ARENAS_ASSIGNMENT_GLOBAL: + arena = assignment->global; + if (arena == NULL) + arena = heap_global_arena_assign(heap); + break; + default: + ASSERT(0); /* unreachable */ + } + + ASSERTne(arena, NULL); + + return arena; +} + +/* + * heap_get_thread_arena_id -- returns the arena id assigned to the current + * thread + */ +unsigned +heap_get_thread_arena_id(struct palloc_heap *heap) +{ + unsigned arena_id = 1; + struct arena *arenap = heap_thread_arena(heap); + struct arena *arenav; + struct heap_rt *rt = heap->rt; + + util_mutex_lock(&rt->arenas.lock); + VEC_FOREACH(arenav, &heap->rt->arenas.vec) { + if (arenav == arenap) { + util_mutex_unlock(&rt->arenas.lock); + return arena_id; + } + arena_id++; + } + + util_mutex_unlock(&rt->arenas.lock); + ASSERT(0); + return arena_id; +} + +/* + * heap_bucket_acquire -- fetches by arena or by id a bucket exclusive + * for the thread until heap_bucket_release is called */ struct bucket * -zoneset_bucket_acquire(struct zoneset *zset, uint8_t class_id) +heap_bucket_acquire(struct palloc_heap *heap, uint8_t class_id, + uint16_t arena_id) { + struct heap_rt *rt = heap->rt; struct bucket_locked *b; - D_ASSERT(zset != NULL); + if (class_id == DEFAULT_ALLOC_CLASS_ID) { + b = rt->default_bucket; + goto out; + } + + if (arena_id == HEAP_ARENA_PER_THREAD) { + struct arena *arena = heap_thread_arena(heap); - if (class_id == DEFAULT_ALLOC_CLASS_ID) - b = zset->default_bucket; - else - b = zset->buckets[class_id]; + ASSERTne(arena->buckets, NULL); + b = arena->buckets[class_id]; + } else { + b = (VEC_ARR(&heap->rt->arenas.vec) + [arena_id - 1])->buckets[class_id]; + } +out: return bucket_acquire(b); } /* - * zoneset_bucket_release -- puts the bucket back into the heap + * heap_bucket_release -- puts the bucket back into the heap */ void -zoneset_bucket_release(struct bucket *b) +heap_bucket_release(struct bucket *b) { bucket_release(b); } @@ -382,9 +744,8 @@ heap_run_into_free_chunk(struct palloc_heap *heap, static int heap_reclaim_run(struct palloc_heap *heap, struct memory_block *m, int startup) { - struct chunk_run *run = heap_get_chunk_run(heap, m); + struct chunk_run *run = heap_get_chunk_run(heap, m); struct chunk_header *hdr = heap_get_chunk_hdr(heap, m); - struct zoneset *zset = heap_get_zoneset(heap, m->zone_id); struct alloc_class *c = alloc_class_by_run( heap->rt->alloc_classes, @@ -412,7 +773,8 @@ heap_reclaim_run(struct palloc_heap *heap, struct memory_block *m, int startup) STATS_INC(heap->stats, transient, heap_run_allocated, (c->rdsc.nallocs - e.free_space) * run->hdr.block_size); } - struct recycler *recycler = heap_get_recycler(heap, zset, c->id, c->rdsc.nallocs); + struct recycler *recycler = heap_get_recycler(heap, c->id, + c->rdsc.nallocs); if (recycler == NULL || recycler_put(recycler, e) < 0) ERR("lost runtime tracking info of %u run due to OOM", c->id); @@ -504,27 +866,24 @@ static int heap_recycle_unused(struct palloc_heap *heap, struct recycler *recycler, struct bucket *defb, int force) { - struct zoneset *zset; - struct memory_block *nm; - struct empty_runs r = recycler_recalc(recycler, force); - struct bucket *nb; + struct empty_runs r = recycler_recalc(recycler, force); if (VEC_SIZE(&r) == 0) return ENOMEM; - zset = recycler_get_zoneset(recycler); - D_ASSERT(zset != NULL); - - nb = defb == NULL ? zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID) : NULL; + struct bucket *nb = defb == NULL ? heap_bucket_acquire(heap, + DEFAULT_ALLOC_CLASS_ID, HEAP_ARENA_PER_THREAD) : NULL; ASSERT(defb != NULL || nb != NULL); + struct memory_block *nm; + VEC_FOREACH_BY_PTR(nm, &r) { heap_run_into_free_chunk(heap, defb ? defb : nb, nm); } if (nb != NULL) - zoneset_bucket_release(nb); + heap_bucket_release(nb); VEC_DELETE(&r); @@ -537,12 +896,11 @@ heap_recycle_unused(struct palloc_heap *heap, struct recycler *recycler, static int heap_reclaim_garbage(struct palloc_heap *heap, struct bucket *bucket) { - int ret = ENOMEM; + int ret = ENOMEM; struct recycler *r; - struct zoneset *zset = bucket_get_zoneset(bucket); for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { - r = zset->recyclers[i]; + r = heap->rt->recyclers[i]; if (r == NULL) continue; @@ -595,15 +953,14 @@ heap_ensure_huge_bucket_filled(struct palloc_heap *heap, void heap_discard_run(struct palloc_heap *heap, struct memory_block *m) { - struct zoneset *zset = heap_get_zoneset(heap, m->zone_id); - - D_ASSERT(zset != NULL); if (heap_reclaim_run(heap, m, 0)) { - struct bucket *b = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID); + struct bucket *b = + heap_bucket_acquire(heap, + DEFAULT_ALLOC_CLASS_ID, 0); heap_run_into_free_chunk(heap, b, m); - zoneset_bucket_release(b); + heap_bucket_release(b); } } @@ -626,6 +983,34 @@ heap_detach_and_try_discard_run(struct palloc_heap *heap, struct bucket *b) return 0; } +/* + * heap_force_recycle -- detaches all memory from arenas, and forces global + * recycling of all memory blocks + */ +void +heap_force_recycle(struct palloc_heap *heap) +{ + util_mutex_lock(&heap->rt->arenas.lock); + struct arena *arenap; + + VEC_FOREACH(arenap, &heap->rt->arenas.vec) { + for (int i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + struct bucket_locked *locked = arenap->buckets[i]; + + if (locked == NULL) + continue; + + struct bucket *b = bucket_acquire(locked); + + heap_detach_and_try_discard_run(heap, b); + + bucket_release(b); + } + } + util_mutex_unlock(&heap->rt->arenas.lock); + heap_reclaim_garbage(heap, NULL); +} + /* * heap_reuse_from_recycler -- (internal) try reusing runs that are currently * in the recycler @@ -634,14 +1019,14 @@ static int heap_reuse_from_recycler(struct palloc_heap *heap, struct bucket *b, uint32_t units, int force) { - struct zoneset *zset = bucket_get_zoneset(b); struct memory_block m = MEMORY_BLOCK_NONE; m.size_idx = units; struct alloc_class *aclass = bucket_alloc_class(b); - struct recycler *recycler = heap_get_recycler(heap, zset, aclass->id, aclass->rdsc.nallocs); + struct recycler *recycler = heap_get_recycler(heap, aclass->id, + aclass->rdsc.nallocs); if (recycler == NULL) { ERR("lost runtime tracking info of %u run due to OOM", @@ -687,9 +1072,7 @@ heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b, { int ret = 0; struct alloc_class *aclass = bucket_alloc_class(b); - struct zoneset *zset = bucket_get_zoneset(b); - D_ASSERT(zset != NULL); ASSERTeq(aclass->type, CLASS_RUN); if (heap_detach_and_try_discard_run(heap, b) != 0) @@ -699,10 +1082,12 @@ heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b, goto out; /* search in the next zone before attempting to create a new run */ - struct bucket *defb = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID); + struct bucket *defb = heap_bucket_acquire(heap, + DEFAULT_ALLOC_CLASS_ID, + HEAP_ARENA_PER_THREAD); heap_populate_bucket(heap, defb); - zoneset_bucket_release(defb); + heap_bucket_release(defb); if (heap_reuse_from_recycler(heap, b, units, 0) == 0) goto out; @@ -711,21 +1096,23 @@ heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b, m.size_idx = aclass->rdsc.size_idx; - defb = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID); + defb = heap_bucket_acquire(heap, + DEFAULT_ALLOC_CLASS_ID, + HEAP_ARENA_PER_THREAD); /* cannot reuse an existing run, create a new one */ if (heap_get_bestfit_block(heap, defb, &m) == 0) { ASSERTeq(m.block_off, 0); if (heap_run_create(heap, b, &m) != 0) { - zoneset_bucket_release(defb); + heap_bucket_release(defb); return ENOMEM; } - zoneset_bucket_release(defb); + heap_bucket_release(defb); goto out; } - zoneset_bucket_release(defb); + heap_bucket_release(defb); if (heap_reuse_from_recycler(heap, b, units, 0) == 0) goto out; @@ -743,8 +1130,6 @@ heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b, void heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m) { - struct zoneset *zset = heap_get_zoneset(heap, m->zone_id); - if (m->type != MEMORY_BLOCK_RUN) return; @@ -760,7 +1145,8 @@ heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m) if (c == NULL) return; - struct recycler *recycler = heap_get_recycler(heap, zset, c->id, c->rdsc.nallocs); + struct recycler *recycler = heap_get_recycler(heap, c->id, + c->rdsc.nallocs); if (recycler == NULL) { ERR("lost runtime tracking info of %u run due to OOM", @@ -852,74 +1238,210 @@ heap_end(struct palloc_heap *h) } /* - * heap_default_zoneset_init -- (internal) initializes default zone + * heap_arena_create -- create a new arena, push it to the vector + * and return new arena id or -1 on failure */ -static int -heap_default_zoneset_init(struct palloc_heap *heap) +int +heap_arena_create(struct palloc_heap *heap) { - struct heap_rt *h = heap->rt; - struct zoneset *default_zset; - struct alloc_class *c; - uint8_t i; + struct heap_rt *h = heap->rt; + struct arena *arena = heap_arena_new(heap, 0); - D_ALLOC_PTR(default_zset); - if (default_zset == NULL) + if (arena == NULL) return -1; - for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { - c = alloc_class_by_id(h->alloc_classes, i); + util_mutex_lock(&h->arenas.lock); - if (c == NULL) - continue; + if (VEC_PUSH_BACK(&h->arenas.vec, arena)) + goto err_push_back; - default_zset->buckets[c->id] = - bucket_locked_new(container_new_seglists(heap), c, default_zset); - if (default_zset->buckets[c->id] == NULL) - goto error_bucket_create; - } + int ret = (int)VEC_SIZE(&h->arenas.vec); - default_zset->default_bucket = bucket_locked_new( - container_new_ravl(heap), alloc_class_by_id(h->alloc_classes, DEFAULT_ALLOC_CLASS_ID), - default_zset); + util_mutex_unlock(&h->arenas.lock); - if (default_zset->default_bucket == NULL) - goto error_bucket_create; + return ret; - heap->rt->default_zset = default_zset; - return 0; +err_push_back: + util_mutex_unlock(&h->arenas.lock); + heap_arena_delete(arena); + return -1; +} -error_bucket_create: - for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { - c = alloc_class_by_id(h->alloc_classes, i); - if (c != NULL) { - if (default_zset->buckets[c->id] != NULL) - bucket_locked_delete(default_zset->buckets[c->id]); - } +/* + * heap_get_narenas_total -- returns the number of all arenas in the heap + */ +unsigned +heap_get_narenas_total(struct palloc_heap *heap) +{ + struct heap_rt *h = heap->rt; + + util_mutex_lock(&h->arenas.lock); + + unsigned total = (unsigned)VEC_SIZE(&h->arenas.vec); + + util_mutex_unlock(&h->arenas.lock); + + return total; +} + +/* + * heap_get_narenas_max -- returns the max number of arenas + */ +unsigned +heap_get_narenas_max(struct palloc_heap *heap) +{ + struct heap_rt *h = heap->rt; + + util_mutex_lock(&h->arenas.lock); + + unsigned max = (unsigned)VEC_CAPACITY(&h->arenas.vec); + + util_mutex_unlock(&h->arenas.lock); + + return max; +} + +/* + * heap_set_narenas_max -- change the max number of arenas + */ +int +heap_set_narenas_max(struct palloc_heap *heap, unsigned size) +{ + struct heap_rt *h = heap->rt; + int ret = -1; + + util_mutex_lock(&h->arenas.lock); + + unsigned capacity = (unsigned)VEC_CAPACITY(&h->arenas.vec); + + if (size < capacity) { + ERR("cannot decrease max number of arenas"); + goto out; + } else if (size == capacity) { + ret = 0; + goto out; } - D_FREE(default_zset); - return -1; + + ret = VEC_RESERVE(&h->arenas.vec, size); + +out: + util_mutex_unlock(&h->arenas.lock); + return ret; } -static void -heap_default_zoneset_cleanup(struct palloc_heap *heap) +/* + * heap_get_narenas_auto -- returns the number of all automatic arenas + */ +unsigned +heap_get_narenas_auto(struct palloc_heap *heap) { - struct zoneset *default_zset = heap->rt->default_zset; - uint8_t i; + struct heap_rt *h = heap->rt; + struct arena *arena; + unsigned narenas = 0; - for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { - if (default_zset->buckets[i] == NULL) - continue; - bucket_locked_delete(default_zset->buckets[i]); + util_mutex_lock(&h->arenas.lock); + + VEC_FOREACH(arena, &h->arenas.vec) { + if (arena->automatic) + narenas++; } - bucket_locked_delete(default_zset->default_bucket); - for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { - if (default_zset->recyclers[i] == NULL) - continue; - recycler_delete(default_zset->recyclers[i]); + util_mutex_unlock(&h->arenas.lock); + + return narenas; +} + +/* + * heap_get_arena_buckets -- returns a pointer to buckets from the arena + */ +struct bucket_locked ** +heap_get_arena_buckets(struct palloc_heap *heap, unsigned arena_id) +{ + util_mutex_lock(&heap->rt->arenas.lock); + struct arena *a = heap_get_arena_by_id(heap, arena_id); + + util_mutex_unlock(&heap->rt->arenas.lock); + + return a->buckets; +} + +/* + * heap_get_arena_auto -- returns arena automatic value + */ +int +heap_get_arena_auto(struct palloc_heap *heap, unsigned arena_id) +{ + int value; + + util_mutex_lock(&heap->rt->arenas.lock); + + struct arena *a = heap_get_arena_by_id(heap, arena_id); + value = a->automatic; + + util_mutex_unlock(&heap->rt->arenas.lock); + + return value; +} + +/* + * heap_set_arena_auto -- sets arena automatic value + */ +int +heap_set_arena_auto(struct palloc_heap *heap, unsigned arena_id, + int automatic) +{ + unsigned nautomatic = 0; + struct arena *a; + struct heap_rt *h = heap->rt; + int ret = 0; + + util_mutex_lock(&h->arenas.lock); + VEC_FOREACH(a, &h->arenas.vec) + if (a->automatic) + nautomatic++; + + a = VEC_ARR(&heap->rt->arenas.vec)[arena_id - 1]; + + if (!automatic && nautomatic <= 1 && a->automatic) { + D_CRIT("at least one automatic arena must exist\n"); + ret = -1; + goto out; } - D_FREE(default_zset); - heap->rt->default_zset = NULL; + a->automatic = automatic; + +out: + util_mutex_unlock(&h->arenas.lock); + return ret; + +} + +/* + * heap_set_arena_thread -- assign arena with given id to the current thread + */ +void +heap_set_arena_thread(struct palloc_heap *heap, unsigned arena_id) +{ + os_mutex_lock(&heap->rt->arenas.lock); + heap_arena_thread_attach(heap, heap_get_arena_by_id(heap, arena_id)); + os_mutex_unlock(&heap->rt->arenas.lock); +} + +/* + * heap_get_procs -- returns the number of arenas to create + */ +unsigned +heap_get_procs(void) +{ + long cpus = sysconf(_SC_NPROCESSORS_ONLN); + + if (cpus < 1) + cpus = 1; + + unsigned arenas = (unsigned)cpus; + + DAV_DBG("creating %u arenas", arenas); + + return arenas; } /* @@ -929,16 +1451,61 @@ heap_default_zoneset_cleanup(struct palloc_heap *heap) int heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c) { - struct zoneset *default_zset = heap->rt->default_zset; + struct heap_rt *h = heap->rt; + size_t i; + struct arena *arena; + + VEC_FOREACH_BY_POS(i, &h->arenas.vec) { + arena = VEC_ARR(&h->arenas.vec)[i]; + if (arena->buckets[c->id] == NULL) + arena->buckets[c->id] = bucket_locked_new( + container_new_seglists(heap), c); + if (arena->buckets[c->id] == NULL) + goto error_cache_bucket_new; + } - if (default_zset->buckets[c->id] == NULL) { - default_zset->buckets[c->id] = - bucket_locked_new(container_new_seglists(heap), c, default_zset); - if (default_zset->buckets[c->id] == NULL) - return -1; + return 0; + +error_cache_bucket_new: + for (; i != 0; --i) + bucket_locked_delete( + VEC_ARR(&h->arenas.vec)[i - 1]->buckets[c->id]); + + return -1; +} + +/* + * heap_buckets_init -- (internal) initializes bucket instances + */ +int +heap_buckets_init(struct palloc_heap *heap) +{ + struct heap_rt *h = heap->rt; + + for (uint8_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + struct alloc_class *c = alloc_class_by_id(h->alloc_classes, i); + + if (c != NULL) { + if (heap_create_alloc_class_buckets(heap, c) != 0) + goto error_bucket_create; + } } + h->default_bucket = bucket_locked_new(container_new_ravl(heap), + alloc_class_by_id(h->alloc_classes, DEFAULT_ALLOC_CLASS_ID)); + + if (h->default_bucket == NULL) + goto error_bucket_create; + return 0; + +error_bucket_create: { + struct arena *arena; + + VEC_FOREACH(arena, &h->arenas.vec) + heap_arena_delete(arena); + } + return -1; } /* @@ -999,12 +1566,25 @@ heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, goto error_heap_malloc; } + err = arena_thread_assignment_init(&h->arenas.assignment, + Default_arenas_assignment_type); + if (err != 0) + goto error_assignment_init; + h->alloc_classes = alloc_class_collection_new(); if (h->alloc_classes == NULL) { err = ENOMEM; goto error_alloc_classes_new; } + unsigned narenas_default = Default_arenas_max == 0 ? + heap_get_procs() : (unsigned)Default_arenas_max; + + if (heap_arenas_init(&h->arenas) != 0) { + err = ENOMEM; + goto error_arenas_malloc; + } + h->nzones = heap_max_zone(heap_size); h->zones_exhausted = 0; @@ -1024,18 +1604,27 @@ heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, heap->alloc_pattern = PALLOC_CTL_DEBUG_NO_PATTERN; VALGRIND_DO_CREATE_MEMPOOL(heap->layout, 0, 0); - if (heap_default_zoneset_init(heap) != 0) { - err = ENOMEM; - goto error_zoneset_init; + for (unsigned i = 0; i < narenas_default; ++i) { + if (VEC_PUSH_BACK(&h->arenas.vec, heap_arena_new(heap, 1))) { + err = ENOMEM; + goto error_vec_reserve; + } } + for (unsigned i = 0; i < MAX_ALLOCATION_CLASSES; ++i) + h->recyclers[i] = NULL; + heap_zone_update_if_needed(heap); return 0; -error_zoneset_init: +error_vec_reserve: + heap_arenas_fini(&h->arenas); +error_arenas_malloc: alloc_class_collection_delete(h->alloc_classes); error_alloc_classes_new: + arena_thread_assignment_fini(&h->arenas.assignment); +error_assignment_init: D_FREE(h); heap->rt = NULL; error_heap_malloc: @@ -1112,11 +1701,26 @@ heap_cleanup(struct palloc_heap *heap) alloc_class_collection_delete(rt->alloc_classes); - heap_default_zoneset_cleanup(heap); + arena_thread_assignment_fini(&rt->arenas.assignment); + bucket_locked_delete(rt->default_bucket); + + struct arena *arena; + + VEC_FOREACH(arena, &rt->arenas.vec) + heap_arena_delete(arena); for (unsigned i = 0; i < rt->nlocks; ++i) util_mutex_destroy(&rt->run_locks[i]); + heap_arenas_fini(&rt->arenas); + + for (int i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + if (heap->rt->recyclers[i] == NULL) + continue; + + recycler_delete(rt->recyclers[i]); + } + VALGRIND_DO_DESTROY_MEMPOOL(heap->layout); D_FREE(rt); diff --git a/src/common/dav/heap.h b/src/common/dav/heap.h index e1e205d076d..d3e2bba4cdf 100644 --- a/src/common/dav/heap.h +++ b/src/common/dav/heap.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2023, Intel Corporation */ +/* Copyright 2015-2021, Intel Corporation */ /* * heap.h -- internal definitions for heap @@ -18,62 +18,81 @@ #include "os_thread.h" #include "dav_internal.h" +extern enum dav_arenas_assignment_type Default_arenas_assignment_type; +extern size_t Default_arenas_max; + #define HEAP_OFF_TO_PTR(heap, off) ((void *)((char *)((heap)->base) + (off))) -#define HEAP_PTR_TO_OFF(heap, ptr) ((uintptr_t)(ptr) - (uintptr_t)((heap)->base)) - -#define BIT_IS_CLR(a, i) (!((a) & (1ULL << (i)))) -#define HEAP_ARENA_PER_THREAD (0) - -int -heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, uint64_t *sizep, - void *base, struct mo_ops *p_ops, struct stats *stats, struct pool_set *set); -int -heap_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops); -void -heap_cleanup(struct palloc_heap *heap); -int -heap_check(void *heap_start, uint64_t heap_size); -int -heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops); -int -heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c); -int -heap_extend(struct palloc_heap *heap, struct bucket *defb, size_t size); +#define HEAP_PTR_TO_OFF(heap, ptr)\ + ((uintptr_t)(ptr) - (uintptr_t)((heap)->base)) + +#define BIT_IS_CLR(a, i) (!((a) & (1ULL << (i)))) +#define HEAP_ARENA_PER_THREAD (0) + +int heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, + uint64_t *sizep, void *base, struct mo_ops *p_ops, + struct stats *stats, struct pool_set *set); +int heap_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops); +void heap_cleanup(struct palloc_heap *heap); +int heap_check(void *heap_start, uint64_t heap_size); +int heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops); +int heap_buckets_init(struct palloc_heap *heap); +int heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c); +int heap_extend(struct palloc_heap *heap, struct bucket *defb, size_t size); struct alloc_class * heap_get_best_class(struct palloc_heap *heap, size_t size); struct bucket * -zoneset_bucket_acquire(struct zoneset *zset, uint8_t class_id); -void -zoneset_bucket_release(struct bucket *b); +heap_bucket_acquire(struct palloc_heap *heap, uint8_t class_id, uint16_t arena_id); +void heap_bucket_release(struct bucket *b); + +int heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b, + struct memory_block *m); +os_mutex_t *heap_get_run_lock(struct palloc_heap *heap, + uint32_t chunk_id); + +void heap_force_recycle(struct palloc_heap *heap); + +void heap_discard_run(struct palloc_heap *heap, struct memory_block *m); + +void heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m); + +int heap_free_chunk_reuse(struct palloc_heap *heap, + struct bucket *bucket, struct memory_block *m); + +void heap_foreach_object(struct palloc_heap *heap, object_callback cb, + void *arg, struct memory_block start); -int -heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b, struct memory_block *m); -os_mutex_t * -heap_get_run_lock(struct palloc_heap *heap, uint32_t chunk_id); +struct alloc_class_collection *heap_alloc_classes(struct palloc_heap *heap); -void -heap_discard_run(struct palloc_heap *heap, struct memory_block *m); +void *heap_end(struct palloc_heap *heap); -void -heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m); +unsigned heap_get_narenas_total(struct palloc_heap *heap); -int -heap_free_chunk_reuse(struct palloc_heap *heap, struct bucket *bucket, struct memory_block *m); +unsigned heap_get_narenas_max(struct palloc_heap *heap); -void -heap_foreach_object(struct palloc_heap *heap, object_callback cb, void *arg, - struct memory_block start); +int heap_set_narenas_max(struct palloc_heap *heap, unsigned size); -struct alloc_class_collection * -heap_alloc_classes(struct palloc_heap *heap); +unsigned heap_get_narenas_auto(struct palloc_heap *heap); -void * -heap_end(struct palloc_heap *heap); +unsigned heap_get_thread_arena_id(struct palloc_heap *heap); -void -heap_vg_open(struct palloc_heap *heap, object_callback cb, void *arg, int objects); +int heap_arena_create(struct palloc_heap *heap); + +struct bucket_locked ** +heap_get_arena_buckets(struct palloc_heap *heap, unsigned arena_id); + +int heap_get_arena_auto(struct palloc_heap *heap, unsigned arena_id); + +int heap_set_arena_auto(struct palloc_heap *heap, unsigned arena_id, + int automatic); + +void heap_set_arena_thread(struct palloc_heap *heap, unsigned arena_id); + +unsigned heap_get_procs(void); + +void heap_vg_open(struct palloc_heap *heap, object_callback cb, + void *arg, int objects); static inline struct chunk_header * heap_get_chunk_hdr(struct palloc_heap *heap, const struct memory_block *m) @@ -93,7 +112,4 @@ heap_get_chunk_run(struct palloc_heap *heap, const struct memory_block *m) return GET_CHUNK_RUN(heap->layout, m->zone_id, m->chunk_id); } -struct zoneset * -heap_get_zoneset(struct palloc_heap *heap, uint32_t zone_id); - #endif /* __DAOS_COMMON_HEAP_H */ diff --git a/src/common/dav/obj.h b/src/common/dav/obj.h index 470323da1ef..3140235d105 100644 --- a/src/common/dav/obj.h +++ b/src/common/dav/obj.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2014-2023, Intel Corporation */ +/* Copyright 2014-2021, Intel Corporation */ /* * obj.h -- internal definitions for obj module @@ -45,6 +45,7 @@ typedef uint64_t type_num_t; #define CLASS_ID_FROM_FLAG(flag)\ ((uint16_t)((flag) >> 48)) -#define EZONE_ID_FROM_FLAG(flag) ((uint32_t)((flag) >> 16)) +#define ARENA_ID_FROM_FLAG(flag)\ +((uint16_t)((flag) >> 32)) #endif /* __DAOS_COMMON_OBJ_H */ diff --git a/src/common/dav/palloc.c b/src/common/dav/palloc.c index 255303de4a2..a7b5424576f 100644 --- a/src/common/dav/palloc.c +++ b/src/common/dav/palloc.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2023, Intel Corporation */ +/* Copyright 2015-2022, Intel Corporation */ /* * palloc.c -- implementation of pmalloc POSIX-like API @@ -178,13 +178,15 @@ alloc_prep_block(struct palloc_heap *heap, const struct memory_block *m, * (best-fit, next-fit, ...) varies depending on the bucket container. */ static int -palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr constructor, - void *arg, uint64_t extra_field, uint16_t object_flags, uint16_t class_id, - uint32_t zset_id, struct dav_action_internal *out) +palloc_reservation_create(struct palloc_heap *heap, size_t size, + palloc_constr constructor, void *arg, + uint64_t extra_field, uint16_t object_flags, + uint16_t class_id, uint16_t arena_id, + struct dav_action_internal *out) { - int err = 0; + int err = 0; + struct memory_block *new_block = &out->m; - struct zoneset *zset; out->type = DAV_ACTION_TYPE_HEAP; @@ -200,12 +202,6 @@ palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr c return -1; } - zset = heap_get_zoneset(heap, zset_id); - if (zset == NULL) { - errno = EINVAL; - return -1; - } - /* * The caller provided size in bytes, but buckets operate in * 'size indexes' which are multiples of the block size in the @@ -226,7 +222,7 @@ palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr c *new_block = MEMORY_BLOCK_NONE; new_block->size_idx = (uint32_t)size_idx; - struct bucket *b = zoneset_bucket_acquire(zset, c->id); + struct bucket *b = heap_bucket_acquire(heap, c->id, arena_id); err = heap_get_bestfit_block(heap, b, new_block); if (err != 0) @@ -258,7 +254,7 @@ palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr c out->new_state = MEMBLOCK_ALLOCATED; out: - zoneset_bucket_release(b); + heap_bucket_release(b); if (err == 0) return 0; @@ -300,17 +296,17 @@ static void palloc_restore_free_chunk_state(struct palloc_heap *heap, struct memory_block *m) { - struct zoneset *zset = heap_get_zoneset(heap, m->zone_id); - if (m->type == MEMORY_BLOCK_HUGE) { - struct bucket *b = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID); + struct bucket *b = heap_bucket_acquire(heap, + DEFAULT_ALLOC_CLASS_ID, + HEAP_ARENA_PER_THREAD); if (heap_free_chunk_reuse(heap, b, m) != 0) { if (errno == EEXIST) FATAL("duplicate runtime chunk state, possible double free"); else D_CRIT("unable to track runtime chunk state\n"); } - zoneset_bucket_release(b); + heap_bucket_release(b); } } @@ -577,15 +573,18 @@ palloc_exec_actions(struct palloc_heap *heap, * palloc_reserve -- creates a single reservation */ int -palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg, - uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t zset_id, - struct dav_action *act) +palloc_reserve(struct palloc_heap *heap, size_t size, + palloc_constr constructor, void *arg, + uint64_t extra_field, uint16_t object_flags, + uint16_t class_id, uint16_t arena_id, + struct dav_action *act) { COMPILE_ERROR_ON(sizeof(struct dav_action) != sizeof(struct dav_action_internal)); - return palloc_reservation_create(heap, size, constructor, arg, extra_field, object_flags, - class_id, zset_id, (struct dav_action_internal *)act); + return palloc_reservation_create(heap, size, constructor, arg, + extra_field, object_flags, class_id, arena_id, + (struct dav_action_internal *)act); } /* @@ -729,7 +728,7 @@ palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt int palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size, palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags, - uint16_t class_id, uint32_t zset_id, struct operation_context *ctx) + uint16_t class_id, uint16_t arena_id, struct operation_context *ctx) { size_t user_size = 0; @@ -759,8 +758,9 @@ palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, siz /* alloc or realloc */ if (size != 0) { alloc = &ops[nops++]; - if (palloc_reservation_create(heap, size, constructor, arg, extra_field, - object_flags, class_id, zset_id, alloc) != 0) { + if (palloc_reservation_create(heap, size, constructor, arg, + extra_field, object_flags, + class_id, arena_id, alloc) != 0) { operation_cancel(ctx); return -1; } @@ -907,6 +907,15 @@ palloc_boot(struct palloc_heap *heap, void *heap_start, base, p_ops, stats, set); } +/* + * palloc_buckets_init -- initialize buckets + */ +int +palloc_buckets_init(struct palloc_heap *heap) +{ + return heap_buckets_init(heap); +} + /* * palloc_init -- initializes palloc heap */ @@ -944,6 +953,15 @@ palloc_heap_check_remote(void *heap_start, uint64_t heap_size, return heap_check_remote(heap_start, heap_size, ops); } +/* + * palloc_heap_cleanup -- cleanups the volatile heap state + */ +void +palloc_heap_cleanup(struct palloc_heap *heap) +{ + heap_cleanup(heap); +} + #if VG_MEMCHECK_ENABLED /* * palloc_vg_register_alloc -- (internal) registers allocation header diff --git a/src/common/dav/palloc.h b/src/common/dav/palloc.h index 8c630e999e6..9c7560f1aaa 100644 --- a/src/common/dav/palloc.h +++ b/src/common/dav/palloc.h @@ -20,86 +20,85 @@ #define PALLOC_CTL_DEBUG_NO_PATTERN (-1) struct palloc_heap { - struct mo_ops p_ops; + struct mo_ops p_ops; struct heap_layout *layout; - struct heap_rt *rt; - uint64_t *sizep; - uint64_t growsize; - struct stats *stats; - struct pool_set *set; - void *base; - int alloc_pattern; + struct heap_rt *rt; + uint64_t *sizep; + uint64_t growsize; + + struct stats *stats; + struct pool_set *set; + + void *base; + + int alloc_pattern; }; struct memory_block; -struct zoneset; -typedef int (*palloc_constr)(void *base, void *ptr, size_t usable_size, void *arg); +typedef int (*palloc_constr)(void *base, void *ptr, + size_t usable_size, void *arg); -int -palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size, - palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags, - uint16_t class_id, uint32_t zset_id, struct operation_context *ctx); +int palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, + size_t size, palloc_constr constructor, void *arg, + uint64_t extra_field, uint16_t object_flags, + uint16_t class_id, uint16_t arena_id, + struct operation_context *ctx); int -palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg, - uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t zset_id, +palloc_reserve(struct palloc_heap *heap, size_t size, + palloc_constr constructor, void *arg, + uint64_t extra_field, uint16_t object_flags, + uint16_t class_id, uint16_t arena_id, struct dav_action *act); -int -palloc_action_isalloc(struct dav_action *act); -void -palloc_get_prange(struct dav_action *act, uint64_t *const off, uint64_t *const size, - int persist_udata); -uint64_t -palloc_get_realoffset(struct palloc_heap *heap, uint64_t off); +int palloc_action_isalloc(struct dav_action *act); +void palloc_get_prange(struct dav_action *act, uint64_t *const off, uint64_t *const size, + int persist_udata); +uint64_t palloc_get_realoffset(struct palloc_heap *heap, uint64_t off); void -palloc_defer_free(struct palloc_heap *heap, uint64_t off, struct dav_action *act); +palloc_defer_free(struct palloc_heap *heap, uint64_t off, + struct dav_action *act); void -palloc_cancel(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt); +palloc_cancel(struct palloc_heap *heap, + struct dav_action *actv, size_t actvcnt); void -palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt, - struct operation_context *ctx); +palloc_publish(struct palloc_heap *heap, + struct dav_action *actv, size_t actvcnt, + struct operation_context *ctx); void -palloc_set_value(struct palloc_heap *heap, struct dav_action *act, uint64_t *ptr, uint64_t value); +palloc_set_value(struct palloc_heap *heap, struct dav_action *act, + uint64_t *ptr, uint64_t value); -uint64_t -palloc_first(struct palloc_heap *heap); -uint64_t -palloc_next(struct palloc_heap *heap, uint64_t off); +uint64_t palloc_first(struct palloc_heap *heap); +uint64_t palloc_next(struct palloc_heap *heap, uint64_t off); -size_t -palloc_usable_size(struct palloc_heap *heap, uint64_t off); -uint64_t -palloc_extra(struct palloc_heap *heap, uint64_t off); -uint16_t -palloc_flags(struct palloc_heap *heap, uint64_t off); +size_t palloc_usable_size(struct palloc_heap *heap, uint64_t off); +uint64_t palloc_extra(struct palloc_heap *heap, uint64_t off); +uint16_t palloc_flags(struct palloc_heap *heap, uint64_t off); -int -palloc_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, uint64_t *sizep, - void *base, struct mo_ops *p_ops, struct stats *stats, struct pool_set *set); +int palloc_boot(struct palloc_heap *heap, void *heap_start, + uint64_t heap_size, uint64_t *sizep, + void *base, struct mo_ops *p_ops, + struct stats *stats, struct pool_set *set); -int -palloc_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops); -void * -palloc_heap_end(struct palloc_heap *h); -int -palloc_heap_check(void *heap_start, uint64_t heap_size); -int -palloc_heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops); -size_t -palloc_heap(void *heap_start); +int palloc_buckets_init(struct palloc_heap *heap); +int palloc_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops); +void *palloc_heap_end(struct palloc_heap *h); +int palloc_heap_check(void *heap_start, uint64_t heap_size); +int palloc_heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops); +void palloc_heap_cleanup(struct palloc_heap *heap); +size_t palloc_heap(void *heap_start); /* foreach callback, terminates iteration if return value is non-zero */ typedef int (*object_callback)(const struct memory_block *m, void *arg); #if VG_MEMCHECK_ENABLED -void -palloc_heap_vg_open(struct palloc_heap *heap, int objects); +void palloc_heap_vg_open(struct palloc_heap *heap, int objects); #endif #endif /* __DAOS_COMMON_PALLOC_H */ diff --git a/src/common/dav/recycler.c b/src/common/dav/recycler.c index be26d9d7114..07537a44bd4 100644 --- a/src/common/dav/recycler.c +++ b/src/common/dav/recycler.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2016-2023, Intel Corporation */ +/* Copyright 2016-2022, Intel Corporation */ /* * recycler.c -- implementation of run recycler @@ -49,7 +49,6 @@ recycler_element_cmp(const void *lhs, const void *rhs) struct recycler { struct ravl *runs; struct palloc_heap *heap; - struct zoneset *zset; /* * How many unaccounted units there *might* be inside of the memory @@ -62,7 +61,8 @@ struct recycler { */ size_t unaccounted_units[MAX_CHUNK]; size_t unaccounted_total; - size_t nallocs; + size_t nallocs; + size_t *peak_arenas; VEC(, struct recycler_element) recalc; @@ -73,7 +73,7 @@ struct recycler { * recycler_new -- creates new recycler instance */ struct recycler * -recycler_new(struct palloc_heap *heap, size_t nallocs, struct zoneset *zset) +recycler_new(struct palloc_heap *heap, size_t nallocs, size_t *peak_arenas) { struct recycler *r; @@ -88,7 +88,7 @@ recycler_new(struct palloc_heap *heap, size_t nallocs, struct zoneset *zset) r->heap = heap; r->nallocs = nallocs; - r->zset = zset; + r->peak_arenas = peak_arenas; r->unaccounted_total = 0; memset(&r->unaccounted_units, 0, sizeof(r->unaccounted_units)); @@ -219,7 +219,12 @@ recycler_recalc(struct recycler *r, int force) uint64_t units = r->unaccounted_total; - uint64_t recalc_threshold = THRESHOLD_MUL * r->nallocs; + size_t peak_arenas; + + util_atomic_load64(r->peak_arenas, &peak_arenas); + + uint64_t recalc_threshold = + THRESHOLD_MUL * peak_arenas * r->nallocs; if (!force && units < recalc_threshold) return runs; @@ -313,12 +318,3 @@ recycler_inc_unaccounted(struct recycler *r, const struct memory_block *m) util_fetch_and_add64(&r->unaccounted_units[m->chunk_id], m->size_idx); } - -/* - * Return the zoneset associated with the recycler. - */ -struct zoneset * -recycler_get_zoneset(struct recycler *r) -{ - return r->zset; -} diff --git a/src/common/dav/recycler.h b/src/common/dav/recycler.h index 7904289937d..2d68d8d70fc 100644 --- a/src/common/dav/recycler.h +++ b/src/common/dav/recycler.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2016-2023, Intel Corporation */ +/* Copyright 2016-2021, Intel Corporation */ /* * recycler.h -- internal definitions of run recycler @@ -25,8 +25,8 @@ struct recycler_element { uint32_t zone_id; }; -struct recycler * -recycler_new(struct palloc_heap *layout, size_t nallocs, struct zoneset *zset); +struct recycler *recycler_new(struct palloc_heap *layout, + size_t nallocs, size_t *peak_arenas); void recycler_delete(struct recycler *r); struct recycler_element recycler_element_new(struct palloc_heap *heap, const struct memory_block *m); @@ -40,7 +40,4 @@ struct empty_runs recycler_recalc(struct recycler *r, int force); void recycler_inc_unaccounted(struct recycler *r, const struct memory_block *m); -struct zoneset * -recycler_get_zoneset(struct recycler *r); - #endif /* __DAOS_COMMON_RECYCLER_H */ diff --git a/src/common/dav/tx.c b/src/common/dav/tx.c index c3bef536451..189dd073036 100644 --- a/src/common/dav/tx.c +++ b/src/common/dav/tx.c @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause */ -/* Copyright 2015-2023, Intel Corporation */ +/* Copyright 2015-2022, Intel Corporation */ /* * tx.c -- transactions implementation @@ -354,8 +354,8 @@ tx_alloc_common(struct tx *tx, size_t size, type_num_t type_num, return obj_tx_fail_null(ENOMEM, args.flags); if (palloc_reserve(pop->do_heap, size, constructor, &args, type_num, 0, - CLASS_ID_FROM_FLAG(args.flags), EZONE_ID_FROM_FLAG(args.flags), - action) != 0) + CLASS_ID_FROM_FLAG(args.flags), + ARENA_ID_FROM_FLAG(args.flags), action) != 0) goto err_oom; palloc_get_prange(action, &off, &size, 1); @@ -1265,7 +1265,68 @@ dav_tx_xadd_range(uint64_t hoff, size_t size, uint64_t flags) * dav_tx_alloc -- allocates a new object */ uint64_t -dav_tx_alloc(size_t size, uint64_t type_num, uint64_t flags) +dav_tx_alloc(size_t size, uint64_t type_num) +{ + uint64_t off; + + DAV_API_START(); + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + uint64_t flags = tx_abort_on_failure_flag(tx); + + if (size == 0) { + ERR("allocation with size 0"); + off = obj_tx_fail_null(EINVAL, flags); + DAV_API_END(); + return off; + } + + off = tx_alloc_common(tx, size, (type_num_t)type_num, + constructor_tx_alloc, ALLOC_ARGS(flags)); + + DAV_API_END(); + return off; +} + +/* + * dav_tx_zalloc -- allocates a new zeroed object + */ +uint64_t +dav_tx_zalloc(size_t size, uint64_t type_num) +{ + uint64_t off; + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + uint64_t flags = DAV_FLAG_ZERO; + + flags |= tx_abort_on_failure_flag(tx); + + DAV_API_START(); + if (size == 0) { + ERR("allocation with size 0"); + off = obj_tx_fail_null(EINVAL, flags); + DAV_API_END(); + return off; + } + + off = tx_alloc_common(tx, size, (type_num_t)type_num, + constructor_tx_alloc, ALLOC_ARGS(flags)); + + DAV_API_END(); + return off; +} + +/* + * dav_tx_xalloc -- allocates a new object + */ +uint64_t +dav_tx_xalloc(size_t size, uint64_t type_num, uint64_t flags) { uint64_t off; struct tx *tx = get_tx(); @@ -1424,6 +1485,121 @@ dav_tx_off2ptr(uint64_t off) return (void *)OBJ_OFF_TO_PTR(tx->pop, off); } +/* + * dav_reserve -- reserves a single object + */ +uint64_t +dav_reserve(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num) +{ + DAV_DBG("pop %p act %p size %zu type_num %llx", + pop, act, size, + (unsigned long long)type_num); + + DAV_API_START(); + if (pop->do_utx == NULL && dav_umem_wtx_new(pop) == NULL) + return 0; + + if (palloc_reserve(pop->do_heap, size, NULL, NULL, type_num, + 0, 0, 0, act) != 0) { + DAV_API_END(); + return 0; + } + + DAV_API_END(); + return act->heap.offset; +} + +/* + * dav_defer_free -- creates a deferred free action + */ +void +dav_defer_free(dav_obj_t *pop, uint64_t off, struct dav_action *act) +{ + ASSERT(off != 0); + ASSERT(OBJ_OFF_IS_VALID(pop, off)); + palloc_defer_free(pop->do_heap, off, act); +} + +#if 0 +/* + * dav_publish -- publishes a collection of actions + */ +int +dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) +{ + DAV_API_START(); + struct operation_context *ctx = pmalloc_operation_hold(pop); + + size_t entries_size = actvcnt * sizeof(struct ulog_entry_val); + + if (operation_reserve(ctx, entries_size) != 0) { + DAV_API_END(); + return -1; + } + + palloc_publish(&pop->do_heap, actv, actvcnt, ctx); + + pmalloc_operation_release(pop); + + DAV_API_END(); + return 0; +} +#endif + +/* + * dav_cancel -- cancels collection of actions + */ +void +dav_cancel(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) +{ + DAV_DBG("actvcnt=%zu", actvcnt); + DAV_API_START(); + palloc_cancel(pop->do_heap, actv, actvcnt); + DAV_API_END(); +} + + +/* + * dav_tx_publish -- publishes actions inside of a transaction, + * with no_abort option + */ +int +dav_tx_publish(struct dav_action *actv, size_t actvcnt) +{ + struct tx *tx = get_tx(); + uint64_t flags = 0; + uint64_t off, size; + int ret; + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + flags |= tx_abort_on_failure_flag(tx); + + DAV_API_START(); + + if (tx_action_reserve(tx, actvcnt) != 0) { + ret = obj_tx_fail_err(ENOMEM, flags); + + DAV_API_END(); + return ret; + } + + for (size_t i = 0; i < actvcnt; ++i) { + VEC_PUSH_BACK(&tx->actions, actv[i]); + if (palloc_action_isalloc(&actv[i])) { + palloc_get_prange(&actv[i], &off, &size, 1); + struct tx_range_def r = {off, size, DAV_XADD_NO_SNAPSHOT|DAV_XADD_WAL_CPTR}; + + ret = dav_tx_add_common(tx, &r); + D_ASSERT(ret == 0); + } + } + + DAV_API_END(); + return 0; +} + /* arguments for constructor_alloc */ struct constr_args { int zero_init; @@ -1431,6 +1607,7 @@ struct constr_args { void *arg; }; + /* arguments for constructor_alloc_root */ struct carg_root { size_t size; @@ -1608,8 +1785,10 @@ obj_alloc_construct(dav_obj_t *pop, uint64_t *offp, size_t size, ctx = pop->external; operation_start(ctx); - int ret = palloc_operation(pop->do_heap, 0, offp, size, constructor_alloc, &carg, type_num, - 0, CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), ctx); + int ret = palloc_operation(pop->do_heap, 0, offp, size, + constructor_alloc, &carg, type_num, 0, + CLASS_ID_FROM_FLAG(flags), ARENA_ID_FROM_FLAG(flags), + ctx); lw_tx_end(pop, NULL); return ret; @@ -1619,12 +1798,11 @@ obj_alloc_construct(dav_obj_t *pop, uint64_t *offp, size_t size, * dav_alloc -- allocates a new object */ int -dav_alloc(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags, - dav_constr constructor, void *arg) +dav_alloc(dav_obj_t *pop, uint64_t *offp, size_t size, + uint64_t type_num, dav_constr constructor, void *arg) { - DAV_DBG(3, "pop %p offp %p size %zu type_num %llx flags %llx constructor %p arg %p", pop, - offp, size, (unsigned long long)type_num, (unsigned long long)flags, constructor, - arg); + DAV_DBG("pop %p offp %p size %zu type_num %llx constructor %p arg %p", + pop, offp, size, (unsigned long long)type_num, constructor, arg); if (size == 0) { ERR("allocation with size 0"); @@ -1632,14 +1810,15 @@ dav_alloc(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64 return -1; } - if (flags & ~DAV_TX_XALLOC_VALID_FLAGS) { - ERR("unknown flags 0x%" PRIx64, flags & ~DAV_TX_XALLOC_VALID_FLAGS); + if (offp == NULL) { + ERR("allocation offp is NULL"); errno = EINVAL; return -1; } DAV_API_START(); - int ret = obj_alloc_construct(pop, offp, size, type_num, flags, constructor, arg); + int ret = obj_alloc_construct(pop, offp, size, type_num, + 0, constructor, arg); DAV_API_END(); return ret; @@ -1710,146 +1889,3 @@ dav_memcpy_persist_relaxed(dav_obj_t *pop, void *dest, const void *src, DAV_API_END(); return ptr; } - -/* - * dav_reserve -- reserves a single object - */ -uint64_t -dav_reserve(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num, uint64_t flags) -{ - struct constr_args carg; - - DAV_DBG(3, "pop %p act %p size %zu type_num %llx flags %llx", pop, act, size, - (unsigned long long)type_num, (unsigned long long)flags); - - if (flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS) { - ERR("unknown flags 0x%" PRIx64, flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS); - errno = EINVAL; - return 0; - } - - DAV_API_START(); - - if (pop->do_utx == NULL && dav_umem_wtx_new(pop) == NULL) - return 0; - - carg.zero_init = flags & DAV_FLAG_ZERO; - carg.constructor = NULL; - carg.arg = NULL; - - if (palloc_reserve(pop->do_heap, size, constructor_alloc, &carg, type_num, 0, - CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), act) != 0) { - DAV_API_END(); - return 0; - } - - DAV_API_END(); - return act->heap.offset; -} - -/* - * dav_defer_free -- creates a deferred free action - */ -void -dav_defer_free(dav_obj_t *pop, uint64_t off, struct dav_action *act) -{ - ASSERT(off != 0); - ASSERT(OBJ_OFF_IS_VALID(pop, off)); - palloc_defer_free(pop->do_heap, off, act); -} - -#if 0 -/* - * dav_publish -- publishes a collection of actions - */ -int -dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) -{ - DAV_API_START(); - struct operation_context *ctx = pmalloc_operation_hold(pop); - - size_t entries_size = actvcnt * sizeof(struct ulog_entry_val); - - if (operation_reserve(ctx, entries_size) != 0) { - DAV_API_END(); - return -1; - } - - palloc_publish(&pop->do_heap, actv, actvcnt, ctx); - - pmalloc_operation_release(pop); - - DAV_API_END(); - return 0; -} -#endif - -/* - * dav_cancel -- cancels collection of actions - */ -void -dav_cancel(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) -{ - DAV_DBG("actvcnt=%zu", actvcnt); - DAV_API_START(); - palloc_cancel(pop->do_heap, actv, actvcnt); - DAV_API_END(); -} - -/* - * dav_tx_publish -- publishes actions inside of a transaction, - * with no_abort option - */ -int -dav_tx_publish(struct dav_action *actv, size_t actvcnt) -{ - struct tx *tx = get_tx(); - uint64_t flags = 0; - uint64_t off, size; - int ret; - - ASSERT_IN_TX(tx); - ASSERT_TX_STAGE_WORK(tx); - - flags |= tx_abort_on_failure_flag(tx); - - DAV_API_START(); - - if (tx_action_reserve(tx, actvcnt) != 0) { - ret = obj_tx_fail_err(ENOMEM, flags); - - DAV_API_END(); - return ret; - } - - for (size_t i = 0; i < actvcnt; ++i) { - VEC_PUSH_BACK(&tx->actions, actv[i]); - if (palloc_action_isalloc(&actv[i])) { - palloc_get_prange(&actv[i], &off, &size, 1); - struct tx_range_def r = {off, size, - DAV_XADD_NO_SNAPSHOT | DAV_XADD_WAL_CPTR}; - - ret = dav_tx_add_common(tx, &r); - D_ASSERT(ret == 0); - } - } - - DAV_API_END(); - return 0; -} - -/* - * dav_get_zone_evictable -- Returns an evictable zone id that can be used for - * allocations. If there are no evictable zone with sufficient free space then - * zero is returned which maps to non-evictable zone. - */ -uint32_t -dav_get_zone_evictable(dav_obj_t *pop, int flags) -{ - D_ASSERT(flags == 0); - /* REVISIT: TBD - * Return evictable zone that is currently marked as in-use and has sufficient free space. - * Else, find an evictable zone that has more that x% of free memory and mark it as in-use. - */ - return 0; -} diff --git a/src/common/dav_v2/README.md b/src/common/dav_v2/README.md new file mode 100644 index 00000000000..42616df7e6a --- /dev/null +++ b/src/common/dav_v2/README.md @@ -0,0 +1,6 @@ +# DAOS Allocator for VOS + +The DAV alloctor for md_on_ssd phase 2 now supports evictable zones. This introduces change in the +layout of heap and is not compatible with the DAV allocator of phase 1. In order to support both +layouts the new allocator is packaged as a different library and linked to daos_common_pmem +library. diff --git a/src/common/dav_v2/SConscript b/src/common/dav_v2/SConscript new file mode 100644 index 00000000000..8fd6c05ecd0 --- /dev/null +++ b/src/common/dav_v2/SConscript @@ -0,0 +1,30 @@ +"""Build dav_v2 libraries""" + + +SRC = ['alloc_class.c', 'bucket.c', 'container_ravl.c', 'container_seglists.c', 'critnib.c', + 'dav_clogs.c', 'dav_iface.c', 'heap.c', 'memblock.c', 'memops.c', 'palloc.c', 'ravl.c', + 'ravl_interval.c', 'recycler.c', 'stats.c', 'tx.c', 'ulog.c', 'util.c', 'wal_tx.c'] + + +def scons(): + """Scons function""" + + Import('env', 'base_env') + + env.AppendUnique(LIBPATH=[Dir('.')]) + base_env.AppendUnique(LIBPATH=[Dir('.')]) + base_env.d_add_build_rpath() + env.d_add_build_rpath() + + denv = env.Clone() + + denv.AppendUnique(LIBS=['pthread', 'gurt']) + denv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD', '-DDAV_V2_BUILD']) + denv.AppendUnique(CFLAGS=['-fvisibility=hidden']) + + dav_v2 = denv.d_library('dav_v2', SRC) + denv.Install('$PREFIX/lib64/', dav_v2) + + +if __name__ == "SCons.Script": + scons() diff --git a/src/common/dav_v2/alloc_class.c b/src/common/dav_v2/alloc_class.c new file mode 100644 index 00000000000..3dc5745db6a --- /dev/null +++ b/src/common/dav_v2/alloc_class.c @@ -0,0 +1,647 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2022, Intel Corporation */ + +/* + * alloc_class.c -- implementation of allocation classes + */ + +#include <float.h> +#include <string.h> + +#include "alloc_class.h" +#include "heap_layout.h" +#include "util.h" +#include "out.h" +#include "bucket.h" +#include "critnib.h" + +#define RUN_CLASS_KEY_PACK(map_idx_s, flags_s, size_idx_s)\ +((uint64_t)(map_idx_s) << 32 |\ +(uint64_t)(flags_s) << 16 |\ +(uint64_t)(size_idx_s)) + +/* + * Value used to mark a reserved spot in the bucket array. + */ +#define ACLASS_RESERVED ((void *)0xFFFFFFFFULL) + +/* + * The last size that is handled by runs. + */ +#define MAX_RUN_SIZE (CHUNKSIZE * 10) + +/* + * Maximum number of bytes the allocation class generation algorithm can decide + * to waste in a single run chunk. + */ +#define MAX_RUN_WASTED_BYTES 1024 + +/* + * Allocation categories are used for allocation classes generation. Each one + * defines the biggest handled size (in bytes) and step pct of the generation + * process. The step percentage defines maximum allowed external fragmentation + * for the category. + */ +#define MAX_ALLOC_CATEGORIES 9 + +/* + * The first size (in byes) which is actually used in the allocation + * class generation algorithm. All smaller sizes use the first predefined bucket + * with the smallest run unit size. + */ +#define FIRST_GENERATED_CLASS_SIZE 128 + +/* + * The granularity of the allocation class generation algorithm. + */ +#define ALLOC_BLOCK_SIZE_GEN 64 + +/* + * The first predefined allocation class size + */ +#define MIN_UNIT_SIZE 128 + +static const struct { + size_t size; + float step; +} categories[MAX_ALLOC_CATEGORIES] = { + /* dummy category - the first allocation class is predefined */ + {FIRST_GENERATED_CLASS_SIZE, 0.05f}, + {1024, 0.05f}, + {2048, 0.05f}, + {4096, 0.05f}, + {8192, 0.05f}, + {16384, 0.05f}, + {32768, 0.05f}, + {131072, 0.05f}, + {393216, 0.05f}, +}; + +#define RUN_UNIT_MAX_ALLOC 8U + +/* + * Every allocation has to be a multiple of at least 8 because we need to + * ensure proper alignment of every persistent structure. + */ +#define ALLOC_BLOCK_SIZE 16 + +/* + * Converts size (in bytes) to number of allocation blocks. + */ +#define SIZE_TO_CLASS_MAP_INDEX(_s, _g) (1 + (((_s) - 1) / (_g))) + +/* + * Target number of allocations per run instance. + */ +#define RUN_MIN_NALLOCS 200 + +/* + * Hard limit of chunks per single run. + */ +#define RUN_SIZE_IDX_CAP (16) + +#define ALLOC_CLASS_DEFAULT_FLAGS CHUNK_FLAG_FLEX_BITMAP + +struct alloc_class_collection { + size_t granularity; + + struct alloc_class *aclasses[MAX_ALLOCATION_CLASSES]; + + /* + * The last size (in bytes) that is handled by runs, everything bigger + * uses the default class. + */ + size_t last_run_max_size; + + /* maps allocation classes to allocation sizes, excluding the header! */ + uint8_t *class_map_by_alloc_size; + + /* maps allocation classes to run unit sizes */ + struct critnib *class_map_by_unit_size; + + int fail_on_missing_class; + int autogenerate_on_missing_class; +}; + +/* + * alloc_class_find_first_free_slot -- searches for the + * first available allocation class slot + * + * This function must be thread-safe because allocation classes can be created + * at runtime. + */ +int +alloc_class_find_first_free_slot(struct alloc_class_collection *ac, + uint8_t *slot) +{ + for (int n = 0; n < MAX_ALLOCATION_CLASSES; ++n) { + if (util_bool_compare_and_swap64(&ac->aclasses[n], + NULL, ACLASS_RESERVED)) { + *slot = (uint8_t)n; + return 0; + } + } + + return -1; +} + +/* + * alloc_class_reserve -- reserve the specified class id + */ +int +alloc_class_reserve(struct alloc_class_collection *ac, uint8_t id) +{ + return util_bool_compare_and_swap64(&ac->aclasses[id], + NULL, ACLASS_RESERVED) ? 0 : -1; +} + +/* + * alloc_class_reservation_clear -- removes the reservation on class id + */ +static void +alloc_class_reservation_clear(struct alloc_class_collection *ac, int id) +{ + int ret = util_bool_compare_and_swap64(&ac->aclasses[id], + ACLASS_RESERVED, NULL); + ASSERT(ret); +} + +/* + * alloc_class_new -- creates a new allocation class + */ +struct alloc_class * +alloc_class_new(int id, struct alloc_class_collection *ac, + enum alloc_class_type type, enum header_type htype, + size_t unit_size, size_t alignment, + uint32_t size_idx) +{ + DAV_DBG("alloc_class_new id:%d\n", + (type == CLASS_HUGE) ? DEFAULT_ALLOC_CLASS_ID : id); + + struct alloc_class *c; + + D_ALLOC_PTR_NZ(c); + + if (c == NULL) + goto error_class_alloc; + + c->unit_size = unit_size; + c->header_type = htype; + c->type = type; + c->flags = (uint16_t) + (header_type_to_flag[c->header_type] | + (alignment ? CHUNK_FLAG_ALIGNED : 0)) | + ALLOC_CLASS_DEFAULT_FLAGS; + + switch (type) { + case CLASS_HUGE: + id = DEFAULT_ALLOC_CLASS_ID; + break; + case CLASS_RUN: + c->rdsc.alignment = alignment; + memblock_run_bitmap(&size_idx, c->flags, unit_size, + alignment, NULL, &c->rdsc.bitmap); + c->rdsc.nallocs = c->rdsc.bitmap.nbits; + c->rdsc.size_idx = size_idx; + + /* these two fields are duplicated from class */ + c->rdsc.unit_size = c->unit_size; + c->rdsc.flags = c->flags; + + uint8_t slot = (uint8_t)id; + + if (id < 0 && alloc_class_find_first_free_slot(ac, + &slot) != 0) + goto error_map_insert; + id = slot; + + size_t map_idx = SIZE_TO_CLASS_MAP_INDEX(c->unit_size, + ac->granularity); + ASSERT(map_idx <= UINT32_MAX); + uint32_t map_idx_s = (uint32_t)map_idx; + uint16_t size_idx_s = (uint16_t)size_idx; + uint16_t flags_s = (uint16_t)c->flags; + uint64_t k = RUN_CLASS_KEY_PACK(map_idx_s, + flags_s, size_idx_s); + + if (critnib_insert(ac->class_map_by_unit_size, + k, c) != 0) { + ERR("unable to register allocation class"); + goto error_map_insert; + } + + break; + default: + ASSERT(0); + } + + c->id = (uint8_t)id; + ac->aclasses[c->id] = c; + return c; + +error_map_insert: + D_FREE(c); +error_class_alloc: + if (id >= 0) + alloc_class_reservation_clear(ac, id); + + D_CRIT("alloc_class_new failed\n"); + return NULL; +} + +/* + * alloc_class_delete -- (internal) deletes an allocation class + */ +void +alloc_class_delete(struct alloc_class_collection *ac, + struct alloc_class *c) +{ + DAV_DBG("alloc_class_delete: %d\n", c->id); + + ac->aclasses[c->id] = NULL; + D_FREE(c); +} + +/* + * alloc_class_find_or_create -- (internal) searches for the + * biggest allocation class for which unit_size is evenly divisible by n. + * If no such class exists, create one. + */ +static struct alloc_class * +alloc_class_find_or_create(struct alloc_class_collection *ac, size_t n) +{ + COMPILE_ERROR_ON(MAX_ALLOCATION_CLASSES > UINT8_MAX); + uint64_t required_size_bytes = n * RUN_MIN_NALLOCS; + uint32_t required_size_idx = 1; + + if (required_size_bytes > RUN_DEFAULT_SIZE) { + required_size_bytes -= RUN_DEFAULT_SIZE; + required_size_idx += + CALC_SIZE_IDX(CHUNKSIZE, required_size_bytes); + if (required_size_idx > RUN_SIZE_IDX_CAP) + required_size_idx = RUN_SIZE_IDX_CAP; + } + + for (int i = MAX_ALLOCATION_CLASSES - 1; i >= 0; --i) { + struct alloc_class *c = ac->aclasses[i]; + + if (c == NULL || c->type == CLASS_HUGE || + c->rdsc.size_idx < required_size_idx) + continue; + + if (n % c->unit_size == 0 && + n / c->unit_size <= RUN_UNIT_MAX_ALLOC) + return c; + } + + /* + * In order to minimize the wasted space at the end of the run the + * run data size must be divisible by the allocation class unit size + * with the smallest possible remainder, preferably 0. + */ + struct run_bitmap b; + size_t runsize_bytes = 0; + + do { + if (runsize_bytes != 0) /* don't increase on first iteration */ + n += ALLOC_BLOCK_SIZE_GEN; + + uint32_t size_idx = required_size_idx; + + memblock_run_bitmap(&size_idx, ALLOC_CLASS_DEFAULT_FLAGS, n, 0, + NULL, &b); + + runsize_bytes = RUN_CONTENT_SIZE_BYTES(size_idx) - b.size; + } while ((runsize_bytes % n) > MAX_RUN_WASTED_BYTES); + + /* + * Now that the desired unit size is found the existing classes need + * to be searched for possible duplicates. If a class that can handle + * the calculated size already exists, simply return that. + */ + for (int i = 1; i < MAX_ALLOCATION_CLASSES; ++i) { + struct alloc_class *c = ac->aclasses[i]; + + if (c == NULL || c->type == CLASS_HUGE) + continue; + if (n / c->unit_size <= RUN_UNIT_MAX_ALLOC && + n % c->unit_size == 0) + return c; + if (c->unit_size == n) + return c; + } + + return alloc_class_new(-1, ac, CLASS_RUN, HEADER_COMPACT, n, 0, + required_size_idx); +} + +/* + * alloc_class_find_min_frag -- searches for an existing allocation + * class that will provide the smallest internal fragmentation for the given + * size. + */ +static struct alloc_class * +alloc_class_find_min_frag(struct alloc_class_collection *ac, size_t n) +{ + struct alloc_class *best_c = NULL; + size_t lowest_waste = SIZE_MAX; + + ASSERTne(n, 0); + + /* + * Start from the largest buckets in order to minimize unit size of + * allocated memory blocks. + */ + for (int i = MAX_ALLOCATION_CLASSES - 1; i >= 0; --i) { + struct alloc_class *c = ac->aclasses[i]; + + /* can't use alloc classes /w no headers by default */ + if (c == NULL || c->header_type == HEADER_NONE) + continue; + + size_t real_size = n + header_type_to_size[c->header_type]; + + size_t units = CALC_SIZE_IDX(c->unit_size, real_size); + + /* can't exceed the maximum allowed run unit max */ + if (c->type == CLASS_RUN && units > RUN_UNIT_MAX_ALLOC) + continue; + + if (c->unit_size * units == real_size) + return c; + + size_t waste = (c->unit_size * units) - real_size; + + /* + * If we assume that the allocation class is only ever going to + * be used with exactly one size, the effective internal + * fragmentation would be increased by the leftover + * memory at the end of the run. + */ + if (c->type == CLASS_RUN) { + size_t wasted_units = c->rdsc.nallocs % units; + size_t wasted_bytes = wasted_units * c->unit_size; + size_t waste_avg_per_unit = wasted_bytes / + c->rdsc.nallocs; + + waste += waste_avg_per_unit; + } + + if (best_c == NULL || lowest_waste > waste) { + best_c = c; + lowest_waste = waste; + } + } + + ASSERTne(best_c, NULL); + return best_c; +} + +/* + * alloc_class_collection_new -- creates a new collection of allocation classes + */ +struct alloc_class_collection * +alloc_class_collection_new() +{ + struct alloc_class_collection *ac; + + D_ALLOC_PTR(ac); + if (ac == NULL) + return NULL; + + ac->granularity = ALLOC_BLOCK_SIZE; + ac->last_run_max_size = MAX_RUN_SIZE; + ac->fail_on_missing_class = 0; + ac->autogenerate_on_missing_class = 1; + + size_t maps_size = (MAX_RUN_SIZE / ac->granularity) + 1; + + D_ALLOC_NZ(ac->class_map_by_alloc_size, maps_size); + if (ac->class_map_by_alloc_size == NULL) + goto error; + ac->class_map_by_unit_size = critnib_new(); + if (ac->class_map_by_unit_size == NULL) + goto error; + + memset(ac->class_map_by_alloc_size, 0xFF, maps_size); + + if (alloc_class_new(-1, ac, CLASS_HUGE, HEADER_COMPACT, + CHUNKSIZE, 0, 1) == NULL) + goto error; + + struct alloc_class *predefined_class = + alloc_class_new(-1, ac, CLASS_RUN, HEADER_COMPACT, + MIN_UNIT_SIZE, 0, 1); + if (predefined_class == NULL) + goto error; + + for (size_t i = 0; i < FIRST_GENERATED_CLASS_SIZE / ac->granularity; + ++i) { + ac->class_map_by_alloc_size[i] = predefined_class->id; + } + + /* + * Based on the defined categories, a set of allocation classes is + * created. The unit size of those classes is depended on the category + * initial size and step. + */ + size_t granularity_mask = ALLOC_BLOCK_SIZE_GEN - 1; + + for (int c = 1; c < MAX_ALLOC_CATEGORIES; ++c) { + size_t n = categories[c - 1].size + ALLOC_BLOCK_SIZE_GEN; + + do { + if (alloc_class_find_or_create(ac, n) == NULL) + goto error; + + float stepf = (float)n * categories[c].step; + size_t stepi = (size_t)stepf; + + stepi = (stepf - (float)stepi < FLT_EPSILON) ? + stepi : stepi + 1; + + n += (stepi + (granularity_mask)) & ~granularity_mask; + } while (n <= categories[c].size); + } + + /* + * Find the largest alloc class and use it's unit size as run allocation + * threshold. + */ + uint8_t largest_aclass_slot; + + for (largest_aclass_slot = MAX_ALLOCATION_CLASSES - 1; + largest_aclass_slot > 0 && + ac->aclasses[largest_aclass_slot] == NULL; + --largest_aclass_slot) { + /* intentional NOP */ + } + + struct alloc_class *c = ac->aclasses[largest_aclass_slot]; + + /* + * The actual run might contain less unit blocks than the theoretical + * unit max variable. This may be the case for very large unit sizes. + */ + size_t real_unit_max = (c->rdsc.nallocs < RUN_UNIT_MAX_ALLOC) ? + c->rdsc.nallocs : RUN_UNIT_MAX_ALLOC; + + size_t theoretical_run_max_size = c->unit_size * real_unit_max; + + ac->last_run_max_size = theoretical_run_max_size <= MAX_RUN_SIZE ? + theoretical_run_max_size : MAX_RUN_SIZE; + +#ifdef DAV_EXTRA_DEBUG + /* + * Verify that each bucket's unit size points back to the bucket by the + * bucket map. This must be true for the default allocation classes, + * otherwise duplicate buckets will be created. + */ + for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + struct alloc_class *cl = ac->aclasses[i]; + + if (cl != NULL && cl->type == CLASS_RUN) { + ASSERTeq(i, cl->id); + ASSERTeq(alloc_class_by_run(ac, cl->unit_size, + cl->flags, cl->rdsc.size_idx), cl); + } + } +#endif + + return ac; + +error: + alloc_class_collection_delete(ac); + + return NULL; +} + +/* + * alloc_class_collection_delete -- deletes the allocation class collection and + * all of the classes within it + */ +void +alloc_class_collection_delete(struct alloc_class_collection *ac) +{ + for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + struct alloc_class *c = ac->aclasses[i]; + + if (c != NULL) + alloc_class_delete(ac, c); + } + + if (ac->class_map_by_unit_size) + critnib_delete(ac->class_map_by_unit_size); + D_FREE(ac->class_map_by_alloc_size); + D_FREE(ac); +} + +/* + * alloc_class_assign_by_size -- (internal) chooses the allocation class that + * best approximates the provided size + */ +static struct alloc_class * +alloc_class_assign_by_size(struct alloc_class_collection *ac, + size_t size) +{ + size_t class_map_index = SIZE_TO_CLASS_MAP_INDEX(size, + ac->granularity); + struct alloc_class *c = alloc_class_find_min_frag(ac, + class_map_index * ac->granularity); + + ASSERTne(c, NULL); + + /* + * We don't lock this array because locking this section here and then + * bailing out if someone else was faster would be still slower than + * just calculating the class and failing to assign the variable. + * We are using a compare and swap so that helgrind/drd don't complain. + */ + util_bool_compare_and_swap64( + &ac->class_map_by_alloc_size[class_map_index], + MAX_ALLOCATION_CLASSES, c->id); + + DAV_DBG("alloc_class_assign_by_size: %zu id:%d", + size, c->id); + + return c; +} + +/* + * alloc_class_by_alloc_size -- returns allocation class that is assigned + * to handle an allocation of the provided size + */ +struct alloc_class * +alloc_class_by_alloc_size(struct alloc_class_collection *ac, size_t size) +{ + if (size < ac->last_run_max_size) { + uint8_t class_id = ac->class_map_by_alloc_size[ + SIZE_TO_CLASS_MAP_INDEX(size, ac->granularity)]; + + if (class_id == MAX_ALLOCATION_CLASSES) { + if (ac->fail_on_missing_class) + return NULL; + else if (ac->autogenerate_on_missing_class) + return alloc_class_assign_by_size(ac, size); + else + return ac->aclasses[DEFAULT_ALLOC_CLASS_ID]; + } + + return ac->aclasses[class_id]; + } else { + return ac->aclasses[DEFAULT_ALLOC_CLASS_ID]; + } +} + +/* + * alloc_class_by_run -- returns the allocation class that has the given + * unit size + */ +struct alloc_class * +alloc_class_by_run(struct alloc_class_collection *ac, + size_t unit_size, uint16_t flags, uint32_t size_idx) +{ + size_t map_idx = SIZE_TO_CLASS_MAP_INDEX(unit_size, ac->granularity); + + ASSERT(map_idx <= UINT32_MAX); + + uint32_t map_idx_s = (uint32_t)map_idx; + + ASSERT(size_idx <= UINT16_MAX); + + uint16_t size_idx_s = (uint16_t)size_idx; + uint16_t flags_s = (uint16_t)flags; + + return critnib_get(ac->class_map_by_unit_size, + RUN_CLASS_KEY_PACK(map_idx_s, flags_s, size_idx_s)); +} + +/* + * alloc_class_by_id -- returns the allocation class with an id + */ +struct alloc_class * +alloc_class_by_id(struct alloc_class_collection *ac, uint8_t id) +{ + return ac->aclasses[id]; +} + +/* + * alloc_class_calc_size_idx -- calculates how many units does the size require + */ +ssize_t +alloc_class_calc_size_idx(struct alloc_class *c, size_t size) +{ + uint32_t size_idx = CALC_SIZE_IDX(c->unit_size, + size + header_type_to_size[c->header_type]); + + if (c->type == CLASS_RUN) { + if (c->header_type == HEADER_NONE && size_idx != 1) + return -1; + else if (size_idx > RUN_UNIT_MAX) + return -1; + else if (size_idx > c->rdsc.nallocs) + return -1; + } + + return size_idx; +} diff --git a/src/common/dav_v2/alloc_class.h b/src/common/dav_v2/alloc_class.h new file mode 100644 index 00000000000..676c064d975 --- /dev/null +++ b/src/common/dav_v2/alloc_class.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2020, Intel Corporation */ + +/* + * alloc_class.h -- internal definitions for allocation classes + */ + +#ifndef __DAOS_COMMON_ALLOC_CLASS_H +#define __DAOS_COMMON_ALLOC_CLASS_H 1 + +#include <stddef.h> +#include <stdint.h> +#include <sys/types.h> +#include "heap_layout.h" +#include "memblock.h" + +#define MAX_ALLOCATION_CLASSES (UINT8_MAX) +#define DEFAULT_ALLOC_CLASS_ID (0) +#define RUN_UNIT_MAX RUN_BITS_PER_VALUE + +struct alloc_class_collection; + +enum alloc_class_type { + CLASS_UNKNOWN, + CLASS_HUGE, + CLASS_RUN, + + MAX_ALLOC_CLASS_TYPES +}; + +struct alloc_class { + uint8_t id; + uint16_t flags; + + size_t unit_size; + + enum header_type header_type; + enum alloc_class_type type; + + /* run-specific data */ + struct run_descriptor rdsc; +}; + +struct alloc_class_collection *alloc_class_collection_new(void); +void alloc_class_collection_delete(struct alloc_class_collection *ac); + +struct alloc_class *alloc_class_by_run( + struct alloc_class_collection *ac, + size_t unit_size, uint16_t flags, uint32_t size_idx); +struct alloc_class *alloc_class_by_alloc_size( + struct alloc_class_collection *ac, size_t size); +struct alloc_class *alloc_class_by_id( + struct alloc_class_collection *ac, uint8_t id); + +int alloc_class_reserve(struct alloc_class_collection *ac, uint8_t id); +int alloc_class_find_first_free_slot(struct alloc_class_collection *ac, + uint8_t *slot); + +ssize_t +alloc_class_calc_size_idx(struct alloc_class *c, size_t size); + +struct alloc_class * +alloc_class_new(int id, struct alloc_class_collection *ac, + enum alloc_class_type type, enum header_type htype, + size_t unit_size, size_t alignment, + uint32_t size_idx); + +void alloc_class_delete(struct alloc_class_collection *ac, + struct alloc_class *c); + +#endif /* __DAOS_COMMON_ALLOC_CLASS_H */ diff --git a/src/common/dav_v2/bucket.c b/src/common/dav_v2/bucket.c new file mode 100644 index 00000000000..33aba6167c5 --- /dev/null +++ b/src/common/dav_v2/bucket.c @@ -0,0 +1,275 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2023, Intel Corporation */ + +/* + * bucket.c -- bucket implementation + * + * Buckets manage volatile state of the heap. They are the abstraction layer + * between the heap-managed chunks/runs and memory allocations. + * + * Each bucket instance can have a different underlying container that is + * responsible for selecting blocks - which means that whether the allocator + * serves memory blocks in best/first/next -fit manner is decided during bucket + * creation. + */ + +#include "alloc_class.h" +#include "bucket.h" +#include "heap.h" +#include "memblock.h" +#include "out.h" +#include "sys_util.h" +#include "valgrind_internal.h" + +struct bucket { + /* this struct is both the lock guard and the locked state */ + struct bucket_locked *locked; + struct alloc_class *aclass; + struct block_container *container; + const struct block_container_ops *c_ops; + struct memory_block_reserved *active_memory_block; + struct zoneset *zset; + int is_active; +}; + +struct bucket_locked { + struct bucket bucket; + pthread_mutex_t lock; +}; + +/* + * bucket_init -- initializes the bucket's runtime state + */ +static int +bucket_init(struct bucket *b, struct block_container *c, + struct alloc_class *aclass) +{ + b->container = c; + b->c_ops = c->c_ops; + + b->is_active = 0; + b->active_memory_block = NULL; + if (aclass && aclass->type == CLASS_RUN) { + D_ALLOC_PTR(b->active_memory_block); + + if (b->active_memory_block == NULL) + return -1; + } + b->aclass = aclass; + + return 0; +} + +/* + * bucket_fini -- destroys the bucket's runtime state + */ +static void +bucket_fini(struct bucket *b) +{ + if (b->active_memory_block) + D_FREE(b->active_memory_block); + b->c_ops->destroy(b->container); +} + +/* + * bucket_locked_new -- creates a new locked bucket instance + */ +struct bucket_locked * +bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct zoneset *zset) +{ + ASSERTne(c, NULL); + + struct bucket_locked *b; + + D_ALLOC_PTR_NZ(b); + if (b == NULL) + return NULL; + + if (bucket_init(&b->bucket, c, aclass) != 0) + goto err_bucket_init; + + util_mutex_init(&b->lock); + b->bucket.locked = b; + b->bucket.zset = zset; + + return b; + +err_bucket_init: + D_FREE(b); + return NULL; +} + +/* + * bucket_locked_delete -- cleanups and deallocates locked bucket instance + */ +void +bucket_locked_delete(struct bucket_locked *b) +{ + bucket_fini(&b->bucket); + util_mutex_destroy(&b->lock); + D_FREE(b); +} + +/* + * bucket_acquire -- acquires a usable bucket struct + */ +struct bucket * +bucket_acquire(struct bucket_locked *b) +{ + util_mutex_lock(&b->lock); + return &b->bucket; +} + +/* + * bucket_release -- releases a bucket struct + */ +void +bucket_release(struct bucket *b) +{ + util_mutex_unlock(&b->locked->lock); +} + +/* + * bucket_try_insert_attached_block -- tries to return a previously allocated + * memory block back to the original bucket + */ +void +bucket_try_insert_attached_block(struct bucket *b, const struct memory_block *m) +{ + struct memory_block *active = &b->active_memory_block->m; + + if (b->is_active && + m->chunk_id == active->chunk_id && + m->zone_id == active->zone_id) { + bucket_insert_block(b, m); + } +} + +/* + * bucket_alloc_class -- returns the bucket's alloc class + */ +struct alloc_class * +bucket_alloc_class(struct bucket *b) +{ + return b->aclass; +} + +/* + * bucket_insert_block -- inserts a block into the bucket + */ +int +bucket_insert_block(struct bucket *b, const struct memory_block *m) +{ +#if VG_MEMCHECK_ENABLED || VG_HELGRIND_ENABLED || VG_DRD_ENABLED + if (On_memcheck || On_drd_or_hg) { + size_t size = m->m_ops->get_real_size(m); + void *data = m->m_ops->get_real_data(m); + + VALGRIND_DO_MAKE_MEM_NOACCESS(data, size); + VALGRIND_ANNOTATE_NEW_MEMORY(data, size); + } +#endif + return b->c_ops->insert(b->container, m); +} + +/* + * bucket_remove_block -- removes an exact block from the bucket + */ +int +bucket_remove_block(struct bucket *b, const struct memory_block *m) +{ + return b->c_ops->get_rm_exact(b->container, m); +} + +/* + * bucket_alloc_block -- allocates a block from the bucket + */ +int +bucket_alloc_block(struct bucket *b, struct memory_block *m_out) +{ + return b->c_ops->get_rm_bestfit(b->container, m_out); +} + +/* + * bucket_memblock_insert_block -- (internal) bucket insert wrapper + * for callbacks + */ +static int +bucket_memblock_insert_block(const struct memory_block *m, void *b) +{ + return bucket_insert_block(b, m); +} + +/* + * bucket_attach_run - attaches a run to a bucket, making it active + */ +int +bucket_attach_run(struct bucket *b, const struct memory_block *m) +{ + pthread_mutex_t *lock = m->m_ops->get_lock(m); + + util_mutex_lock(lock); + + int ret = m->m_ops->iterate_free(m, bucket_memblock_insert_block, b); + + util_mutex_unlock(lock); + + if (ret == 0) { + b->active_memory_block->m = *m; + b->active_memory_block->bucket = b->locked; + b->is_active = 1; + util_fetch_and_add64(&b->active_memory_block->nresv, 1); + } else { + b->c_ops->rm_all(b->container); + } + return 0; +} + +/* + * bucket_detach_run - gets rid of the active block in the bucket + */ +int +bucket_detach_run(struct bucket *b, struct memory_block *m_out, int *empty) +{ + *empty = 0; + + struct memory_block_reserved **active = &b->active_memory_block; + + if (b->is_active) { + b->c_ops->rm_all(b->container); + if (util_fetch_and_sub64(&(*active)->nresv, 1) == 1) { + *m_out = (*active)->m; + *empty = 1; + + VALGRIND_ANNOTATE_HAPPENS_AFTER(&(*active)->nresv); + (*active)->m = MEMORY_BLOCK_NONE; + } else { + VALGRIND_ANNOTATE_HAPPENS_BEFORE(&(*active)->nresv); + *active = NULL; + } + b->is_active = 0; + } + + if (*active == NULL) { + D_ALLOC_PTR(*active); + if (*active == NULL) + return -1; + } + + return 0; +} + +/* + * bucket_active_block -- returns the bucket active block + */ +struct memory_block_reserved * +bucket_active_block(struct bucket *b) +{ + return b->is_active ? b->active_memory_block : NULL; +} + +struct zoneset * +bucket_get_zoneset(struct bucket *b) +{ + return b->zset; +} diff --git a/src/common/dav_v2/bucket.h b/src/common/dav_v2/bucket.h new file mode 100644 index 00000000000..b0d92b66995 --- /dev/null +++ b/src/common/dav_v2/bucket.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2023, Intel Corporation */ + +/* + * bucket.h -- internal definitions for bucket + */ + +#ifndef __DAOS_COMMON_BUCKET_H +#define __DAOS_COMMON_BUCKET_H 1 + +#include <stddef.h> +#include <stdint.h> + +#include "alloc_class.h" +#include "container.h" +#include "memblock.h" + +#define CALC_SIZE_IDX(_unit_size, _size)\ + ((_size) == 0 ? 0 : (uint32_t)((((_size)-1) / (_unit_size)) + 1)) + +struct bucket_locked; +struct bucket; + +struct bucket_locked * +bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct zoneset *zset); + +struct bucket *bucket_acquire(struct bucket_locked *b); +void bucket_release(struct bucket *b); + +struct alloc_class *bucket_alloc_class(struct bucket *b); +int bucket_insert_block(struct bucket *b, const struct memory_block *m); +void bucket_try_insert_attached_block(struct bucket *b, + const struct memory_block *m); +int bucket_remove_block(struct bucket *b, const struct memory_block *m); +int bucket_alloc_block(struct bucket *b, struct memory_block *m_out); + +int bucket_attach_run(struct bucket *b, const struct memory_block *m); +int bucket_detach_run(struct bucket *b, + struct memory_block *m_out, int *empty); + +struct memory_block_reserved *bucket_active_block(struct bucket *b); + +void bucket_locked_delete(struct bucket_locked *b); +struct zoneset * +bucket_get_zoneset(struct bucket *b); + +#endif /* __DAOS_COMMON_BUCKET_H */ diff --git a/src/common/dav_v2/container.h b/src/common/dav_v2/container.h new file mode 100644 index 00000000000..2ec71e88243 --- /dev/null +++ b/src/common/dav_v2/container.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2020, Intel Corporation */ + +/* + * container.h -- internal definitions for block containers + */ + +#ifndef __DAOS_COMMON_CONTAINER_H +#define __DAOS_COMMON_CONTAINER_H 1 + +#include "memblock.h" + +struct block_container { + const struct block_container_ops *c_ops; + struct palloc_heap *heap; +}; + +struct block_container_ops { + /* inserts a new memory block into the container */ + int (*insert)(struct block_container *c, const struct memory_block *m); + + /* removes exact match memory block */ + int (*get_rm_exact)(struct block_container *c, + const struct memory_block *m); + + /* removes and returns the best-fit memory block for size */ + int (*get_rm_bestfit)(struct block_container *c, + struct memory_block *m); + + /* checks whether the container is empty */ + int (*is_empty)(struct block_container *c); + + /* removes all elements from the container */ + void (*rm_all)(struct block_container *c); + + /* deletes the container */ + void (*destroy)(struct block_container *c); +}; + +struct palloc_heap; +struct block_container *container_new_ravl(struct palloc_heap *heap); +struct block_container *container_new_seglists(struct palloc_heap *heap); + +#endif /* __DAOS_COMMON_CONTAINER_H */ diff --git a/src/common/dav_v2/container_ravl.c b/src/common/dav_v2/container_ravl.c new file mode 100644 index 00000000000..8cf5033c44d --- /dev/null +++ b/src/common/dav_v2/container_ravl.c @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2022, Intel Corporation */ + +/* + * container_ravl.c -- implementation of ravl-based block container + */ + +#include "container.h" +#include "ravl.h" +#include "out.h" +#include "sys_util.h" + +struct block_container_ravl { + struct block_container super; + struct ravl *tree; +}; + +/* + * container_compare_memblocks -- (internal) compares two memory blocks + */ +static int +container_compare_memblocks(const void *lhs, const void *rhs) +{ + const struct memory_block *l = lhs; + const struct memory_block *r = rhs; + + int64_t diff = (int64_t)l->size_idx - (int64_t)r->size_idx; + + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->zone_id - (int64_t)r->zone_id; + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->chunk_id - (int64_t)r->chunk_id; + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->block_off - (int64_t)r->block_off; + if (diff != 0) + return diff > 0 ? 1 : -1; + + return 0; +} + +/* + * container_ravl_insert_block -- (internal) inserts a new memory block + * into the container + */ +static int +container_ravl_insert_block(struct block_container *bc, + const struct memory_block *m) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + struct memory_block *e = m->m_ops->get_user_data(m); + + VALGRIND_DO_MAKE_MEM_DEFINED(e, sizeof(*e)); + VALGRIND_ADD_TO_TX(e, sizeof(*e)); + *e = *m; + VALGRIND_SET_CLEAN(e, sizeof(*e)); + VALGRIND_REMOVE_FROM_TX(e, sizeof(*e)); + + return ravl_insert(c->tree, e); +} + +/* + * container_ravl_get_rm_block_bestfit -- (internal) removes and returns the + * best-fit memory block for size + */ +static int +container_ravl_get_rm_block_bestfit(struct block_container *bc, + struct memory_block *m) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + struct ravl_node *n = ravl_find(c->tree, m, + RAVL_PREDICATE_GREATER_EQUAL); + + if (n == NULL) + return ENOMEM; + + struct memory_block *e = ravl_data(n); + *m = *e; + ravl_remove(c->tree, n); + + return 0; +} + +/* + * container_ravl_get_rm_block_exact -- + * (internal) removes exact match memory block + */ +static int +container_ravl_get_rm_block_exact(struct block_container *bc, + const struct memory_block *m) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + struct ravl_node *n = ravl_find(c->tree, m, RAVL_PREDICATE_EQUAL); + + if (n == NULL) + return ENOMEM; + + ravl_remove(c->tree, n); + + return 0; +} + +/* + * container_ravl_is_empty -- (internal) checks whether the container is empty + */ +static int +container_ravl_is_empty(struct block_container *bc) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + return ravl_empty(c->tree); +} + +/* + * container_ravl_rm_all -- (internal) removes all elements from the tree + */ +static void +container_ravl_rm_all(struct block_container *bc) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + ravl_clear(c->tree); +} + +/* + * container_ravl_delete -- (internal) deletes the container + */ +static void +container_ravl_destroy(struct block_container *bc) +{ + struct block_container_ravl *c = + (struct block_container_ravl *)bc; + + ravl_delete(c->tree); + + D_FREE(bc); +} + +/* + * Tree-based block container used to provide best-fit functionality to the + * bucket. The time complexity for this particular container is O(k) where k is + * the length of the key. + * + * The get methods also guarantee that the block with lowest possible address + * that best matches the requirements is provided. + */ +static const struct block_container_ops container_ravl_ops = { + .insert = container_ravl_insert_block, + .get_rm_exact = container_ravl_get_rm_block_exact, + .get_rm_bestfit = container_ravl_get_rm_block_bestfit, + .is_empty = container_ravl_is_empty, + .rm_all = container_ravl_rm_all, + .destroy = container_ravl_destroy, +}; + +/* + * container_new_ravl -- allocates and initializes a ravl container + */ +struct block_container * +container_new_ravl(struct palloc_heap *heap) +{ + struct block_container_ravl *bc; + + D_ALLOC_PTR_NZ(bc); + if (bc == NULL) + goto error_container_malloc; + + bc->super.heap = heap; + bc->super.c_ops = &container_ravl_ops; + bc->tree = ravl_new(container_compare_memblocks); + if (bc->tree == NULL) + goto error_ravl_new; + + return (struct block_container *)&bc->super; + +error_ravl_new: + D_FREE(bc); + +error_container_malloc: + return NULL; +} diff --git a/src/common/dav_v2/container_seglists.c b/src/common/dav_v2/container_seglists.c new file mode 100644 index 00000000000..943d70ad87d --- /dev/null +++ b/src/common/dav_v2/container_seglists.c @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2022, Intel Corporation */ + +/* + * container_seglists.c -- implementation of segregated lists block container + * + * This container is constructed from N (up to 64) intrusive lists and a + * single 8 byte bitmap that stores the information whether a given list is + * empty or not. + */ + +#include "container.h" +#include "out.h" +#include "sys_util.h" +#include "util.h" +#include "valgrind_internal.h" +#include "vecq.h" + +#define SEGLIST_BLOCK_LISTS 64U + +struct block_container_seglists { + struct block_container super; + struct memory_block m; + + VECQ(, uint32_t) blocks[SEGLIST_BLOCK_LISTS]; + uint64_t nonempty_lists; +}; + +/* + * container_seglists_insert_block -- (internal) inserts a new memory block + * into the container + */ +static int +container_seglists_insert_block(struct block_container *bc, + const struct memory_block *m) +{ + ASSERT(m->chunk_id < MAX_CHUNK); + ASSERT(m->zone_id < UINT16_MAX); + ASSERTne(m->size_idx, 0); + + struct block_container_seglists *c = + (struct block_container_seglists *)bc; + + if (c->nonempty_lists == 0) + c->m = *m; + + ASSERT(m->size_idx <= SEGLIST_BLOCK_LISTS); + ASSERT(m->chunk_id == c->m.chunk_id); + ASSERT(m->zone_id == c->m.zone_id); + + if (VECQ_ENQUEUE(&c->blocks[m->size_idx - 1], m->block_off) != 0) + return -1; + + /* marks the list as nonempty */ + c->nonempty_lists |= 1ULL << (m->size_idx - 1); + + return 0; +} + +/* + * container_seglists_get_rm_block_bestfit -- (internal) removes and returns the + * best-fit memory block for size + */ +static int +container_seglists_get_rm_block_bestfit(struct block_container *bc, + struct memory_block *m) +{ + struct block_container_seglists *c = + (struct block_container_seglists *)bc; + + ASSERT(m->size_idx <= SEGLIST_BLOCK_LISTS); + uint32_t i = 0; + + /* applicable lists */ + uint64_t size_mask = (1ULL << (m->size_idx - 1)) - 1; + uint64_t v = c->nonempty_lists & ~size_mask; + + if (v == 0) + return ENOMEM; + + /* finds the list that serves the smallest applicable size */ + i = util_lssb_index64(v); + + uint32_t block_offset = VECQ_DEQUEUE(&c->blocks[i]); + + if (VECQ_SIZE(&c->blocks[i]) == 0) /* marks the list as empty */ + c->nonempty_lists &= ~(1ULL << (i)); + + *m = c->m; + m->block_off = block_offset; + m->size_idx = i + 1; + + return 0; +} + +/* + * container_seglists_is_empty -- (internal) checks whether the container is + * empty + */ +static int +container_seglists_is_empty(struct block_container *bc) +{ + struct block_container_seglists *c = + (struct block_container_seglists *)bc; + + return c->nonempty_lists == 0; +} + +/* + * container_seglists_rm_all -- (internal) removes all elements from the tree + */ +static void +container_seglists_rm_all(struct block_container *bc) +{ + struct block_container_seglists *c = + (struct block_container_seglists *)bc; + + for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i) + VECQ_CLEAR(&c->blocks[i]); + + c->nonempty_lists = 0; +} + +/* + * container_seglists_delete -- (internal) deletes the container + */ +static void +container_seglists_destroy(struct block_container *bc) +{ + struct block_container_seglists *c = + (struct block_container_seglists *)bc; + + for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i) + VECQ_DELETE(&c->blocks[i]); + + D_FREE(c); +} + +/* + * This container does not support retrieval of exact memory blocks, but other + * than provides best-fit in O(1) time for unit sizes that do not exceed 64. + */ +static const struct block_container_ops container_seglists_ops = { + .insert = container_seglists_insert_block, + .get_rm_exact = NULL, + .get_rm_bestfit = container_seglists_get_rm_block_bestfit, + .is_empty = container_seglists_is_empty, + .rm_all = container_seglists_rm_all, + .destroy = container_seglists_destroy, +}; + +/* + * container_new_seglists -- allocates and initializes a seglists container + */ +struct block_container * +container_new_seglists(struct palloc_heap *heap) +{ + struct block_container_seglists *bc; + + D_ALLOC_PTR_NZ(bc); + if (bc == NULL) + goto error_container_malloc; + + bc->super.heap = heap; + bc->super.c_ops = &container_seglists_ops; + + for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i) + VECQ_INIT(&bc->blocks[i]); + bc->nonempty_lists = 0; + + return (struct block_container *)&bc->super; + +error_container_malloc: + return NULL; +} diff --git a/src/common/dav_v2/critnib.c b/src/common/dav_v2/critnib.c new file mode 100644 index 00000000000..8a33d7d883d --- /dev/null +++ b/src/common/dav_v2/critnib.c @@ -0,0 +1,678 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2022, Intel Corporation */ + +/* + * critnib.c -- implementation of critnib tree + * + * It offers identity lookup (like a hashmap) and <= lookup (like a search + * tree). Unlike some hashing algorithms (cuckoo hash, perfect hashing) the + * complexity isn't constant, but for data sizes we expect it's several + * times as fast as cuckoo, and has no "stop the world" cases that would + * cause latency (ie, better worst case behavior). + */ + +/* + * STRUCTURE DESCRIPTION + * + * Critnib is a hybrid between a radix tree and DJ Bernstein's critbit: + * it skips nodes for uninteresting radix nodes (ie, ones that would have + * exactly one child), this requires adding to every node a field that + * describes the slice (4-bit in our case) that this radix level is for. + * + * This implementation also stores each node's path (ie, bits that are + * common to every key in that subtree) -- this doesn't help with lookups + * at all (unused in == match, could be reconstructed at no cost in <= + * after first dive) but simplifies inserts and removes. If we ever want + * that piece of memory it's easy to trim it down. + */ + +/* + * CONCURRENCY ISSUES + * + * Reads are completely lock-free sync-free, but only almost wait-free: + * if for some reason a read thread gets pathologically stalled, it will + * notice the data being stale and restart the work. In usual cases, + * the structure having been modified does _not_ cause a restart. + * + * Writes could be easily made lock-free as well (with only a cmpxchg + * sync), but this leads to problems with removes. A possible solution + * would be doing removes by overwriting by NULL w/o freeing -- yet this + * would lead to the structure growing without bounds. Complex per-node + * locks would increase concurrency but they slow down individual writes + * enough that in practice a simple global write lock works faster. + * + * Removes are the only operation that can break reads. The structure + * can do local RCU well -- the problem being knowing when it's safe to + * free. Any synchronization with reads would kill their speed, thus + * instead we have a remove count. The grace period is DELETED_LIFE, + * after which any read will notice staleness and restart its work. + */ +#include <errno.h> +#include <stdbool.h> + +#include "critnib.h" +#include "out.h" +#include "sys_util.h" +#include "valgrind_internal.h" +#include "util.h" + +/* + * A node that has been deleted is left untouched for this many delete + * cycles. Reads have guaranteed correctness if they took no longer than + * DELETED_LIFE concurrent deletes, otherwise they notice something is + * wrong and restart. The memory of deleted nodes is never freed to + * malloc nor their pointers lead anywhere wrong, thus a stale read will + * (temporarily) get a wrong answer but won't crash. + * + * There's no need to count writes as they never interfere with reads. + * + * Allowing stale reads (of arbitrarily old writes or of deletes less than + * DELETED_LIFE old) might sound counterintuitive, but it doesn't affect + * semantics in any way: the thread could have been stalled just after + * returning from our code. Thus, the guarantee is: the result of get() or + * find_le() is a value that was current at any point between the call + * start and end. + */ +#define DELETED_LIFE 16 + +#define SLICE 4 +#define NIB ((1ULL << SLICE) - 1) +#define SLNODES (1 << SLICE) + +typedef unsigned char sh_t; + +struct critnib_node { + /* + * path is the part of a tree that's already traversed (be it through + * explicit nodes or collapsed links) -- ie, any subtree below has all + * those bits set to this value. + * + * nib is a 4-bit slice that's an index into the node's children. + * + * shift is the length (in bits) of the part of the key below this node. + * + * nib + * |XXXXXXXXXX|?|*****| + * path ^ + * +-----+ + * shift + */ + struct critnib_node *child[SLNODES]; + uint64_t path; + sh_t shift; +}; + +struct critnib_leaf { + uint64_t key; + void *value; +}; + +struct critnib { + struct critnib_node *root; + + /* pool of freed nodes: singly linked list, next at child[0] */ + struct critnib_node *deleted_node; + struct critnib_leaf *deleted_leaf; + + /* nodes removed but not yet eligible for reuse */ + struct critnib_node *pending_del_nodes[DELETED_LIFE]; + struct critnib_leaf *pending_del_leaves[DELETED_LIFE]; + + uint64_t remove_count; + + pthread_mutex_t mutex; /* writes/removes */ +}; + +/* + * atomic load + */ +static void +load(void *src, void *dst) +{ + util_atomic_load_explicit64((uint64_t *)src, (uint64_t *)dst, + memory_order_acquire); +} + +/* + * atomic store + */ +static void +store(void *dst, void *src) +{ + util_atomic_store_explicit64((uint64_t *)dst, (uint64_t)src, + memory_order_release); +} + +/* + * internal: is_leaf -- check tagged pointer for leafness + */ +static inline bool +is_leaf(struct critnib_node *n) +{ + return (uint64_t)n & 1; +} + +/* + * internal: to_leaf -- untag a leaf pointer + */ +static inline struct critnib_leaf * +to_leaf(struct critnib_node *n) +{ + return (void *)((uint64_t)n & ~1ULL); +} + +/* + * internal: path_mask -- return bit mask of a path above a subtree [shift] + * bits tall + */ +static inline uint64_t +path_mask(sh_t shift) +{ + return ~NIB << shift; +} + +/* + * internal: slice_index -- return index of child at the given nib + */ +static inline unsigned +slice_index(uint64_t key, sh_t shift) +{ + return (unsigned)((key >> shift) & NIB); +} + +/* + * critnib_new -- allocates a new critnib structure + */ +struct critnib * +critnib_new(void) +{ + struct critnib *c; + + D_ALLOC_PTR(c); + if (!c) + return NULL; + + util_mutex_init(&c->mutex); + + VALGRIND_HG_DRD_DISABLE_CHECKING(&c->root, sizeof(c->root)); + VALGRIND_HG_DRD_DISABLE_CHECKING(&c->remove_count, + sizeof(c->remove_count)); + + return c; +} + +/* + * internal: delete_node -- recursively free (to malloc) a subtree + */ +static void +delete_node(struct critnib_node *__restrict n) +{ + if (!is_leaf(n)) { + for (int i = 0; i < SLNODES; i++) { + if (n->child[i]) + delete_node(n->child[i]); + } + + D_FREE(n); + } else { + void *ptr; + + ptr = (void *)to_leaf(n); + D_FREE(ptr); + } +} + +/* + * critnib_delete -- destroy and free a critnib struct + */ +void +critnib_delete(struct critnib *c) +{ + if (c->root) + delete_node(c->root); + + util_mutex_destroy(&c->mutex); + + for (struct critnib_node *m = c->deleted_node; m; ) { + struct critnib_node *mm = m->child[0]; + + D_FREE(m); + m = mm; + } + + for (struct critnib_leaf *k = c->deleted_leaf; k; ) { + struct critnib_leaf *kk = k->value; + + D_FREE(k); + k = kk; + } + + for (int i = 0; i < DELETED_LIFE; i++) { + D_FREE(c->pending_del_nodes[i]); + D_FREE(c->pending_del_leaves[i]); + } + + D_FREE(c); +} + +/* + * internal: free_node -- free (to internal pool, not malloc) a node. + * + * We cannot free them to malloc as a stalled reader thread may still walk + * through such nodes; it will notice the result being bogus but only after + * completing the walk, thus we need to ensure any freed nodes still point + * to within the critnib structure. + */ +static void +free_node(struct critnib *__restrict c, struct critnib_node *__restrict n) +{ + if (!n) + return; + + ASSERT(!is_leaf(n)); + n->child[0] = c->deleted_node; + c->deleted_node = n; +} + +/* + * internal: alloc_node -- allocate a node from our pool or from malloc + */ +static struct critnib_node * +alloc_node(struct critnib *__restrict c) +{ + if (!c->deleted_node) { + struct critnib_node *n; + + D_ALLOC_PTR_NZ(n); + if (n == NULL) + D_CRIT("Malloc!\n"); + + return n; + } + + struct critnib_node *n = c->deleted_node; + + c->deleted_node = n->child[0]; + VALGRIND_ANNOTATE_NEW_MEMORY(n, sizeof(*n)); + + return n; +} + +/* + * internal: free_leaf -- free (to internal pool, not malloc) a leaf. + * + * See free_node(). + */ +static void +free_leaf(struct critnib *__restrict c, struct critnib_leaf *__restrict k) +{ + if (!k) + return; + + k->value = c->deleted_leaf; + c->deleted_leaf = k; +} + +/* + * internal: alloc_leaf -- allocate a leaf from our pool or from malloc + */ +static struct critnib_leaf * +alloc_leaf(struct critnib *__restrict c) +{ + if (!c->deleted_leaf) { + struct critnib_leaf *k; + + D_ALLOC_PTR_NZ(k); + if (k == NULL) + D_CRIT("Malloc!\n"); + + return k; + } + + struct critnib_leaf *k = c->deleted_leaf; + + c->deleted_leaf = k->value; + VALGRIND_ANNOTATE_NEW_MEMORY(k, sizeof(*k)); + + return k; +} + +/* + * critnib_insert -- write a key:value pair to the critnib structure + * + * Returns: + * - 0 on success + * - EEXIST if such a key already exists + * - ENOMEM if we're out of memory + * + * Takes a global write lock but doesn't stall any readers. + */ +int +critnib_insert(struct critnib *c, uint64_t key, void *value) +{ + util_mutex_lock(&c->mutex); + + struct critnib_leaf *k = alloc_leaf(c); + + if (!k) { + util_mutex_unlock(&c->mutex); + + return ENOMEM; + } + + VALGRIND_HG_DRD_DISABLE_CHECKING(k, sizeof(struct critnib_leaf)); + + k->key = key; + k->value = value; + + struct critnib_node *kn = (void *)((uint64_t)k | 1); + + struct critnib_node *n = c->root; + + if (!n) { + c->root = kn; + + util_mutex_unlock(&c->mutex); + + return 0; + } + + struct critnib_node **parent = &c->root; + struct critnib_node *prev = c->root; + + while (n && !is_leaf(n) && (key & path_mask(n->shift)) == n->path) { + prev = n; + parent = &n->child[slice_index(key, n->shift)]; + n = *parent; + } + + if (!n) { + n = prev; + store(&n->child[slice_index(key, n->shift)], kn); + + util_mutex_unlock(&c->mutex); + + return 0; + } + + uint64_t path = is_leaf(n) ? to_leaf(n)->key : n->path; + /* Find where the path differs from our key. */ + uint64_t at = path ^ key; + + if (!at) { + ASSERT(is_leaf(n)); + free_leaf(c, to_leaf(kn)); + /* fail instead of replacing */ + + util_mutex_unlock(&c->mutex); + + return EEXIST; + } + + /* and convert that to an index. */ + sh_t sh = util_mssb_index64(at) & (sh_t)~(SLICE - 1); + + struct critnib_node *m = alloc_node(c); + + if (!m) { + free_leaf(c, to_leaf(kn)); + + util_mutex_unlock(&c->mutex); + + return ENOMEM; + } + VALGRIND_HG_DRD_DISABLE_CHECKING(m, sizeof(struct critnib_node)); + + for (int i = 0; i < SLNODES; i++) + m->child[i] = NULL; + + m->child[slice_index(key, sh)] = kn; + m->child[slice_index(path, sh)] = n; + m->shift = sh; + m->path = key & path_mask(sh); + store(parent, m); + + util_mutex_unlock(&c->mutex); + + return 0; +} + +/* + * critnib_remove -- delete a key from the critnib structure, return its value + */ +void * +critnib_remove(struct critnib *c, uint64_t key) +{ + struct critnib_leaf *k; + void *value = NULL; + + util_mutex_lock(&c->mutex); + + struct critnib_node *n = c->root; + + if (!n) + goto not_found; + + uint64_t del = util_fetch_and_add64(&c->remove_count, 1) % DELETED_LIFE; + + free_node(c, c->pending_del_nodes[del]); + free_leaf(c, c->pending_del_leaves[del]); + c->pending_del_nodes[del] = NULL; + c->pending_del_leaves[del] = NULL; + + if (is_leaf(n)) { + k = to_leaf(n); + if (k->key == key) { + store(&c->root, NULL); + goto del_leaf; + } + + goto not_found; + } + /* + * n and k are a parent:child pair (after the first iteration); k is the + * leaf that holds the key we're deleting. + */ + struct critnib_node **k_parent = &c->root; + struct critnib_node **n_parent = &c->root; + struct critnib_node *kn = n; + + while (!is_leaf(kn)) { + n_parent = k_parent; + n = kn; + k_parent = &kn->child[slice_index(key, kn->shift)]; + kn = *k_parent; + + if (!kn) + goto not_found; + } + + k = to_leaf(kn); + if (k->key != key) + goto not_found; + + store(&n->child[slice_index(key, n->shift)], NULL); + + /* Remove the node if there's only one remaining child. */ + int ochild = -1; + + for (int i = 0; i < SLNODES; i++) { + if (n->child[i]) { + if (ochild != -1) + goto del_leaf; + + ochild = i; + } + } + + ASSERTne(ochild, -1); + + store(n_parent, n->child[ochild]); + c->pending_del_nodes[del] = n; + +del_leaf: + value = k->value; + c->pending_del_leaves[del] = k; + +not_found: + util_mutex_unlock(&c->mutex); + return value; +} + +/* + * critnib_get -- query for a key ("==" match), returns value or NULL + * + * Doesn't need a lock but if many deletes happened while our thread was + * somehow stalled the query is restarted (as freed nodes remain unused only + * for a grace period). + * + * Counterintuitively, it's pointless to return the most current answer, + * we need only one that was valid at any point after the call started. + */ +void * +critnib_get(struct critnib *c, uint64_t key) +{ + uint64_t wrs1, wrs2; + void *res; + + do { + struct critnib_node *n; + + load(&c->remove_count, &wrs1); + load(&c->root, &n); + + /* + * critbit algorithm: dive into the tree, looking at nothing but + * each node's critical bit^H^H^Hnibble. This means we risk + * going wrong way if our path is missing, but that's ok... + */ + while (n && !is_leaf(n)) + load(&n->child[slice_index(key, n->shift)], &n); + + /* ... as we check it at the end. */ + struct critnib_leaf *k = to_leaf(n); + + res = (n && k->key == key) ? k->value : NULL; + load(&c->remove_count, &wrs2); + } while (wrs1 + DELETED_LIFE <= wrs2); + + return res; +} + +/* + * internal: find_successor -- return the rightmost non-null node in a subtree + */ +static void * +find_successor(struct critnib_node *__restrict n) +{ + while (1) { + int nib; + + for (nib = NIB; nib >= 0; nib--) + if (n->child[nib]) + break; + + if (nib < 0) + return NULL; + + n = n->child[nib]; + if (is_leaf(n)) + return to_leaf(n)->value; + } +} + +/* + * internal: find_le -- recursively search <= in a subtree + */ +static void * +find_le(struct critnib_node *__restrict n, uint64_t key) +{ + if (!n) + return NULL; + + if (is_leaf(n)) { + struct critnib_leaf *k = to_leaf(n); + + return (k->key <= key) ? k->value : NULL; + } + + /* + * is our key outside the subtree we're in? + * + * If we're inside, all bits above the nib will be identical; note + * that shift points at the nib's lower rather than upper edge, so it + * needs to be masked away as well. + */ + if ((key ^ n->path) >> (n->shift) & ~NIB) { + /* + * subtree is too far to the left? + * -> its rightmost value is good + */ + if (n->path < key) + return find_successor(n); + + /* + * subtree is too far to the right? + * -> it has nothing of interest to us + */ + return NULL; + } + + unsigned nib = slice_index(key, n->shift); + + /* recursive call: follow the path */ + { + struct critnib_node *m; + + load(&n->child[nib], &m); + + void *value = find_le(m, key); + + if (value) + return value; + } + + /* + * nothing in that subtree? We strayed from the path at this point, + * thus need to search every subtree to our left in this node. No + * need to dive into any but the first non-null, though. + */ + for (; nib > 0; nib--) { + struct critnib_node *m; + + load(&n->child[nib - 1], &m); + if (m) { + n = m; + if (is_leaf(n)) + return to_leaf(n)->value; + + return find_successor(n); + } + } + + return NULL; +} + +/* + * critnib_find_le -- query for a key ("<=" match), returns value or NULL + * + * Same guarantees as critnib_get(). + */ +void * +critnib_find_le(struct critnib *c, uint64_t key) +{ + uint64_t wrs1, wrs2; + void *res; + + do { + load(&c->remove_count, &wrs1); + + struct critnib_node *n; /* avoid a subtle TOCTOU */ + + load(&c->root, &n); + res = n ? find_le(n, key) : NULL; + load(&c->remove_count, &wrs2); + } while (wrs1 + DELETED_LIFE <= wrs2); + + return res; +} diff --git a/src/common/dav_v2/critnib.h b/src/common/dav_v2/critnib.h new file mode 100644 index 00000000000..b07815fba4c --- /dev/null +++ b/src/common/dav_v2/critnib.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2020, Intel Corporation */ + +/* + * critnib.h -- internal definitions for critnib tree + */ + +#ifndef __DAOS_COMMON_CRITNIB_H +#define __DAOS_COMMON_CRITNIB_H 1 + +#include <stdint.h> + +struct critnib; + +struct critnib *critnib_new(void); +void critnib_delete(struct critnib *c); + +int critnib_insert(struct critnib *c, uint64_t key, void *value); +void *critnib_remove(struct critnib *c, uint64_t key); +void *critnib_get(struct critnib *c, uint64_t key); +void *critnib_find_le(struct critnib *c, uint64_t key); + +#endif /* __DAOS_COMMON_CRITNIB_H */ diff --git a/src/common/dav_v2/dav_clogs.c b/src/common/dav_v2/dav_clogs.c new file mode 100644 index 00000000000..a27eabe02d6 --- /dev/null +++ b/src/common/dav_v2/dav_clogs.c @@ -0,0 +1,104 @@ +/** + * (C) Copyright 2015-2022 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#include <sys/types.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> +#include <sys/stat.h> +#include <sys/mman.h> + +#include "dav_internal.h" +#include "memops.h" +#include "tx.h" + +static void +clogs_extend_free(struct ulog *redo) +{ + D_FREE(redo); +} + +static int +clogs_extend_redo(struct ulog **redo, uint64_t gen_num) +{ + size_t size = SIZEOF_ALIGNED_ULOG(LANE_REDO_EXTERNAL_SIZE); + + D_ALIGNED_ALLOC_NZ(*redo, CACHELINE_SIZE, size); + if (*redo == NULL) + return -1; + + size_t capacity = ALIGN_DOWN(size - sizeof(struct ulog), CACHELINE_SIZE); + + ulog_construct_new(*redo, capacity, gen_num, 0); + return 0; +} + +static int +clogs_extend_undo(struct ulog **undo, uint64_t gen_num) +{ + size_t size = TX_DEFAULT_RANGE_CACHE_SIZE; + + D_ALIGNED_ALLOC_NZ(*undo, CACHELINE_SIZE, size); + if (*undo == NULL) + return -1; + + size_t capacity = ALIGN_DOWN(size - sizeof(struct ulog), CACHELINE_SIZE); + + ulog_construct_new(*undo, capacity, gen_num, 0); + return 0; +} + +int +dav_create_clogs(dav_obj_t *hdl) +{ + + ulog_construct_new((struct ulog *)&hdl->clogs.external, + LANE_REDO_EXTERNAL_SIZE, 0, 0); + ulog_construct_new((struct ulog *)&hdl->clogs.undo, + LANE_UNDO_SIZE, 0, 0); + + hdl->external = operation_new((struct ulog *)&hdl->clogs.external, + LANE_REDO_EXTERNAL_SIZE, clogs_extend_redo, clogs_extend_free, + &hdl->p_ops, LOG_TYPE_REDO); + if (hdl->external == NULL) + return -1; + hdl->undo = operation_new((struct ulog *)&hdl->clogs.undo, + LANE_UNDO_SIZE, clogs_extend_undo, clogs_extend_free, + &hdl->p_ops, LOG_TYPE_UNDO); + if (hdl->undo == NULL) { + operation_delete(hdl->external); + return -1; + } + return 0; +} + +void +dav_destroy_clogs(dav_obj_t *hdl) +{ + operation_free_logs(hdl->external); + operation_delete(hdl->external); + operation_free_logs(hdl->undo); + operation_delete(hdl->undo); +} + +int +dav_hold_clogs(dav_obj_t *hdl) +{ + if (hdl->nested_tx++ == 0) { + operation_init(hdl->external); + operation_init(hdl->undo); + } + return 0; +} + +int +dav_release_clogs(dav_obj_t *hdl) +{ + if (hdl->nested_tx == 0) + FATAL("release clogs"); + --hdl->nested_tx; + return 0; +} diff --git a/src/common/dav_v2/dav_clogs.h b/src/common/dav_v2/dav_clogs.h new file mode 100644 index 00000000000..8c7af256ccc --- /dev/null +++ b/src/common/dav_v2/dav_clogs.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2022, Intel Corporation */ + +/* + * dav_iface.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV) + */ + +#ifndef __DAOS_COMMON_DAV_CLOGS_H +#define __DAOS_COMMON_CLOGS_H 1 + +#include <stdint.h> +#include <sys/types.h> +#include "ulog.h" + +#define LANE_TOTAL_SIZE (3072) /* 3 * 1024 (sum of 3 old lane sections) */ +/* + * We have 3 kilobytes to distribute be split between transactional redo + * and undo logs. + * Since by far the most space consuming operations are transactional + * snapshots, most of the space, 2304 bytes, is assigned to the undo log. + * After that, the remainder, 640 bytes, or 40 ulog entries, is left for the + * transactional redo logs. + * Thanks to this distribution, all small and medium transactions should be + * entirely performed without allocating any additional metadata. + * + * These values must be cacheline size aligned to be used for ulogs. Therefore + * they are parametrized for the size of the struct ulog changes between + * platforms. + */ +#define LANE_UNDO_SIZE (LANE_TOTAL_SIZE \ + - LANE_REDO_EXTERNAL_SIZE \ + - 2 * sizeof(struct ulog)) /* 2304 for 64B ulog */ +#define LANE_REDO_EXTERNAL_SIZE ALIGN_UP(704 - sizeof(struct ulog), \ + CACHELINE_SIZE) /* 640 for 64B ulog */ + +struct dav_clogs { + /* + * Redo log for large operations/transactions. + * Can be extended by the use of internal ulog. + */ + struct ULOG(LANE_REDO_EXTERNAL_SIZE) external; + /* + * Undo log for snapshots done in a transaction. + * Can be extended/shrunk by the use of internal ulog. + */ + struct ULOG(LANE_UNDO_SIZE) undo; +}; + +typedef struct dav_obj dav_obj_t; + +int dav_create_clogs(dav_obj_t *hdl); +void dav_destroy_clogs(dav_obj_t *hdl); +int dav_hold_clogs(dav_obj_t *hdl); +int dav_release_clogs(dav_obj_t *hdl); + +#endif /* __DAOS_COMMON_DAV_CLOGS_H */ diff --git a/src/common/dav_v2/dav_iface.c b/src/common/dav_v2/dav_iface.c new file mode 100644 index 00000000000..3879e56c2d4 --- /dev/null +++ b/src/common/dav_v2/dav_iface.c @@ -0,0 +1,434 @@ +/** + * (C) Copyright 2015-2023 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#include <sys/types.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <uuid/uuid.h> + +#include <daos/mem.h> +#include "dav_internal.h" +#include "heap.h" +#include "palloc.h" +#include "mo_wal.h" +#include "obj.h" + +#define DAV_HEAP_INIT 0x1 +#define MEGABYTE ((uintptr_t)1 << 20) + +/* + * get_uuid_lo -- (internal) evaluates XOR sum of least significant + * 8 bytes with most significant 8 bytes. + */ +static inline uint64_t +get_uuid_lo(uuid_t uuid) +{ + uint64_t uuid_lo = 0; + + for (int i = 0; i < 8; i++) + uuid_lo = (uuid_lo << 8) | (uuid[i] ^ uuid[8 + i]); + + return uuid_lo; +} + +static void +setup_dav_phdr(dav_obj_t *hdl) +{ + struct dav_phdr *hptr; + uuid_t uuid; + + ASSERT(hdl->do_base != NULL); + hptr = (struct dav_phdr *)(hdl->do_base); + uuid_generate(uuid); + hptr->dp_uuid_lo = get_uuid_lo(uuid); + hptr->dp_root_offset = 0; + hptr->dp_root_size = 0; + hptr->dp_heap_offset = sizeof(struct dav_phdr); + hptr->dp_heap_size = hdl->do_size - sizeof(struct dav_phdr); + hptr->dp_stats_persistent.heap_curr_allocated = 0; + hdl->do_phdr = hptr; +} + +static void +persist_dav_phdr(dav_obj_t *hdl) +{ + mo_wal_persist(&hdl->p_ops, hdl->do_phdr, offsetof(struct dav_phdr, dp_unused)); +} + +static dav_obj_t * +dav_obj_open_internal(int fd, int flags, size_t sz, const char *path, struct umem_store *store) +{ + dav_obj_t *hdl = NULL; + void *base; + char *heap_base; + uint64_t heap_size; + uint64_t num_pages; + int persist_hdr = 0; + int err = 0; + int rc; + + base = mmap(NULL, sz, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (base == MAP_FAILED) + return NULL; + + D_ALIGNED_ALLOC(hdl, CACHELINE_SIZE, sizeof(dav_obj_t)); + if (hdl == NULL) { + err = ENOMEM; + goto out0; + } + + /* REVISIT: In future pass the meta instance as argument instead of fd */ + hdl->do_fd = fd; + hdl->do_base = base; + hdl->do_size = sz; + hdl->p_ops.base = hdl; + + hdl->do_store = store; + if (hdl->do_store->stor_priv == NULL) { + D_ERROR("meta context not defined. WAL commit disabled for %s\n", path); + } else { + rc = umem_cache_alloc(store, 0); + if (rc != 0) { + D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc)); + err = rc; + goto out1; + } + } + + D_STRNDUP(hdl->do_path, path, strlen(path)); + + num_pages = (sz + UMEM_CACHE_PAGE_SZ - 1) >> UMEM_CACHE_PAGE_SZ_SHIFT; + rc = umem_cache_map_range(hdl->do_store, 0, base, num_pages); + if (rc != 0) { + D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc)); + err = rc; + goto out2; + } + + if (flags & DAV_HEAP_INIT) { + setup_dav_phdr(hdl); + heap_base = (char *)hdl->do_base + hdl->do_phdr->dp_heap_offset; + heap_size = hdl->do_phdr->dp_heap_size; + + rc = lw_tx_begin(hdl); + if (rc) { + err = ENOMEM; + goto out2; + } + + rc = heap_init(heap_base, heap_size, &hdl->do_phdr->dp_heap_size, + &hdl->p_ops); + if (rc) { + err = rc; + goto out2; + } + persist_hdr = 1; + } else { + hdl->do_phdr = hdl->do_base; + + D_ASSERT(store != NULL); + + rc = store->stor_ops->so_load(store, hdl->do_base); + if (rc) { + D_ERROR("Failed to read blob to vos file %s, rc = %d\n", path, rc); + goto out2; + } + + rc = hdl->do_store->stor_ops->so_wal_replay(hdl->do_store, dav_wal_replay_cb, hdl); + if (rc) { + err = rc; + goto out2; + } + + heap_base = (char *)hdl->do_base + hdl->do_phdr->dp_heap_offset; + heap_size = hdl->do_phdr->dp_heap_size; + + rc = lw_tx_begin(hdl); + if (rc) { + err = ENOMEM; + goto out2; + } + } + + hdl->do_stats = stats_new(hdl); + if (hdl->do_stats == NULL) + goto out2; + + D_ALLOC_PTR(hdl->do_heap); + if (hdl->do_heap == NULL) { + err = ENOMEM; + goto out2; + } + + rc = heap_boot(hdl->do_heap, heap_base, heap_size, + &hdl->do_phdr->dp_heap_size, hdl->do_base, + &hdl->p_ops, hdl->do_stats, NULL); + if (rc) { + err = rc; + goto out2; + } + +#if VG_MEMCHECK_ENABLED + if (On_memcheck) + palloc_heap_vg_open(hdl->do_heap, 1); +#endif + + rc = dav_create_clogs(hdl); + if (rc) { + err = rc; + heap_cleanup(hdl->do_heap); + goto out2; + } + + if (persist_hdr) + persist_dav_phdr(hdl); + + lw_tx_end(hdl, NULL); + +#if VG_MEMCHECK_ENABLED + if (On_memcheck) { + /* mark unused part of the pool as not accessible */ + void *end = palloc_heap_end(hdl->do_heap); + + VALGRIND_DO_MAKE_MEM_NOACCESS(end, + OBJ_OFF_TO_PTR(hdl, heap_size) - end); + } +#endif + return hdl; + +out2: + if (hdl->do_stats) + stats_delete(hdl, hdl->do_stats); + if (hdl->do_heap) + D_FREE(hdl->do_heap); + if (hdl->do_utx) { + dav_umem_wtx_cleanup(hdl->do_utx); + D_FREE(hdl->do_utx); + } + D_FREE(hdl->do_path); + umem_cache_free(hdl->do_store); +out1: + D_FREE(hdl); +out0: + munmap(base, sz); + errno = err; + return NULL; + +} + +DAV_FUNC_EXPORT dav_obj_t * +dav_obj_create_v2(const char *path, int flags, size_t sz, mode_t mode, struct umem_store *store) +{ + int fd; + dav_obj_t *hdl; + struct stat statbuf; + + SUPPRESS_UNUSED(flags); + + if (sz == 0) { + /* Open the file and obtain the size */ + fd = open(path, O_RDWR|O_CLOEXEC); + if (fd == -1) + return NULL; + + if (fstat(fd, &statbuf) != 0) { + close(fd); + return NULL; + } + sz = statbuf.st_size; + } else { + fd = open(path, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, mode); + if (fd == -1) + return NULL; + + if (fallocate(fd, 0, 0, (off_t)sz) == -1) { + close(fd); + errno = ENOSPC; + return NULL; + } + } + + if (!store->stor_size || (sz < store->stor_size)) { + ERR("Invalid umem_store size"); + errno = EINVAL; + close(fd); + return NULL; + } + + hdl = dav_obj_open_internal(fd, DAV_HEAP_INIT, store->stor_size, path, store); + if (hdl == NULL) { + close(fd); + return NULL; + } + DAV_DBG("pool %s created, size="DF_U64"", hdl->do_path, sz); + return hdl; +} + +DAV_FUNC_EXPORT dav_obj_t * +dav_obj_open_v2(const char *path, int flags, struct umem_store *store) +{ + size_t size; + int fd; + dav_obj_t *hdl; + struct stat statbuf; + + SUPPRESS_UNUSED(flags); + + fd = open(path, O_RDWR|O_CLOEXEC); + if (fd == -1) + return NULL; + + if (fstat(fd, &statbuf) != 0) { + close(fd); + return NULL; + } + size = (size_t)statbuf.st_size; + + if (!store->stor_size || (size < store->stor_size)) { + ERR("Invalid umem_store size"); + errno = EINVAL; + close(fd); + return NULL; + } + + hdl = dav_obj_open_internal(fd, 0, store->stor_size, path, store); + if (hdl == NULL) { + close(fd); + return NULL; + } + DAV_DBG("pool %s is open, size="DF_U64"", hdl->do_path, size); + return hdl; +} + +DAV_FUNC_EXPORT void +dav_obj_close_v2(dav_obj_t *hdl) +{ + + if (hdl == NULL) { + ERR("NULL handle"); + return; + } + dav_destroy_clogs(hdl); + heap_cleanup(hdl->do_heap); + D_FREE(hdl->do_heap); + + stats_delete(hdl, hdl->do_stats); + + munmap(hdl->do_base, hdl->do_size); + close(hdl->do_fd); + if (hdl->do_utx) { + dav_umem_wtx_cleanup(hdl->do_utx); + D_FREE(hdl->do_utx); + } + umem_cache_free(hdl->do_store); + DAV_DBG("pool %s is closed", hdl->do_path); + D_FREE(hdl->do_path); + D_FREE(hdl); +} + +DAV_FUNC_EXPORT void * +dav_get_base_ptr_v2(dav_obj_t *hdl) +{ + return hdl->do_base; +} + +DAV_FUNC_EXPORT int +dav_class_register_v2(dav_obj_t *pop, struct dav_alloc_class_desc *p) +{ + uint8_t id = (uint8_t)p->class_id; + struct alloc_class_collection *ac = heap_alloc_classes(pop->do_heap); + + if (p->unit_size <= 0 || p->unit_size > DAV_MAX_ALLOC_SIZE || + p->units_per_block <= 0) { + errno = EINVAL; + return -1; + } + + if (p->alignment != 0 && p->unit_size % p->alignment != 0) { + ERR("unit size must be evenly divisible by alignment"); + errno = EINVAL; + return -1; + } + + if (p->alignment > (MEGABYTE * 2)) { + ERR("alignment cannot be larger than 2 megabytes"); + errno = EINVAL; + return -1; + } + + if (p->class_id >= MAX_ALLOCATION_CLASSES) { + ERR("class id outside of the allowed range"); + errno = ERANGE; + return -1; + } + + enum header_type lib_htype = MAX_HEADER_TYPES; + + switch (p->header_type) { + case DAV_HEADER_LEGACY: + lib_htype = HEADER_LEGACY; + break; + case DAV_HEADER_COMPACT: + lib_htype = HEADER_COMPACT; + break; + case DAV_HEADER_NONE: + lib_htype = HEADER_NONE; + break; + case MAX_DAV_HEADER_TYPES: + default: + ERR("invalid header type"); + errno = EINVAL; + return -1; + } + + if (id == 0) { + if (alloc_class_find_first_free_slot(ac, &id) != 0) { + ERR("no available free allocation class identifier"); + errno = EINVAL; + return -1; + } + } else { + if (alloc_class_reserve(ac, id) != 0) { + ERR("attempted to overwrite an allocation class"); + errno = EEXIST; + return -1; + } + } + + size_t runsize_bytes = + CHUNK_ALIGN_UP((p->units_per_block * p->unit_size) + + RUN_BASE_METADATA_SIZE); + + /* aligning the buffer might require up-to to 'alignment' bytes */ + if (p->alignment != 0) + runsize_bytes += p->alignment; + + uint32_t size_idx = (uint32_t)(runsize_bytes / CHUNKSIZE); + + if (size_idx > UINT16_MAX) + size_idx = UINT16_MAX; + + struct alloc_class *c = alloc_class_new(id, + heap_alloc_classes(pop->do_heap), CLASS_RUN, + lib_htype, p->unit_size, p->alignment, size_idx); + if (c == NULL) { + errno = EINVAL; + return -1; + } + + if (heap_create_alloc_class_buckets(pop->do_heap, c) != 0) { + alloc_class_delete(ac, c); + return -1; + } + + p->class_id = c->id; + p->units_per_block = c->rdsc.nallocs; + + return 0; +} diff --git a/src/common/dav_v2/dav_internal.h b/src/common/dav_v2/dav_internal.h new file mode 100644 index 00000000000..408f03a01ae --- /dev/null +++ b/src/common/dav_v2/dav_internal.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2022, Intel Corporation */ + +/* + * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV) + */ + +#ifndef __DAOS_COMMON_DAV_INTERNAL_H +#define __DAOS_COMMON_DAV_INTERNAL_H 1 + +#include "dav_v2.h" +#include "dav_clogs.h" +#include "heap.h" +#include "mo_wal.h" +#include "wal_tx.h" + +#define DAV_FUNC_EXPORT __attribute__ ((visibility ("default"))) + +#define DAV_MAX_ALLOC_SIZE ((size_t)0x3FFDFFFC0) + +enum dav_tx_failure_behavior { + DAV_TX_FAILURE_ABORT, + DAV_TX_FAILURE_RETURN, +}; + +enum dav_stats_enabled { + DAV_STATS_ENABLED_TRANSIENT, + DAV_STATS_ENABLED_BOTH, + DAV_STATS_ENABLED_PERSISTENT, + DAV_STATS_DISABLED, +}; + +#define DAV_PHDR_SIZE 4096 + +/* DAV header data that will be persisted */ +struct dav_phdr { + uint64_t dp_uuid_lo; + uint64_t dp_heap_offset; + uint64_t dp_heap_size; + uint64_t dp_root_offset; + uint64_t dp_root_size; + struct stats_persistent dp_stats_persistent; + char dp_unused[DAV_PHDR_SIZE - sizeof(uint64_t)*5 - + sizeof(struct stats_persistent)]; +}; + +/* DAV object handle */ +typedef struct dav_obj { + char *do_path; + uint64_t do_size; + void *do_base; + struct palloc_heap *do_heap; + struct dav_phdr *do_phdr; + struct operation_context *external; + struct operation_context *undo; + struct mo_ops p_ops; /* REVISIT */ + struct stats *do_stats; + int do_fd; + int nested_tx; + struct umem_wal_tx *do_utx; + struct umem_store *do_store; + + struct dav_clogs clogs __attribute__ ((__aligned__(CACHELINE_SIZE))); +} dav_obj_t; + +static inline +struct dav_tx *utx2wtx(struct umem_wal_tx *utx) +{ + return (struct dav_tx *)&utx->utx_private; +} + +static inline +struct umem_wal_tx *wtx2utx(struct dav_tx *wtx) +{ + return (struct umem_wal_tx *)((void *)wtx + - (ptrdiff_t)offsetof(struct umem_wal_tx, utx_private)); +} + +int lw_tx_begin(dav_obj_t *pop); +int lw_tx_end(dav_obj_t *pop, void *data); + +#endif /* __DAOS_COMMON_DAV_INTERNAL_H */ diff --git a/src/common/dav_v2/dav_v2.h b/src/common/dav_v2/dav_v2.h new file mode 100644 index 00000000000..4d5094ba195 --- /dev/null +++ b/src/common/dav_v2/dav_v2.h @@ -0,0 +1,307 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2023, Intel Corporation */ + +/* + * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV) + */ + +#ifndef __DAOS_COMMON_DAV_V2_H +#define __DAOS_COMMON_DAV_V2_H 1 + +#include <setjmp.h> +#include <stddef.h> +#include <stdint.h> +#include <sys/stat.h> +#include "../dav/dav.h" + +typedef struct dav_obj dav_obj_t; +struct umem_store; + +/** + * Create and initialize a DAV object and return its handle. + * + * \param[in] path Path of the vos file. + * + * \param[in] flags additional flags (Future). + * + * \param[in] sz size of the file/heap. + * + * \param[in] mode permission to use while creating the file. + * + * \param[in] store backing umem store. + * + * \return Returns the pointer to the object handle. Upon failure, + * it returns NULL with errno set appropriately. + */ +dav_obj_t * +dav_obj_create_v2(const char *path, int flags, size_t sz, mode_t mode, struct umem_store *store); + +/** + * Open and initialize a DAV object and return its handle. + * + * \param[in] path Path of the vos file. + * + * \param[in] flags additional flags (Future). + * + * \param[in] store backing umem store. + * + * \return Returns the pointer to the object handle. Upon failure, + * it returns NULL with errno set appropriately. + */ +dav_obj_t * +dav_obj_open_v2(const char *path, int flags, struct umem_store *store); + +/** + * Close the DAV object + * + * \param[in] hdl DAV handle + */ +void +dav_obj_close_v2(dav_obj_t *hdl); + +/** + * Return the pointer to the base of the heap. + * + * \param[in] hdl DAV handle + * + * \return Returns the pointer to the base of the heap pointed to + * by hdl. + */ +void * +dav_get_base_ptr_v2(dav_obj_t *hdl); + +typedef int (*dav_constr)(dav_obj_t *pop, void *ptr, void *arg); + +/* + * Allocates a new object from the pool and calls a constructor function before + * returning. It is guaranteed that allocated object is either properly + * initialized, or if it's interrupted before the constructor completes, the + * memory reserved for the object is automatically reclaimed. + */ +int +dav_alloc_v2(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags, + dav_constr constructor, void *arg); + +/** + * Frees the memory at specified offset within the DAV object pointed to by hdl. + * + * \param[in] hdl DAV handle. + * + * \param[in] off offset to the memory location. off should correspond + * to the offset returned by previous call to dav_malloc(). + */ +void +dav_free_v2(dav_obj_t *pop, uint64_t off); + +/* + * DAV version of memcpy. Data copied is made persistent in blob. + */ +void * +dav_memcpy_persist_v2(dav_obj_t *pop, void *dest, const void *src, size_t len); +/* + * DAV version of memcpy with deferred commit to blob. + */ +void * +dav_memcpy_persist_relaxed_v2(dav_obj_t *pop, void *dest, const void *src, size_t len); + +/* + * If called for the first time on a newly created dav heap, the root object + * of given size is allocated. Otherwise, it returns the existing root object. + * In such case, the size must be not less than the actual root object size + * stored in the pool. If it's larger, the root object is automatically + * resized. + * + * This function is currently *not* thread-safe. + */ +uint64_t +dav_root_v2(dav_obj_t *pop, size_t size); + +/* + * Starts a new transaction in the current thread. + * If called within an open transaction, starts a nested transaction. + * + * If successful, transaction stage changes to TX_STAGE_WORK and function + * returns zero. Otherwise, stage changes to TX_STAGE_ONABORT and an error + * number is returned. + */ +int +dav_tx_begin_v2(dav_obj_t *pop, jmp_buf env, ...); + +/* + * Aborts current transaction + * + * Causes transition to TX_STAGE_ONABORT. + * + * This function must be called during TX_STAGE_WORK. + */ +void +dav_tx_abort_v2(int errnum); + +/* + * Commits current transaction + * + * This function must be called during TX_STAGE_WORK. + */ +void +dav_tx_commit_v2(void); + +/* + * Cleanups current transaction. Must always be called after dav_tx_begin, + * even if starting the transaction failed. + * + * If called during TX_STAGE_NONE, has no effect. + * + * Always causes transition to TX_STAGE_NONE. + * + * If transaction was successful, returns 0. Otherwise returns error code set + * by dav_tx_abort. + * + * This function must *not* be called during TX_STAGE_WORK. + */ +int +dav_tx_end_v2(void *data); + +/* + * Returns the current stage of the transaction. + */ +enum dav_tx_stage +dav_tx_stage_v2(void); + +/* + * Returns last transaction error code. + */ +int +dav_tx_errno_v2(void); + +/* + * Transactionally allocates a new object. + * + * If successful, returns offset of the object in the heap. + * Otherwise, stage changes to TX_STAGE_ONABORT and an zero is returned. + * 'Flags' is a bitmask of the following values: + * - POBJ_XALLOC_ZERO - zero the allocated object + * - POBJ_XALLOC_NO_FLUSH - skip flush on commit + * - POBJ_XALLOC_NO_ABORT - if the function does not end successfully, + * - DAV_CLASS_ID(id) - id of allocation class to use. + * - DAV_EZONE_ID(id) - id of zone to use. + * do not abort the transaction and return the error number. + * + * This function must be called during TX_STAGE_WORK. + */ +uint64_t +dav_tx_alloc_v2(size_t size, uint64_t type_num, uint64_t flags); + +/* + * Transactionally frees an existing object. + * + * If successful, returns zero. + * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned. + * + * This function must be called during TX_STAGE_WORK. + */ +int +dav_tx_free_v2(uint64_t off); + +/* + * Takes a "snapshot" of the memory block of given size and located at given + * offset 'off' in the object 'oid' and saves it in the undo log. + * The application is then free to directly modify the object in that memory + * range. In case of failure or abort, all the changes within this range will + * be rolled-back automatically. + * + * If successful, returns zero. + * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned. + * + * This function must be called during TX_STAGE_WORK. + */ +int +dav_tx_add_range_v2(uint64_t off, size_t size); + +/* + * Takes a "snapshot" of the given memory region and saves it in the undo log. + * The application is then free to directly modify the object in that memory + * range. In case of failure or abort, all the changes within this range will + * be rolled-back automatically. The supplied block of memory has to be within + * the given pool. + * + * If successful, returns zero. + * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned. + * + * This function must be called during TX_STAGE_WORK. + */ +int +dav_tx_add_range_direct_v2(const void *ptr, size_t size); + +/* + * Behaves exactly the same as dav_tx_add_range when 'flags' equals 0. + * 'Flags' is a bitmask of the following values: + * - POBJ_XADD_NO_FLUSH - skips flush on commit + * - POBJ_XADD_NO_SNAPSHOT - added range will not be snapshotted + * - POBJ_XADD_ASSUME_INITIALIZED - added range is assumed to be initialized + * - POBJ_XADD_NO_ABORT - if the function does not end successfully, + * do not abort the transaction and return the error number. + */ +int +dav_tx_xadd_range_v2(uint64_t off, size_t size, uint64_t flags); + +/* + * Behaves exactly the same as dav_tx_add_range_direct when 'flags' equals + * 0. 'Flags' is a bitmask of the following values: + * - POBJ_XADD_NO_FLUSH - skips flush on commit + * - POBJ_XADD_NO_SNAPSHOT - added range will not be snapshotted + * - POBJ_XADD_ASSUME_INITIALIZED - added range is assumed to be initialized + * - POBJ_XADD_NO_ABORT - if the function does not end successfully, + * do not abort the transaction and return the error number. + */ +int +dav_tx_xadd_range_direct_v2(const void *ptr, size_t size, uint64_t flags); + +/* + * Converts the offset to a pointer in the context of heap associated with + * current transaction. + */ +void * +dav_tx_off2ptr_v2(uint64_t off); + +#define DAV_ACTION_XRESERVE_VALID_FLAGS \ + (DAV_XALLOC_CLASS_MASK | DAV_XALLOC_EZONE_MASK | DAV_XALLOC_ZERO) + +struct dav_action; +uint64_t +dav_reserve_v2(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num, + uint64_t flags); +void +dav_defer_free_v2(dav_obj_t *pop, uint64_t off, struct dav_action *act); +void +dav_cancel_v2(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt); +int +dav_tx_publish_v2(struct dav_action *actv, size_t actvcnt); + +struct dav_alloc_class_desc; +/* + * Registers an allocation class handle with the DAV object. + */ +int +dav_class_register_v2(dav_obj_t *pop, struct dav_alloc_class_desc *p); + +struct dav_heap_stats; +/* + * Returns the heap allocation statistics associated with the + * DAV object. + */ +int +dav_get_heap_stats_v2(dav_obj_t *pop, struct dav_heap_stats *st); + +/** + * Get an evictable zone with sufficient free space within. + * + * \param[in] pop pool handle + * \param[in] flags zone selection criteria. + * + * \return id >= 0. Zero indicates non-evictable zone and will be + * returned if no evictable zone can be chosen. + */ +uint32_t +dav_get_zone_evictable_v2(dav_obj_t *pop, int flags); + +#endif /* __DAOS_COMMON_DAV_V2_H */ diff --git a/src/common/dav_v2/heap.c b/src/common/dav_v2/heap.c new file mode 100644 index 00000000000..9c2ed954d5d --- /dev/null +++ b/src/common/dav_v2/heap.c @@ -0,0 +1,1398 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2023, Intel Corporation */ + +/* + * heap.c -- heap implementation + */ + +#include <errno.h> +#include <unistd.h> +#include <string.h> +#include <float.h> + +#include "bucket.h" +#include "dav_internal.h" +#include "memblock.h" +#include "queue.h" +#include "heap.h" +#include "out.h" +#include "util.h" +#include "sys_util.h" +#include "valgrind_internal.h" +#include "recycler.h" +#include "container.h" +#include "alloc_class.h" + +#define MAX_RUN_LOCKS MAX_CHUNK +#define MAX_RUN_LOCKS_VG 1024 /* avoid perf issues /w drd */ + +/* + * This is the value by which the heap might grow once we hit an OOM. + */ +#define HEAP_DEFAULT_GROW_SIZE (1 << 27) /* 128 megabytes */ + +/* + * zoneset stores the collection of buckets and recyclers for allocation classes. + * Each evictable zone is assigned a zoneset during first allocation. + */ +struct zoneset { + uint32_t zset_id; + uint32_t padding; + struct bucket_locked *default_bucket; /* bucket for free chunks */ + struct bucket_locked *buckets[MAX_ALLOCATION_CLASSES]; /* one bucket per allocation class */ + struct recycler *recyclers[MAX_ALLOCATION_CLASSES]; +}; + +struct heap_rt { + struct alloc_class_collection *alloc_classes; + struct zoneset *default_zset; + struct zoneset **evictable_zsets; + pthread_mutex_t run_locks[MAX_RUN_LOCKS]; + unsigned nlocks; + unsigned nzones; + unsigned zones_exhausted; +}; + +/* + * heap_get_zoneset - returns the reference to the zoneset given + * zone or zoneset id. + */ +struct zoneset * +heap_get_zoneset(struct palloc_heap *heap, uint32_t zone_id) +{ + /* REVISIT: + * Implement the code for evictable zonesets. + */ + return heap->rt->default_zset; +} + +/* + * heap_get_recycler - (internal) retrieves the recycler instance from the zoneset with + * the corresponding class id. Initializes the recycler if needed. + */ +static struct recycler * +heap_get_recycler(struct palloc_heap *heap, struct zoneset *zset, size_t id, size_t nallocs) +{ + struct recycler *r; + + D_ASSERT(zset != NULL); + util_atomic_load_explicit64(&zset->recyclers[id], &r, memory_order_acquire); + if (r != NULL) + return r; + + r = recycler_new(heap, nallocs, zset); + if (r && !util_bool_compare_and_swap64(&zset->recyclers[id], NULL, r)) { + /* + * If a different thread succeeded in assigning the recycler + * first, the recycler this thread created needs to be deleted. + */ + recycler_delete(r); + + return heap_get_recycler(heap, zset, id, nallocs); + } + + return r; +} + +/* + * heap_alloc_classes -- returns the allocation classes collection + */ +struct alloc_class_collection * +heap_alloc_classes(struct palloc_heap *heap) +{ + return heap->rt ? heap->rt->alloc_classes : NULL; +} + +/* + * heap_get_best_class -- returns the alloc class that best fits the + * requested size + */ +struct alloc_class * +heap_get_best_class(struct palloc_heap *heap, size_t size) +{ + return alloc_class_by_alloc_size(heap->rt->alloc_classes, size); +} + +/* + * zoneset_bucket_acquire -- fetches by zoneset or by id a bucket exclusive + * for the thread until zoneset_bucket_release is called + */ +struct bucket * +zoneset_bucket_acquire(struct zoneset *zset, uint8_t class_id) +{ + struct bucket_locked *b; + + D_ASSERT(zset != NULL); + + if (class_id == DEFAULT_ALLOC_CLASS_ID) + b = zset->default_bucket; + else + b = zset->buckets[class_id]; + + return bucket_acquire(b); +} + +/* + * zoneset_bucket_release -- puts the bucket back into the heap + */ +void +zoneset_bucket_release(struct bucket *b) +{ + bucket_release(b); +} + +/* + * heap_get_run_lock -- returns the lock associated with memory block + */ +pthread_mutex_t * +heap_get_run_lock(struct palloc_heap *heap, uint32_t chunk_id) +{ + return &heap->rt->run_locks[chunk_id % heap->rt->nlocks]; +} + +/* + * heap_max_zone -- (internal) calculates how many zones can the heap fit + */ +static unsigned +heap_max_zone(size_t size) +{ + unsigned max_zone = 0; + + size -= sizeof(struct heap_header); + + while (size >= ZONE_MIN_SIZE) { + max_zone++; + size -= size <= ZONE_MAX_SIZE ? size : ZONE_MAX_SIZE; + } + + return max_zone; +} + +/* + * zone_calc_size_idx -- (internal) calculates zone size index + */ +static uint32_t +zone_calc_size_idx(uint32_t zone_id, unsigned max_zone, size_t heap_size) +{ + ASSERT(max_zone > 0); + if (zone_id < max_zone - 1) + return MAX_CHUNK; + + ASSERT(heap_size >= zone_id * ZONE_MAX_SIZE); + size_t zone_raw_size = heap_size - zone_id * ZONE_MAX_SIZE; + + ASSERT(zone_raw_size >= (sizeof(struct zone_header) + + sizeof(struct chunk_header) * MAX_CHUNK) + + sizeof(struct heap_header)); + zone_raw_size -= sizeof(struct zone_header) + + sizeof(struct chunk_header) * MAX_CHUNK + + sizeof(struct heap_header); + + size_t zone_size_idx = zone_raw_size / CHUNKSIZE; + + ASSERT(zone_size_idx <= UINT32_MAX); + + return (uint32_t)zone_size_idx; +} + +/* + * heap_zone_init -- (internal) writes zone's first chunk and header + */ +static void +heap_zone_init(struct palloc_heap *heap, uint32_t zone_id, + uint32_t first_chunk_id) +{ + struct zone *z = ZID_TO_ZONE(heap->layout, zone_id); + uint32_t size_idx = zone_calc_size_idx(zone_id, heap->rt->nzones, + *heap->sizep); + + ASSERT(size_idx > first_chunk_id); + memblock_huge_init(heap, first_chunk_id, zone_id, + size_idx - first_chunk_id); + + struct zone_header nhdr = { + .size_idx = size_idx, + .magic = ZONE_HEADER_MAGIC, + }; + + z->header = nhdr; /* write the entire header (8 bytes) at once */ + mo_wal_persist(&heap->p_ops, &z->header, sizeof(z->header)); +} + +/* + * heap_get_adjacent_free_block -- locates adjacent free memory block in heap + */ +static int +heap_get_adjacent_free_block(struct palloc_heap *heap, + const struct memory_block *in, struct memory_block *out, int prev) +{ + struct zone *z = ZID_TO_ZONE(heap->layout, in->zone_id); + struct chunk_header *hdr = &z->chunk_headers[in->chunk_id]; + + out->zone_id = in->zone_id; + + if (prev) { + if (in->chunk_id == 0) + return ENOENT; + + struct chunk_header *prev_hdr = + &z->chunk_headers[in->chunk_id - 1]; + out->chunk_id = in->chunk_id - prev_hdr->size_idx; + + if (z->chunk_headers[out->chunk_id].type != CHUNK_TYPE_FREE) + return ENOENT; + + out->size_idx = z->chunk_headers[out->chunk_id].size_idx; + } else { /* next */ + if (in->chunk_id + hdr->size_idx == z->header.size_idx) + return ENOENT; + + out->chunk_id = in->chunk_id + hdr->size_idx; + + if (z->chunk_headers[out->chunk_id].type != CHUNK_TYPE_FREE) + return ENOENT; + + out->size_idx = z->chunk_headers[out->chunk_id].size_idx; + } + memblock_rebuild_state(heap, out); + + return 0; +} + +/* + * heap_coalesce -- (internal) merges adjacent memory blocks + */ +static struct memory_block +heap_coalesce(struct palloc_heap *heap, + const struct memory_block *blocks[], int n) +{ + struct memory_block ret = MEMORY_BLOCK_NONE; + + const struct memory_block *b = NULL; + + ret.size_idx = 0; + for (int i = 0; i < n; ++i) { + if (blocks[i] == NULL) + continue; + b = b ? b : blocks[i]; + ret.size_idx += blocks[i]->size_idx; + } + + ASSERTne(b, NULL); + + ret.chunk_id = b->chunk_id; + ret.zone_id = b->zone_id; + ret.block_off = b->block_off; + memblock_rebuild_state(heap, &ret); + + return ret; +} + +/* + * heap_coalesce_huge -- finds neighbors of a huge block, removes them from the + * volatile state and returns the resulting block + */ +static struct memory_block +heap_coalesce_huge(struct palloc_heap *heap, struct bucket *b, + const struct memory_block *m) +{ + const struct memory_block *blocks[3] = {NULL, m, NULL}; + + struct memory_block prev = MEMORY_BLOCK_NONE; + + if (heap_get_adjacent_free_block(heap, m, &prev, 1) == 0 && + bucket_remove_block(b, &prev) == 0) { + blocks[0] = &prev; + } + + struct memory_block next = MEMORY_BLOCK_NONE; + + if (heap_get_adjacent_free_block(heap, m, &next, 0) == 0 && + bucket_remove_block(b, &next) == 0) { + blocks[2] = &next; + } + + return heap_coalesce(heap, blocks, 3); +} + +/* + * heap_free_chunk_reuse -- reuses existing free chunk + */ +int +heap_free_chunk_reuse(struct palloc_heap *heap, + struct bucket *bucket, + struct memory_block *m) +{ + /* + * Perform coalescing just in case there + * are any neighboring free chunks. + */ + struct memory_block nm = heap_coalesce_huge(heap, bucket, m); + + if (nm.size_idx != m->size_idx) + m->m_ops->prep_hdr(&nm, MEMBLOCK_FREE, NULL); + + *m = nm; + + return bucket_insert_block(bucket, m); +} + +/* + * heap_run_into_free_chunk -- (internal) creates a new free chunk in place of + * a run. + */ +static void +heap_run_into_free_chunk(struct palloc_heap *heap, + struct bucket *bucket, + struct memory_block *m) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(heap, m); + + m->block_off = 0; + m->size_idx = hdr->size_idx; + + STATS_SUB(heap->stats, transient, heap_run_active, + m->size_idx * CHUNKSIZE); + + /* + * The only thing this could race with is heap_memblock_on_free() + * because that function is called after processing the operation, + * which means that a different thread might immediately call this + * function if the free() made the run empty. + * We could forgo this lock if it weren't for helgrind which needs it + * to establish happens-before relation for the chunk metadata. + */ + pthread_mutex_t *lock = m->m_ops->get_lock(m); + + util_mutex_lock(lock); + + *m = memblock_huge_init(heap, m->chunk_id, m->zone_id, m->size_idx); + + heap_free_chunk_reuse(heap, bucket, m); + + util_mutex_unlock(lock); +} + +/* + * heap_reclaim_run -- checks the run for available memory if unclaimed. + * + * Returns 1 if reclaimed chunk, 0 otherwise. + */ +static int +heap_reclaim_run(struct palloc_heap *heap, struct memory_block *m, int startup) +{ + struct chunk_run *run = heap_get_chunk_run(heap, m); + struct chunk_header *hdr = heap_get_chunk_hdr(heap, m); + struct zoneset *zset = heap_get_zoneset(heap, m->zone_id); + + struct alloc_class *c = alloc_class_by_run( + heap->rt->alloc_classes, + run->hdr.block_size, hdr->flags, m->size_idx); + + struct recycler_element e = recycler_element_new(heap, m); + + if (c == NULL) { + uint32_t size_idx = m->size_idx; + struct run_bitmap b; + + m->m_ops->get_bitmap(m, &b); + + ASSERTeq(size_idx, m->size_idx); + + return e.free_space == b.nbits; + } + + if (e.free_space == c->rdsc.nallocs) + return 1; + + if (startup) { + STATS_INC(heap->stats, transient, heap_run_active, + m->size_idx * CHUNKSIZE); + STATS_INC(heap->stats, transient, heap_run_allocated, + (c->rdsc.nallocs - e.free_space) * run->hdr.block_size); + } + struct recycler *recycler = heap_get_recycler(heap, zset, c->id, c->rdsc.nallocs); + + if (recycler == NULL || recycler_put(recycler, e) < 0) + ERR("lost runtime tracking info of %u run due to OOM", c->id); + + return 0; +} + +/* + * heap_reclaim_zone_garbage -- (internal) creates volatile state of unused runs + */ +static void +heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket, + uint32_t zone_id) +{ + struct zone *z = ZID_TO_ZONE(heap->layout, zone_id); + + for (uint32_t i = 0; i < z->header.size_idx; ) { + struct chunk_header *hdr = &z->chunk_headers[i]; + + ASSERT(hdr->size_idx != 0); + + struct memory_block m = MEMORY_BLOCK_NONE; + + m.zone_id = zone_id; + m.chunk_id = i; + m.size_idx = hdr->size_idx; + + memblock_rebuild_state(heap, &m); + m.m_ops->reinit_chunk(&m); + + switch (hdr->type) { + case CHUNK_TYPE_RUN: + if (heap_reclaim_run(heap, &m, 1) != 0) + heap_run_into_free_chunk(heap, bucket, &m); + break; + case CHUNK_TYPE_FREE: + heap_free_chunk_reuse(heap, bucket, &m); + break; + case CHUNK_TYPE_USED: + break; + default: + ASSERT(0); + } + + i = m.chunk_id + m.size_idx; /* hdr might have changed */ + } +} + +/* + * heap_populate_bucket -- (internal) creates volatile state of memory blocks + */ +static int +heap_populate_bucket(struct palloc_heap *heap, struct bucket *bucket) +{ + struct heap_rt *h = heap->rt; + + /* at this point we are sure that there's no more memory in the heap */ + if (h->zones_exhausted == h->nzones) + return ENOMEM; + + uint32_t zone_id = h->zones_exhausted++; + struct zone *z = ZID_TO_ZONE(heap->layout, zone_id); + + /* ignore zone and chunk headers */ + VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(z, sizeof(z->header) + + sizeof(z->chunk_headers)); + + if (z->header.magic != ZONE_HEADER_MAGIC) + heap_zone_init(heap, zone_id, 0); + + heap_reclaim_zone_garbage(heap, bucket, zone_id); + + /* + * It doesn't matter that this function might not have found any + * free blocks because there is still potential that subsequent calls + * will find something in later zones. + */ + return 0; +} + +/* + * heap_recycle_unused -- recalculate scores in the recycler and turn any + * empty runs into free chunks + * + * If force is not set, this function might effectively be a noop if not enough + * of space was freed. + */ +static int +heap_recycle_unused(struct palloc_heap *heap, struct recycler *recycler, + struct bucket *defb, int force) +{ + struct zoneset *zset; + struct memory_block *nm; + struct empty_runs r = recycler_recalc(recycler, force); + struct bucket *nb; + + if (VEC_SIZE(&r) == 0) + return ENOMEM; + + zset = recycler_get_zoneset(recycler); + D_ASSERT(zset != NULL); + + nb = defb == NULL ? zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID) : NULL; + + ASSERT(defb != NULL || nb != NULL); + + VEC_FOREACH_BY_PTR(nm, &r) { + heap_run_into_free_chunk(heap, defb ? defb : nb, nm); + } + + if (nb != NULL) + zoneset_bucket_release(nb); + + VEC_DELETE(&r); + + return 0; +} + +/* + * heap_reclaim_garbage -- (internal) creates volatile state of unused runs + */ +static int +heap_reclaim_garbage(struct palloc_heap *heap, struct bucket *bucket) +{ + int ret = ENOMEM; + struct recycler *r; + struct zoneset *zset = bucket_get_zoneset(bucket); + + for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + r = zset->recyclers[i]; + if (r == NULL) + continue; + + if (heap_recycle_unused(heap, r, bucket, 1) == 0) + ret = 0; + } + + return ret; +} + +/* + * heap_ensure_huge_bucket_filled -- + * (internal) refills the default bucket if needed + */ +static int +heap_ensure_huge_bucket_filled(struct palloc_heap *heap, + struct bucket *bucket) +{ + if (heap_reclaim_garbage(heap, bucket) == 0) + return 0; + + if (heap_populate_bucket(heap, bucket) == 0) + return 0; + +#if 0 /*REVISIT: heap extend not supported*/ + int extend; + + extend = heap_extend(heap, bucket, heap->growsize); + if (extend < 0) + return ENOMEM; + + if (extend == 1) + return 0; +#endif + + /* + * Extending the pool does not automatically add the chunks into the + * runtime state of the bucket - we need to traverse the new zone if + * it was created. + */ + if (heap_populate_bucket(heap, bucket) == 0) + return 0; + + return ENOMEM; +} + +/* + * heap_discard_run -- puts the memory block back into the global heap. + */ +void +heap_discard_run(struct palloc_heap *heap, struct memory_block *m) +{ + struct zoneset *zset = heap_get_zoneset(heap, m->zone_id); + + D_ASSERT(zset != NULL); + if (heap_reclaim_run(heap, m, 0)) { + struct bucket *b = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID); + + heap_run_into_free_chunk(heap, b, m); + + zoneset_bucket_release(b); + } +} + +/* + * heap_detach_and_try_discard_run -- detaches the active from a bucket and + * tries to discard the run if it is completely empty (has no allocations) + */ +static int +heap_detach_and_try_discard_run(struct palloc_heap *heap, struct bucket *b) +{ + int empty = 0; + struct memory_block m; + + if (bucket_detach_run(b, &m, &empty) != 0) + return -1; + + if (empty) + heap_discard_run(heap, &m); + + return 0; +} + +/* + * heap_reuse_from_recycler -- (internal) try reusing runs that are currently + * in the recycler + */ +static int +heap_reuse_from_recycler(struct palloc_heap *heap, + struct bucket *b, uint32_t units, int force) +{ + struct zoneset *zset = bucket_get_zoneset(b); + struct memory_block m = MEMORY_BLOCK_NONE; + + m.size_idx = units; + + struct alloc_class *aclass = bucket_alloc_class(b); + + struct recycler *recycler = heap_get_recycler(heap, zset, aclass->id, aclass->rdsc.nallocs); + + if (recycler == NULL) { + ERR("lost runtime tracking info of %u run due to OOM", + aclass->id); + return 0; + } + + if (!force && recycler_get(recycler, &m) == 0) + return bucket_attach_run(b, &m); + + heap_recycle_unused(heap, recycler, NULL, force); + + if (recycler_get(recycler, &m) == 0) + return bucket_attach_run(b, &m); + + return ENOMEM; +} + +/* + * heap_run_create -- (internal) initializes a new run on an existing free chunk + */ +static int +heap_run_create(struct palloc_heap *heap, struct bucket *b, + struct memory_block *m) +{ + struct alloc_class *aclass = bucket_alloc_class(b); + *m = memblock_run_init(heap, m->chunk_id, m->zone_id, &aclass->rdsc); + + bucket_attach_run(b, m); + + STATS_INC(heap->stats, transient, heap_run_active, + m->size_idx * CHUNKSIZE); + + return 0; +} + +/* + * heap_ensure_run_bucket_filled -- (internal) refills the bucket if needed + */ +static int +heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b, + uint32_t units) +{ + int ret = 0; + struct alloc_class *aclass = bucket_alloc_class(b); + struct zoneset *zset = bucket_get_zoneset(b); + + D_ASSERT(zset != NULL); + ASSERTeq(aclass->type, CLASS_RUN); + + if (heap_detach_and_try_discard_run(heap, b) != 0) + return ENOMEM; + + if (heap_reuse_from_recycler(heap, b, units, 0) == 0) + goto out; + + /* search in the next zone before attempting to create a new run */ + struct bucket *defb = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID); + + heap_populate_bucket(heap, defb); + zoneset_bucket_release(defb); + + if (heap_reuse_from_recycler(heap, b, units, 0) == 0) + goto out; + + struct memory_block m = MEMORY_BLOCK_NONE; + + m.size_idx = aclass->rdsc.size_idx; + + defb = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID); + + /* cannot reuse an existing run, create a new one */ + if (heap_get_bestfit_block(heap, defb, &m) == 0) { + ASSERTeq(m.block_off, 0); + if (heap_run_create(heap, b, &m) != 0) { + zoneset_bucket_release(defb); + return ENOMEM; + } + + zoneset_bucket_release(defb); + + goto out; + } + zoneset_bucket_release(defb); + + if (heap_reuse_from_recycler(heap, b, units, 0) == 0) + goto out; + + ret = ENOMEM; +out: + + return ret; +} + +/* + * heap_memblock_on_free -- bookkeeping actions executed at every free of a + * block + */ +void +heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m) +{ + struct zoneset *zset = heap_get_zoneset(heap, m->zone_id); + + if (m->type != MEMORY_BLOCK_RUN) + return; + + struct chunk_header *hdr = heap_get_chunk_hdr(heap, m); + struct chunk_run *run = heap_get_chunk_run(heap, m); + + ASSERTeq(hdr->type, CHUNK_TYPE_RUN); + + struct alloc_class *c = alloc_class_by_run( + heap->rt->alloc_classes, + run->hdr.block_size, hdr->flags, hdr->size_idx); + + if (c == NULL) + return; + + struct recycler *recycler = heap_get_recycler(heap, zset, c->id, c->rdsc.nallocs); + + if (recycler == NULL) { + ERR("lost runtime tracking info of %u run due to OOM", + c->id); + } else { + recycler_inc_unaccounted(recycler, m); + } +} + +/* + * heap_split_block -- (internal) splits unused part of the memory block + */ +static void +heap_split_block(struct palloc_heap *heap, struct bucket *b, + struct memory_block *m, uint32_t units) +{ + struct alloc_class *aclass = bucket_alloc_class(b); + + ASSERT(units <= UINT16_MAX); + ASSERT(units > 0); + + if (aclass->type == CLASS_RUN) { + ASSERT((uint64_t)m->block_off + (uint64_t)units <= UINT32_MAX); + struct memory_block r = {m->chunk_id, m->zone_id, + m->size_idx - units, (uint32_t)(m->block_off + units), + NULL, NULL, 0, 0, NULL}; + memblock_rebuild_state(heap, &r); + if (bucket_insert_block(b, &r) != 0) + D_CRIT("failed to allocate memory block runtime tracking info\n"); + } else { + uint32_t new_chunk_id = m->chunk_id + units; + uint32_t new_size_idx = m->size_idx - units; + + struct memory_block n = memblock_huge_init(heap, + new_chunk_id, m->zone_id, new_size_idx); + + *m = memblock_huge_init(heap, m->chunk_id, m->zone_id, units); + + if (bucket_insert_block(b, &n) != 0) + D_CRIT("failed to allocate memory block runtime tracking info\n"); + } + + m->size_idx = units; +} + +/* + * heap_get_bestfit_block -- + * extracts a memory block of equal size index + */ +int +heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b, + struct memory_block *m) +{ + struct alloc_class *aclass = bucket_alloc_class(b); + uint32_t units = m->size_idx; + + while (bucket_alloc_block(b, m) != 0) { + if (aclass->type == CLASS_HUGE) { + if (heap_ensure_huge_bucket_filled(heap, b) != 0) + return ENOMEM; + } else { + if (heap_ensure_run_bucket_filled(heap, b, units) != 0) + return ENOMEM; + } + } + + ASSERT(m->size_idx >= units); + + if (units != m->size_idx) + heap_split_block(heap, b, m, units); + + m->m_ops->ensure_header_type(m, aclass->header_type); + m->header_type = aclass->header_type; + + return 0; +} + +/* + * heap_end -- returns first address after heap + */ +void * +heap_end(struct palloc_heap *h) +{ + ASSERT(h->rt->nzones > 0); + + struct zone *last_zone = ZID_TO_ZONE(h->layout, h->rt->nzones - 1); + + return &last_zone->chunks[last_zone->header.size_idx]; +} + +/* + * heap_default_zoneset_init -- (internal) initializes default zone + */ +static int +heap_default_zoneset_init(struct palloc_heap *heap) +{ + struct heap_rt *h = heap->rt; + struct zoneset *default_zset; + struct alloc_class *c; + uint8_t i; + + D_ALLOC_PTR(default_zset); + if (default_zset == NULL) + return -1; + + for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + c = alloc_class_by_id(h->alloc_classes, i); + + if (c == NULL) + continue; + + default_zset->buckets[c->id] = + bucket_locked_new(container_new_seglists(heap), c, default_zset); + if (default_zset->buckets[c->id] == NULL) + goto error_bucket_create; + } + + default_zset->default_bucket = bucket_locked_new( + container_new_ravl(heap), alloc_class_by_id(h->alloc_classes, DEFAULT_ALLOC_CLASS_ID), + default_zset); + + if (default_zset->default_bucket == NULL) + goto error_bucket_create; + + heap->rt->default_zset = default_zset; + return 0; + +error_bucket_create: + for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + c = alloc_class_by_id(h->alloc_classes, i); + if (c != NULL) { + if (default_zset->buckets[c->id] != NULL) + bucket_locked_delete(default_zset->buckets[c->id]); + } + } + D_FREE(default_zset); + return -1; +} + +static void +heap_default_zoneset_cleanup(struct palloc_heap *heap) +{ + struct zoneset *default_zset = heap->rt->default_zset; + uint8_t i; + + for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + if (default_zset->buckets[i] == NULL) + continue; + bucket_locked_delete(default_zset->buckets[i]); + } + bucket_locked_delete(default_zset->default_bucket); + + for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) { + if (default_zset->recyclers[i] == NULL) + continue; + recycler_delete(default_zset->recyclers[i]); + } + D_FREE(default_zset); + heap->rt->default_zset = NULL; +} + +/* + * heap_create_alloc_class_buckets -- allocates all cache bucket + * instances of the specified type + */ +int +heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c) +{ + struct zoneset *default_zset = heap->rt->default_zset; + + if (default_zset->buckets[c->id] == NULL) { + default_zset->buckets[c->id] = + bucket_locked_new(container_new_seglists(heap), c, default_zset); + if (default_zset->buckets[c->id] == NULL) + return -1; + } + + return 0; +} + +/* + * heap_zone_update_if_needed -- updates the zone metadata if the pool has been + * extended. + */ +static void +heap_zone_update_if_needed(struct palloc_heap *heap) +{ + struct zone *z; + + for (uint32_t i = 0; i < heap->rt->nzones; ++i) { + z = ZID_TO_ZONE(heap->layout, i); + if (z->header.magic != ZONE_HEADER_MAGIC) + continue; + + size_t size_idx = zone_calc_size_idx(i, heap->rt->nzones, + *heap->sizep); + + if (size_idx == z->header.size_idx) + continue; + + heap_zone_init(heap, i, z->header.size_idx); + } +} + +/* + * heap_boot -- opens the heap region of the dav_obj pool + * + * If successful function returns zero. Otherwise an error number is returned. + */ +int +heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, + uint64_t *sizep, void *base, struct mo_ops *p_ops, + struct stats *stats, struct pool_set *set) +{ + struct heap_rt *h; + int err; + + /* + * The size can be 0 if interrupted during heap_init or this is the + * first time booting the heap with the persistent size field. + */ + if (*sizep == 0) { + *sizep = heap_size; + + mo_wal_persist(p_ops, sizep, sizeof(*sizep)); + } + + if (heap_size < *sizep) { + ERR("mapped region smaller than the heap size"); + return EINVAL; + } + + D_ALLOC_PTR_NZ(h); + if (h == NULL) { + err = ENOMEM; + goto error_heap_malloc; + } + + h->alloc_classes = alloc_class_collection_new(); + if (h->alloc_classes == NULL) { + err = ENOMEM; + goto error_alloc_classes_new; + } + + h->nzones = heap_max_zone(heap_size); + + h->zones_exhausted = 0; + + h->nlocks = On_valgrind ? MAX_RUN_LOCKS_VG : MAX_RUN_LOCKS; + for (unsigned i = 0; i < h->nlocks; ++i) + util_mutex_init(&h->run_locks[i]); + + heap->p_ops = *p_ops; + heap->layout = heap_start; + heap->rt = h; + heap->sizep = sizep; + heap->base = base; + heap->stats = stats; + heap->set = set; + heap->growsize = HEAP_DEFAULT_GROW_SIZE; + heap->alloc_pattern = PALLOC_CTL_DEBUG_NO_PATTERN; + VALGRIND_DO_CREATE_MEMPOOL(heap->layout, 0, 0); + + if (heap_default_zoneset_init(heap) != 0) { + err = ENOMEM; + goto error_zoneset_init; + } + + heap_zone_update_if_needed(heap); + + return 0; + +error_zoneset_init: + alloc_class_collection_delete(h->alloc_classes); +error_alloc_classes_new: + D_FREE(h); + heap->rt = NULL; +error_heap_malloc: + return err; +} + +/* + * heap_write_header -- (internal) creates a clean header + */ +static void +heap_write_header(struct heap_header *hdr) +{ + struct heap_header newhdr = { + .signature = HEAP_SIGNATURE, + .major = HEAP_MAJOR, + .minor = HEAP_MINOR, + .unused = 0, + .chunksize = CHUNKSIZE, + .chunks_per_zone = MAX_CHUNK, + .reserved = {0}, + .checksum = 0 + }; + + util_checksum(&newhdr, sizeof(newhdr), &newhdr.checksum, 1, 0); + *hdr = newhdr; +} + +/* + * heap_init -- initializes the heap + * + * If successful function returns zero. Otherwise an error number is returned. + */ +int +heap_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, + struct mo_ops *p_ops) +{ + if (heap_size < HEAP_MIN_SIZE) + return EINVAL; + + VALGRIND_DO_MAKE_MEM_UNDEFINED(heap_start, heap_size); + + struct heap_layout *layout = heap_start; + + heap_write_header(&layout->header); + mo_wal_persist(p_ops, &layout->header, sizeof(struct heap_header)); + + unsigned zones = heap_max_zone(heap_size); + + for (unsigned i = 0; i < zones; ++i) { + struct zone *zone = ZID_TO_ZONE(layout, i); + + mo_wal_memset(p_ops, &zone->header, 0, + sizeof(struct zone_header), 0); + mo_wal_memset(p_ops, &zone->chunk_headers, 0, + sizeof(struct chunk_header), 0); + + /* only explicitly allocated chunks should be accessible */ + VALGRIND_DO_MAKE_MEM_NOACCESS(&zone->chunk_headers, + sizeof(struct chunk_header)); + } + *sizep = heap_size; + mo_wal_persist(p_ops, sizep, sizeof(*sizep)); + + return 0; +} + +/* + * heap_cleanup -- cleanups the volatile heap state + */ +void +heap_cleanup(struct palloc_heap *heap) +{ + struct heap_rt *rt = heap->rt; + + alloc_class_collection_delete(rt->alloc_classes); + + heap_default_zoneset_cleanup(heap); + + for (unsigned i = 0; i < rt->nlocks; ++i) + util_mutex_destroy(&rt->run_locks[i]); + + VALGRIND_DO_DESTROY_MEMPOOL(heap->layout); + + D_FREE(rt); + heap->rt = NULL; +} + +/* + * heap_verify_header -- (internal) verifies if the heap header is consistent + */ +static int +heap_verify_header(struct heap_header *hdr) +{ + if (util_checksum(hdr, sizeof(*hdr), &hdr->checksum, 0, 0) != 1) { + D_CRIT("heap: invalid header's checksum\n"); + return -1; + } + + if (memcmp(hdr->signature, HEAP_SIGNATURE, HEAP_SIGNATURE_LEN) != 0) { + D_CRIT("heap: invalid signature\n"); + return -1; + } + + return 0; +} + +/* + * heap_verify_zone_header -- + * (internal) verifies if the zone header is consistent + */ +static int +heap_verify_zone_header(struct zone_header *hdr) +{ + if (hdr->magic != ZONE_HEADER_MAGIC) /* not initialized */ + return 0; + + if (hdr->size_idx == 0) { + D_CRIT("heap: invalid zone size\n"); + return -1; + } + + return 0; +} + +/* + * heap_verify_chunk_header -- + * (internal) verifies if the chunk header is consistent + */ +static int +heap_verify_chunk_header(struct chunk_header *hdr) +{ + if (hdr->type == CHUNK_TYPE_UNKNOWN) { + D_CRIT("heap: invalid chunk type\n"); + return -1; + } + + if (hdr->type >= MAX_CHUNK_TYPE) { + D_CRIT("heap: unknown chunk type\n"); + return -1; + } + + if (hdr->flags & ~CHUNK_FLAGS_ALL_VALID) { + D_CRIT("heap: invalid chunk flags\n"); + return -1; + } + + return 0; +} + +/* + * heap_verify_zone -- (internal) verifies if the zone is consistent + */ +static int +heap_verify_zone(struct zone *zone) +{ + if (zone->header.magic == 0) + return 0; /* not initialized, and that is OK */ + + if (zone->header.magic != ZONE_HEADER_MAGIC) { + D_CRIT("heap: invalid zone magic\n"); + return -1; + } + + if (heap_verify_zone_header(&zone->header)) + return -1; + + uint32_t i; + + for (i = 0; i < zone->header.size_idx; ) { + if (heap_verify_chunk_header(&zone->chunk_headers[i])) + return -1; + + i += zone->chunk_headers[i].size_idx; + } + + if (i != zone->header.size_idx) { + D_CRIT("heap: chunk sizes mismatch\n"); + return -1; + } + + return 0; +} + +/* + * heap_check -- verifies if the heap is consistent and can be opened properly + * + * If successful function returns zero. Otherwise an error number is returned. + */ +int +heap_check(void *heap_start, uint64_t heap_size) +{ + if (heap_size < HEAP_MIN_SIZE) { + D_CRIT("heap: invalid heap size\n"); + return -1; + } + + struct heap_layout *layout = heap_start; + + if (heap_verify_header(&layout->header)) + return -1; + + for (unsigned i = 0; i < heap_max_zone(heap_size); ++i) { + if (heap_verify_zone(ZID_TO_ZONE(layout, i))) + return -1; + } + + return 0; +} + +/* + * heap_check_remote -- verifies if the heap of a remote pool is consistent + * and can be opened properly + * + * If successful function returns zero. Otherwise an error number is returned. + */ +int +heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops) +{ + struct zone *zone_buff; + + if (heap_size < HEAP_MIN_SIZE) { + D_CRIT("heap: invalid heap size\n"); + return -1; + } + + struct heap_layout *layout = heap_start; + + struct heap_header header; + + if (ops->read(ops->ctx, ops->base, &header, &layout->header, + sizeof(struct heap_header))) { + D_CRIT("heap: obj_read_remote error\n"); + return -1; + } + + if (heap_verify_header(&header)) + return -1; + + D_ALLOC_PTR_NZ(zone_buff); + if (zone_buff == NULL) { + D_CRIT("heap: zone_buff malloc error\n"); + return -1; + } + for (unsigned i = 0; i < heap_max_zone(heap_size); ++i) { + if (ops->read(ops->ctx, ops->base, zone_buff, + ZID_TO_ZONE(layout, i), sizeof(struct zone))) { + D_CRIT("heap: obj_read_remote error\n"); + goto out; + } + + if (heap_verify_zone(zone_buff)) + goto out; + } + D_FREE(zone_buff); + return 0; + +out: + D_FREE(zone_buff); + return -1; +} + +/* + * heap_zone_foreach_object -- (internal) iterates through objects in a zone + */ +static int +heap_zone_foreach_object(struct palloc_heap *heap, object_callback cb, + void *arg, struct memory_block *m) +{ + struct zone *zone = ZID_TO_ZONE(heap->layout, m->zone_id); + + if (zone->header.magic == 0) + return 0; + + for (; m->chunk_id < zone->header.size_idx; ) { + struct chunk_header *hdr = heap_get_chunk_hdr(heap, m); + + memblock_rebuild_state(heap, m); + m->size_idx = hdr->size_idx; + + if (m->m_ops->iterate_used(m, cb, arg) != 0) + return 1; + + m->chunk_id += m->size_idx; + m->block_off = 0; + } + + return 0; +} + +/* + * heap_foreach_object -- (internal) iterates through objects in the heap + */ +void +heap_foreach_object(struct palloc_heap *heap, object_callback cb, void *arg, + struct memory_block m) +{ + for (; m.zone_id < heap->rt->nzones; ++m.zone_id) { + if (heap_zone_foreach_object(heap, cb, arg, &m) != 0) + break; + + m.chunk_id = 0; + } +} + +#if VG_MEMCHECK_ENABLED +/* + * heap_vg_open -- notifies Valgrind about heap layout + */ +void +heap_vg_open(struct palloc_heap *heap, object_callback cb, + void *arg, int objects) +{ + ASSERTne(cb, NULL); + VALGRIND_DO_MAKE_MEM_UNDEFINED(heap->layout, *heap->sizep); + + struct heap_layout *layout = heap->layout; + + VALGRIND_DO_MAKE_MEM_DEFINED(&layout->header, sizeof(layout->header)); + + unsigned zones = heap_max_zone(*heap->sizep); + struct memory_block m = MEMORY_BLOCK_NONE; + + for (unsigned i = 0; i < zones; ++i) { + struct zone *z = ZID_TO_ZONE(layout, i); + uint32_t chunks; + + m.zone_id = i; + m.chunk_id = 0; + + VALGRIND_DO_MAKE_MEM_DEFINED(&z->header, sizeof(z->header)); + + if (z->header.magic != ZONE_HEADER_MAGIC) + continue; + + chunks = z->header.size_idx; + + for (uint32_t c = 0; c < chunks; ) { + struct chunk_header *hdr = &z->chunk_headers[c]; + + /* define the header before rebuilding state */ + VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr)); + + m.chunk_id = c; + m.size_idx = hdr->size_idx; + + memblock_rebuild_state(heap, &m); + + m.m_ops->vg_init(&m, objects, cb, arg); + m.block_off = 0; + + ASSERT(hdr->size_idx > 0); + + c += hdr->size_idx; + } + + /* mark all unused chunk headers after last as not accessible */ + VALGRIND_DO_MAKE_MEM_NOACCESS(&z->chunk_headers[chunks], + (MAX_CHUNK - chunks) * sizeof(struct chunk_header)); + } +} +#endif diff --git a/src/common/dav_v2/heap.h b/src/common/dav_v2/heap.h new file mode 100644 index 00000000000..21f6d0dfd0b --- /dev/null +++ b/src/common/dav_v2/heap.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2023, Intel Corporation */ + +/* + * heap.h -- internal definitions for heap + */ + +#ifndef __DAOS_COMMON_HEAP_H +#define __DAOS_COMMON_HEAP_H 1 + +#include <stddef.h> +#include <stdint.h> + +#include "memblock.h" +#include "bucket.h" +#include "memops.h" +#include "palloc.h" +#include "dav_internal.h" + +#define HEAP_OFF_TO_PTR(heap, off) ((void *)((char *)((heap)->base) + (off))) +#define HEAP_PTR_TO_OFF(heap, ptr) ((uintptr_t)(ptr) - (uintptr_t)((heap)->base)) + +#define BIT_IS_CLR(a, i) (!((a) & (1ULL << (i)))) +#define HEAP_ARENA_PER_THREAD (0) + +int +heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, uint64_t *sizep, + void *base, struct mo_ops *p_ops, struct stats *stats, struct pool_set *set); +int +heap_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops); +void +heap_cleanup(struct palloc_heap *heap); +int +heap_check(void *heap_start, uint64_t heap_size); +int +heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops); +int +heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c); +int +heap_extend(struct palloc_heap *heap, struct bucket *defb, size_t size); + +struct alloc_class * +heap_get_best_class(struct palloc_heap *heap, size_t size); + +struct bucket * +zoneset_bucket_acquire(struct zoneset *zset, uint8_t class_id); +void +zoneset_bucket_release(struct bucket *b); + +int +heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b, struct memory_block *m); +pthread_mutex_t * +heap_get_run_lock(struct palloc_heap *heap, uint32_t chunk_id); + +void +heap_discard_run(struct palloc_heap *heap, struct memory_block *m); + +void +heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m); + +int +heap_free_chunk_reuse(struct palloc_heap *heap, struct bucket *bucket, struct memory_block *m); + +void +heap_foreach_object(struct palloc_heap *heap, object_callback cb, void *arg, + struct memory_block start); + +struct alloc_class_collection * +heap_alloc_classes(struct palloc_heap *heap); + +void * +heap_end(struct palloc_heap *heap); + +void +heap_vg_open(struct palloc_heap *heap, object_callback cb, void *arg, int objects); + +static inline struct chunk_header * +heap_get_chunk_hdr(struct palloc_heap *heap, const struct memory_block *m) +{ + return GET_CHUNK_HDR(heap->layout, m->zone_id, m->chunk_id); +} + +static inline struct chunk * +heap_get_chunk(struct palloc_heap *heap, const struct memory_block *m) +{ + return GET_CHUNK(heap->layout, m->zone_id, m->chunk_id); +} + +static inline struct chunk_run * +heap_get_chunk_run(struct palloc_heap *heap, const struct memory_block *m) +{ + return GET_CHUNK_RUN(heap->layout, m->zone_id, m->chunk_id); +} + +struct zoneset * +heap_get_zoneset(struct palloc_heap *heap, uint32_t zone_id); + +#endif /* __DAOS_COMMON_HEAP_H */ diff --git a/src/common/dav_v2/heap_layout.h b/src/common/dav_v2/heap_layout.h new file mode 100644 index 00000000000..c7209670103 --- /dev/null +++ b/src/common/dav_v2/heap_layout.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2020, Intel Corporation */ + +/* + * heap_layout.h -- internal definitions for heap layout + */ + +#ifndef __DAOS_COMMON_HEAP_LAYOUT_H +#define __DAOS_COMMON_HEAP_LAYOUT_H 1 + +#include <stddef.h> +#include <stdint.h> + +#define HEAP_MAJOR 1 +#define HEAP_MINOR 0 + +#define MAX_CHUNK (UINT16_MAX - 7) /* has to be multiple of 8 */ +#define CHUNK_BASE_ALIGNMENT 1024 +#define CHUNKSIZE ((size_t)1024 * 256) /* 256 kilobytes */ +#define MAX_MEMORY_BLOCK_SIZE (MAX_CHUNK * CHUNKSIZE) +#define HEAP_SIGNATURE_LEN 16 +#define HEAP_SIGNATURE "MEMORY_HEAP_HDR\0" +#define ZONE_HEADER_MAGIC 0xC3F0A2D2 +#define ZONE_MIN_SIZE (sizeof(struct zone) + sizeof(struct chunk)) +#define ZONE_MAX_SIZE (sizeof(struct zone) + sizeof(struct chunk) * MAX_CHUNK) +#define HEAP_MIN_SIZE (sizeof(struct heap_layout) + ZONE_MIN_SIZE) + +/* Base bitmap values, relevant for both normal and flexible bitmaps */ +#define RUN_BITS_PER_VALUE 64U +#define RUN_BASE_METADATA_VALUES\ + ((unsigned)(sizeof(struct chunk_run_header) / sizeof(uint64_t))) +#define RUN_BASE_METADATA_SIZE (sizeof(struct chunk_run_header)) + +#define RUN_CONTENT_SIZE (CHUNKSIZE - RUN_BASE_METADATA_SIZE) + +/* + * Calculates the size in bytes of a single run instance, including bitmap + */ +#define RUN_CONTENT_SIZE_BYTES(size_idx)\ +(RUN_CONTENT_SIZE + (((size_idx) - 1) * CHUNKSIZE)) + +/* Default bitmap values, specific for old, non-flexible, bitmaps */ +#define RUN_DEFAULT_METADATA_VALUES 40 /* in 8 byte words, 320 bytes total */ +#define RUN_DEFAULT_BITMAP_VALUES \ + (RUN_DEFAULT_METADATA_VALUES - RUN_BASE_METADATA_VALUES) +#define RUN_DEFAULT_BITMAP_SIZE (sizeof(uint64_t) * RUN_DEFAULT_BITMAP_VALUES) +#define RUN_DEFAULT_BITMAP_NBITS\ + (RUN_BITS_PER_VALUE * RUN_DEFAULT_BITMAP_VALUES) +#define RUN_DEFAULT_SIZE \ + (CHUNKSIZE - RUN_BASE_METADATA_SIZE - RUN_DEFAULT_BITMAP_SIZE) + +/* + * Calculates the size in bytes of a single run instance, without bitmap, + * but only for the default fixed-bitmap algorithm + */ +#define RUN_DEFAULT_SIZE_BYTES(size_idx)\ +(RUN_DEFAULT_SIZE + (((size_idx) - 1) * CHUNKSIZE)) + +#define CHUNK_MASK ((CHUNKSIZE) - 1) +#define CHUNK_ALIGN_UP(value) ((((value) + CHUNK_MASK) & ~CHUNK_MASK)) + +enum chunk_flags { + CHUNK_FLAG_COMPACT_HEADER = 0x0001, + CHUNK_FLAG_HEADER_NONE = 0x0002, + CHUNK_FLAG_ALIGNED = 0x0004, + CHUNK_FLAG_FLEX_BITMAP = 0x0008, +}; + +#define CHUNK_FLAGS_ALL_VALID (\ + CHUNK_FLAG_COMPACT_HEADER |\ + CHUNK_FLAG_HEADER_NONE |\ + CHUNK_FLAG_ALIGNED |\ + CHUNK_FLAG_FLEX_BITMAP\ +) + +enum chunk_type { + CHUNK_TYPE_UNKNOWN, + CHUNK_TYPE_FOOTER, /* not actual chunk type */ + CHUNK_TYPE_FREE, + CHUNK_TYPE_USED, + CHUNK_TYPE_RUN, + CHUNK_TYPE_RUN_DATA, + + MAX_CHUNK_TYPE +}; + +struct chunk { + uint8_t data[CHUNKSIZE]; +}; + +struct chunk_run_header { + uint64_t block_size; + uint64_t alignment; /* valid only /w CHUNK_FLAG_ALIGNED */ +}; + +struct chunk_run { + struct chunk_run_header hdr; + uint8_t content[RUN_CONTENT_SIZE]; /* bitmap + data */ +}; + +struct chunk_header { + uint16_t type; + uint16_t flags; + uint32_t size_idx; +}; + +struct zone_header { + uint32_t magic; + uint32_t size_idx; + uint8_t reserved[56]; +}; + +struct zone { + struct zone_header header; + struct chunk_header chunk_headers[MAX_CHUNK]; + struct chunk chunks[]; +}; + +struct heap_header { + char signature[HEAP_SIGNATURE_LEN]; + uint64_t major; + uint64_t minor; + uint64_t unused; /* might be garbage */ + uint64_t chunksize; + uint64_t chunks_per_zone; + uint8_t reserved[960]; + uint64_t checksum; +}; + +struct heap_layout { + struct heap_header header; + struct zone zone0; /* first element of zones array */ +}; + +#define ALLOC_HDR_SIZE_SHIFT (48ULL) +#define ALLOC_HDR_FLAGS_MASK (((1ULL) << ALLOC_HDR_SIZE_SHIFT) - 1) + +struct allocation_header_legacy { + uint8_t unused[8]; + uint64_t size; + uint8_t unused2[32]; + uint64_t root_size; + uint64_t type_num; +}; + +#define ALLOC_HDR_COMPACT_SIZE sizeof(struct allocation_header_compact) + +struct allocation_header_compact { + uint64_t size; + uint64_t extra; +}; + +enum header_type { + HEADER_LEGACY, + HEADER_COMPACT, + HEADER_NONE, + + MAX_HEADER_TYPES +}; + +static const size_t header_type_to_size[MAX_HEADER_TYPES] = { + sizeof(struct allocation_header_legacy), + sizeof(struct allocation_header_compact), + 0 +}; + +static const enum chunk_flags header_type_to_flag[MAX_HEADER_TYPES] = { + (enum chunk_flags)0, + CHUNK_FLAG_COMPACT_HEADER, + CHUNK_FLAG_HEADER_NONE +}; + +static inline struct zone * +ZID_TO_ZONE(struct heap_layout *layout, size_t zone_id) +{ + return (struct zone *) + ((uintptr_t)&layout->zone0 + ZONE_MAX_SIZE * zone_id); +} + +static inline struct chunk_header * +GET_CHUNK_HDR(struct heap_layout *layout, size_t zone_id, unsigned chunk_id) +{ + return &ZID_TO_ZONE(layout, zone_id)->chunk_headers[chunk_id]; +} + +static inline struct chunk * +GET_CHUNK(struct heap_layout *layout, size_t zone_id, unsigned chunk_id) +{ + return &ZID_TO_ZONE(layout, zone_id)->chunks[chunk_id]; +} + +static inline struct chunk_run * +GET_CHUNK_RUN(struct heap_layout *layout, size_t zone_id, unsigned chunk_id) +{ + return (struct chunk_run *)GET_CHUNK(layout, zone_id, chunk_id); +} + +#endif /* __DAOS_COMMON_HEAP_LAYOUT_H */ diff --git a/src/common/dav_v2/memblock.c b/src/common/dav_v2/memblock.c new file mode 100644 index 00000000000..cf3204432b1 --- /dev/null +++ b/src/common/dav_v2/memblock.c @@ -0,0 +1,1615 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2021, Intel Corporation */ + +/* + * memblock.c -- implementation of memory block + * + * Memory block is a representation of persistent object that resides in the + * heap. A valid memory block must be either a huge (free or used) chunk or a + * block inside a run. + * + * Huge blocks are 1:1 correlated with the chunk headers in the zone whereas + * run blocks are represented by bits in corresponding chunk bitmap. + * + * This file contains implementations of abstract operations on memory blocks. + * Instead of storing the mbops structure inside each memory block the correct + * method implementation is chosen at runtime. + */ + +#include <string.h> + +#include "obj.h" +#include "heap.h" +#include "memblock.h" +#include "out.h" +#include "valgrind_internal.h" +#include "alloc_class.h" + +/* calculates the size of the entire run, including any additional chunks */ +#define SIZEOF_RUN(runp, size_idx)\ + (sizeof(*(runp)) + (((size_idx) - 1) * CHUNKSIZE)) + +/* + * memblock_header_type -- determines the memory block's header type + */ +static enum header_type +memblock_header_type(const struct memory_block *m) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + if (hdr->flags & CHUNK_FLAG_COMPACT_HEADER) + return HEADER_COMPACT; + + if (hdr->flags & CHUNK_FLAG_HEADER_NONE) + return HEADER_NONE; + + return HEADER_LEGACY; +} + +/* + * memblock_header_legacy_get_size -- + * (internal) returns the size stored in a legacy header + */ +static size_t +memblock_header_legacy_get_size(const struct memory_block *m) +{ + struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m); + + return hdr->size; +} + +/* + * memblock_header_compact_get_size -- + * (internal) returns the size stored in a compact header + */ +static size_t +memblock_header_compact_get_size(const struct memory_block *m) +{ + struct allocation_header_compact *hdr = m->m_ops->get_real_data(m); + + return hdr->size & ALLOC_HDR_FLAGS_MASK; +} + +/* + * memblock_header_none_get_size -- + * (internal) determines the sizes of an object without a header + */ +static size_t +memblock_header_none_get_size(const struct memory_block *m) +{ + return m->m_ops->block_size(m); +} + +/* + * memblock_header_legacy_get_extra -- + * (internal) returns the extra field stored in a legacy header + */ +static uint64_t +memblock_header_legacy_get_extra(const struct memory_block *m) +{ + struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m); + + return hdr->type_num; +} + +/* + * memblock_header_compact_get_extra -- + * (internal) returns the extra field stored in a compact header + */ +static uint64_t +memblock_header_compact_get_extra(const struct memory_block *m) +{ + struct allocation_header_compact *hdr = m->m_ops->get_real_data(m); + + return hdr->extra; +} + +/* + * memblock_header_none_get_extra -- + * (internal) objects without a header don't have an extra field + */ +static uint64_t +memblock_header_none_get_extra(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + return 0; +} + +/* + * memblock_header_legacy_get_flags -- + * (internal) returns the flags stored in a legacy header + */ +static uint16_t +memblock_header_legacy_get_flags(const struct memory_block *m) +{ + struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m); + + return (uint16_t)(hdr->root_size >> ALLOC_HDR_SIZE_SHIFT); +} + +/* + * memblock_header_compact_get_flags -- + * (internal) returns the flags stored in a compact header + */ +static uint16_t +memblock_header_compact_get_flags(const struct memory_block *m) +{ + struct allocation_header_compact *hdr = m->m_ops->get_real_data(m); + + return (uint16_t)(hdr->size >> ALLOC_HDR_SIZE_SHIFT); +} + +/* + * memblock_header_none_get_flags -- + * (internal) objects without a header do not support flags + */ +static uint16_t +memblock_header_none_get_flags(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + return 0; +} + +/* + * memblock_header_legacy_write -- + * (internal) writes a legacy header of an object + */ +static void +memblock_header_legacy_write(const struct memory_block *m, + size_t size, uint64_t extra, uint16_t flags) +{ + struct allocation_header_legacy hdr; + + hdr.size = size; + hdr.type_num = extra; + hdr.root_size = ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT); + + struct allocation_header_legacy *hdrp = m->m_ops->get_real_data(m); + + VALGRIND_DO_MAKE_MEM_UNDEFINED(hdrp, sizeof(*hdrp)); + + VALGRIND_ADD_TO_TX(hdrp, sizeof(*hdrp)); + memcpy(hdrp, &hdr, sizeof(hdr)); /* legacy header is 64 bytes in size */ + VALGRIND_REMOVE_FROM_TX(hdrp, sizeof(*hdrp)); + + /* unused fields of the legacy headers are used as a red zone */ + VALGRIND_DO_MAKE_MEM_NOACCESS(hdrp->unused, sizeof(hdrp->unused)); +} + +/* + * memblock_header_compact_write -- + * (internal) writes a compact header of an object + */ +static void +memblock_header_compact_write(const struct memory_block *m, + size_t size, uint64_t extra, uint16_t flags) +{ + COMPILE_ERROR_ON(ALLOC_HDR_COMPACT_SIZE > CACHELINE_SIZE); + + struct { + struct allocation_header_compact hdr; + uint8_t padding[CACHELINE_SIZE - ALLOC_HDR_COMPACT_SIZE]; + } padded; + + /* + * REVISIT: + * Below memset is added to prevent valgrind propagating the + * cleared V-Bits of the padding field all the way till DMA buffer + * as part of logging by WAL. + * This code needs to be revisited when valgrind macros are + * enabled within DAV. + */ + padded.hdr.size = size | ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT); + padded.hdr.extra = extra; + + struct allocation_header_compact *hdrp = m->m_ops->get_real_data(m); + + VALGRIND_DO_MAKE_MEM_UNDEFINED(hdrp, sizeof(*hdrp)); + + /* + * If possible write the entire header with a single memcpy, this allows + * the copy implementation to avoid a cache miss on a partial cache line + * write. + */ + size_t hdr_size = ALLOC_HDR_COMPACT_SIZE; + + if ((uintptr_t)hdrp % CACHELINE_SIZE == 0 && size >= sizeof(padded)) + hdr_size = sizeof(padded); + + VALGRIND_ADD_TO_TX(hdrp, hdr_size); + + memcpy(hdrp, &padded, hdr_size); + VALGRIND_DO_MAKE_MEM_UNDEFINED((char *)hdrp + ALLOC_HDR_COMPACT_SIZE, + hdr_size - ALLOC_HDR_COMPACT_SIZE); + + VALGRIND_REMOVE_FROM_TX(hdrp, hdr_size); +} + +/* + * memblock_header_none_write -- + * (internal) nothing to write + */ +static void +memblock_header_none_write(const struct memory_block *m, + size_t size, uint64_t extra, uint16_t flags) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m, size, extra, flags); + + /* NOP */ +} + +/* + * memblock_header_legacy_invalidate -- + * (internal) invalidates a legacy header + */ +static void +memblock_header_legacy_invalidate(const struct memory_block *m) +{ + struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m); + + VALGRIND_SET_CLEAN(hdr, sizeof(*hdr)); +} + +/* + * memblock_header_compact_invalidate -- + * (internal) invalidates a compact header + */ +static void +memblock_header_compact_invalidate(const struct memory_block *m) +{ + struct allocation_header_compact *hdr = m->m_ops->get_real_data(m); + + VALGRIND_SET_CLEAN(hdr, sizeof(*hdr)); +} + +/* + * memblock_no_header_invalidate -- + * (internal) nothing to invalidate + */ +static void +memblock_header_none_invalidate(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + /* NOP */ +} + +/* + * memblock_header_legacy_reinit -- + * (internal) reinitializes a legacy header after a heap restart + */ +static void +memblock_header_legacy_reinit(const struct memory_block *m) +{ + struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m); + + VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr)); + + /* unused fields of the legacy headers are used as a red zone */ + VALGRIND_DO_MAKE_MEM_NOACCESS(hdr->unused, sizeof(hdr->unused)); +} + +/* + * memblock_header_compact_reinit -- + * (internal) reinitializes a compact header after a heap restart + */ +static void +memblock_header_compact_reinit(const struct memory_block *m) +{ + struct allocation_header_compact *hdr = m->m_ops->get_real_data(m); + + VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr)); +} + +/* + * memblock_header_none_reinit -- + * (internal) nothing to reinitialize + */ +static void +memblock_header_none_reinit(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + /* NOP */ +} + +static const struct { + /* determines the sizes of an object */ + size_t (*get_size)(const struct memory_block *m); + + /* returns the extra field (if available, 0 if not) */ + uint64_t (*get_extra)(const struct memory_block *m); + + /* returns the flags stored in a header (if available, 0 if not) */ + uint16_t (*get_flags)(const struct memory_block *m); + + /* + * Stores size, extra info and flags in header of an object + * (if available, does nothing otherwise). + */ + void (*write)(const struct memory_block *m, + size_t size, uint64_t extra, uint16_t flags); + void (*invalidate)(const struct memory_block *m); + + /* + * Reinitializes a header after a heap restart (if available, does + * nothing otherwise) (VG). + */ + void (*reinit)(const struct memory_block *m); +} memblock_header_ops[MAX_HEADER_TYPES] = { + [HEADER_LEGACY] = { + memblock_header_legacy_get_size, + memblock_header_legacy_get_extra, + memblock_header_legacy_get_flags, + memblock_header_legacy_write, + memblock_header_legacy_invalidate, + memblock_header_legacy_reinit, + }, + [HEADER_COMPACT] = { + memblock_header_compact_get_size, + memblock_header_compact_get_extra, + memblock_header_compact_get_flags, + memblock_header_compact_write, + memblock_header_compact_invalidate, + memblock_header_compact_reinit, + }, + [HEADER_NONE] = { + memblock_header_none_get_size, + memblock_header_none_get_extra, + memblock_header_none_get_flags, + memblock_header_none_write, + memblock_header_none_invalidate, + memblock_header_none_reinit, + } +}; + +/* + * memblock_run_default_nallocs -- returns the number of memory blocks + * available in the in a run with given parameters using the default + * fixed-bitmap algorithm + */ +static unsigned +memblock_run_default_nallocs(uint32_t *size_idx, uint16_t flags, + uint64_t unit_size, uint64_t alignment) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(flags); + + unsigned nallocs = (unsigned) + (RUN_DEFAULT_SIZE_BYTES(*size_idx) / unit_size); + + while (nallocs > RUN_DEFAULT_BITMAP_NBITS) { + /* trying to create a run with number of units exceeding the bitmap size */ + DAV_DBG("run:%lu number of units %u exceeds bitmap size (%u)", + unit_size, nallocs, RUN_DEFAULT_BITMAP_NBITS); + if (*size_idx > 1) { + *size_idx -= 1; + /* recalculate the number of allocations */ + nallocs = (uint32_t) + (RUN_DEFAULT_SIZE_BYTES(*size_idx) / unit_size); + /* run was constructed with fewer chunks (minus one) */ + D_INFO("run:%lu constructed with fewer chunks:%u\n", + unit_size, *size_idx); + } else { + /* + * run was constructed with fewer units than optimal, + * this might lead to inefficient memory utilization! + */ + D_INFO("run:%lu constructed with fewer units:%u than optimal:%u\n", + unit_size, RUN_DEFAULT_BITMAP_NBITS, nallocs); + + nallocs = RUN_DEFAULT_BITMAP_NBITS; + } + } + + return nallocs - (alignment ? 1 : 0); +} + +/* + * memblock_run_bitmap -- calculate bitmap parameters for given arguments + */ +void +memblock_run_bitmap(uint32_t *size_idx, uint16_t flags, + uint64_t unit_size, uint64_t alignment, void *content, + struct run_bitmap *b) +{ + ASSERTne(*size_idx, 0); + + /* + * Flexible bitmaps have a variably sized values array. The size varies + * depending on: + * alignment - initial run alignment might require up-to a unit + * size idx - the larger the run, the more units it carries + * unit_size - the smaller the unit size, the more units per run + * + * The size of the bitmap also has to be calculated in such a way that + * the beginning of allocations data is cacheline aligned. This is + * required to perform many optimizations throughout the codebase. + * This alignment requirement means that some of the bitmap values might + * remain unused and will serve only as a padding for data. + */ + if (flags & CHUNK_FLAG_FLEX_BITMAP) { + /* + * First calculate the number of values without accounting for + * the bitmap size. + */ + size_t content_size = RUN_CONTENT_SIZE_BYTES(*size_idx); + + b->nbits = (unsigned)(content_size / unit_size); + b->nvalues = util_div_ceil(b->nbits, RUN_BITS_PER_VALUE); + + /* + * Then, align the number of values up, so that the cacheline + * alignment is preserved. + */ + b->nvalues = ALIGN_UP(b->nvalues + RUN_BASE_METADATA_VALUES, + (unsigned)(CACHELINE_SIZE / sizeof(*b->values))) + - RUN_BASE_METADATA_VALUES; + + /* + * This is the total number of bytes needed for the bitmap AND + * padding. + */ + b->size = b->nvalues * sizeof(*b->values); + + /* + * Calculate the number of allocations again, but this time + * accounting for the bitmap/padding. + */ + b->nbits = (unsigned)((content_size - b->size) / unit_size) + - (alignment ? 1U : 0U); + + /* + * The last step is to calculate how much of the padding + * is left at the end of the bitmap. + */ + unsigned unused_bits = (b->nvalues * RUN_BITS_PER_VALUE) + - b->nbits; + unsigned unused_values = unused_bits / RUN_BITS_PER_VALUE; + + b->nvalues -= unused_values; + b->values = (uint64_t *)content; + + return; + } + + b->size = RUN_DEFAULT_BITMAP_SIZE; + b->nbits = memblock_run_default_nallocs(size_idx, flags, + unit_size, alignment); + + unsigned unused_bits = RUN_DEFAULT_BITMAP_NBITS - b->nbits; + unsigned unused_values = unused_bits / RUN_BITS_PER_VALUE; + + b->nvalues = RUN_DEFAULT_BITMAP_VALUES - unused_values; + b->values = (uint64_t *)content; +} + +/* + * run_get_bitmap -- initializes run bitmap information + */ +static void +run_get_bitmap(const struct memory_block *m, struct run_bitmap *b) +{ + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + if (m->cached_bitmap != NULL) { + *b = *m->cached_bitmap; + b->values = (uint64_t *)run->content; + } else { + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + uint32_t size_idx = hdr->size_idx; + + memblock_run_bitmap(&size_idx, hdr->flags, run->hdr.block_size, + run->hdr.alignment, run->content, b); + ASSERTeq(size_idx, hdr->size_idx); + } +} + +/* + * huge_block_size -- returns the compile-time constant which defines the + * huge memory block size. + */ +static size_t +huge_block_size(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + return CHUNKSIZE; +} + +/* + * run_block_size -- looks for the right chunk and returns the block size + * information that is attached to the run block metadata. + */ +static size_t +run_block_size(const struct memory_block *m) +{ + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + return run->hdr.block_size; +} + +/* + * huge_get_real_data -- returns pointer to the beginning data of a huge block + */ +static void * +huge_get_real_data(const struct memory_block *m) +{ + return heap_get_chunk(m->heap, m)->data; +} + +/* + * run_get_data_start -- (internal) returns the pointer to the beginning of + * allocations in a run + */ +static char * +run_get_data_start(const struct memory_block *m) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + struct run_bitmap b; + + run_get_bitmap(m, &b); + + if (hdr->flags & CHUNK_FLAG_ALIGNED) { + /* + * Alignment is property of user data in allocations. And + * since objects have headers, we need to take them into + * account when calculating the address. + */ + uintptr_t hsize = header_type_to_size[m->header_type]; + uintptr_t base = (uintptr_t)run->content + + b.size + hsize; + return (char *)(ALIGN_UP(base, run->hdr.alignment) - hsize); + } else { + return (char *)&run->content + b.size; + } +} + +/* + * run_get_data_offset -- (internal) returns the number of bytes between + * run base metadata and data + */ +static size_t +run_get_data_offset(const struct memory_block *m) +{ + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + return (size_t)run_get_data_start(m) - (size_t)&run->content; +} + +/* + * run_get_real_data -- returns pointer to the beginning data of a run block + */ +static void * +run_get_real_data(const struct memory_block *m) +{ + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + ASSERT(run->hdr.block_size != 0); + + return run_get_data_start(m) + (run->hdr.block_size * m->block_off); +} + +/* + * block_get_user_data -- returns pointer to the data of a block + */ +static void * +block_get_user_data(const struct memory_block *m) +{ + return (char *)m->m_ops->get_real_data(m) + + header_type_to_size[m->header_type]; +} + +/* + * chunk_get_chunk_hdr_value -- (internal) get value of a header for redo log + */ +static uint64_t +chunk_get_chunk_hdr_value(uint16_t type, uint16_t flags, uint32_t size_idx) +{ + uint64_t val; + struct chunk_header hdr; + + COMPILE_ERROR_ON(sizeof(struct chunk_header) != sizeof(uint64_t)); + + hdr.type = type; + hdr.flags = flags; + hdr.size_idx = size_idx; + memcpy(&val, &hdr, sizeof(val)); + + return val; +} + +/* + * huge_prep_operation_hdr -- prepares the new value of a chunk header that will + * be set after the operation concludes. + */ +static void +huge_prep_operation_hdr(const struct memory_block *m, enum memblock_state op, + struct operation_context *ctx) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + /* + * Depending on the operation that needs to be performed a new chunk + * header needs to be prepared with the new chunk state. + */ + uint64_t val = chunk_get_chunk_hdr_value( + op == MEMBLOCK_ALLOCATED ? CHUNK_TYPE_USED : CHUNK_TYPE_FREE, + hdr->flags, + m->size_idx); + + if (ctx == NULL) { + util_atomic_store_explicit64((uint64_t *)hdr, val, + memory_order_relaxed); + mo_wal_persist(&m->heap->p_ops, hdr, sizeof(*hdr)); + } else { + operation_add_entry(ctx, hdr, val, ULOG_OPERATION_SET); + } + + VALGRIND_DO_MAKE_MEM_NOACCESS(hdr + 1, + (hdr->size_idx - 1) * sizeof(struct chunk_header)); + + /* + * In the case of chunks larger than one unit the footer must be + * created immediately AFTER the persistent state is safely updated. + */ + if (m->size_idx == 1) + return; + + struct chunk_header *footer = hdr + m->size_idx - 1; + + VALGRIND_DO_MAKE_MEM_UNDEFINED(footer, sizeof(*footer)); + + val = chunk_get_chunk_hdr_value(CHUNK_TYPE_FOOTER, 0, m->size_idx); + + /* + * It's only safe to write the footer AFTER the persistent part of + * the operation have been successfully processed because the footer + * pointer might point to a currently valid persistent state + * of a different chunk. + * The footer entry change is updated as transient because it will + * be recreated at heap boot regardless - it's just needed for runtime + * operations. + */ + if (ctx == NULL) { + util_atomic_store_explicit64((uint64_t *)footer, val, + memory_order_relaxed); + VALGRIND_SET_CLEAN(footer, sizeof(*footer)); + } else { + operation_add_typed_entry(ctx, + footer, val, ULOG_OPERATION_SET, LOG_TRANSIENT); + } +} + +/* + * run_prep_operation_hdr -- prepares the new value for a select few bytes of + * a run bitmap that will be set after the operation concludes. + * + * It's VERY important to keep in mind that the particular value of the + * bitmap this method is modifying must not be changed after this function + * is called and before the operation is processed. + */ +static void +run_prep_operation_hdr(const struct memory_block *m, enum memblock_state op, + struct operation_context *ctx) +{ + ASSERT(m->size_idx <= RUN_BITS_PER_VALUE); + ASSERT(m->size_idx > 0); + + /* + * Free blocks are represented by clear bits and used blocks by set + * bits - which is the reverse of the commonly used scheme. + * + * Here a bit mask is prepared that flips the bits that represent the + * memory block provided by the caller - because both the size index and + * the block offset are tied 1:1 to the bitmap this operation is + * relatively simple. + */ + uint64_t bmask; + +#ifdef WAL_SUPPORTS_AND_OR_OPS + if (m->size_idx == RUN_BITS_PER_VALUE) { + ASSERTeq(m->block_off % RUN_BITS_PER_VALUE, 0); + bmask = UINT64_MAX; + } else { + bmask = ((1ULL << m->size_idx) - 1ULL) << + (m->block_off % RUN_BITS_PER_VALUE); + } +#else + uint16_t num = m->size_idx; + uint32_t pos = m->block_off % RUN_BITS_PER_VALUE; + + ASSERT_rt(num > 0 && num <= RUN_BITS_PER_VALUE); + bmask = ULOG_ENTRY_TO_VAL(pos, num); +#endif + + /* + * The run bitmap is composed of several 8 byte values, so a proper + * element of the bitmap array must be selected. + */ + unsigned bpos = m->block_off / RUN_BITS_PER_VALUE; + struct run_bitmap b; + + run_get_bitmap(m, &b); + + /* the bit mask is applied immediately by the add entry operations */ + if (op == MEMBLOCK_ALLOCATED) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + operation_add_entry(ctx, &b.values[bpos], + bmask, ULOG_OPERATION_OR); +#else + operation_add_entry(ctx, &b.values[bpos], + bmask, ULOG_OPERATION_SET_BITS); +#endif + } else if (op == MEMBLOCK_FREE) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + operation_add_entry(ctx, &b.values[bpos], + ~bmask, ULOG_OPERATION_AND); +#else + operation_add_entry(ctx, &b.values[bpos], + bmask, ULOG_OPERATION_CLR_BITS); +#endif + } else { + ASSERT(0); + } +} + +/* + * huge_get_lock -- because huge memory blocks are always allocated from a + * single bucket there's no reason to lock them - the bucket itself is + * protected. + */ +static pthread_mutex_t * +huge_get_lock(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + return NULL; +} + +/* + * run_get_lock -- gets the runtime mutex from the heap. + */ +static pthread_mutex_t * +run_get_lock(const struct memory_block *m) +{ + return heap_get_run_lock(m->heap, m->chunk_id); +} + +/* + * huge_get_state -- returns whether a huge block is allocated or not + */ +static enum memblock_state +huge_get_state(const struct memory_block *m) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + if (hdr->type == CHUNK_TYPE_USED) + return MEMBLOCK_ALLOCATED; + + if (hdr->type == CHUNK_TYPE_FREE) + return MEMBLOCK_FREE; + + return MEMBLOCK_STATE_UNKNOWN; +} + +/* + * huge_get_state -- returns whether a block from a run is allocated or not + */ +static enum memblock_state +run_get_state(const struct memory_block *m) +{ + struct run_bitmap b; + + run_get_bitmap(m, &b); + + unsigned v = m->block_off / RUN_BITS_PER_VALUE; + uint64_t bitmap = b.values[v]; + unsigned bit = m->block_off % RUN_BITS_PER_VALUE; + + unsigned bit_last = bit + m->size_idx; + + ASSERT(bit_last <= RUN_BITS_PER_VALUE); + + for (unsigned i = bit; i < bit_last; ++i) { + if (!BIT_IS_CLR(bitmap, i)) + return MEMBLOCK_ALLOCATED; + } + + return MEMBLOCK_FREE; +} + +/* + * huge_ensure_header_type -- checks the header type of a chunk and modifies + * it if necessary. This is fail-safe atomic. + */ +static void +huge_ensure_header_type(const struct memory_block *m, + enum header_type t) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + ASSERTeq(hdr->type, CHUNK_TYPE_FREE); + + if ((hdr->flags & header_type_to_flag[t]) == 0) { + VALGRIND_ADD_TO_TX(hdr, sizeof(*hdr)); + uint16_t f = ((uint16_t)header_type_to_flag[t]); + uint64_t nhdr = chunk_get_chunk_hdr_value(hdr->type, + hdr->flags | f, hdr->size_idx); + util_atomic_store_explicit64((uint64_t *)hdr, + nhdr, memory_order_relaxed); + mo_wal_persist(&m->heap->p_ops, hdr, sizeof(*hdr)); + VALGRIND_REMOVE_FROM_TX(hdr, sizeof(*hdr)); + } +} + +/* + * run_ensure_header_type -- runs must be created with appropriate header type. + */ +static void +run_ensure_header_type(const struct memory_block *m, + enum header_type t) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m, t); + +#ifdef DAV_EXTRA_DEBUG + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + ASSERTeq(hdr->type, CHUNK_TYPE_RUN); + ASSERT((hdr->flags & header_type_to_flag[t]) == header_type_to_flag[t]); +#endif +} + +/* + * block_get_real_size -- returns the size of a memory block that includes all + * of the overhead (headers) + */ +static size_t +block_get_real_size(const struct memory_block *m) +{ + /* + * There are two valid ways to get a size. If the memory block + * initialized properly and the size index is set, the chunk unit size + * can be simply multiplied by that index, otherwise we need to look at + * the allocation header. + */ + if (m->size_idx != 0) + return m->m_ops->block_size(m) * m->size_idx; + else + return memblock_header_ops[m->header_type].get_size(m); +} + +/* + * block_get_user_size -- returns the size of a memory block without overheads, + * this is the size of a data block that can be used. + */ +static size_t +block_get_user_size(const struct memory_block *m) +{ + return block_get_real_size(m) - header_type_to_size[m->header_type]; +} + +/* + * block_write_header -- writes a header of an allocation + */ +static void +block_write_header(const struct memory_block *m, + uint64_t extra_field, uint16_t flags) +{ + memblock_header_ops[m->header_type].write(m, + block_get_real_size(m), extra_field, flags); +} + +/* + * block_invalidate -- invalidates allocation data and header + */ +static void +block_invalidate(const struct memory_block *m) +{ + void *data = m->m_ops->get_user_data(m); + size_t size = m->m_ops->get_user_size(m); + + VALGRIND_SET_CLEAN(data, size); + + memblock_header_ops[m->header_type].invalidate(m); +} + +/* + * block_reinit_header -- reinitializes a block after a heap restart + */ +static void +block_reinit_header(const struct memory_block *m) +{ + memblock_header_ops[m->header_type].reinit(m); +} + +/* + * block_get_extra -- returns the extra field of an allocation + */ +static uint64_t +block_get_extra(const struct memory_block *m) +{ + return memblock_header_ops[m->header_type].get_extra(m); +} + +/* + * block_get_flags -- returns the flags of an allocation + */ +static uint16_t +block_get_flags(const struct memory_block *m) +{ + return memblock_header_ops[m->header_type].get_flags(m); +} + +/* + * heap_run_process_bitmap_value -- (internal) looks for unset bits in the + * value, creates a valid memory block out of them and inserts that + * block into the given bucket. + */ +static int +run_process_bitmap_value(const struct memory_block *m, + uint64_t value, uint32_t base_offset, object_callback cb, void *arg) +{ + int ret = 0; + uint64_t shift = 0; /* already processed bits */ + struct memory_block s = *m; + + do { + /* + * Shift the value so that the next memory block starts on the + * least significant position: + * ..............0 (free block) + * or ..............1 (used block) + */ + uint64_t shifted = value >> shift; + + /* all clear or set bits indicate the end of traversal */ + if (shifted == 0) { + /* + * Insert the remaining blocks as free. Remember that + * unsigned values are always zero-filled, so we must + * take the current shift into account. + */ + s.block_off = (uint32_t)(base_offset + shift); + s.size_idx = (uint32_t)(RUN_BITS_PER_VALUE - shift); + + ret = cb(&s, arg); + if (ret != 0) + return ret; + + break; + } else if (shifted == UINT64_MAX) { + break; + } + + /* + * Offset and size of the next free block, either of these + * can be zero depending on where the free block is located + * in the value. + */ + unsigned off = (unsigned)util_lssb_index64(~shifted); + unsigned size = (unsigned)util_lssb_index64(shifted); + + shift += off + size; + + if (size != 0) { /* zero size means skip to the next value */ + s.block_off = (uint32_t)(base_offset + (shift - size)); + s.size_idx = (uint32_t)(size); + + memblock_rebuild_state(m->heap, &s); + ret = cb(&s, arg); + if (ret != 0) + return ret; + } + } while (shift != RUN_BITS_PER_VALUE); + + return 0; +} + +/* + * run_iterate_free -- iterates over free blocks in a run + */ +static int +run_iterate_free(const struct memory_block *m, object_callback cb, void *arg) +{ + int ret = 0; + uint32_t block_off = 0; + struct run_bitmap b; + + run_get_bitmap(m, &b); + + struct memory_block nm = *m; + + for (unsigned i = 0; i < b.nvalues; ++i) { + uint64_t v = b.values[i]; + + ASSERT((uint64_t)RUN_BITS_PER_VALUE * (uint64_t)i + <= UINT32_MAX); + block_off = RUN_BITS_PER_VALUE * i; + ret = run_process_bitmap_value(&nm, v, block_off, cb, arg); + if (ret != 0) + return ret; + } + + return 0; +} + +/* + * run_iterate_used -- iterates over used blocks in a run + */ +static int +run_iterate_used(const struct memory_block *m, object_callback cb, void *arg) +{ + uint32_t i = m->block_off / RUN_BITS_PER_VALUE; + uint32_t block_start = m->block_off % RUN_BITS_PER_VALUE; + uint32_t block_off; + + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + struct memory_block iter = *m; + struct run_bitmap b; + + run_get_bitmap(m, &b); + + for (; i < b.nvalues; ++i) { + uint64_t v = b.values[i]; + + block_off = (uint32_t)(RUN_BITS_PER_VALUE * i); + + for (uint32_t j = block_start; j < RUN_BITS_PER_VALUE; ) { + if (block_off + j >= (uint32_t)b.nbits) + break; + + if (!BIT_IS_CLR(v, j)) { + iter.block_off = (uint32_t)(block_off + j); + + /* + * The size index of this memory block cannot be + * retrieved at this time because the header + * might not be initialized in valgrind yet. + */ + iter.size_idx = 0; + + if (cb(&iter, arg) != 0) + return 1; + + iter.size_idx = CALC_SIZE_IDX( + run->hdr.block_size, + iter.m_ops->get_real_size(&iter)); + j = (uint32_t)(j + iter.size_idx); + } else { + ++j; + } + } + block_start = 0; + } + + return 0; +} + +/* + * huge_iterate_free -- calls cb on memory block if it's free + */ +static int +huge_iterate_free(const struct memory_block *m, object_callback cb, void *arg) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + return hdr->type == CHUNK_TYPE_FREE ? cb(m, arg) : 0; +} + +/* + * huge_iterate_free -- calls cb on memory block if it's used + */ +static int +huge_iterate_used(const struct memory_block *m, object_callback cb, void *arg) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + return hdr->type == CHUNK_TYPE_USED ? cb(m, arg) : 0; +} + +/* + * huge_vg_init -- initializes chunk metadata in memcheck state + */ +static void +huge_vg_init(const struct memory_block *m, int objects, + object_callback cb, void *arg) +{ + struct zone *z = ZID_TO_ZONE(m->heap->layout, m->zone_id); + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + struct chunk *chunk = heap_get_chunk(m->heap, m); + + VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr)); + + /* + * Mark unused chunk headers as not accessible. + */ + VALGRIND_DO_MAKE_MEM_NOACCESS( + &z->chunk_headers[m->chunk_id + 1], + (m->size_idx - 1) * + sizeof(struct chunk_header)); + + size_t size = block_get_real_size(m); + + VALGRIND_DO_MAKE_MEM_NOACCESS(chunk, size); + + if (objects && huge_get_state(m) == MEMBLOCK_ALLOCATED) { + if (cb(m, arg) != 0) + FATAL("failed to initialize valgrind state"); + } +} + +/* + * run_vg_init -- initializes run metadata in memcheck state + */ +static void +run_vg_init(const struct memory_block *m, int objects, + object_callback cb, void *arg) +{ + struct zone *z = ZID_TO_ZONE(m->heap->layout, m->zone_id); + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + struct chunk_run *run = heap_get_chunk_run(m->heap, m); + + VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr)); + + /* set the run metadata as defined */ + VALGRIND_DO_MAKE_MEM_DEFINED(run, RUN_BASE_METADATA_SIZE); + + struct run_bitmap b; + + run_get_bitmap(m, &b); + + /* + * Mark run data headers as defined. + */ + for (unsigned j = 1; j < m->size_idx; ++j) { + struct chunk_header *data_hdr = + &z->chunk_headers[m->chunk_id + j]; + VALGRIND_DO_MAKE_MEM_DEFINED(data_hdr, + sizeof(struct chunk_header)); + ASSERTeq(data_hdr->type, CHUNK_TYPE_RUN_DATA); + } + + VALGRIND_DO_MAKE_MEM_NOACCESS(run, SIZEOF_RUN(run, m->size_idx)); + + /* set the run bitmap as defined */ + VALGRIND_DO_MAKE_MEM_DEFINED(run, b.size + RUN_BASE_METADATA_SIZE); + + if (objects) { + if (run_iterate_used(m, cb, arg) != 0) + FATAL("failed to initialize valgrind state"); + } +} + +/* + * run_reinit_chunk -- run reinitialization on first zone traversal + */ +static void +run_reinit_chunk(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + /* noop */ +} + +/* + * huge_write_footer -- (internal) writes a chunk footer + */ +static void +huge_write_footer(struct chunk_header *hdr, uint32_t size_idx) +{ + if (size_idx == 1) /* that would overwrite the header */ + return; + + VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr + size_idx - 1, sizeof(*hdr)); + + struct chunk_header f = *hdr; + + f.type = CHUNK_TYPE_FOOTER; + f.size_idx = size_idx; + *(hdr + size_idx - 1) = f; + /* no need to persist, footers are recreated in heap_populate_buckets */ + VALGRIND_SET_CLEAN(hdr + size_idx - 1, sizeof(f)); +} + +/* + * huge_reinit_chunk -- chunk reinitialization on first zone traversal + */ +static void +huge_reinit_chunk(const struct memory_block *m) +{ + struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m); + + if (hdr->type == CHUNK_TYPE_USED) + huge_write_footer(hdr, hdr->size_idx); +} + +/* + * run_calc_free -- calculates the number of free units in a run + */ +static void +run_calc_free(const struct memory_block *m, + uint32_t *free_space, uint32_t *max_free_block) +{ + struct run_bitmap b; + + run_get_bitmap(m, &b); + for (unsigned i = 0; i < b.nvalues; ++i) { + uint64_t value = ~b.values[i]; + + if (value == 0) + continue; + + uint32_t free_in_value = util_popcount64(value); + + *free_space = *free_space + free_in_value; + + /* + * If this value has less free blocks than already found max, + * there's no point in calculating. + */ + if (free_in_value < *max_free_block) + continue; + + /* if the entire value is empty, no point in calculating */ + if (free_in_value == RUN_BITS_PER_VALUE) { + *max_free_block = RUN_BITS_PER_VALUE; + continue; + } + + /* if already at max, no point in calculating */ + if (*max_free_block == RUN_BITS_PER_VALUE) + continue; + + /* + * Calculate the biggest free block in the bitmap. + * This algorithm is not the most clever imaginable, but it's + * easy to implement and fast enough. + */ + uint16_t n = 0; + + while (value != 0) { + value &= (value << 1ULL); + n++; + } + + if (n > *max_free_block) + *max_free_block = n; + } +} + +/* + * huge_fill_pct -- huge blocks by definition use the entirety of a chunk + */ +static unsigned +huge_fill_pct(const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(m); + + return 100; +} + +/* + * run_fill_pct -- calculates the percentage of allocated units inside of a run + */ +static unsigned +run_fill_pct(const struct memory_block *m) +{ + struct run_bitmap b; + unsigned clearbits = 0; + + run_get_bitmap(m, &b); + for (unsigned i = 0; i < b.nvalues; ++i) { + uint64_t value = ~b.values[i]; + + if (value == 0) + continue; + + clearbits += util_popcount64(value); + } + ASSERT(b.nbits >= clearbits); + unsigned setbits = b.nbits - clearbits; + + return (100 * setbits) / b.nbits; +} + +static const struct memory_block_ops mb_ops[MAX_MEMORY_BLOCK] = { + [MEMORY_BLOCK_HUGE] = { + .block_size = huge_block_size, + .prep_hdr = huge_prep_operation_hdr, + .get_lock = huge_get_lock, + .get_state = huge_get_state, + .get_user_data = block_get_user_data, + .get_real_data = huge_get_real_data, + .get_user_size = block_get_user_size, + .get_real_size = block_get_real_size, + .write_header = block_write_header, + .invalidate = block_invalidate, + .ensure_header_type = huge_ensure_header_type, + .reinit_header = block_reinit_header, + .vg_init = huge_vg_init, + .get_extra = block_get_extra, + .get_flags = block_get_flags, + .iterate_free = huge_iterate_free, + .iterate_used = huge_iterate_used, + .reinit_chunk = huge_reinit_chunk, + .calc_free = NULL, + .get_bitmap = NULL, + .fill_pct = huge_fill_pct, + }, + [MEMORY_BLOCK_RUN] = { + .block_size = run_block_size, + .prep_hdr = run_prep_operation_hdr, + .get_lock = run_get_lock, + .get_state = run_get_state, + .get_user_data = block_get_user_data, + .get_real_data = run_get_real_data, + .get_user_size = block_get_user_size, + .get_real_size = block_get_real_size, + .write_header = block_write_header, + .invalidate = block_invalidate, + .ensure_header_type = run_ensure_header_type, + .reinit_header = block_reinit_header, + .vg_init = run_vg_init, + .get_extra = block_get_extra, + .get_flags = block_get_flags, + .iterate_free = run_iterate_free, + .iterate_used = run_iterate_used, + .reinit_chunk = run_reinit_chunk, + .calc_free = run_calc_free, + .get_bitmap = run_get_bitmap, + .fill_pct = run_fill_pct, + } +}; + +/* + * memblock_huge_init -- initializes a new huge memory block + */ +struct memory_block +memblock_huge_init(struct palloc_heap *heap, + uint32_t chunk_id, uint32_t zone_id, uint32_t size_idx) +{ + struct memory_block m = MEMORY_BLOCK_NONE; + + m.chunk_id = chunk_id; + m.zone_id = zone_id; + m.size_idx = size_idx; + m.heap = heap; + + struct chunk_header *hdr = heap_get_chunk_hdr(heap, &m); + + VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr, sizeof(*hdr)); + VALGRIND_ANNOTATE_NEW_MEMORY(hdr, sizeof(*hdr)); + + uint64_t nhdr = chunk_get_chunk_hdr_value(CHUNK_TYPE_FREE, + 0, size_idx); + util_atomic_store_explicit64((uint64_t *)hdr, + nhdr, memory_order_relaxed); + + mo_wal_persist(&heap->p_ops, hdr, sizeof(*hdr)); + + huge_write_footer(hdr, size_idx); + + memblock_rebuild_state(heap, &m); + + return m; +} + +/* + * memblock_run_init -- initializes a new run memory block + */ +struct memory_block +memblock_run_init(struct palloc_heap *heap, + uint32_t chunk_id, uint32_t zone_id, struct run_descriptor *rdsc) +{ + uint32_t size_idx = rdsc->size_idx; + + ASSERTne(size_idx, 0); + + struct memory_block m = MEMORY_BLOCK_NONE; + + m.chunk_id = chunk_id; + m.zone_id = zone_id; + m.size_idx = size_idx; + m.heap = heap; + + struct zone *z = ZID_TO_ZONE(heap->layout, zone_id); + struct chunk_run *run = heap_get_chunk_run(heap, &m); + size_t runsize = SIZEOF_RUN(run, size_idx); + + VALGRIND_DO_MAKE_MEM_UNDEFINED(run, runsize); + + /* add/remove chunk_run and chunk_header to valgrind transaction */ + VALGRIND_ADD_TO_TX(run, runsize); + run->hdr.block_size = rdsc->unit_size; + run->hdr.alignment = rdsc->alignment; + + struct run_bitmap b = rdsc->bitmap; + + b.values = (uint64_t *)run->content; + + size_t bitmap_size = b.size; + + /* set all the bits */ + memset(b.values, 0xFF, bitmap_size); + + /* clear only the bits available for allocations from this bucket */ + memset(b.values, 0, sizeof(*b.values) * (b.nvalues - 1)); + + unsigned trailing_bits = b.nbits % RUN_BITS_PER_VALUE; + uint64_t last_value = UINT64_MAX << trailing_bits; + + b.values[b.nvalues - 1] = last_value; + + VALGRIND_REMOVE_FROM_TX(run, runsize); + + mo_wal_flush(&heap->p_ops, run, + sizeof(struct chunk_run_header) + + bitmap_size, 0); + + struct chunk_header run_data_hdr; + + run_data_hdr.type = CHUNK_TYPE_RUN_DATA; + run_data_hdr.flags = 0; + + VALGRIND_ADD_TO_TX(&z->chunk_headers[chunk_id], + sizeof(struct chunk_header) * size_idx); + + struct chunk_header *data_hdr; + + for (unsigned i = 1; i < size_idx; ++i) { + data_hdr = &z->chunk_headers[chunk_id + i]; + VALGRIND_DO_MAKE_MEM_UNDEFINED(data_hdr, sizeof(*data_hdr)); + VALGRIND_ANNOTATE_NEW_MEMORY(data_hdr, sizeof(*data_hdr)); + run_data_hdr.size_idx = i; + *data_hdr = run_data_hdr; + } + mo_wal_persist(&heap->p_ops, + &z->chunk_headers[chunk_id + 1], + sizeof(struct chunk_header) * (size_idx - 1)); + + struct chunk_header *hdr = &z->chunk_headers[chunk_id]; + + ASSERT(hdr->type == CHUNK_TYPE_FREE); + + VALGRIND_ANNOTATE_NEW_MEMORY(hdr, sizeof(*hdr)); + + uint64_t run_hdr = chunk_get_chunk_hdr_value(CHUNK_TYPE_RUN, + rdsc->flags, hdr->size_idx); + util_atomic_store_explicit64((uint64_t *)hdr, + run_hdr, memory_order_relaxed); + mo_wal_persist(&heap->p_ops, hdr, sizeof(*hdr)); + + VALGRIND_REMOVE_FROM_TX(&z->chunk_headers[chunk_id], + sizeof(struct chunk_header) * size_idx); + + memblock_rebuild_state(heap, &m); + m.cached_bitmap = &rdsc->bitmap; + + return m; +} + +/* + * memblock_detect_type -- looks for the corresponding chunk header and + * depending on the chunks type returns the right memory block type + */ +static enum memory_block_type +memblock_detect_type(struct palloc_heap *heap, const struct memory_block *m) +{ + enum memory_block_type ret = MEMORY_BLOCK_HUGE; + + switch (heap_get_chunk_hdr(heap, m)->type) { + case CHUNK_TYPE_RUN: + case CHUNK_TYPE_RUN_DATA: + ret = MEMORY_BLOCK_RUN; + break; + case CHUNK_TYPE_FREE: + case CHUNK_TYPE_USED: + case CHUNK_TYPE_FOOTER: + ret = MEMORY_BLOCK_HUGE; + break; + default: + /* unreachable */ + FATAL("possible zone chunks metadata corruption"); + } + return ret; +} + +/* + * memblock_from_offset -- resolves a memory block data from an offset that + * originates from the heap + */ +struct memory_block +memblock_from_offset_opt(struct palloc_heap *heap, uint64_t off, int size) +{ + struct memory_block m = MEMORY_BLOCK_NONE; + + m.heap = heap; + + off -= HEAP_PTR_TO_OFF(heap, &heap->layout->zone0); + m.zone_id = (uint32_t)(off / ZONE_MAX_SIZE); + + off -= (ZONE_MAX_SIZE * m.zone_id) + sizeof(struct zone); + m.chunk_id = (uint32_t)(off / CHUNKSIZE); + + struct chunk_header *hdr = heap_get_chunk_hdr(heap, &m); + + if (hdr->type == CHUNK_TYPE_RUN_DATA) + m.chunk_id -= hdr->size_idx; + + off -= CHUNKSIZE * m.chunk_id; + + m.header_type = memblock_header_type(&m); + + off -= header_type_to_size[m.header_type]; + + m.type = off != 0 ? MEMORY_BLOCK_RUN : MEMORY_BLOCK_HUGE; + ASSERTeq(memblock_detect_type(heap, &m), m.type); + + m.m_ops = &mb_ops[m.type]; + + uint64_t unit_size = m.m_ops->block_size(&m); + + if (off != 0) { /* run */ + off -= run_get_data_offset(&m); + off -= RUN_BASE_METADATA_SIZE; + m.block_off = (uint16_t)(off / unit_size); + off -= m.block_off * unit_size; + } + + struct alloc_class_collection *acc = heap_alloc_classes(heap); + + if (acc != NULL) { + struct alloc_class *ac = alloc_class_by_run(acc, + unit_size, hdr->flags, hdr->size_idx); + if (ac != NULL) + m.cached_bitmap = &ac->rdsc.bitmap; + } + + m.size_idx = !size ? 0 : CALC_SIZE_IDX(unit_size, + memblock_header_ops[m.header_type].get_size(&m)); + + ASSERTeq(off, 0); + + return m; +} + +/* + * memblock_from_offset -- returns memory block with size + */ +struct memory_block +memblock_from_offset(struct palloc_heap *heap, uint64_t off) +{ + return memblock_from_offset_opt(heap, off, 1); +} + +/* + * memblock_rebuild_state -- fills in the runtime-state related fields of a + * memory block structure + * + * This function must be called on all memory blocks that were created by hand + * (as opposed to retrieved from memblock_from_offset function). + */ +void +memblock_rebuild_state(struct palloc_heap *heap, struct memory_block *m) +{ + m->heap = heap; + m->header_type = memblock_header_type(m); + m->type = memblock_detect_type(heap, m); + m->m_ops = &mb_ops[m->type]; + m->cached_bitmap = NULL; +} diff --git a/src/common/dav_v2/memblock.h b/src/common/dav_v2/memblock.h new file mode 100644 index 00000000000..f2fe3ee91be --- /dev/null +++ b/src/common/dav_v2/memblock.h @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2021, Intel Corporation */ + +/* + * memblock.h -- internal definitions for memory block + */ + +#ifndef __DAOS_COMMON_MEMBLOCK_H +#define __DAOS_COMMON_MEMBLOCK_H 1 + +#include <stddef.h> +#include <stdint.h> + +#include "heap_layout.h" +#include "memops.h" +#include "palloc.h" + +#define MEMORY_BLOCK_NONE \ +(struct memory_block)\ +{0, 0, 0, 0, NULL, NULL, MAX_HEADER_TYPES, MAX_MEMORY_BLOCK, NULL} + +#define MEMORY_BLOCK_IS_NONE(_m)\ +((_m).heap == NULL) + +#define MEMORY_BLOCK_EQUALS(lhs, rhs)\ +((lhs).zone_id == (rhs).zone_id && (lhs).chunk_id == (rhs).chunk_id &&\ +(lhs).block_off == (rhs).block_off && (lhs).heap == (rhs).heap) + +enum memory_block_type { + /* + * Huge memory blocks are directly backed by memory chunks. A single + * huge block can consist of several chunks. + * The persistent representation of huge memory blocks can be thought + * of as a doubly linked list with variable length elements. + * That list is stored in the chunk headers array where one element + * directly corresponds to one chunk. + * + * U - used, F - free, R - footer, . - empty + * |U| represents a used chunk with a size index of 1, with type + * information (CHUNK_TYPE_USED) stored in the corresponding header + * array element - chunk_headers[chunk_id]. + * + * |F...R| represents a free chunk with size index of 5. The empty + * chunk headers have undefined values and shouldn't be used. All + * chunks with size larger than 1 must have a footer in the last + * corresponding header array - chunk_headers[chunk_id - size_idx - 1]. + * + * The above representation of chunks will be used to describe the + * way fail-safety is achieved during heap operations. + * + * Allocation of huge memory block with size index 5: + * Initial heap state: |U| <> |F..R| <> |U| <> |F......R| + * + * The only block that matches that size is at very end of the chunks + * list: |F......R| + * + * As the request was for memory block of size 5, and this ones size is + * 7 there's a need to first split the chunk in two. + * 1) The last chunk header of the new allocation is marked as footer + * and the block after that one is marked as free: |F...RF.R| + * This is allowed and has no impact on the heap because this + * modification is into chunk header that is otherwise unused, in + * other words the linked list didn't change. + * + * 2) The size index of the first header is changed from previous value + * of 7 to 5: |F...R||F.R| + * This is a single fail-safe atomic operation and this is the + * first change that is noticeable by the heap operations. + * A single linked list element is split into two new ones. + * + * 3) The allocation process either uses redo log or changes directly + * the chunk header type from free to used: |U...R| <> |F.R| + * + * In a similar fashion the reverse operation, free, is performed: + * Initial heap state: |U| <> |F..R| <> |F| <> |U...R| <> |F.R| + * + * This is the heap after the previous example with the single chunk + * in between changed from used to free. + * + * 1) Determine the neighbors of the memory block which is being + * freed. + * + * 2) Update the footer (if needed) information of the last chunk which + * is the memory block being freed or it's neighbor to the right. + * |F| <> |U...R| <> |F.R << this one| + * + * 3) Update the size index and type of the left-most chunk header. + * And so this: |F << this one| <> |U...R| <> |F.R| + * becomes this: |F.......R| + * The entire chunk header can be updated in a single fail-safe + * atomic operation because it's size is only 64 bytes. + */ + MEMORY_BLOCK_HUGE, + /* + * Run memory blocks are chunks with CHUNK_TYPE_RUN and size index of 1. + * The entire chunk is subdivided into smaller blocks and has an + * additional metadata attached in the form of a bitmap - each bit + * corresponds to a single block. + * In this case there's no need to perform any coalescing or splitting + * on the persistent metadata. + * The bitmap is stored on a variable number of 64 bit values and + * because of the requirement of allocation fail-safe atomicity the + * maximum size index of a memory block from a run is 64 - since that's + * the limit of atomic write guarantee. + * + * The allocation/deallocation process is a single 8 byte write that + * sets/clears the corresponding bits. Depending on the user choice + * it can either be made atomically or using redo-log when grouped with + * other operations. + * It's also important to note that in a case of realloc it might so + * happen that a single 8 byte bitmap value has its bits both set and + * cleared - that's why the run memory block metadata changes operate + * on AND'ing or OR'ing a bitmask instead of directly setting the value. + */ + MEMORY_BLOCK_RUN, + + MAX_MEMORY_BLOCK +}; + +enum memblock_state { + MEMBLOCK_STATE_UNKNOWN, + MEMBLOCK_ALLOCATED, + MEMBLOCK_FREE, + + MAX_MEMBLOCK_STATE, +}; + +/* runtime bitmap information for a run */ +struct run_bitmap { + unsigned nvalues; /* number of 8 byte values - size of values array */ + unsigned nbits; /* number of valid bits */ + + size_t size; /* total size of the bitmap in bytes */ + + uint64_t *values; /* pointer to the bitmap's values array */ +}; + +/* runtime information necessary to create a run */ +struct run_descriptor { + uint16_t flags; /* chunk flags for the run */ + size_t unit_size; /* the size of a single unit in a run */ + uint32_t size_idx; /* size index of a single run instance */ + size_t alignment; /* required alignment of objects */ + unsigned nallocs; /* number of allocs per run */ + struct run_bitmap bitmap; +}; + +struct memory_block_ops { + /* returns memory block size */ + size_t (*block_size)(const struct memory_block *m); + + /* prepares header modification operation */ + void (*prep_hdr)(const struct memory_block *m, + enum memblock_state dest_state, struct operation_context *ctx); + + /* returns lock associated with memory block */ + pthread_mutex_t *(*get_lock)(const struct memory_block *m); + + /* returns whether a block is allocated or not */ + enum memblock_state (*get_state)(const struct memory_block *m); + + /* returns pointer to the data of a block */ + void *(*get_user_data)(const struct memory_block *m); + + /* + * Returns the size of a memory block without overhead. + * This is the size of a data block that can be used. + */ + size_t (*get_user_size)(const struct memory_block *m); + + /* returns pointer to the beginning of data of a run block */ + void *(*get_real_data)(const struct memory_block *m); + + /* returns the size of a memory block, including headers */ + size_t (*get_real_size)(const struct memory_block *m); + + /* writes a header of an allocation */ + void (*write_header)(const struct memory_block *m, + uint64_t extra_field, uint16_t flags); + void (*invalidate)(const struct memory_block *m); + + /* + * Checks the header type of a chunk matches the expected type and + * modifies it if necessary. This is fail-safe atomic. + */ + void (*ensure_header_type)(const struct memory_block *m, + enum header_type t); + + /* + * Reinitializes a block after a heap restart. + * This is called for EVERY allocation, but *only* under Valgrind. + */ + void (*reinit_header)(const struct memory_block *m); + + /* returns the extra field of an allocation */ + uint64_t (*get_extra)(const struct memory_block *m); + + /* returns the flags of an allocation */ + uint16_t (*get_flags)(const struct memory_block *m); + + /* initializes memblock in valgrind */ + void (*vg_init)(const struct memory_block *m, int objects, + object_callback cb, void *arg); + + /* iterates over every free block */ + int (*iterate_free)(const struct memory_block *m, + object_callback cb, void *arg); + + /* iterates over every used block */ + int (*iterate_used)(const struct memory_block *m, + object_callback cb, void *arg); + + /* calculates number of free units, valid only for runs */ + void (*calc_free)(const struct memory_block *m, + uint32_t *free_space, uint32_t *max_free_block); + + /* this is called exactly once for every existing chunk */ + void (*reinit_chunk)(const struct memory_block *m); + + /* + * Initializes bitmap data for a run. + * Do *not* use this function unless absolutely necessary, it breaks + * the abstraction layer by exposing implementation details. + */ + void (*get_bitmap)(const struct memory_block *m, struct run_bitmap *b); + + /* calculates the ratio between occupied and unoccupied space */ + unsigned (*fill_pct)(const struct memory_block *m); +}; + +struct memory_block { + uint32_t chunk_id; /* index of the memory block in its zone */ + uint32_t zone_id; /* index of this block zone in the heap */ + + /* + * Size index of the memory block represented in either multiple of + * CHUNKSIZE in the case of a huge chunk or in multiple of a run + * block size. + */ + uint32_t size_idx; + + /* + * Used only for run chunks, must be zeroed for huge. + * Number of preceding blocks in the chunk. In other words, the + * position of this memory block in run bitmap. + */ + uint32_t block_off; + + /* + * The variables below are associated with the memory block and are + * stored here for convenience. Those fields are filled by either the + * memblock_from_offset or memblock_rebuild_state, and they should not + * be modified manually. + */ + const struct memory_block_ops *m_ops; + struct palloc_heap *heap; + enum header_type header_type; + enum memory_block_type type; + struct run_bitmap *cached_bitmap; +}; + +/* + * This is a representation of a run memory block that is active in a bucket or + * is on a pending list in the recycler. + * This structure should never be passed around by value because the address of + * the nresv variable can be in reservations made through palloc_reserve(). Only + * if the number of reservations equals 0 the structure can be moved/freed. + */ +struct memory_block_reserved { + struct memory_block m; + + struct bucket_locked *bucket; + /* + * Number of reservations made from this run, the pointer to this value + * is stored in a user facing pobj_action structure. Decremented once + * the reservation is published or canceled. + */ + int nresv; +}; + +struct memory_block memblock_from_offset(struct palloc_heap *heap, + uint64_t off); +struct memory_block memblock_from_offset_opt(struct palloc_heap *heap, + uint64_t off, int size); +void memblock_rebuild_state(struct palloc_heap *heap, struct memory_block *m); + +struct memory_block memblock_huge_init(struct palloc_heap *heap, + uint32_t chunk_id, uint32_t zone_id, uint32_t size_idx); + +struct memory_block memblock_run_init(struct palloc_heap *heap, + uint32_t chunk_id, uint32_t zone_id, struct run_descriptor *rdsc); + +void memblock_run_bitmap(uint32_t *size_idx, uint16_t flags, + uint64_t unit_size, uint64_t alignment, void *content, + struct run_bitmap *b); + +#endif /* __DAOS_COMMON_MEMBLOCK_H */ diff --git a/src/common/dav_v2/memops.c b/src/common/dav_v2/memops.c new file mode 100644 index 00000000000..a137ac28836 --- /dev/null +++ b/src/common/dav_v2/memops.c @@ -0,0 +1,677 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2022, Intel Corporation */ + +/* + * memops.c -- aggregated memory operations helper implementation + * + * The operation collects all of the required memory modifications that + * need to happen in an atomic way (all of them or none), and abstracts + * away the storage type (transient/persistent) and the underlying + * implementation of how it's actually performed - in some cases using + * the redo log is unnecessary and the allocation process can be sped up + * a bit by completely omitting that whole machinery. + * + * The modifications are not visible until the context is processed. + */ + +#include "memops.h" +#include "obj.h" +#include "out.h" +#include "ravl.h" +#include "valgrind_internal.h" +#include "vecq.h" +#include "sys_util.h" +#include "dav_internal.h" +#include "tx.h" + +static inline int +OBJ_OFF_IS_VALID_FROM_CTX(void *ctx, uint64_t offset) +{ + dav_obj_t *dav_hdl = (dav_obj_t *)ctx; + + return OBJ_OFF_IS_VALID(dav_hdl, offset); +} + +#define ULOG_BASE_SIZE 1024 +#define OP_MERGE_SEARCH 64 + +enum operation_state { + OPERATION_IDLE, + OPERATION_IN_PROGRESS, + OPERATION_CLEANUP, +}; + +struct operation_log { + size_t capacity; /* capacity of the ulog log */ + size_t offset; /* data offset inside of the log */ + struct ulog *ulog; /* DRAM allocated log of modifications */ +}; + +/* + * operation_context -- context of an ongoing palloc operation + */ +struct operation_context { + enum log_type type; + + ulog_extend_fn extend; /* function to allocate next ulog */ + ulog_free_fn ulog_free; /* function to free next ulogs */ + + const struct mo_ops *p_ops; + struct mo_ops t_ops; /* used for transient data processing */ + struct mo_ops s_ops; /* used for shadow copy data processing */ + + size_t ulog_curr_offset; /* offset in the log for buffer stores */ + size_t ulog_curr_capacity; /* capacity of the current log */ + size_t ulog_curr_gen_num; /* transaction counter in the current log */ + struct ulog *ulog_curr; /* current persistent log */ + size_t total_logged; /* total amount of buffer stores in the logs */ + + struct ulog *ulog; /* pointer to the ulog used by context for undo ops */ + size_t ulog_base_nbytes; /* available bytes in initial ulog log */ + size_t ulog_capacity; /* sum of capacity, incl all next ulog logs */ + int ulog_auto_reserve; /* allow or do not to auto ulog reservation */ + + struct ulog_next next; /* vector of 'next' fields of persistent ulog */ + + enum operation_state state; /* operation sanity check */ + + struct operation_log pshadow_ops; /* used by context for redo ops */ + struct operation_log transient_ops; /* log of transient changes */ + + /* collection used to look for potential merge candidates */ + VECQ(, struct ulog_entry_val *) merge_entries; +}; + +/* + * operation_log_transient_init -- (internal) initialize operation log + * containing transient memory resident changes + */ +static int +operation_log_transient_init(struct operation_log *log) +{ + struct ulog *src; + + log->capacity = ULOG_BASE_SIZE; + log->offset = 0; + + D_ALLOC(src, (sizeof(struct ulog) + ULOG_BASE_SIZE)); + if (src == NULL) { + D_CRIT("Zalloc!\n"); + return -1; + } + + /* initialize underlying redo log structure */ + src->capacity = ULOG_BASE_SIZE; + + log->ulog = src; + + return 0; +} + +/* + * operation_log_persistent_init -- (internal) initialize operation log + * containing persistent memory resident changes + */ +static int +operation_log_persistent_init(struct operation_log *log, + size_t ulog_base_nbytes) +{ + struct ulog *src; + + log->capacity = ULOG_BASE_SIZE; + log->offset = 0; + + D_ALLOC(src, (sizeof(struct ulog) + ULOG_BASE_SIZE)); + if (src == NULL) { + D_CRIT("Zalloc!\n"); + return -1; + } + + /* initialize underlying redo log structure */ + src->capacity = ULOG_BASE_SIZE; + memset(src->unused, 0, sizeof(src->unused)); + + log->ulog = src; + + return 0; +} + +/* + * operation_transient_clean -- cleans pmemcheck address state + */ +static int +operation_transient_clean(void *base, const void *addr, size_t len, + unsigned flags) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(base, flags); + + VALGRIND_SET_CLEAN(addr, len); + + return 0; +} + +/* + * operation_transient_drain -- noop + */ +static void +operation_transient_drain(void *base) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(base); +} + +/* + * operation_transient_memcpy -- transient memcpy wrapper + */ +static void * +operation_transient_memcpy(void *base, void *dest, const void *src, size_t len, + unsigned flags) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(base, flags); + + return memcpy(dest, src, len); +} + +/* + * operation_new -- creates new operation context + */ +struct operation_context * +operation_new(struct ulog *ulog, size_t ulog_base_nbytes, + ulog_extend_fn extend, ulog_free_fn ulog_free, + const struct mo_ops *p_ops, enum log_type type) +{ + + SUPPRESS_UNUSED(p_ops); + + struct operation_context *ctx; + + D_ALLOC_PTR(ctx); + if (ctx == NULL) { + D_CRIT("Zalloc!\n"); + goto error_ctx_alloc; + } + + ctx->ulog = ulog; + ctx->ulog_base_nbytes = ulog_base_nbytes; + ctx->ulog_capacity = ulog_capacity(ulog, + ulog_base_nbytes); + ctx->extend = extend; + ctx->ulog_free = ulog_free; + ctx->state = OPERATION_IDLE; + VEC_INIT(&ctx->next); + ulog_rebuild_next_vec(ulog, &ctx->next); + ctx->p_ops = p_ops; + ctx->type = type; + + ctx->ulog_curr_offset = 0; + ctx->ulog_curr_capacity = 0; + ctx->ulog_curr = NULL; + + ctx->t_ops.base = NULL; + ctx->t_ops.flush = operation_transient_clean; + ctx->t_ops.memcpy = operation_transient_memcpy; + ctx->t_ops.drain = operation_transient_drain; + + ctx->s_ops.base = p_ops->base; + ctx->s_ops.flush = operation_transient_clean; + ctx->s_ops.memcpy = operation_transient_memcpy; + ctx->s_ops.drain = operation_transient_drain; + + VECQ_INIT(&ctx->merge_entries); + + if (operation_log_transient_init(&ctx->transient_ops) != 0) + goto error_ulog_alloc; + + if (operation_log_persistent_init(&ctx->pshadow_ops, + ulog_base_nbytes) != 0) + goto error_ulog_alloc; + + return ctx; + +error_ulog_alloc: + operation_delete(ctx); +error_ctx_alloc: + return NULL; +} + +/* + * operation_delete -- deletes operation context + */ +void +operation_delete(struct operation_context *ctx) +{ + VECQ_DELETE(&ctx->merge_entries); + VEC_DELETE(&ctx->next); + D_FREE(ctx->pshadow_ops.ulog); + D_FREE(ctx->transient_ops.ulog); + D_FREE(ctx); +} + +/* + * operation_free_logs -- free all logs except first + */ +void +operation_free_logs(struct operation_context *ctx) +{ + int freed = ulog_free_next(ctx->ulog, ctx->ulog_free); + + if (freed) { + ctx->ulog_capacity = ulog_capacity(ctx->ulog, + ctx->ulog_base_nbytes); + VEC_CLEAR(&ctx->next); + ulog_rebuild_next_vec(ctx->ulog, &ctx->next); + } + + ASSERTeq(VEC_SIZE(&ctx->next), 0); +} + +/* + * operation_merge -- (internal) performs operation on a field + */ +static inline int +operation_merge(struct ulog_entry_base *entry, uint64_t value, + ulog_operation_type type) +{ + struct ulog_entry_val *e = (struct ulog_entry_val *)entry; + uint16_t num, num1, num2; + uint32_t pos, pos1, pos2; + + switch (type) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + case ULOG_OPERATION_AND: + e->value &= value; + break; + case ULOG_OPERATION_OR: + e->value |= value; + break; +#else + case ULOG_OPERATION_SET_BITS: + case ULOG_OPERATION_CLR_BITS: + num1 = ULOG_ENTRY_VAL_TO_BITS(e->value); + pos1 = ULOG_ENTRY_VAL_TO_POS(e->value); + num2 = ULOG_ENTRY_VAL_TO_BITS(value); + pos2 = ULOG_ENTRY_VAL_TO_POS(value); + + if ((pos2 > pos1 + num1) || (pos1 > pos2 + num2)) + return 0; /* there is a gap, no merge */ + + pos = MIN(pos1, pos2); + num = MAX(pos1 + num1, pos2 + num2) - pos; + + e->value = ULOG_ENTRY_TO_VAL(pos, num); + break; +#endif + case ULOG_OPERATION_SET: + e->value = value; + default: + ASSERT(0); /* unreachable */ + } + return 1; +} + +/* + * operation_try_merge_entry -- tries to merge the incoming log entry with + * existing entries + * + * Because this requires a reverse foreach, it cannot be implemented using + * the on-media ulog log structure since there's no way to find what's + * the previous entry in the log. Instead, the last N entries are stored + * in a collection and traversed backwards. + */ +static int +operation_try_merge_entry(struct operation_context *ctx, + void *ptr, uint64_t value, ulog_operation_type type) +{ + int ret = 0; + uint64_t offset = OBJ_PTR_TO_OFF(ctx->p_ops->base, ptr); + + struct ulog_entry_val *e; + + VECQ_FOREACH_REVERSE(e, &ctx->merge_entries) { + if (ulog_entry_offset(&e->base) == offset) { + if (ulog_entry_type(&e->base) == type) { + if (operation_merge(&e->base, value, type)) + return 1; + } + break; + } + } + + return ret; +} + +/* + * operation_merge_entry_add -- adds a new entry to the merge collection, + * keeps capacity at OP_MERGE_SEARCH. Removes old entries in FIFO fashion. + */ +static void +operation_merge_entry_add(struct operation_context *ctx, + struct ulog_entry_val *entry) +{ + if (VECQ_SIZE(&ctx->merge_entries) == OP_MERGE_SEARCH) + (void) VECQ_DEQUEUE(&ctx->merge_entries); + + if (VECQ_ENQUEUE(&ctx->merge_entries, entry) != 0) { + /* this is fine, only runtime perf will get slower */ + D_CRIT("out of memory - unable to track entries\n"); + } +} + +/* + * operation_add_typed_value -- adds new entry to the current operation, if the + * same ptr address already exists and the operation type is set, + * the new value is not added and the function has no effect. + */ +int +operation_add_typed_entry(struct operation_context *ctx, + void *ptr, uint64_t value, + ulog_operation_type type, enum operation_log_type log_type) +{ + struct operation_log *oplog = log_type == LOG_PERSISTENT ? + &ctx->pshadow_ops : &ctx->transient_ops; + + /* + * Always make sure to have one extra spare cacheline so that the + * ulog log entry creation has enough room for zeroing. + */ + if (oplog->offset + CACHELINE_SIZE == oplog->capacity) { + size_t ncapacity = oplog->capacity + ULOG_BASE_SIZE; + struct ulog *ulog; + + D_REALLOC_NZ(ulog, oplog->ulog, SIZEOF_ULOG(ncapacity)); + if (ulog == NULL) + return -1; + oplog->capacity += ULOG_BASE_SIZE; + oplog->ulog = ulog; + oplog->ulog->capacity = oplog->capacity; + + /* + * Realloc invalidated the ulog entries that are inside of this + * vector, need to clear it to avoid use after free. + */ + VECQ_CLEAR(&ctx->merge_entries); + } + + if (log_type == LOG_PERSISTENT && + operation_try_merge_entry(ctx, ptr, value, type) != 0) + return 0; + + struct ulog_entry_val *entry = ulog_entry_val_create( + oplog->ulog, oplog->offset, ptr, value, type, + log_type == LOG_TRANSIENT ? &ctx->t_ops : &ctx->s_ops); + + if (log_type == LOG_PERSISTENT) + operation_merge_entry_add(ctx, entry); + + oplog->offset += ulog_entry_size(&entry->base); + + return 0; +} + + +/* + * operation_add_value -- adds new entry to the current operation with + * entry type autodetected based on the memory location + */ +int +operation_add_entry(struct operation_context *ctx, void *ptr, uint64_t value, + ulog_operation_type type) +{ + const struct mo_ops *p_ops = ctx->p_ops; + dav_obj_t *pop = (dav_obj_t *)p_ops->base; + + int from_pool = OBJ_PTR_IS_VALID(pop, ptr); + + return operation_add_typed_entry(ctx, ptr, value, type, + from_pool ? LOG_PERSISTENT : LOG_TRANSIENT); +} + +/* + * operation_add_buffer -- adds a buffer operation to the log + */ +int +operation_add_buffer(struct operation_context *ctx, + void *dest, void *src, size_t size, ulog_operation_type type) +{ + size_t real_size = size + sizeof(struct ulog_entry_buf); + + /* if there's no space left in the log, reserve some more */ + if (ctx->ulog_curr_capacity == 0) { + ctx->ulog_curr_gen_num = ctx->ulog->gen_num; + if (operation_reserve(ctx, ctx->total_logged + real_size) != 0) + return -1; + + ctx->ulog_curr = ctx->ulog_curr == NULL ? ctx->ulog : + ulog_next(ctx->ulog_curr); + ASSERTne(ctx->ulog_curr, NULL); + ctx->ulog_curr_offset = 0; + ctx->ulog_curr_capacity = ctx->ulog_curr->capacity; + } + + size_t curr_size = MIN(real_size, ctx->ulog_curr_capacity); + size_t data_size = curr_size - sizeof(struct ulog_entry_buf); + size_t entry_size = ALIGN_UP(curr_size, CACHELINE_SIZE); + + /* + * To make sure that the log is consistent and contiguous, we need + * make sure that the header of the entry that would be located + * immediately after this one is zeroed. + */ + struct ulog_entry_base *next_entry = NULL; + + if (entry_size == ctx->ulog_curr_capacity) { + struct ulog *u = ulog_next(ctx->ulog_curr); + + if (u != NULL) + next_entry = (struct ulog_entry_base *)u->data; + } else { + size_t next_entry_offset = ctx->ulog_curr_offset + entry_size; + + next_entry = (struct ulog_entry_base *)(ctx->ulog_curr->data + + next_entry_offset); + } + if (next_entry != NULL) + ulog_clobber_entry(next_entry); + + /* create a persistent log entry */ + struct ulog_entry_buf *e = ulog_entry_buf_create(ctx->ulog_curr, + ctx->ulog_curr_offset, + ctx->ulog_curr_gen_num, + dest, src, data_size, + type, ctx->p_ops); + ASSERT(entry_size == ulog_entry_size(&e->base)); + ASSERT(entry_size <= ctx->ulog_curr_capacity); + + ctx->total_logged += entry_size; + ctx->ulog_curr_offset += entry_size; + ctx->ulog_curr_capacity -= entry_size; + + /* + * Recursively add the data to the log until the entire buffer is + * processed. + */ + return size - data_size == 0 ? 0 : operation_add_buffer(ctx, + (char *)dest + data_size, + (char *)src + data_size, + size - data_size, type); +} + +/* + * operation_set_auto_reserve -- set auto reserve value for context + */ +void +operation_set_auto_reserve(struct operation_context *ctx, int auto_reserve) +{ + ctx->ulog_auto_reserve = auto_reserve; +} + +/* + * operation_process_persistent_redo -- (internal) process using ulog + */ +static void +operation_process_persistent_redo(struct operation_context *ctx) +{ + ASSERTeq(ctx->pshadow_ops.capacity % CACHELINE_SIZE, 0); + + /* Copy the redo log to wal redo */ + ulog_foreach_entry(ctx->pshadow_ops.ulog, tx_create_wal_entry, + NULL, ctx->p_ops); + + ulog_process(ctx->pshadow_ops.ulog, OBJ_OFF_IS_VALID_FROM_CTX, + ctx->p_ops); + + ulog_clobber(ctx->ulog, &ctx->next); +} + +/* + * operation_reserve -- (internal) reserves new capacity in persistent ulog log + */ +int +operation_reserve(struct operation_context *ctx, size_t new_capacity) +{ + if ((ctx->type == LOG_TYPE_UNDO) && (new_capacity > ctx->ulog_capacity)) { + if (ctx->extend == NULL) { + ERR("no extend function present"); + return -1; + } + + if (ulog_reserve(ctx->ulog, + ctx->ulog_base_nbytes, + ctx->ulog_curr_gen_num, + ctx->ulog_auto_reserve, + &new_capacity, ctx->extend, + &ctx->next) != 0) + return -1; + ctx->ulog_capacity = new_capacity; + } + + return 0; +} + +/* + * operation_init -- initializes runtime state of an operation + */ +void +operation_init(struct operation_context *ctx) +{ + struct operation_log *plog = &ctx->pshadow_ops; + struct operation_log *tlog = &ctx->transient_ops; + + VALGRIND_ANNOTATE_NEW_MEMORY(ctx, sizeof(*ctx)); + VALGRIND_ANNOTATE_NEW_MEMORY(tlog->ulog, sizeof(struct ulog) + + tlog->capacity); + VALGRIND_ANNOTATE_NEW_MEMORY(plog->ulog, sizeof(struct ulog) + + plog->capacity); + tlog->offset = 0; + plog->offset = 0; + VECQ_REINIT(&ctx->merge_entries); + + ctx->ulog_curr_offset = 0; + ctx->ulog_curr_capacity = 0; + ctx->ulog_curr_gen_num = 0; + ctx->ulog_curr = NULL; + ctx->total_logged = 0; + ctx->ulog_auto_reserve = 1; +} + +/* + * operation_start -- initializes and starts a new operation + */ +void +operation_start(struct operation_context *ctx) +{ + operation_init(ctx); + ASSERTeq(ctx->state, OPERATION_IDLE); + ctx->state = OPERATION_IN_PROGRESS; +} + +/* + * operation_cancel -- cancels a running operation + */ +void +operation_cancel(struct operation_context *ctx) +{ + ASSERTeq(ctx->state, OPERATION_IN_PROGRESS); + ctx->state = OPERATION_IDLE; +} + +/* + * operation_process -- processes registered operations + * + * The order of processing is important: persistent, transient. + * This is because the transient entries that reside on persistent memory might + * require write to a location that is currently occupied by a valid persistent + * state but becomes a transient state after operation is processed. + */ +void +operation_process(struct operation_context *ctx) +{ + /* + * If there's exactly one persistent entry there's no need to involve + * the redo log. We can simply assign the value, the operation will be + * atomic. + */ + int redo_process = ctx->type == LOG_TYPE_REDO && + ctx->pshadow_ops.offset != 0; + if (redo_process && + ctx->pshadow_ops.offset == sizeof(struct ulog_entry_val)) { + struct ulog_entry_base *e = (struct ulog_entry_base *) + ctx->pshadow_ops.ulog->data; + ulog_operation_type t = ulog_entry_type(e); + + if ((t == ULOG_OPERATION_SET) || ULOG_ENTRY_IS_BIT_OP(t)) { + tx_create_wal_entry(e, NULL, ctx->p_ops); + ulog_entry_apply(e, 1, ctx->p_ops); + redo_process = 0; + } + } + + if (redo_process) { + operation_process_persistent_redo(ctx); + ctx->state = OPERATION_CLEANUP; + } + D_ASSERT(ctx->type != LOG_TYPE_UNDO); + + /* process transient entries with transient memory ops */ + if (ctx->transient_ops.offset != 0) + ulog_process(ctx->transient_ops.ulog, NULL, &ctx->t_ops); +} + +/* + * operation_finish -- finalizes the operation + */ +void +operation_finish(struct operation_context *ctx, unsigned flags) +{ + ASSERTne(ctx->state, OPERATION_IDLE); + + if (ctx->type == LOG_TYPE_UNDO && ctx->total_logged != 0) + ctx->state = OPERATION_CLEANUP; + + if (ctx->state != OPERATION_CLEANUP) + goto out; + + if (ctx->type == LOG_TYPE_UNDO) { + int ret = ulog_clobber_data(ctx->ulog, + &ctx->next, ctx->ulog_free, flags); + + if (ret == 0) + goto out; + } else if (ctx->type == LOG_TYPE_REDO) { + int ret = ulog_free_next(ctx->ulog, ctx->ulog_free); + + if (ret == 0) + goto out; + } + + /* clobbering shrunk the ulog */ + ctx->ulog_capacity = ulog_capacity(ctx->ulog, + ctx->ulog_base_nbytes); + VEC_CLEAR(&ctx->next); + ulog_rebuild_next_vec(ctx->ulog, &ctx->next); + +out: + ctx->state = OPERATION_IDLE; +} diff --git a/src/common/dav_v2/memops.h b/src/common/dav_v2/memops.h new file mode 100644 index 00000000000..035105de0c5 --- /dev/null +++ b/src/common/dav_v2/memops.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2020, Intel Corporation */ + +/* + * memops.h -- aggregated memory operations helper definitions + */ + +#ifndef __DAOS_COMMON_MEMOPS_H +#define __DAOS_COMMON_MEMOPS_H 1 + +#include <stddef.h> +#include <stdint.h> + +#include "vec.h" +#include "mo_wal.h" +#include "ulog.h" + +enum operation_log_type { + LOG_PERSISTENT, /* log of persistent modifications */ + LOG_TRANSIENT, /* log of transient memory modifications */ + + MAX_OPERATION_LOG_TYPE +}; + +enum log_type { + LOG_TYPE_UNDO, + LOG_TYPE_REDO, + + MAX_LOG_TYPE, +}; + +struct user_buffer_def { + void *addr; + size_t size; +}; + +struct operation_context; + +struct operation_context * +operation_new(struct ulog *redo, size_t ulog_base_nbytes, + ulog_extend_fn extend, ulog_free_fn ulog_free, + const struct mo_ops *p_ops, enum log_type type); + +void operation_init(struct operation_context *ctx); +void operation_start(struct operation_context *ctx); + +void operation_delete(struct operation_context *ctx); +void operation_free_logs(struct operation_context *ctx); + +int operation_add_buffer(struct operation_context *ctx, + void *dest, void *src, size_t size, ulog_operation_type type); + +int operation_add_entry(struct operation_context *ctx, + void *ptr, uint64_t value, ulog_operation_type type); +int operation_add_typed_entry(struct operation_context *ctx, + void *ptr, uint64_t value, + ulog_operation_type type, enum operation_log_type log_type); +void operation_set_auto_reserve(struct operation_context *ctx, + int auto_reserve); + +int operation_reserve(struct operation_context *ctx, size_t new_capacity); +void operation_process(struct operation_context *ctx); +void operation_finish(struct operation_context *ctx, unsigned flags); +void operation_cancel(struct operation_context *ctx); + +#endif /* __DAOS_COMMON_MEMOPS_H */ diff --git a/src/common/dav_v2/mo_wal.h b/src/common/dav_v2/mo_wal.h new file mode 100644 index 00000000000..9f05eca72a9 --- /dev/null +++ b/src/common/dav_v2/mo_wal.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2020, Intel Corporation */ + +#ifndef __DAOS_COMMON_MO_WAL_H +#define __DAOS_COMMON_MO_WAL_H 1 + +#include <stddef.h> +#include <stdint.h> +#include <sys/mman.h> +#include <string.h> + +#include "out.h" +#include "wal_tx.h" + +typedef int (*persist_fn)(void *base, const void *, size_t, unsigned); +typedef int (*flush_fn)(void *base, const void *, size_t, unsigned); +typedef void (*drain_fn)(void *base); + +typedef void *(*memcpy_fn)(void *base, void *dest, const void *src, size_t len, + unsigned flags); +typedef void *(*memmove_fn)(void *base, void *dest, const void *src, size_t len, + unsigned flags); +typedef void *(*memset_fn)(void *base, void *dest, int c, size_t len, + unsigned flags); + +typedef int (*remote_read_fn)(void *ctx, uintptr_t base, void *dest, void *addr, + size_t length); + +struct mo_ops { + /* for 'master' replica: with or without data replication */ + persist_fn persist; /* persist function */ + flush_fn flush; /* flush function */ + drain_fn drain; /* drain function */ + memcpy_fn memcpy; /* persistent memcpy function */ + memmove_fn memmove; /* persistent memmove function */ + memset_fn memset; /* persistent memset function */ + void *base; + + struct remote_ops { + remote_read_fn read; + + void *ctx; + uintptr_t base; + } remote; +}; + +static force_inline void +mo_wal_persist(const struct mo_ops *p_ops, void *d, size_t s) +{ + dav_wal_tx_snap(p_ops->base, d, s, d, 0); +} + +static force_inline void +mo_wal_flush(const struct mo_ops *p_ops, void *d, size_t s, int flags) +{ + dav_wal_tx_snap(p_ops->base, d, s, d, flags); +} + +static force_inline void +mo_wal_drain(const struct mo_ops *p_ops) +{ + SUPPRESS_UNUSED(p_ops); +} + +static force_inline void * +mo_wal_memcpy(const struct mo_ops *p_ops, void *dest, + const void *src, size_t len, unsigned flags) +{ + SUPPRESS_UNUSED(p_ops); + memcpy(dest, src, len); + mo_wal_flush(p_ops, dest, len, 0); + return dest; +} + +static force_inline void * +mo_wal_memmove(const struct mo_ops *p_ops, void *dest, + const void *src, size_t len, unsigned flags) +{ + SUPPRESS_UNUSED(p_ops); + memmove(dest, src, len); + mo_wal_flush(p_ops, dest, len, 0); + return dest; +} + +static force_inline void * +mo_wal_memset(const struct mo_ops *p_ops, void *dest, int c, + size_t len, unsigned flags) +{ + SUPPRESS_UNUSED(p_ops); + memset(dest, c, len); + dav_wal_tx_set(p_ops->base, dest, c, len); + return dest; +} + +#endif /* __DAOS_COMMON_MO_WAL_H */ diff --git a/src/common/dav_v2/obj.h b/src/common/dav_v2/obj.h new file mode 100644 index 00000000000..470323da1ef --- /dev/null +++ b/src/common/dav_v2/obj.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2014-2023, Intel Corporation */ + +/* + * obj.h -- internal definitions for obj module + */ + +#ifndef __DAOS_COMMON_OBJ_H +#define __DAOS_COMMON_OBJ_H 1 + +#include <stddef.h> +#include <stdint.h> + +#include "dav_internal.h" +#include "stats.h" + +#define OBJ_OFF_TO_PTR(pop, off) ((void *)((uintptr_t)(((dav_obj_t *)(pop))->do_base) + (off))) +#define OBJ_PTR_TO_OFF(pop, ptr) ((uintptr_t)(ptr) - (uintptr_t)(((dav_obj_t *)(pop))->do_base)) +#define OBJ_OFF_FROM_HEAP(pop, off)\ + ((off) >= ((dav_obj_t *)(pop))->do_phdr->dp_heap_offset &&\ + (off) < ((dav_obj_t *)(pop))->do_phdr->dp_heap_offset +\ + ((dav_obj_t *)(pop))->do_phdr->dp_heap_size) + +#define OBJ_OFF_IS_VALID(pop, off)\ + (OBJ_OFF_FROM_HEAP(pop, off) ||\ + (OBJ_PTR_TO_OFF(pop, &((dav_obj_t *)(pop))->do_phdr->dp_root_offset) == (off)) ||\ + (OBJ_PTR_TO_OFF(pop, &((dav_obj_t *)(pop))->do_phdr->dp_root_size) == (off))) + +#define OBJ_PTR_IS_VALID(pop, ptr)\ + OBJ_OFF_IS_VALID(pop, OBJ_PTR_TO_OFF(pop, ptr)) + +#define OBJ_PTR_FROM_POOL(pop, ptr)\ + ((uintptr_t)(ptr) >= (uintptr_t)(((dav_obj_t *)pop)->do_base) &&\ + (uintptr_t)(ptr) < (uintptr_t)(((dav_obj_t *)pop)->do_base) +\ + (((dav_obj_t *)pop)->do_phdr->dp_heap_offset +\ + ((dav_obj_t *)pop)->do_phdr->dp_heap_size)) + +#define OBJ_OFFRANGE_FROM_HEAP(pop, start, end)\ + (((start) >= ((dav_obj_t *)pop)->do_phdr->dp_heap_offset) &&\ + ((end) <= (((dav_obj_t *)pop)->do_phdr->dp_heap_offset + \ + ((dav_obj_t *)pop)->do_phdr->dp_heap_size))) + +typedef uint64_t type_num_t; + +#define CLASS_ID_FROM_FLAG(flag)\ +((uint16_t)((flag) >> 48)) + +#define EZONE_ID_FROM_FLAG(flag) ((uint32_t)((flag) >> 16)) + +#endif /* __DAOS_COMMON_OBJ_H */ diff --git a/src/common/dav_v2/out.h b/src/common/dav_v2/out.h new file mode 100644 index 00000000000..ebe12044db4 --- /dev/null +++ b/src/common/dav_v2/out.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2014-2021, Intel Corporation */ + +/* + * out.h -- definitions for "out" module + */ + +#ifndef __DAOS_COMMON_OUT_H +#define __DAOS_COMMON_OUT_H 1 + +#include <daos/debug.h> +#include "util.h" + +#define DAV_LOG_FAC DB_TRACE + +/* enable extra debug messages and extra checks */ +/*#define DAV_EXTRA_DEBUG*/ + +#ifndef EVALUATE_DBG_EXPRESSIONS +#if defined(DAV_EXTRA_DEBUG) || defined(__clang_analyzer__) || defined(__COVERITY__) ||\ + defined(__KLOCWORK__) +#define EVALUATE_DBG_EXPRESSIONS 1 +#else +#define EVALUATE_DBG_EXPRESSIONS 0 +#endif +#endif + +#define TEST_ALWAYS_TRUE_EXPR(cnd) do { \ + if (__builtin_constant_p(cnd)) \ + COMPILE_ERROR_ON(cnd); \ +} while (0) +#define TEST_ALWAYS_EQ_EXPR(lhs, rhs) do { \ + if (__builtin_constant_p(lhs) && __builtin_constant_p(rhs)) \ + COMPILE_ERROR_ON((lhs) == (rhs)); \ +} while (0) +#define TEST_ALWAYS_NE_EXPR(lhs, rhs) do { \ + if (__builtin_constant_p(lhs) && __builtin_constant_p(rhs)) \ + COMPILE_ERROR_ON((lhs) != (rhs)); \ +} while (0) + +/* produce debug/trace output */ +#if defined(DAV_EXTRA_DEBUG) +#define DAV_DBG(fmt, ...) D_DEBUG(DAV_LOG_FAC, fmt "\n", ##__VA_ARGS__) +#else +#define DAV_DBG(fmt, ...) SUPPRESS_UNUSED(__VA_ARGS__) +#endif + +/* produce output and exit */ +#define FATAL(fmt, ...) \ + D_ASSERTF(0, fmt "\n", ## __VA_ARGS__) + +/* assert a condition is true at runtime */ +#define ASSERT_rt(cnd) do { \ + if (!EVALUATE_DBG_EXPRESSIONS || (cnd)) \ + break; \ + D_ASSERT(cnd); \ +} while (0) + +/* assert two integer values are equal at runtime */ +#define ASSERTeq_rt(lhs, rhs) do { \ + if (!EVALUATE_DBG_EXPRESSIONS || ((lhs) == (rhs)))\ + break; \ + D_ASSERTF(((lhs) == (rhs)), \ + "assertion failure: %s (0x%llx) == %s (0x%llx)", #lhs,\ + (unsigned long long)(lhs), #rhs, (unsigned long long)(rhs)); \ +} while (0) + +/* assert two integer values are not equal at runtime */ +#define ASSERTne_rt(lhs, rhs) do { \ + if (!EVALUATE_DBG_EXPRESSIONS || ((lhs) != (rhs)))\ + break; \ + D_ASSERTF(((lhs) != (rhs)), \ + "assertion failure: %s (0x%llx) != %s (0x%llx)", #lhs,\ + (unsigned long long)(lhs), #rhs, (unsigned long long)(rhs)); \ +} while (0) + +/* + * Detect useless asserts on always true expression. Please use + * COMPILE_ERROR_ON(!cnd) or ASSERT_rt(cnd) in such cases. + */ +/* assert a condition is true */ +#define ASSERT(cnd) do {\ + TEST_ALWAYS_TRUE_EXPR(cnd);\ + ASSERT_rt(cnd);\ + } while (0) + +/* assert two integer values are equal */ +#define ASSERTeq(lhs, rhs) do {\ + /* See comment in ASSERT. */\ + TEST_ALWAYS_EQ_EXPR(lhs, rhs);\ + ASSERTeq_rt(lhs, rhs);\ + } while (0) + +/* assert two integer values are not equal */ +#define ASSERTne(lhs, rhs) do {\ + /* See comment in ASSERT. */\ + TEST_ALWAYS_NE_EXPR(lhs, rhs);\ + ASSERTne_rt(lhs, rhs);\ + } while (0) + +#define ERR(fmt, ...)\ + D_ERROR(fmt "\n", ## __VA_ARGS__) + +#endif /* __DAOS_COMMON_OUT_H */ diff --git a/src/common/dav_v2/palloc.c b/src/common/dav_v2/palloc.c new file mode 100644 index 00000000000..cf73303d655 --- /dev/null +++ b/src/common/dav_v2/palloc.c @@ -0,0 +1,977 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2023, Intel Corporation */ + +/* + * palloc.c -- implementation of pmalloc POSIX-like API + * + * This is the front-end part of the persistent memory allocator. It uses both + * transient and persistent representation of the heap to provide memory blocks + * in a reasonable time and with an acceptable common-case fragmentation. + * + * Lock ordering in the entirety of the allocator is simple, but might be hard + * to follow at times because locks are, by necessity, externalized. + * There are two sets of locks that need to be taken into account: + * - runtime state locks, represented by buckets. + * - persistent state locks, represented by memory block mutexes. + * + * To properly use them, follow these rules: + * - When nesting, always lock runtime state first. + * Doing the reverse might cause deadlocks in other parts of the code. + * + * - When introducing functions that would require runtime state locks, + * always try to move the lock acquiring to the upper most layer. This + * usually means that the functions will simply take "struct bucket" as + * their argument. By doing so most of the locking can happen in + * the frontend part of the allocator and it's easier to follow the first + * rule because all functions in the backend can safely use the persistent + * state locks - the runtime lock, if it is needed, will be already taken + * by the upper layer. + * + * General lock ordering: + * 1. arenas.lock + * 2. buckets (sorted by ID) + * 3. memory blocks (sorted by lock address) + */ + +#include "bucket.h" +#include "valgrind_internal.h" +#include "heap_layout.h" +#include "heap.h" +#include "alloc_class.h" +#include "out.h" +#include "sys_util.h" +#include "palloc.h" +#include "ravl.h" +#include "vec.h" + +struct dav_action_internal { + /* type of operation (alloc/free vs set) */ + enum dav_action_type type; + + uint32_t padding; + + /* + * Action-specific lock that needs to be taken for the duration of + * an action. + */ + pthread_mutex_t *lock; + + /* action-specific data */ + union { + /* valid only when type == DAV_ACTION_TYPE_HEAP */ + struct { + uint64_t offset; + uint64_t usable_size; + enum memblock_state new_state; + struct memory_block m; + struct memory_block_reserved *mresv; + }; + + /* valid only when type == DAV_ACTION_TYPE_MEM */ + struct { + uint64_t *ptr; + uint64_t value; + }; + + /* padding, not used */ + uint64_t data2[14]; + }; +}; +D_CASSERT(offsetof(struct dav_action_internal, data2) == offsetof(struct dav_action, data2), + "struct dav_action misaligned!"); + +/* + * palloc_set_value -- creates a new set memory action + */ +void +palloc_set_value(struct palloc_heap *heap, struct dav_action *act, + uint64_t *ptr, uint64_t value) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(heap); + + act->type = DAV_ACTION_TYPE_MEM; + + struct dav_action_internal *actp = (struct dav_action_internal *)act; + + actp->ptr = ptr; + actp->value = value; + actp->lock = NULL; +} + +/* + * alloc_prep_block -- (internal) prepares a memory block for allocation + * + * Once the block is fully reserved and it's guaranteed that no one else will + * be able to write to this memory region it is safe to write the allocation + * header and call the object construction function. + * + * Because the memory block at this stage is only reserved in transient state + * there's no need to worry about fail-safety of this method because in case + * of a crash the memory will be back in the free blocks collection. + */ +static int +alloc_prep_block(struct palloc_heap *heap, const struct memory_block *m, + palloc_constr constructor, void *arg, + uint64_t extra_field, uint16_t object_flags, + struct dav_action_internal *out) +{ + void *uptr = m->m_ops->get_user_data(m); + size_t usize = m->m_ops->get_user_size(m); + + VALGRIND_DO_MEMPOOL_ALLOC(heap->layout, uptr, usize); + VALGRIND_DO_MAKE_MEM_UNDEFINED(uptr, usize); + VALGRIND_ANNOTATE_NEW_MEMORY(uptr, usize); + + m->m_ops->write_header(m, extra_field, object_flags); + + /* + * Set allocated memory with pattern, if debug.heap.alloc_pattern CTL + * parameter had been set. + */ + if (unlikely(heap->alloc_pattern > PALLOC_CTL_DEBUG_NO_PATTERN)) { + mo_wal_memset(&heap->p_ops, uptr, heap->alloc_pattern, + usize, 0); + VALGRIND_DO_MAKE_MEM_UNDEFINED(uptr, usize); + } + + int ret; + + if (constructor != NULL) { + ret = constructor(heap->p_ops.base, uptr, usize, arg); + if (ret != 0) { + /* + * If canceled, revert the block back to the free + * state in vg machinery. + */ + VALGRIND_DO_MEMPOOL_FREE(heap->layout, uptr); + return ret; + } + } + + /* + * To avoid determining the user data pointer twice this method is also + * responsible for calculating the offset of the object in the pool that + * will be used to set the offset destination pointer provided by the + * caller. + */ + out->offset = HEAP_PTR_TO_OFF(heap, uptr); + out->usable_size = usize; + + return 0; +} + +/* + * palloc_reservation_create -- creates a volatile reservation of a + * memory block. + * + * The first step in the allocation of a new block is reserving it in + * the transient heap - which is represented by the bucket abstraction. + * + * To provide optimal scaling for multi-threaded applications and reduce + * fragmentation the appropriate bucket is chosen depending on the + * current thread context and to which allocation class the requested + * size falls into. + * + * Once the bucket is selected, just enough memory is reserved for the + * requested size. The underlying block allocation algorithm + * (best-fit, next-fit, ...) varies depending on the bucket container. + */ +static int +palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr constructor, + void *arg, uint64_t extra_field, uint16_t object_flags, uint16_t class_id, + uint32_t zset_id, struct dav_action_internal *out) +{ + int err = 0; + struct memory_block *new_block = &out->m; + struct zoneset *zset; + + out->type = DAV_ACTION_TYPE_HEAP; + + ASSERT(class_id < UINT8_MAX); + struct alloc_class *c = class_id == 0 ? + heap_get_best_class(heap, size) : + alloc_class_by_id(heap_alloc_classes(heap), + (uint8_t)class_id); + + if (c == NULL) { + ERR("no allocation class for size %lu bytes", size); + errno = EINVAL; + return -1; + } + + zset = heap_get_zoneset(heap, zset_id); + if (zset == NULL) { + errno = EINVAL; + return -1; + } + + /* + * The caller provided size in bytes, but buckets operate in + * 'size indexes' which are multiples of the block size in the + * bucket. + * + * For example, to allocate 500 bytes from a bucket that + * provides 256 byte blocks two memory 'units' are required. + */ + ssize_t size_idx = alloc_class_calc_size_idx(c, size); + + if (size_idx < 0) { + ERR("allocation class not suitable for size %lu bytes", + size); + errno = EINVAL; + return -1; + } + ASSERT(size_idx <= UINT32_MAX); + *new_block = MEMORY_BLOCK_NONE; + new_block->size_idx = (uint32_t)size_idx; + + struct bucket *b = zoneset_bucket_acquire(zset, c->id); + + err = heap_get_bestfit_block(heap, b, new_block); + if (err != 0) + goto out; + + if (alloc_prep_block(heap, new_block, constructor, arg, + extra_field, object_flags, out) != 0) { + /* + * Constructor returned non-zero value which means + * the memory block reservation has to be rolled back. + */ + if (new_block->type == MEMORY_BLOCK_HUGE) + bucket_insert_block(b, new_block); + err = ECANCELED; + goto out; + } + + /* + * Each as of yet unfulfilled reservation needs to be tracked in the + * runtime state. + * The memory block cannot be put back into the global state unless + * there are no active reservations. + */ + out->mresv = bucket_active_block(b); + if (out->mresv != NULL) + util_fetch_and_add64(&out->mresv->nresv, 1); + + out->lock = new_block->m_ops->get_lock(new_block); + out->new_state = MEMBLOCK_ALLOCATED; + +out: + zoneset_bucket_release(b); + + if (err == 0) + return 0; + + errno = err; + return -1; +} + +/* + * palloc_heap_action_exec -- executes a single heap action (alloc, free) + */ +static void +palloc_heap_action_exec(struct palloc_heap *heap, + const struct dav_action_internal *act, + struct operation_context *ctx) +{ +#ifdef DAV_EXTRA_DEBUG + if (act->m.m_ops->get_state(&act->m) == act->new_state) { + D_CRIT("invalid operation or heap corruption\n"); + ASSERT(0); + } +#endif + + /* + * The actual required metadata modifications are chunk-type + * dependent, but it always is a modification of a single 8 byte + * value - either modification of few bits in a bitmap or + * changing a chunk type from free to used or vice versa. + */ + act->m.m_ops->prep_hdr(&act->m, act->new_state, ctx); +} + +/* + * palloc_restore_free_chunk_state -- updates the runtime state of a free chunk. + * + * This function also takes care of coalescing of huge chunks. + */ +static void +palloc_restore_free_chunk_state(struct palloc_heap *heap, + struct memory_block *m) +{ + struct zoneset *zset = heap_get_zoneset(heap, m->zone_id); + + if (m->type == MEMORY_BLOCK_HUGE) { + struct bucket *b = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID); + + if (heap_free_chunk_reuse(heap, b, m) != 0) { + if (errno == EEXIST) + FATAL("duplicate runtime chunk state, possible double free"); + else + D_CRIT("unable to track runtime chunk state\n"); + } + zoneset_bucket_release(b); + } +} + +/* + * palloc_mem_action_noop -- empty handler for unused memory action funcs + */ +static void +palloc_mem_action_noop(struct palloc_heap *heap, + struct dav_action_internal *act) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(heap, act); +} + +/* + * palloc_reservation_clear -- clears the reservation state of the block, + * discards the associated memory block if possible + */ +static void +palloc_reservation_clear(struct palloc_heap *heap, + struct dav_action_internal *act, int publish) +{ + if (act->mresv == NULL) + return; + + struct memory_block_reserved *mresv = act->mresv; + struct bucket_locked *locked = mresv->bucket; + + if (!publish) { + /* + * If a memory block used for the action is the currently active + * memory block of the bucket it can be returned back to the + * bucket. This way it will be available for future allocation + * requests, improving performance. + */ + struct bucket *b = bucket_acquire(locked); + + bucket_try_insert_attached_block(b, &act->m); + bucket_release(b); + } + + if (util_fetch_and_sub64(&mresv->nresv, 1) == 1) { + VALGRIND_ANNOTATE_HAPPENS_AFTER(&mresv->nresv); + /* + * If the memory block used for the action is not currently used + * in any bucket nor action it can be discarded (given back to + * the heap). + */ + heap_discard_run(heap, &mresv->m); + D_FREE(mresv); + } else { + VALGRIND_ANNOTATE_HAPPENS_BEFORE(&mresv->nresv); + } +} + +/* + * palloc_heap_action_on_cancel -- restores the state of the heap + */ +static void +palloc_heap_action_on_cancel(struct palloc_heap *heap, + struct dav_action_internal *act) +{ + if (act->new_state == MEMBLOCK_FREE) + return; + + VALGRIND_DO_MEMPOOL_FREE(heap->layout, + act->m.m_ops->get_user_data(&act->m)); + + act->m.m_ops->invalidate(&act->m); + palloc_restore_free_chunk_state(heap, &act->m); + + palloc_reservation_clear(heap, act, 0 /* publish */); +} + +/* + * palloc_heap_action_on_process -- performs finalization steps under a lock + * on the persistent state + */ +static void +palloc_heap_action_on_process(struct palloc_heap *heap, + struct dav_action_internal *act) +{ + if (act->new_state == MEMBLOCK_ALLOCATED) { + STATS_INC(heap->stats, persistent, heap_curr_allocated, + act->m.m_ops->get_real_size(&act->m)); + if (act->m.type == MEMORY_BLOCK_RUN) { + STATS_INC(heap->stats, transient, heap_run_allocated, + act->m.m_ops->get_real_size(&act->m)); + } + } else if (act->new_state == MEMBLOCK_FREE) { + if (On_memcheck) { + void *ptr = act->m.m_ops->get_user_data(&act->m); + + VALGRIND_DO_MEMPOOL_FREE(heap->layout, ptr); + } + + STATS_SUB(heap->stats, persistent, heap_curr_allocated, + act->m.m_ops->get_real_size(&act->m)); + if (act->m.type == MEMORY_BLOCK_RUN) { + STATS_SUB(heap->stats, transient, heap_run_allocated, + act->m.m_ops->get_real_size(&act->m)); + } + heap_memblock_on_free(heap, &act->m); + } +} + +/* + * palloc_heap_action_on_unlock -- performs finalization steps that need to be + * performed without a lock on persistent state + */ +static void +palloc_heap_action_on_unlock(struct palloc_heap *heap, + struct dav_action_internal *act) +{ + if (act->new_state == MEMBLOCK_ALLOCATED) + palloc_reservation_clear(heap, act, 1 /* publish */); + else if (act->new_state == MEMBLOCK_FREE) + palloc_restore_free_chunk_state(heap, &act->m); +} + +/* + * palloc_mem_action_exec -- executes a single memory action (set, and, or) + */ +static void +palloc_mem_action_exec(struct palloc_heap *heap, + const struct dav_action_internal *act, + struct operation_context *ctx) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(heap); + + operation_add_entry(ctx, act->ptr, act->value, ULOG_OPERATION_SET); +} + +static const struct { + /* + * Translate action into some number of operation_entry'ies. + */ + void (*exec)(struct palloc_heap *heap, + const struct dav_action_internal *act, + struct operation_context *ctx); + + /* + * Cancel any runtime state changes. Can be called only when action has + * not been translated to persistent operation yet. + */ + void (*on_cancel)(struct palloc_heap *heap, + struct dav_action_internal *act); + + /* + * Final steps after persistent state has been modified. Performed + * under action-specific lock. + */ + void (*on_process)(struct palloc_heap *heap, + struct dav_action_internal *act); + + /* + * Final steps after persistent state has been modified. Performed + * after action-specific lock has been dropped. + */ + void (*on_unlock)(struct palloc_heap *heap, + struct dav_action_internal *act); +} action_funcs[DAV_MAX_ACTION_TYPE] = { + [DAV_ACTION_TYPE_HEAP] = { + .exec = palloc_heap_action_exec, + .on_cancel = palloc_heap_action_on_cancel, + .on_process = palloc_heap_action_on_process, + .on_unlock = palloc_heap_action_on_unlock, + }, + [DAV_ACTION_TYPE_MEM] = { + .exec = palloc_mem_action_exec, + .on_cancel = palloc_mem_action_noop, + .on_process = palloc_mem_action_noop, + .on_unlock = palloc_mem_action_noop, + } +}; + +/* + * palloc_action_compare -- compares two actions based on lock address + */ +static int +palloc_action_compare(const void *lhs, const void *rhs) +{ + const struct dav_action_internal *mlhs = lhs; + const struct dav_action_internal *mrhs = rhs; + uintptr_t vlhs = (uintptr_t)(mlhs->lock); + uintptr_t vrhs = (uintptr_t)(mrhs->lock); + + if (vlhs < vrhs) + return -1; + if (vlhs > vrhs) + return 1; + + return 0; +} + +/* + * palloc_exec_actions -- perform the provided free/alloc operations + */ +static void +palloc_exec_actions(struct palloc_heap *heap, + struct operation_context *ctx, + struct dav_action_internal *actv, + size_t actvcnt) +{ + /* + * The operations array is sorted so that proper lock ordering is + * ensured. + */ + if (actv) + qsort(actv, actvcnt, sizeof(struct dav_action_internal), + palloc_action_compare); + else + ASSERTeq(actvcnt, 0); + + struct dav_action_internal *act; + + for (size_t i = 0; i < actvcnt; ++i) { + act = &actv[i]; + + /* + * This lock must be held for the duration between the creation + * of the allocation metadata updates in the operation context + * and the operation processing. This is because a different + * thread might operate on the same 8-byte value of the run + * bitmap and override allocation performed by this thread. + */ + if (i == 0 || act->lock != actv[i - 1].lock) { + if (act->lock) + util_mutex_lock(act->lock); + } + + /* translate action to some number of operation_entry'ies */ + action_funcs[act->type].exec(heap, act, ctx); + } + + /* wait for all allocated object headers to be persistent */ + mo_wal_drain(&heap->p_ops); + + /* perform all persistent memory operations */ + operation_process(ctx); + + for (size_t i = 0; i < actvcnt; ++i) { + act = &actv[i]; + + action_funcs[act->type].on_process(heap, act); + + if (i == actvcnt - 1 || act->lock != actv[i + 1].lock) { + if (act->lock) + util_mutex_unlock(act->lock); + } + } + + for (size_t i = 0; i < actvcnt; ++i) { + act = &actv[i]; + + action_funcs[act->type].on_unlock(heap, act); + } + + operation_finish(ctx, 0); +} + +/* + * palloc_reserve -- creates a single reservation + */ +int +palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg, + uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t zset_id, + struct dav_action *act) +{ + COMPILE_ERROR_ON(sizeof(struct dav_action) != + sizeof(struct dav_action_internal)); + + return palloc_reservation_create(heap, size, constructor, arg, extra_field, object_flags, + class_id, zset_id, (struct dav_action_internal *)act); +} + +/* + * palloc_action_isalloc - action is a heap reservation + * created by palloc_reserve(). + */ +int +palloc_action_isalloc(struct dav_action *act) +{ + struct dav_action_internal *actp = (struct dav_action_internal *)act; + + return ((actp->type == DAV_ACTION_TYPE_HEAP) && + (actp->new_state == MEMBLOCK_ALLOCATED)); +} + +uint64_t +palloc_get_realoffset(struct palloc_heap *heap, uint64_t off) +{ + struct memory_block m = memblock_from_offset(heap, off); + + return HEAP_PTR_TO_OFF(m.heap, m.m_ops->get_real_data(&m)); +} + +/* + * palloc_get_prange -- get the start offset and size of allocated memory that + * needs to be persisted. + * + * persist_udata - if true, persist the user data. + */ +void +palloc_get_prange(struct dav_action *act, uint64_t *const offp, uint64_t *const sizep, + int persist_udata) +{ + struct dav_action_internal *act_in = (struct dav_action_internal *)act; + + D_ASSERT(act_in->type == DAV_ACTION_TYPE_HEAP); + /* we need to persist the header if present */ + *offp = HEAP_PTR_TO_OFF(act_in->m.heap, act_in->m.m_ops->get_real_data(&act_in->m)); + *sizep = header_type_to_size[act_in->m.header_type]; + + D_ASSERT(act_in->offset == *offp + header_type_to_size[act_in->m.header_type]); + /* persist the user data */ + if (persist_udata) + *sizep += act_in->usable_size; +} + +/* + * palloc_defer_free -- creates an internal deferred free action + */ +static void +palloc_defer_free_create(struct palloc_heap *heap, uint64_t off, + struct dav_action_internal *out) +{ + COMPILE_ERROR_ON(sizeof(struct dav_action) != + sizeof(struct dav_action_internal)); + + out->type = DAV_ACTION_TYPE_HEAP; + out->offset = off; + out->m = memblock_from_offset(heap, off); + + /* + * For the duration of free we may need to protect surrounding + * metadata from being modified. + */ + out->lock = out->m.m_ops->get_lock(&out->m); + out->mresv = NULL; + out->new_state = MEMBLOCK_FREE; +} + +/* + * palloc_defer_free -- creates a deferred free action + */ +void +palloc_defer_free(struct palloc_heap *heap, uint64_t off, struct dav_action *act) +{ + COMPILE_ERROR_ON(sizeof(struct dav_action) != + sizeof(struct dav_action_internal)); + + palloc_defer_free_create(heap, off, (struct dav_action_internal *)act); +} + +/* + * palloc_cancel -- cancels all reservations in the array + */ +void +palloc_cancel(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt) +{ + struct dav_action_internal *act; + + for (size_t i = 0; i < actvcnt; ++i) { + act = (struct dav_action_internal *)&actv[i]; + action_funcs[act->type].on_cancel(heap, act); + } +} + +/* + * palloc_publish -- publishes all reservations in the array + */ +void +palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt, + struct operation_context *ctx) +{ + palloc_exec_actions(heap, ctx, + (struct dav_action_internal *)actv, actvcnt); +} + +/* + * palloc_operation -- persistent memory operation. Takes a NULL pointer + * or an existing memory block and modifies it to occupy, at least, 'size' + * number of bytes. + * + * The malloc, free and realloc routines are implemented in the context of this + * common operation which encompasses all of the functionality usually done + * separately in those methods. + * + * The first thing that needs to be done is determining which memory blocks + * will be affected by the operation - this varies depending on the whether the + * operation will need to modify or free an existing block and/or allocate + * a new one. + * + * Simplified allocation process flow is as follows: + * - reserve a new block in the transient heap + * - prepare the new block + * - create redo log of required modifications + * - chunk metadata + * - offset of the new object + * - commit and process the redo log + * + * And similarly, the deallocation process: + * - create redo log of required modifications + * - reverse the chunk metadata back to the 'free' state + * - set the destination of the object offset to zero + * - commit and process the redo log + * There's an important distinction in the deallocation process - it does not + * return the memory block to the transient container. That is done once no more + * memory is available. + * + * Reallocation is a combination of the above, with one additional step + * of copying the old content. + */ +int +palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size, + palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags, + uint16_t class_id, uint32_t zset_id, struct operation_context *ctx) +{ + size_t user_size = 0; + + size_t nops = 0; + uint64_t aoff; + uint64_t asize; + struct dav_action_internal ops[2]; + struct dav_action_internal *alloc = NULL; + struct dav_action_internal *dealloc = NULL; + + /* + * The offset of an existing block can be nonzero which means this + * operation is either free or a realloc - either way the offset of the + * object needs to be translated into memory block, which is a structure + * that all of the heap methods expect. + */ + if (off != 0) { + dealloc = &ops[nops++]; + palloc_defer_free_create(heap, off, dealloc); + user_size = dealloc->m.m_ops->get_user_size(&dealloc->m); + if (user_size == size) { + operation_cancel(ctx); + return 0; + } + } + + /* alloc or realloc */ + if (size != 0) { + alloc = &ops[nops++]; + if (palloc_reservation_create(heap, size, constructor, arg, extra_field, + object_flags, class_id, zset_id, alloc) != 0) { + operation_cancel(ctx); + return -1; + } + + palloc_get_prange((struct dav_action *)alloc, &aoff, &asize, 0); + if (asize) /* != CHUNK_FLAG_HEADER_NONE */ + dav_wal_tx_snap(heap->p_ops.base, HEAP_OFF_TO_PTR(heap, aoff), + asize, HEAP_OFF_TO_PTR(heap, aoff), 0); + } + + /* realloc */ + if (alloc != NULL && dealloc != NULL) { + /* copy data to newly allocated memory */ + size_t old_size = user_size; + size_t to_cpy = old_size > size ? size : old_size; + + VALGRIND_ADD_TO_TX( + HEAP_OFF_TO_PTR(heap, alloc->offset), + to_cpy); + mo_wal_memcpy(&heap->p_ops, + HEAP_OFF_TO_PTR(heap, alloc->offset), + HEAP_OFF_TO_PTR(heap, off), + to_cpy, + 0); + VALGRIND_REMOVE_FROM_TX( + HEAP_OFF_TO_PTR(heap, alloc->offset), + to_cpy); + } + + /* + * If the caller provided a destination value to update, it needs to be + * modified atomically alongside the heap metadata, and so the operation + * context must be used. + */ + if (dest_off) { + operation_add_entry(ctx, dest_off, + alloc ? alloc->offset : 0, ULOG_OPERATION_SET); + } + + /* and now actually perform the requested operation! */ + palloc_exec_actions(heap, ctx, ops, nops); + + return 0; +} + +/* + * palloc_usable_size -- returns the number of bytes in the memory block + */ +size_t +palloc_usable_size(struct palloc_heap *heap, uint64_t off) +{ + struct memory_block m = memblock_from_offset(heap, off); + + return m.m_ops->get_user_size(&m); +} + +/* + * palloc_extra -- returns allocation extra field + */ +uint64_t +palloc_extra(struct palloc_heap *heap, uint64_t off) +{ + struct memory_block m = memblock_from_offset(heap, off); + + return m.m_ops->get_extra(&m); +} + +/* + * palloc_flags -- returns allocation flags + */ +uint16_t +palloc_flags(struct palloc_heap *heap, uint64_t off) +{ + struct memory_block m = memblock_from_offset(heap, off); + + return m.m_ops->get_flags(&m); +} + +/* + * pmalloc_search_cb -- (internal) foreach callback. + */ +static int +pmalloc_search_cb(const struct memory_block *m, void *arg) +{ + struct memory_block *out = arg; + + if (MEMORY_BLOCK_EQUALS(*m, *out)) + return 0; /* skip the same object */ + + *out = *m; + + return 1; +} + +/* + * palloc_first -- returns the first object from the heap. + */ +uint64_t +palloc_first(struct palloc_heap *heap) +{ + struct memory_block search = MEMORY_BLOCK_NONE; + + heap_foreach_object(heap, pmalloc_search_cb, + &search, MEMORY_BLOCK_NONE); + + if (MEMORY_BLOCK_IS_NONE(search)) + return 0; + + void *uptr = search.m_ops->get_user_data(&search); + + return HEAP_PTR_TO_OFF(heap, uptr); +} + +/* + * palloc_next -- returns the next object relative to 'off'. + */ +uint64_t +palloc_next(struct palloc_heap *heap, uint64_t off) +{ + struct memory_block m = memblock_from_offset(heap, off); + struct memory_block search = m; + + heap_foreach_object(heap, pmalloc_search_cb, &search, m); + + if (MEMORY_BLOCK_IS_NONE(search) || + MEMORY_BLOCK_EQUALS(search, m)) + return 0; + + void *uptr = search.m_ops->get_user_data(&search); + + return HEAP_PTR_TO_OFF(heap, uptr); +} + +/* + * palloc_boot -- initializes allocator section + */ +int +palloc_boot(struct palloc_heap *heap, void *heap_start, + uint64_t heap_size, uint64_t *sizep, + void *base, struct mo_ops *p_ops, struct stats *stats, + struct pool_set *set) +{ + return heap_boot(heap, heap_start, heap_size, sizep, + base, p_ops, stats, set); +} + +/* + * palloc_init -- initializes palloc heap + */ +int +palloc_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops) +{ + return heap_init(heap_start, heap_size, sizep, p_ops); +} + +/* + * palloc_heap_end -- returns first address after heap + */ +void * +palloc_heap_end(struct palloc_heap *h) +{ + return heap_end(h); +} + +/* + * palloc_heap_check -- verifies heap state + */ +int +palloc_heap_check(void *heap_start, uint64_t heap_size) +{ + return heap_check(heap_start, heap_size); +} + +/* + * palloc_heap_check_remote -- verifies state of remote replica + */ +int +palloc_heap_check_remote(void *heap_start, uint64_t heap_size, + struct remote_ops *ops) +{ + return heap_check_remote(heap_start, heap_size, ops); +} + +#if VG_MEMCHECK_ENABLED +/* + * palloc_vg_register_alloc -- (internal) registers allocation header + * in Valgrind + */ +static int +palloc_vg_register_alloc(const struct memory_block *m, void *arg) +{ + struct palloc_heap *heap = arg; + + m->m_ops->reinit_header(m); + + void *uptr = m->m_ops->get_user_data(m); + size_t usize = m->m_ops->get_user_size(m); + + VALGRIND_DO_MEMPOOL_ALLOC(heap->layout, uptr, usize); + VALGRIND_DO_MAKE_MEM_DEFINED(uptr, usize); + + return 0; +} + +/* + * palloc_heap_vg_open -- notifies Valgrind about heap layout + */ +void +palloc_heap_vg_open(struct palloc_heap *heap, int objects) +{ + heap_vg_open(heap, palloc_vg_register_alloc, heap, objects); +} +#endif diff --git a/src/common/dav_v2/palloc.h b/src/common/dav_v2/palloc.h new file mode 100644 index 00000000000..0560cd97890 --- /dev/null +++ b/src/common/dav_v2/palloc.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2020, Intel Corporation */ + +/* + * palloc.h -- internal definitions for persistent allocator + */ + +#ifndef __DAOS_COMMON_PALLOC_H +#define __DAOS_COMMON_PALLOC_H 1 + +#include <stddef.h> +#include <stdint.h> + +#include "memops.h" +#include "ulog.h" +#include "valgrind_internal.h" +#include "stats.h" +#include "dav_v2.h" + +#define PALLOC_CTL_DEBUG_NO_PATTERN (-1) + +struct palloc_heap { + struct mo_ops p_ops; + struct heap_layout *layout; + struct heap_rt *rt; + uint64_t *sizep; + uint64_t growsize; + struct stats *stats; + struct pool_set *set; + void *base; + int alloc_pattern; +}; + +struct memory_block; +struct zoneset; + +typedef int (*palloc_constr)(void *base, void *ptr, size_t usable_size, void *arg); + +int +palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size, + palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags, + uint16_t class_id, uint32_t zset_id, struct operation_context *ctx); + +int +palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg, + uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t zset_id, + struct dav_action *act); + +int +palloc_action_isalloc(struct dav_action *act); +void +palloc_get_prange(struct dav_action *act, uint64_t *const off, uint64_t *const size, + int persist_udata); +uint64_t +palloc_get_realoffset(struct palloc_heap *heap, uint64_t off); + +void +palloc_defer_free(struct palloc_heap *heap, uint64_t off, struct dav_action *act); + +void +palloc_cancel(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt); + +void +palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt, + struct operation_context *ctx); + +void +palloc_set_value(struct palloc_heap *heap, struct dav_action *act, uint64_t *ptr, uint64_t value); + +uint64_t +palloc_first(struct palloc_heap *heap); +uint64_t +palloc_next(struct palloc_heap *heap, uint64_t off); + +size_t +palloc_usable_size(struct palloc_heap *heap, uint64_t off); +uint64_t +palloc_extra(struct palloc_heap *heap, uint64_t off); +uint16_t +palloc_flags(struct palloc_heap *heap, uint64_t off); + +int +palloc_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, uint64_t *sizep, + void *base, struct mo_ops *p_ops, struct stats *stats, struct pool_set *set); + +int +palloc_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops); +void * +palloc_heap_end(struct palloc_heap *h); +int +palloc_heap_check(void *heap_start, uint64_t heap_size); +int +palloc_heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops); +size_t +palloc_heap(void *heap_start); + +/* foreach callback, terminates iteration if return value is non-zero */ +typedef int (*object_callback)(const struct memory_block *m, void *arg); + +#if VG_MEMCHECK_ENABLED +void +palloc_heap_vg_open(struct palloc_heap *heap, int objects); +#endif + +#endif /* __DAOS_COMMON_PALLOC_H */ diff --git a/src/common/dav_v2/queue.h b/src/common/dav_v2/queue.h new file mode 100644 index 00000000000..654c60cec9b --- /dev/null +++ b/src/common/dav_v2/queue.h @@ -0,0 +1,112 @@ +/* + * Source: glibc 2.24 (git://sourceware.org/glibc.git /misc/sys/queue.h) + * + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 2016, Microsoft Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + */ + +#ifndef __DAOS_COMMON_QUEUE_H_ +#define __DAOS_COMMON_QUEUE_H_ + +/* + * This file defines five types of data structures: singly-linked lists, + * lists, simple queues, tail queues, and circular queues. + * + * A singly-linked list is headed by a single forward pointer. The + * elements are singly linked for minimum space and pointer manipulation + * overhead at the expense of O(n) removal for arbitrary elements. New + * elements can be added to the list after an existing element or at the + * head of the list. Elements being removed from the head of the list + * should use the explicit macro for this purpose for optimum + * efficiency. A singly-linked list may only be traversed in the forward + * direction. Singly-linked lists are ideal for applications with large + * datasets and few or no removals or for implementing a LIFO queue. + * + * For details on the use of these macros, see the queue(3) manual page. + */ + +/* + * Singly-linked List definitions. + */ +#define DAV_SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define DAV_SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define DAV_SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define DAV_SLIST_INIT(head) ((head)->slh_first = NULL) + +#define DAV_SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + (elm)->field.sle_next = (slistelm)->field.sle_next; \ + (slistelm)->field.sle_next = (elm); \ +} while (/*CONSTCOND*/0) + +#define DAV_SLIST_INSERT_HEAD(head, elm, field) do { \ + (elm)->field.sle_next = (head)->slh_first; \ + (head)->slh_first = (elm); \ +} while (/*CONSTCOND*/0) + +#define DAV_SLIST_REMOVE_HEAD(head, field) \ + ((head)->slh_first = (head)->slh_first->field.sle_next) + +#define DAV_SLIST_REMOVE(head, elm, type, field) do { \ + if ((head)->slh_first == (elm)) { \ + DAV_SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = (head)->slh_first; \ + while (curelm->field.sle_next != (elm)) \ + curelm = curelm->field.sle_next; \ + curelm->field.sle_next = \ + curelm->field.sle_next->field.sle_next; \ + } \ +} while (/*CONSTCOND*/0) + +#define DAV_SLIST_FOREACH(var, head, field) \ + for ((var) = (head)->slh_first; (var); (var) = (var)->field.sle_next) + +/* + * Singly-linked List access methods. + */ +#define DAV_SLIST_EMPTY(head) ((head)->slh_first == NULL) +#define DAV_SLIST_FIRST(head) ((head)->slh_first) +#define DAV_SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#endif /* __DAOS_COMMON_QUEUE_H_ */ diff --git a/src/common/dav_v2/ravl.c b/src/common/dav_v2/ravl.c new file mode 100644 index 00000000000..5192e2abbdb --- /dev/null +++ b/src/common/dav_v2/ravl.c @@ -0,0 +1,613 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2022, Intel Corporation */ + +/* + * ravl.c -- implementation of a RAVL tree + * https://sidsen.azurewebsites.net//papers/ravl-trees-journal.pdf + */ + +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include "out.h" +#include "ravl.h" +#include "util.h" + +#define RAVL_DEFAULT_DATA_SIZE (sizeof(void *)) + +enum ravl_slot_type { + RAVL_LEFT, + RAVL_RIGHT, + + MAX_SLOTS, + + RAVL_ROOT +}; + +struct ravl_node { + struct ravl_node *parent; + struct ravl_node *slots[MAX_SLOTS]; + int32_t rank; /* cannot be greater than height of the subtree */ + int32_t pointer_based; + char data[]; +}; + +struct ravl { + struct ravl_node *root; + ravl_compare *compare; + size_t data_size; +}; + +/* + * ravl_new -- creates a new ravl tree instance + */ +struct ravl * +ravl_new_sized(ravl_compare *compare, size_t data_size) +{ + struct ravl *r; + + D_ALLOC_PTR_NZ(r); + if (r == NULL) { + D_CRIT("Malloc!\n"); + return r; + } + + r->compare = compare; + r->root = NULL; + r->data_size = data_size; + + return r; +} + +/* + * ravl_new -- creates a new tree that stores data pointers + */ +struct ravl * +ravl_new(ravl_compare *compare) +{ + return ravl_new_sized(compare, RAVL_DEFAULT_DATA_SIZE); +} + +/* + * ravl_clear_node -- (internal) recursively clears the given subtree, + * calls callback in an in-order fashion. Optionally frees the given node. + */ +static void +ravl_foreach_node(struct ravl_node *n, ravl_cb cb, void *arg, int free_node) +{ + if (n == NULL) + return; + + ravl_foreach_node(n->slots[RAVL_LEFT], cb, arg, free_node); + if (cb) + cb((void *)n->data, arg); + ravl_foreach_node(n->slots[RAVL_RIGHT], cb, arg, free_node); + + if (free_node) + D_FREE(n); +} + +/* + * ravl_clear -- clears the entire tree, starting from the root + */ +void +ravl_clear(struct ravl *ravl) +{ + ravl_foreach_node(ravl->root, NULL, NULL, 1); + ravl->root = NULL; +} + +/* + * ravl_delete_cb -- clears and deletes the given ravl instance, calls callback + */ +void +ravl_delete_cb(struct ravl *ravl, ravl_cb cb, void *arg) +{ + ravl_foreach_node(ravl->root, cb, arg, 1); + D_FREE(ravl); +} + +/* + * ravl_delete -- clears and deletes the given ravl instance + */ +void +ravl_delete(struct ravl *ravl) +{ + ravl_delete_cb(ravl, NULL, NULL); +} + +/* + * ravl_foreach -- traverses the entire tree, calling callback for every node + */ +void +ravl_foreach(struct ravl *ravl, ravl_cb cb, void *arg) +{ + ravl_foreach_node(ravl->root, cb, arg, 0); +} + +/* + * ravl_empty -- checks whether the given tree is empty + */ +int +ravl_empty(struct ravl *ravl) +{ + return ravl->root == NULL; +} + +/* + * ravl_node_insert_constructor -- node data constructor for ravl_insert + */ +static void +ravl_node_insert_constructor(void *data, size_t data_size, const void *arg) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(data_size); + + /* copy only the 'arg' pointer */ + memcpy(data, &arg, sizeof(arg)); +} + +/* + * ravl_node_copy_constructor -- node data constructor for ravl_emplace_copy + */ +static void +ravl_node_copy_constructor(void *data, size_t data_size, const void *arg) +{ + memcpy(data, arg, data_size); +} + +/* + * ravl_new_node -- (internal) allocates and initializes a new node + */ +static struct ravl_node * +ravl_new_node(struct ravl *ravl, ravl_constr constr, const void *arg) +{ + struct ravl_node *n; + + D_ALLOC_NZ(n, (sizeof(*n) + ravl->data_size)); + if (n == NULL) { + D_CRIT("Malloc!\n"); + return n; + } + + n->parent = NULL; + n->slots[RAVL_LEFT] = NULL; + n->slots[RAVL_RIGHT] = NULL; + n->rank = 0; + n->pointer_based = constr == ravl_node_insert_constructor; + constr(n->data, ravl->data_size, arg); + + return n; +} + +/* + * ravl_slot_opposite -- (internal) returns the opposite slot type, cannot be + * called for root type + */ +static enum ravl_slot_type +ravl_slot_opposite(enum ravl_slot_type t) +{ + ASSERTne(t, RAVL_ROOT); + + return t == RAVL_LEFT ? RAVL_RIGHT : RAVL_LEFT; +} + +/* + * ravl_node_slot_type -- (internal) returns the type of the given node: + * left child, right child or root + */ +static enum ravl_slot_type +ravl_node_slot_type(struct ravl_node *n) +{ + if (n->parent == NULL) + return RAVL_ROOT; + + return n->parent->slots[RAVL_LEFT] == n ? RAVL_LEFT : RAVL_RIGHT; +} + +/* + * ravl_node_sibling -- (internal) returns the sibling of the given node, + * NULL if the node is root (has no parent) + */ +static struct ravl_node * +ravl_node_sibling(struct ravl_node *n) +{ + enum ravl_slot_type t = ravl_node_slot_type(n); + + if (t == RAVL_ROOT) + return NULL; + + return n->parent->slots[t == RAVL_LEFT ? RAVL_RIGHT : RAVL_LEFT]; +} + +/* + * ravl_node_ref -- (internal) returns the pointer to the memory location in + * which the given node resides + */ +static struct ravl_node ** +ravl_node_ref(struct ravl *ravl, struct ravl_node *n) +{ + enum ravl_slot_type t = ravl_node_slot_type(n); + + return t == RAVL_ROOT ? &ravl->root : &n->parent->slots[t]; +} + +/* + * ravl_rotate -- (internal) performs a rotation around a given node + * + * The node n swaps place with its parent. If n is right child, parent becomes + * the left child of n, otherwise parent becomes right child of n. + */ +static void +ravl_rotate(struct ravl *ravl, struct ravl_node *n) +{ + ASSERTne(n->parent, NULL); + struct ravl_node *p = n->parent; + struct ravl_node **pref = ravl_node_ref(ravl, p); + + enum ravl_slot_type t = ravl_node_slot_type(n); + enum ravl_slot_type t_opposite = ravl_slot_opposite(t); + + n->parent = p->parent; + p->parent = n; + *pref = n; + + p->slots[t] = n->slots[t_opposite]; + if (p->slots[t] != NULL) + p->slots[t]->parent = p; + n->slots[t_opposite] = p; +} + +/* + * ravl_node_rank -- (internal) returns the rank of the node + * + * For the purpose of balancing, NULL nodes have rank -1. + */ +static int +ravl_node_rank(struct ravl_node *n) +{ + return n == NULL ? -1 : n->rank; +} + +/* + * ravl_node_rank_difference_parent -- (internal) returns the rank different + * between parent node p and its child n + * + * Every rank difference must be positive. + * + * Either of these can be NULL. + */ +static int +ravl_node_rank_difference_parent(struct ravl_node *p, struct ravl_node *n) +{ + return ravl_node_rank(p) - ravl_node_rank(n); +} + +/* + * ravl_node_rank_differenced - (internal) returns the rank difference between + * parent and its child + * + * Can be used to check if a given node is an i-child. + */ +static int +ravl_node_rank_difference(struct ravl_node *n) +{ + return ravl_node_rank_difference_parent(n->parent, n); +} + +/* + * ravl_node_is_i_j -- (internal) checks if a given node is strictly i,j-node + */ +static int +ravl_node_is_i_j(struct ravl_node *n, int i, int j) +{ + return (ravl_node_rank_difference_parent(n, n->slots[RAVL_LEFT]) == i && + ravl_node_rank_difference_parent(n, n->slots[RAVL_RIGHT]) == j); +} + +/* + * ravl_node_is -- (internal) checks if a given node is i,j-node or j,i-node + */ +static int +ravl_node_is(struct ravl_node *n, int i, int j) +{ + return ravl_node_is_i_j(n, i, j) || ravl_node_is_i_j(n, j, i); +} + +/* + * ravl_node_promote -- promotes a given node by increasing its rank + */ +static void +ravl_node_promote(struct ravl_node *n) +{ + n->rank += 1; +} + +/* + * ravl_node_promote -- demotes a given node by increasing its rank + */ +static void +ravl_node_demote(struct ravl_node *n) +{ + ASSERT(n->rank > 0); + n->rank -= 1; +} + +/* + * ravl_balance -- balances the tree after insert + * + * This function must restore the invariant that every rank + * difference is positive. + */ +static void +ravl_balance(struct ravl *ravl, struct ravl_node *n) +{ + /* walk up the tree, promoting nodes */ + while (n->parent && ravl_node_is(n->parent, 0, 1)) { + ravl_node_promote(n->parent); + n = n->parent; + } + + /* + * Either the rank rule holds or n is a 0-child whose sibling is an + * i-child with i > 1. + */ + struct ravl_node *s = ravl_node_sibling(n); + + if (!(ravl_node_rank_difference(n) == 0 && + ravl_node_rank_difference_parent(n->parent, s) > 1)) + return; + + struct ravl_node *y = n->parent; + /* if n is a left child, let z be n's right child and vice versa */ + enum ravl_slot_type t = ravl_slot_opposite(ravl_node_slot_type(n)); + struct ravl_node *z = n->slots[t]; + + if (z == NULL || ravl_node_rank_difference(z) == 2) { + ravl_rotate(ravl, n); + ravl_node_demote(y); + } else if (ravl_node_rank_difference(z) == 1) { + ravl_rotate(ravl, z); + ravl_rotate(ravl, z); + ravl_node_promote(z); + ravl_node_demote(n); + ravl_node_demote(y); + } +} + +/* + * ravl_insert -- insert data into the tree + */ +int +ravl_insert(struct ravl *ravl, const void *data) +{ + return ravl_emplace(ravl, ravl_node_insert_constructor, data); +} + +/* + * ravl_insert -- copy construct data inside of a new tree node + */ +int +ravl_emplace_copy(struct ravl *ravl, const void *data) +{ + return ravl_emplace(ravl, ravl_node_copy_constructor, data); +} + +/* + * ravl_emplace -- construct data inside of a new tree node + */ +int +ravl_emplace(struct ravl *ravl, ravl_constr constr, const void *arg) +{ + struct ravl_node *n = ravl_new_node(ravl, constr, arg); + + if (n == NULL) + return -1; + + /* walk down the tree and insert the new node into a missing slot */ + struct ravl_node **dstp = &ravl->root; + struct ravl_node *dst = NULL; + + while (*dstp != NULL) { + dst = (*dstp); + int cmp_result = ravl->compare(ravl_data(n), ravl_data(dst)); + + if (cmp_result == 0) + goto error_duplicate; + + dstp = &dst->slots[cmp_result > 0]; + } + n->parent = dst; + *dstp = n; + + ravl_balance(ravl, n); + + return 0; + +error_duplicate: + errno = EEXIST; + D_FREE(n); + return -1; +} + +/* + * ravl_node_type_most -- (internal) returns left-most or right-most node in + * the subtree + */ +static struct ravl_node * +ravl_node_type_most(struct ravl_node *n, enum ravl_slot_type t) +{ + while (n->slots[t] != NULL) + n = n->slots[t]; + + return n; +} + +/* + * ravl_node_cessor -- (internal) returns the successor or predecessor of the + * node + */ +static struct ravl_node * +ravl_node_cessor(struct ravl_node *n, enum ravl_slot_type t) +{ + /* + * If t child is present, we are looking for t-opposite-most node + * in t child subtree + */ + if (n->slots[t]) + return ravl_node_type_most(n->slots[t], ravl_slot_opposite(t)); + + /* otherwise get the first parent on the t path */ + while (n->parent != NULL && n == n->parent->slots[t]) + n = n->parent; + + return n->parent; +} + +/* + * ravl_node_successor -- (internal) returns node's successor + * + * It's the first node larger than n. + */ +static struct ravl_node * +ravl_node_successor(struct ravl_node *n) +{ + return ravl_node_cessor(n, RAVL_RIGHT); +} + +/* + * ravl_node_successor -- (internal) returns node's successor + * + * It's the first node smaller than n. + */ +static struct ravl_node * +ravl_node_predecessor(struct ravl_node *n) +{ + return ravl_node_cessor(n, RAVL_LEFT); +} + +/* + * ravl_predicate_holds -- (internal) verifies the given predicate for + * the current node in the search path + * + * If the predicate holds for the given node or a node that can be directly + * derived from it, returns 1. Otherwise returns 0. + */ +static int +ravl_predicate_holds(int result, struct ravl_node **ret, + struct ravl_node *n, enum ravl_predicate flags) +{ + if (flags & RAVL_PREDICATE_EQUAL) { + if (result == 0) { + *ret = n; + return 1; + } + } + if (flags & RAVL_PREDICATE_GREATER) { + if (result < 0) { /* data < n->data */ + *ret = n; + return 0; + } else if (result == 0) { + *ret = ravl_node_successor(n); + return 1; + } + } + if (flags & RAVL_PREDICATE_LESS) { + if (result > 0) { /* data > n->data */ + *ret = n; + return 0; + } else if (result == 0) { + *ret = ravl_node_predecessor(n); + return 1; + } + } + + return 0; +} + +/* + * ravl_find -- searches for the node in the tree + */ +struct ravl_node * +ravl_find(struct ravl *ravl, const void *data, enum ravl_predicate flags) +{ + struct ravl_node *r = NULL; + struct ravl_node *n = ravl->root; + + while (n) { + int result = ravl->compare(data, ravl_data(n)); + + if (ravl_predicate_holds(result, &r, n, flags)) + return r; + + n = n->slots[result > 0]; + } + + return r; +} + +/* + * ravl_remove -- removes the given node from the tree + */ +void +ravl_remove(struct ravl *ravl, struct ravl_node *n) +{ + if (n->slots[RAVL_LEFT] != NULL && n->slots[RAVL_RIGHT] != NULL) { + /* if both children are present, remove the successor instead */ + struct ravl_node *s = ravl_node_successor(n); + + memcpy(n->data, s->data, ravl->data_size); + ravl_remove(ravl, s); + } else { + /* swap n with the child that may exist */ + struct ravl_node *r = n->slots[RAVL_LEFT] ? + n->slots[RAVL_LEFT] : n->slots[RAVL_RIGHT]; + + if (r != NULL) + r->parent = n->parent; + + *ravl_node_ref(ravl, n) = r; + D_FREE(n); + } +} + +/* + * ravl_data -- returns the data contained within the node + */ +void * +ravl_data(struct ravl_node *node) +{ + if (node->pointer_based) { + void *data; + + memcpy(&data, node->data, sizeof(void *)); + return data; + } else { + return (void *)node->data; + } +} + +/* + * ravl_first -- returns first (left-most) node in the tree + */ +struct ravl_node * +ravl_first(struct ravl *ravl) +{ + if (ravl->root) + return ravl_node_type_most(ravl->root, RAVL_LEFT); + + return NULL; +} + +/* + * ravl_last -- returns last (right-most) node in the tree + */ +struct ravl_node * +ravl_last(struct ravl *ravl) +{ + if (ravl->root) + return ravl_node_type_most(ravl->root, RAVL_RIGHT); + + return NULL; +} diff --git a/src/common/dav_v2/ravl.h b/src/common/dav_v2/ravl.h new file mode 100644 index 00000000000..e44f1877791 --- /dev/null +++ b/src/common/dav_v2/ravl.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2021, Intel Corporation */ + +/* + * ravl.h -- internal definitions for ravl tree + */ + +#ifndef __DAOS_COMMON_RAVL_H +#define __DAOS_COMMON_RAVL_H 1 + +#include <stddef.h> + +struct ravl; +struct ravl_node; + +enum ravl_predicate { + RAVL_PREDICATE_EQUAL = 1 << 0, + RAVL_PREDICATE_GREATER = 1 << 1, + RAVL_PREDICATE_LESS = 1 << 2, + RAVL_PREDICATE_LESS_EQUAL = + RAVL_PREDICATE_EQUAL | RAVL_PREDICATE_LESS, + RAVL_PREDICATE_GREATER_EQUAL = + RAVL_PREDICATE_EQUAL | RAVL_PREDICATE_GREATER, +}; + +typedef int ravl_compare(const void *lhs, const void *rhs); +typedef void ravl_cb(void *data, void *arg); +typedef void ravl_constr(void *data, size_t data_size, const void *arg); + +struct ravl *ravl_new(ravl_compare *compare); +struct ravl *ravl_new_sized(ravl_compare *compare, size_t data_size); +void ravl_delete(struct ravl *ravl); +void ravl_delete_cb(struct ravl *ravl, ravl_cb cb, void *arg); +void ravl_foreach(struct ravl *ravl, ravl_cb cb, void *arg); +int ravl_empty(struct ravl *ravl); +void ravl_clear(struct ravl *ravl); +int ravl_insert(struct ravl *ravl, const void *data); +int ravl_emplace(struct ravl *ravl, ravl_constr constr, const void *arg); +int ravl_emplace_copy(struct ravl *ravl, const void *data); + +struct ravl_node *ravl_find(struct ravl *ravl, const void *data, + enum ravl_predicate predicate_flags); +struct ravl_node *ravl_first(struct ravl *ravl); +struct ravl_node *ravl_last(struct ravl *ravl); +void *ravl_data(struct ravl_node *node); +void ravl_remove(struct ravl *ravl, struct ravl_node *node); + +#endif /* __DAOS_COMMON_RAVL_H */ diff --git a/src/common/dav_v2/ravl_interval.c b/src/common/dav_v2/ravl_interval.c new file mode 100644 index 00000000000..de37ee167a0 --- /dev/null +++ b/src/common/dav_v2/ravl_interval.c @@ -0,0 +1,344 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2020-2022, Intel Corporation */ + +/* + * ravl_interval.c -- ravl_interval implementation + */ + +#include <stdbool.h> + +#include "ravl_interval.h" +#include "sys_util.h" +#include "ravl.h" + +/* + * ravl_interval - structure representing two points + * on the number line + */ +struct ravl_interval { + struct ravl *tree; + ravl_interval_min *get_min; + ravl_interval_max *get_max; +}; + +/* + * ravl_interval_node - structure holding min, max functions and address + */ +struct ravl_interval_node { + void *addr; + ravl_interval_min *get_min; + ravl_interval_max *get_max; + bool overlap; +}; + +/* + * ravl_interval_compare -- compare intervals by its boundaries + */ +static int +ravl_interval_compare(const void *lhs, const void *rhs) +{ + const struct ravl_interval_node *left = lhs; + const struct ravl_interval_node *right = rhs; + + /* + * when searching, comparing should return the + * earliest overlapped record + */ + if (left->overlap) { + if (left->get_min(left->addr) >= right->get_max(right->addr)) + return 1; + if (left->get_min(left->addr) == right->get_min(right->addr)) + return 0; + return -1; + } + + /* when inserting, comparing shouldn't allow overlapping intervals */ + if (left->get_min(left->addr) >= right->get_max(right->addr)) + return 1; + if (left->get_max(left->addr) <= right->get_min(right->addr)) + return -1; + return 0; +} + +/* + * ravl_interval_delete - finalize the ravl interval module + */ +void +ravl_interval_delete(struct ravl_interval *ri) +{ + ravl_delete(ri->tree); + ri->tree = NULL; + D_FREE(ri); +} + +/* + * ravl_interval_delete_cb - finalize the ravl interval module with entries + * and execute provided callback function for each entry. + */ +void +ravl_interval_delete_cb(struct ravl_interval *ri, ravl_cb cb, void *arg) +{ + ravl_delete_cb(ri->tree, cb, arg); + ri->tree = NULL; + D_FREE(ri); +} + +/* + * ravl_interval_new -- initialize the ravl interval module + */ +struct ravl_interval * +ravl_interval_new(ravl_interval_min *get_min, ravl_interval_max *get_max) +{ + struct ravl_interval *interval; + + D_ALLOC_PTR_NZ(interval); + if (!interval) + return NULL; + + interval->tree = ravl_new_sized(ravl_interval_compare, + sizeof(struct ravl_interval_node)); + if (!(interval->tree)) + goto free_alloc; + + interval->get_min = get_min; + interval->get_max = get_max; + + return interval; + +free_alloc: + D_FREE(interval); + return NULL; +} + +/* + * ravl_interval_insert -- insert interval entry into the tree + */ +int +ravl_interval_insert(struct ravl_interval *ri, void *addr) +{ + struct ravl_interval_node rin; + + rin.addr = addr; + rin.get_min = ri->get_min; + rin.get_max = ri->get_max; + rin.overlap = false; + + int ret = ravl_emplace_copy(ri->tree, &rin); + + if (ret && errno) + return -errno; + + return ret; +} + +/* + * ravl_interval_remove -- remove interval entry from the tree + */ +int +ravl_interval_remove(struct ravl_interval *ri, struct ravl_interval_node *rin) +{ + struct ravl_node *node = ravl_find(ri->tree, rin, + RAVL_PREDICATE_EQUAL); + if (!node) + return -ENOENT; + + ravl_remove(ri->tree, node); + + return 0; +} + +/* + * ravl_interval_find_prior -- find overlapping interval starting prior to + * the current one + */ +static struct ravl_interval_node * +ravl_interval_find_prior(struct ravl *tree, struct ravl_interval_node *rin) +{ + struct ravl_node *node; + struct ravl_interval_node *cur; + + node = ravl_find(tree, rin, RAVL_PREDICATE_LESS); + if (!node) + return NULL; + + cur = ravl_data(node); + /* + * If the end of the found interval is below the searched boundary, then + * those intervals are not overlapping. + */ + if (cur->get_max(cur->addr) <= rin->get_min(rin->addr)) + return NULL; + + return cur; +} + +/* + * ravl_interval_find_eq -- find overlapping interval starting neither prior or + * lather than the current one + */ +static struct ravl_interval_node * +ravl_interval_find_eq(struct ravl *tree, struct ravl_interval_node *rin) +{ + struct ravl_node *node; + + node = ravl_find(tree, rin, RAVL_PREDICATE_EQUAL); + if (!node) + return NULL; + + return ravl_data(node); +} + +/* + * ravl_interval_find_later -- find overlapping interval starting later than + * the current one + */ +static struct ravl_interval_node * +ravl_interval_find_later(struct ravl *tree, struct ravl_interval_node *rin) +{ + struct ravl_node *node; + struct ravl_interval_node *cur; + + node = ravl_find(tree, rin, RAVL_PREDICATE_GREATER); + if (!node) + return NULL; + + cur = ravl_data(node); + + /* + * If the beginning of the found interval is above the end of + * the searched range, then those interval are not overlapping + */ + if (cur->get_min(cur->addr) >= rin->get_max(rin->addr)) + return NULL; + + return cur; +} + +/* + * ravl_interval_find_equal -- find the interval with exact (min, max) range + */ +struct ravl_interval_node * +ravl_interval_find_equal(struct ravl_interval *ri, void *addr) +{ + struct ravl_interval_node range; + + range.addr = addr; + range.get_min = ri->get_min; + range.get_max = ri->get_max; + range.overlap = true; + + struct ravl_node *node; + + node = ravl_find(ri->tree, &range, RAVL_PREDICATE_EQUAL); + if (!node) + return NULL; + + return ravl_data(node); +} + +/* + * ravl_interval_find -- find the earliest interval within (min, max) range + */ +struct ravl_interval_node * +ravl_interval_find(struct ravl_interval *ri, void *addr) +{ + struct ravl_interval_node range; + + range.addr = addr; + range.get_min = ri->get_min; + range.get_max = ri->get_max; + range.overlap = true; + + struct ravl_interval_node *cur; + + cur = ravl_interval_find_prior(ri->tree, &range); + if (!cur) + cur = ravl_interval_find_eq(ri->tree, &range); + if (!cur) + cur = ravl_interval_find_later(ri->tree, &range); + + return cur; +} + +/* + * ravl_interval_data -- returns the data contained within an interval node + */ +void * +ravl_interval_data(struct ravl_interval_node *rin) +{ + return (void *)rin->addr; +} + +/* + * ravl_interval_find_first -- returns first interval in the tree + */ +struct ravl_interval_node * +ravl_interval_find_first(struct ravl_interval *ri) +{ + struct ravl_node *first; + + first = ravl_first(ri->tree); + if (first) + return ravl_data(first); + + return NULL; +} + +/* + * ravl_interval_find_last -- returns last interval in the tree + */ +struct ravl_interval_node * +ravl_interval_find_last(struct ravl_interval *ri) +{ + struct ravl_node *last; + + last = ravl_last(ri->tree); + if (last) + return ravl_data(last); + + return NULL; +} + +/* + * ravl_interval_find_next -- returns interval succeeding the one provided + */ +struct ravl_interval_node * +ravl_interval_find_next(struct ravl_interval *ri, void *addr) +{ + struct ravl_interval_node range; + + range.addr = addr; + range.get_min = ri->get_min; + range.get_max = ri->get_max; + range.overlap = true; + + struct ravl_node *next = NULL; + + next = ravl_find(ri->tree, &range, RAVL_PREDICATE_GREATER); + if (next) + return ravl_data(next); + + return NULL; +} + +/* + * ravl_interval_find_prev -- returns interval preceding the one provided + */ +struct ravl_interval_node * +ravl_interval_find_prev(struct ravl_interval *ri, void *addr) +{ + struct ravl_interval_node range; + + range.addr = addr; + range.get_min = ri->get_min; + range.get_max = ri->get_max; + range.overlap = true; + + struct ravl_node *prev = NULL; + + prev = ravl_find(ri->tree, &range, RAVL_PREDICATE_LESS); + if (prev) + return ravl_data(prev); + + return NULL; +} diff --git a/src/common/dav_v2/ravl_interval.h b/src/common/dav_v2/ravl_interval.h new file mode 100644 index 00000000000..0f1370da713 --- /dev/null +++ b/src/common/dav_v2/ravl_interval.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2020-2021, Intel Corporation */ + +/* + * ravl_interval.h -- internal definitions for ravl_interval + */ + +#ifndef RAVL_INTERVAL_H +#define RAVL_INTERVAL_H + +#include "ravl.h" + +struct ravl_interval; +struct ravl_interval_node; + +typedef size_t ravl_interval_min(void *addr); +typedef size_t ravl_interval_max(void *addr); + +struct ravl_interval *ravl_interval_new(ravl_interval_min *min, + ravl_interval_min *max); +void ravl_interval_delete(struct ravl_interval *ri); +void ravl_interval_delete_cb(struct ravl_interval *ri, ravl_cb cb, void *arg); +int ravl_interval_insert(struct ravl_interval *ri, void *addr); +int ravl_interval_remove(struct ravl_interval *ri, + struct ravl_interval_node *rin); +struct ravl_interval_node *ravl_interval_find_equal(struct ravl_interval *ri, + void *addr); +struct ravl_interval_node *ravl_interval_find(struct ravl_interval *ri, + void *addr); +struct ravl_interval_node *ravl_interval_find_first(struct ravl_interval *ri); +struct ravl_interval_node *ravl_interval_find_last(struct ravl_interval *ri); +struct ravl_interval_node *ravl_interval_find_next(struct ravl_interval *ri, + void *addr); +struct ravl_interval_node *ravl_interval_find_prev(struct ravl_interval *ri, + void *addr); +void *ravl_interval_data(struct ravl_interval_node *rin); +#endif diff --git a/src/common/dav_v2/recycler.c b/src/common/dav_v2/recycler.c new file mode 100644 index 00000000000..de948a9f9c5 --- /dev/null +++ b/src/common/dav_v2/recycler.c @@ -0,0 +1,323 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2023, Intel Corporation */ + +/* + * recycler.c -- implementation of run recycler + */ + +#include "heap.h" +#include "recycler.h" +#include "vec.h" +#include "out.h" +#include "util.h" +#include "sys_util.h" +#include "ravl.h" +#include "valgrind_internal.h" + +#define THRESHOLD_MUL 4 + +/* + * recycler_element_cmp -- compares two recycler elements + */ +static int +recycler_element_cmp(const void *lhs, const void *rhs) +{ + const struct recycler_element *l = lhs; + const struct recycler_element *r = rhs; + + int64_t diff = (int64_t)l->max_free_block - (int64_t)r->max_free_block; + + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->free_space - (int64_t)r->free_space; + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->zone_id - (int64_t)r->zone_id; + if (diff != 0) + return diff > 0 ? 1 : -1; + + diff = (int64_t)l->chunk_id - (int64_t)r->chunk_id; + if (diff != 0) + return diff > 0 ? 1 : -1; + + return 0; +} + +struct recycler { + struct ravl *runs; + struct palloc_heap *heap; + struct zoneset *zset; + + /* + * How many unaccounted units there *might* be inside of the memory + * blocks stored in the recycler. + * The value is not meant to be accurate, but rather a rough measure on + * how often should the memory block scores be recalculated. + * + * Per-chunk unaccounted units are shared for all zones, which might + * lead to some unnecessary recalculations. + */ + size_t unaccounted_units[MAX_CHUNK]; + size_t unaccounted_total; + size_t nallocs; + + VEC(, struct recycler_element) recalc; + + pthread_mutex_t lock; +}; + +/* + * recycler_new -- creates new recycler instance + */ +struct recycler * +recycler_new(struct palloc_heap *heap, size_t nallocs, struct zoneset *zset) +{ + struct recycler *r; + + D_ALLOC_PTR_NZ(r); + if (r == NULL) + goto error_alloc_recycler; + + r->runs = ravl_new_sized(recycler_element_cmp, + sizeof(struct recycler_element)); + if (r->runs == NULL) + goto error_alloc_tree; + + r->heap = heap; + r->nallocs = nallocs; + r->zset = zset; + r->unaccounted_total = 0; + memset(&r->unaccounted_units, 0, sizeof(r->unaccounted_units)); + + VEC_INIT(&r->recalc); + + util_mutex_init(&r->lock); + + return r; + +error_alloc_tree: + D_FREE(r); +error_alloc_recycler: + return NULL; +} + +/* + * recycler_delete -- deletes recycler instance + */ +void +recycler_delete(struct recycler *r) +{ + VEC_DELETE(&r->recalc); + + util_mutex_destroy(&r->lock); + ravl_delete(r->runs); + D_FREE(r); +} + +/* + * recycler_element_new -- calculates how many free bytes does a run have and + * what's the largest request that the run can handle, returns that as + * recycler element struct + */ +struct recycler_element +recycler_element_new(struct palloc_heap *heap, const struct memory_block *m) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(heap); + + /* + * Counting of the clear bits can race with a concurrent deallocation + * that operates on the same run. This race is benign and has absolutely + * no effect on the correctness of this algorithm. Ideally, we would + * avoid grabbing the lock, but helgrind gets very confused if we + * try to disable reporting for this function. + */ + pthread_mutex_t *lock = m->m_ops->get_lock(m); + + util_mutex_lock(lock); + + struct recycler_element e = { + .free_space = 0, + .max_free_block = 0, + .chunk_id = m->chunk_id, + .zone_id = m->zone_id, + }; + m->m_ops->calc_free(m, &e.free_space, &e.max_free_block); + + util_mutex_unlock(lock); + + return e; +} + +/* + * recycler_put -- inserts new run into the recycler + */ +int +recycler_put(struct recycler *r, struct recycler_element element) +{ + int ret = 0; + + util_mutex_lock(&r->lock); + + ret = ravl_emplace_copy(r->runs, &element); + + util_mutex_unlock(&r->lock); + + return ret; +} + +/* + * recycler_get -- retrieves a chunk from the recycler + */ +int +recycler_get(struct recycler *r, struct memory_block *m) +{ + int ret = 0; + + util_mutex_lock(&r->lock); + + struct recycler_element e = { .max_free_block = m->size_idx, 0, 0, 0}; + struct ravl_node *n = ravl_find(r->runs, &e, + RAVL_PREDICATE_GREATER_EQUAL); + if (n == NULL) { + ret = ENOMEM; + goto out; + } + + struct recycler_element *ne = ravl_data(n); + + m->chunk_id = ne->chunk_id; + m->zone_id = ne->zone_id; + + ravl_remove(r->runs, n); + + struct chunk_header *hdr = heap_get_chunk_hdr(r->heap, m); + + m->size_idx = hdr->size_idx; + + memblock_rebuild_state(r->heap, m); + +out: + util_mutex_unlock(&r->lock); + + return ret; +} + +/* + * recycler_recalc -- recalculates the scores of runs in the recycler to match + * the updated persistent state + */ +struct empty_runs +recycler_recalc(struct recycler *r, int force) +{ + struct empty_runs runs; + + VEC_INIT(&runs); + + uint64_t units = r->unaccounted_total; + + uint64_t recalc_threshold = THRESHOLD_MUL * r->nallocs; + + if (!force && units < recalc_threshold) + return runs; + + if (util_mutex_trylock(&r->lock) != 0) + return runs; + + /* If the search is forced, recalculate everything */ + uint64_t search_limit = force ? UINT64_MAX : units; + + uint64_t found_units = 0; + struct memory_block nm = MEMORY_BLOCK_NONE; + struct ravl_node *n; + struct recycler_element next = {0, 0, 0, 0}; + enum ravl_predicate p = RAVL_PREDICATE_GREATER_EQUAL; + + do { + n = ravl_find(r->runs, &next, p); + if (n == NULL) + break; + + p = RAVL_PREDICATE_GREATER; + + struct recycler_element *ne = ravl_data(n); + + next = *ne; + + uint64_t chunk_units = r->unaccounted_units[ne->chunk_id]; + + if (!force && chunk_units == 0) + continue; + + uint32_t existing_free_space = ne->free_space; + + nm.chunk_id = ne->chunk_id; + nm.zone_id = ne->zone_id; + memblock_rebuild_state(r->heap, &nm); + + struct recycler_element e = recycler_element_new(r->heap, &nm); + + ASSERT(e.free_space >= existing_free_space); + uint64_t free_space_diff = e.free_space - existing_free_space; + + found_units += free_space_diff; + + if (free_space_diff == 0) + continue; + + /* + * Decrease the per chunk_id counter by the number of nallocs + * found, increased by the blocks potentially freed in the + * active memory block. Cap the sub value to prevent overflow. + */ + util_fetch_and_sub64(&r->unaccounted_units[nm.chunk_id], + MIN(chunk_units, free_space_diff + r->nallocs)); + + ravl_remove(r->runs, n); + + if (e.free_space == r->nallocs) { + memblock_rebuild_state(r->heap, &nm); + if (VEC_PUSH_BACK(&runs, nm) != 0) + ASSERT(0); /* XXX: fix after refactoring */ + } else { + VEC_PUSH_BACK(&r->recalc, e); + } + } while (found_units < search_limit); + + struct recycler_element *e; + + VEC_FOREACH_BY_PTR(e, &r->recalc) { + ravl_emplace_copy(r->runs, e); + } + + VEC_CLEAR(&r->recalc); + + util_mutex_unlock(&r->lock); + + util_fetch_and_sub64(&r->unaccounted_total, units); + + return runs; +} + +/* + * recycler_inc_unaccounted -- increases the number of unaccounted units in the + * recycler + */ +void +recycler_inc_unaccounted(struct recycler *r, const struct memory_block *m) +{ + util_fetch_and_add64(&r->unaccounted_total, m->size_idx); + util_fetch_and_add64(&r->unaccounted_units[m->chunk_id], + m->size_idx); +} + +/* + * Return the zoneset associated with the recycler. + */ +struct zoneset * +recycler_get_zoneset(struct recycler *r) +{ + return r->zset; +} diff --git a/src/common/dav_v2/recycler.h b/src/common/dav_v2/recycler.h new file mode 100644 index 00000000000..7904289937d --- /dev/null +++ b/src/common/dav_v2/recycler.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2023, Intel Corporation */ + +/* + * recycler.h -- internal definitions of run recycler + * + * This is a container that stores runs that are currently not used by any of + * the buckets. + */ + +#ifndef __DAOS_COMMON_RECYCLER_H +#define __DAOS_COMMON_RECYCLER_H 1 + +#include "memblock.h" +#include "vec.h" + +struct recycler; +VEC(empty_runs, struct memory_block); + +struct recycler_element { + uint32_t max_free_block; + uint32_t free_space; + + uint32_t chunk_id; + uint32_t zone_id; +}; + +struct recycler * +recycler_new(struct palloc_heap *layout, size_t nallocs, struct zoneset *zset); +void recycler_delete(struct recycler *r); +struct recycler_element recycler_element_new(struct palloc_heap *heap, + const struct memory_block *m); + +int recycler_put(struct recycler *r, struct recycler_element element); + +int recycler_get(struct recycler *r, struct memory_block *m); + +struct empty_runs recycler_recalc(struct recycler *r, int force); + +void recycler_inc_unaccounted(struct recycler *r, + const struct memory_block *m); + +struct zoneset * +recycler_get_zoneset(struct recycler *r); + +#endif /* __DAOS_COMMON_RECYCLER_H */ diff --git a/src/common/dav_v2/stats.c b/src/common/dav_v2/stats.c new file mode 100644 index 00000000000..d7162a462f0 --- /dev/null +++ b/src/common/dav_v2/stats.c @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2017-2021, Intel Corporation */ + +/* + * stats.c -- implementation of statistics + */ + +#include <errno.h> + +#include "dav_internal.h" +#include "obj.h" +#include "stats.h" + +/* + * stats_new -- allocates and initializes statistics instance + */ +struct stats * +stats_new(dav_obj_t *pop) +{ + struct stats *s; + + D_ALLOC_PTR_NZ(s); + if (s == NULL) { + D_CRIT("Malloc\n"); + return NULL; + } + + s->persistent = &pop->do_phdr->dp_stats_persistent; + VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(s->persistent, sizeof(*s->persistent)); + D_ALLOC_PTR(s->transient); + if (s->transient == NULL) + goto error_transient_alloc; + + return s; + +error_transient_alloc: + D_FREE(s); + return NULL; +} + +/* + * stats_delete -- deletes statistics instance + */ +void +stats_delete(dav_obj_t *pop, struct stats *s) +{ + D_FREE(s->transient); + D_FREE(s); +} + +/* + * stats_persist -- save the persistent statistics to wal + */ +void +stats_persist(dav_obj_t *pop, struct stats *s) +{ + if (s->transient->heap_prev_pval != + s->persistent->heap_curr_allocated) { + mo_wal_persist(&pop->p_ops, s->persistent, + sizeof(struct stats_persistent)); + s->transient->heap_prev_pval = + s->persistent->heap_curr_allocated; + } +} + +DAV_FUNC_EXPORT int +dav_get_heap_stats_v2(dav_obj_t *pop, struct dav_heap_stats *st) +{ + if ((pop == NULL) || (st == NULL)) { + errno = EINVAL; + return -1; + } + + st->curr_allocated = pop->do_stats->persistent->heap_curr_allocated; + st->run_allocated = pop->do_stats->transient->heap_run_allocated; + st->run_active = pop->do_stats->transient->heap_run_active; + return 0; +} diff --git a/src/common/dav_v2/stats.h b/src/common/dav_v2/stats.h new file mode 100644 index 00000000000..ab3a0e33ee0 --- /dev/null +++ b/src/common/dav_v2/stats.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2017-2021, Intel Corporation */ + +/* + * stats.h -- definitions of statistics + */ + +#ifndef __DAOS_COMMON_STATS_H +#define __DAOS_COMMON_STATS_H 1 + +struct stats_transient { + uint64_t heap_run_allocated; + uint64_t heap_run_active; + uint64_t heap_prev_pval; /* previous persisted value of curr allocated */ +}; + +struct stats_persistent { + uint64_t heap_curr_allocated; +}; + +struct stats { + struct stats_transient *transient; + struct stats_persistent *persistent; +}; + +#define STATS_INC(stats, type, name, value) \ + STATS_INC_##type(stats, name, value) + +#define STATS_INC_transient(stats, name, value)\ + util_fetch_and_add64((&(stats)->transient->name), (value)) + +#define STATS_INC_persistent(stats, name, value)\ + util_fetch_and_add64((&(stats)->persistent->name), (value)) + +#define STATS_SUB(stats, type, name, value)\ + STATS_SUB_##type(stats, name, value) + +#define STATS_SUB_transient(stats, name, value)\ + util_fetch_and_sub64((&(stats)->transient->name), (value)) + +#define STATS_SUB_persistent(stats, name, value)\ + util_fetch_and_sub64((&(stats)->persistent->name), (value)) + +#define STATS_SET(stats, type, name, value)\ + STATS_SET_##type(stats, name, value) + +#define STATS_SET_transient(stats, name, value)\ + util_atomic_store_explicit64((&(stats)->transient->name),\ + (value), memory_order_release)\ + +#define STATS_SET_persistent(stats, name, value)\ + util_atomic_store_explicit64((&(stats)->persistent->name),\ + (value), memory_order_release)\ + +struct dav_obj; + +struct stats *stats_new(struct dav_obj *pop); +void stats_delete(struct dav_obj *pop, struct stats *stats); +void stats_persist(struct dav_obj *pop, struct stats *s); + +#endif /* __DAOS_COMMON_STATS_H */ diff --git a/src/common/dav_v2/sys_util.h b/src/common/dav_v2/sys_util.h new file mode 100644 index 00000000000..79d1a4f12d7 --- /dev/null +++ b/src/common/dav_v2/sys_util.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2020, Intel Corporation */ + +/* + * sys_util.h -- internal utility wrappers around system functions + */ + +#ifndef __DAOS_COMMON_SYS_UTIL_H +#define __DAOS_COMMON_SYS_UTIL_H 1 + +#include <errno.h> + +#include <gurt/common.h> +#include "out.h" + +/* + * util_mutex_init -- os_mutex_init variant that never fails from + * caller perspective. If os_mutex_init failed, this function aborts + * the program. + */ +static inline void +util_mutex_init(pthread_mutex_t *m) +{ + int tmp = D_MUTEX_INIT(m, NULL); + + D_ASSERTF(tmp == 0, "!os_mutex_init"); +} + +/* + * util_mutex_destroy -- os_mutex_destroy variant that never fails from + * caller perspective. If os_mutex_destroy failed, this function aborts + * the program. + */ +static inline void +util_mutex_destroy(pthread_mutex_t *m) +{ + int tmp = D_MUTEX_DESTROY(m); + + D_ASSERTF(tmp == 0, "!os_mutex_destroy"); +} + +/* + * util_mutex_lock -- os_mutex_lock variant that never fails from + * caller perspective. If os_mutex_lock failed, this function aborts + * the program. + */ +static inline void +util_mutex_lock(pthread_mutex_t *m) +{ + int tmp = D_MUTEX_LOCK(m); + + D_ASSERTF(tmp == 0, "!os_mutex_destroy"); +} + +/* + * util_mutex_trylock -- os_mutex_trylock variant that never fails from + * caller perspective (other than EBUSY). If util_mutex_trylock failed, this + * function aborts the program. + * Returns 0 if locked successfully, otherwise returns EBUSY. + */ +static inline int +util_mutex_trylock(pthread_mutex_t *m) +{ + int tmp = D_MUTEX_TRYLOCK(m); + + D_ASSERTF((!tmp || (tmp == -DER_BUSY)), "!os_mutex_trylock"); + return tmp?EBUSY:0; +} + +/* + * util_mutex_unlock -- os_mutex_unlock variant that never fails from + * caller perspective. If os_mutex_unlock failed, this function aborts + * the program. + */ +static inline void +util_mutex_unlock(pthread_mutex_t *m) +{ + int tmp = D_MUTEX_UNLOCK(m); + + D_ASSERTF(tmp == 0, "!os_mutex_unlock"); +} + +#endif /* __DAOS_COMMON_SYS_UTIL_H */ diff --git a/src/common/dav_v2/tx.c b/src/common/dav_v2/tx.c new file mode 100644 index 00000000000..6d08757ea70 --- /dev/null +++ b/src/common/dav_v2/tx.c @@ -0,0 +1,1855 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2023, Intel Corporation */ + +/* + * tx.c -- transactions implementation + */ + +#include <inttypes.h> +#include <wchar.h> +#include <errno.h> + +#include "queue.h" +#include "ravl.h" +#include "obj.h" +#include "out.h" +#include "tx.h" +#include "valgrind_internal.h" +#include "memops.h" +#include "dav_internal.h" + +struct tx_data { + DAV_SLIST_ENTRY(tx_data) tx_entry; + jmp_buf env; + enum dav_tx_failure_behavior failure_behavior; +}; + +struct tx { + dav_obj_t *pop; + enum dav_tx_stage stage; + int last_errnum; + + DAV_SLIST_HEAD(txd, tx_data) tx_entries; + + struct ravl *ranges; + + VEC(, struct dav_action) actions; + + dav_tx_callback stage_callback; + void *stage_callback_arg; + + int first_snapshot; +}; + +/* + * get_tx -- returns current transaction + * + * This function should be used only in high-level functions. + */ +static struct tx * +get_tx() +{ + static __thread struct tx tx; + + return &tx; +} + +struct tx_alloc_args { + uint64_t flags; + const void *copy_ptr; + size_t copy_size; +}; + +#define ALLOC_ARGS(flags)\ +(struct tx_alloc_args){flags, NULL, 0} + +struct tx_range_def { + uint64_t offset; + uint64_t size; + uint64_t flags; +}; + +/* + * tx_range_def_cmp -- compares two snapshot ranges + */ +static int +tx_range_def_cmp(const void *lhs, const void *rhs) +{ + const struct tx_range_def *l = lhs; + const struct tx_range_def *r = rhs; + + if (l->offset > r->offset) + return 1; + else if (l->offset < r->offset) + return -1; + + return 0; +} + +static void +obj_tx_abort(int errnum, int user); + +/* + * obj_tx_fail_err -- (internal) dav_tx_abort variant that returns + * error code + */ +static inline int +obj_tx_fail_err(int errnum, uint64_t flags) +{ + if ((flags & DAV_FLAG_TX_NO_ABORT) == 0) + obj_tx_abort(errnum, 0); + errno = errnum; + return errnum; +} + +/* + * obj_tx_fail_null -- (internal) dav_tx_abort variant that returns + * null PMEMoid + */ +static inline uint64_t +obj_tx_fail_null(int errnum, uint64_t flags) +{ + if ((flags & DAV_FLAG_TX_NO_ABORT) == 0) + obj_tx_abort(errnum, 0); + errno = errnum; + return 0; +} + +/* ASSERT_IN_TX -- checks whether there's open transaction */ +#define ASSERT_IN_TX(tx) do {\ + if ((tx)->stage == DAV_TX_STAGE_NONE)\ + FATAL("%s called outside of transaction", __func__);\ +} while (0) + +/* ASSERT_TX_STAGE_WORK -- checks whether current transaction stage is WORK */ +#define ASSERT_TX_STAGE_WORK(tx) do {\ + if ((tx)->stage != DAV_TX_STAGE_WORK)\ + FATAL("%s called in invalid stage %d", __func__, (tx)->stage);\ +} while (0) + +/* + * tx_action_reserve -- (internal) reserve space for the given number of actions + */ +static int +tx_action_reserve(struct tx *tx, size_t n) +{ + size_t entries_size = (VEC_SIZE(&tx->actions) + n) * + sizeof(struct ulog_entry_val); + + if (operation_reserve(tx->pop->external, entries_size) != 0) + return -1; + + return 0; +} + +/* + * tx_action_add -- (internal) reserve space and add a new tx action + */ +static struct dav_action * +tx_action_add(struct tx *tx) +{ + if (tx_action_reserve(tx, 1) != 0) + return NULL; + + VEC_INC_BACK(&tx->actions); + + return &VEC_BACK(&tx->actions); +} + +/* + * tx_action_remove -- (internal) remove last tx action + */ +static void +tx_action_remove(struct tx *tx) +{ + VEC_POP_BACK(&tx->actions); +} + +/* + * constructor_tx_alloc -- (internal) constructor for normal alloc + */ +static int +constructor_tx_alloc(void *ctx, void *ptr, size_t usable_size, void *arg) +{ + ASSERTne(ptr, NULL); + ASSERTne(arg, NULL); + + struct tx_alloc_args *args = arg; + + /* do not report changes to the new object */ + VALGRIND_ADD_TO_TX(ptr, usable_size); + + if (args->flags & DAV_FLAG_ZERO) + memset(ptr, 0, usable_size); + + if (args->copy_ptr && args->copy_size != 0) { + FATAL("dav xalloc does not support copy_ptr\n"); + memcpy(ptr, args->copy_ptr, args->copy_size); + } + + return 0; +} + +/* + * tx_restore_range -- (internal) restore a single range from undo log + */ +static void +tx_restore_range(dav_obj_t *pop, struct ulog_entry_buf *range) +{ + void *begin, *end; + size_t size = range->size; + uint64_t range_offset = ulog_entry_offset(&range->base); + + begin = OBJ_OFF_TO_PTR(pop, range_offset); + end = (char *)begin + size; + ASSERT((char *)end >= (char *)begin); + + memcpy(begin, range->data, size); +} + +/* + * tx_undo_entry_apply -- applies modifications of a single ulog entry + */ +static int +tx_undo_entry_apply(struct ulog_entry_base *e, void *arg, + const struct mo_ops *p_ops) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(arg); + + struct ulog_entry_buf *eb; + + switch (ulog_entry_type(e)) { + case ULOG_OPERATION_BUF_CPY: + eb = (struct ulog_entry_buf *)e; + + tx_restore_range(p_ops->base, eb); + break; +#ifdef WAL_SUPPORTS_AND_OR_OPS + case ULOG_OPERATION_AND: + case ULOG_OPERATION_OR: +#else + case ULOG_OPERATION_CLR_BITS: + case ULOG_OPERATION_SET_BITS: +#endif + case ULOG_OPERATION_SET: + case ULOG_OPERATION_BUF_SET: + default: + ASSERT(0); + } + + return 0; +} + +/* + * tx_abort_set -- (internal) abort all set operations + */ +static void +tx_abort_set(dav_obj_t *pop) +{ + ulog_foreach_entry((struct ulog *)&pop->clogs.undo, + tx_undo_entry_apply, NULL, &pop->p_ops); + operation_finish(pop->undo, ULOG_INC_FIRST_GEN_NUM); +} + +/* + * tx_flush_range -- (internal) flush one range + */ +static void +tx_flush_range(void *data, void *ctx) +{ + dav_obj_t *pop = ctx; + struct tx_range_def *range = data; + + if (!(range->flags & DAV_FLAG_NO_FLUSH)) { + mo_wal_flush(&pop->p_ops, OBJ_OFF_TO_PTR(pop, range->offset), + range->size, range->flags & DAV_XADD_WAL_CPTR); + } + VALGRIND_REMOVE_FROM_TX(OBJ_OFF_TO_PTR(pop, range->offset), + range->size); +} + +/* + * tx_clean_range -- (internal) clean one range + */ +static void +tx_clean_range(void *data, void *ctx) +{ + dav_obj_t *pop = ctx; + struct tx_range_def *range = data; + + VALGRIND_REMOVE_FROM_TX(OBJ_OFF_TO_PTR(pop, range->offset), + range->size); + VALGRIND_SET_CLEAN(OBJ_OFF_TO_PTR(pop, range->offset), range->size); +} + +/* + * tx_pre_commit -- (internal) do pre-commit operations + */ +static void +tx_pre_commit(struct tx *tx) +{ + /* Flush all regions and destroy the whole tree. */ + ravl_delete_cb(tx->ranges, tx_flush_range, tx->pop); + tx->ranges = NULL; +} + +/* + * tx_abort -- (internal) abort all allocated objects + */ +static void +tx_abort(dav_obj_t *pop) +{ + struct tx *tx = get_tx(); + + tx_abort_set(pop); + + ravl_delete_cb(tx->ranges, tx_clean_range, pop); + palloc_cancel(pop->do_heap, + VEC_ARR(&tx->actions), VEC_SIZE(&tx->actions)); + tx->ranges = NULL; +} + +/* + * tx_ranges_insert_def -- (internal) allocates and inserts a new range + * definition into the ranges tree + */ +static int +tx_ranges_insert_def(dav_obj_t *pop, struct tx *tx, + const struct tx_range_def *rdef) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(pop); + + DAV_DBG("(%lu,%lu) size=%zu", + rdef->offset / 4096, rdef->offset % 4096, rdef->size); + + int ret = ravl_emplace_copy(tx->ranges, rdef); + + if (ret && errno == EEXIST) + FATAL("invalid state of ranges tree"); + return ret; +} + +/* + * tx_alloc_common -- (internal) common function for alloc and zalloc + */ +static uint64_t +tx_alloc_common(struct tx *tx, size_t size, type_num_t type_num, + palloc_constr constructor, struct tx_alloc_args args) +{ + const struct tx_range_def *r; + uint64_t off; + + if (size > DAV_MAX_ALLOC_SIZE) { + ERR("requested size too large"); + return obj_tx_fail_null(ENOMEM, args.flags); + } + + dav_obj_t *pop = tx->pop; + + struct dav_action *action = tx_action_add(tx); + + if (action == NULL) + return obj_tx_fail_null(ENOMEM, args.flags); + + if (palloc_reserve(pop->do_heap, size, constructor, &args, type_num, 0, + CLASS_ID_FROM_FLAG(args.flags), EZONE_ID_FROM_FLAG(args.flags), + action) != 0) + goto err_oom; + + palloc_get_prange(action, &off, &size, 1); + r = &(struct tx_range_def){off, size, args.flags}; + if (tx_ranges_insert_def(pop, tx, r) != 0) + goto err_oom; + + return action->heap.offset; + +err_oom: + tx_action_remove(tx); + D_CRIT("out of memory\n"); + return obj_tx_fail_null(ENOMEM, args.flags); +} + +/* + * tx_create_wal_entry -- convert to WAL a single ulog UNDO entry + */ +int +tx_create_wal_entry(struct ulog_entry_base *e, void *arg, + const struct mo_ops *p_ops) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(arg); + + int rc = 0; + uint64_t offset = ulog_entry_offset(e); + daos_size_t dst_size = sizeof(uint64_t); + struct ulog_entry_val *ev; + struct ulog_entry_buf *eb; + uint64_t v; + uint64_t *dst; + + D_ASSERT(p_ops->base != NULL); + dst = (uint64_t *)((uintptr_t)((dav_obj_t *)p_ops->base)->do_base + offset); + + switch (ulog_entry_type(e)) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + case ULOG_OPERATION_AND: + ev = (struct ulog_entry_val *)e; + v = ev->value; + + rc = dav_wal_tx_and(p_ops->base, dst, v); + break; + case ULOG_OPERATION_OR: + ev = (struct ulog_entry_val *)e; + v = ev->value; + + rc = dav_wal_tx_or(p_ops->base, dst, v); + break; +#else + case ULOG_OPERATION_CLR_BITS: + ev = (struct ulog_entry_val *)e; + v = ev->value; + + rc = dav_wal_tx_clr_bits(p_ops->base, dst, ULOG_ENTRY_VAL_TO_POS(v), + ULOG_ENTRY_VAL_TO_BITS(v)); + break; + case ULOG_OPERATION_SET_BITS: + ev = (struct ulog_entry_val *)e; + v = ev->value; + + rc = dav_wal_tx_set_bits(p_ops->base, dst, ULOG_ENTRY_VAL_TO_POS(v), + ULOG_ENTRY_VAL_TO_BITS(v)); + break; +#endif + case ULOG_OPERATION_SET: + ev = (struct ulog_entry_val *)e; + + rc = dav_wal_tx_snap(p_ops->base, dst, dst_size, (void *)&ev->value, 0); + break; + case ULOG_OPERATION_BUF_SET: + eb = (struct ulog_entry_buf *)e; + + dst_size = eb->size; + rc = dav_wal_tx_set(p_ops->base, dst, 0, dst_size); + break; + case ULOG_OPERATION_BUF_CPY: + eb = (struct ulog_entry_buf *)e; + + dst_size = eb->size; + /* The only undo entry from dav that needs to be + * transformed into redo + */ + rc = dav_wal_tx_snap(p_ops->base, dst, dst_size, dst, 0); + break; + default: + ASSERT(0); + } + + return rc; +} + +int +lw_tx_begin(dav_obj_t *pop) +{ + struct umem_wal_tx *utx = NULL; + int rc; + uint64_t wal_id; + + rc = dav_wal_tx_reserve(pop, &wal_id); + if (rc) { + D_ERROR("so_wal_reserv failed, "DF_RC"\n", DP_RC(rc)); + return rc; + } + if (pop->do_utx == NULL) { + utx = dav_umem_wtx_new(pop); + if (utx == NULL) + return obj_tx_fail_err(EINVAL, 0); + } + pop->do_utx->utx_id = wal_id; + return rc; +} + +int +lw_tx_end(dav_obj_t *pop, void *data) +{ + struct umem_wal_tx *utx; + int rc; + + /* Persist the frequently updated persistent globals */ + stats_persist(pop, pop->do_stats); + + utx = pop->do_utx; + D_ASSERT(utx != NULL); + pop->do_utx = NULL; + + rc = dav_wal_tx_commit(pop, utx, data); + D_FREE(utx); + return rc; +} + +/* + * dav_tx_begin -- initializes new transaction + */ +DAV_FUNC_EXPORT int +dav_tx_begin_v2(dav_obj_t *pop, jmp_buf env, ...) +{ + int err = 0; + struct tx *tx = get_tx(); + uint64_t wal_id; + + enum dav_tx_failure_behavior failure_behavior = DAV_TX_FAILURE_ABORT; + + if (tx->stage == DAV_TX_STAGE_WORK) { + if (tx->pop != pop) { + ERR("nested transaction for different pool"); + return obj_tx_fail_err(EINVAL, 0); + } + + /* inherits this value from the parent transaction */ + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + failure_behavior = txd->failure_behavior; + + VALGRIND_START_TX; + } else if (tx->stage == DAV_TX_STAGE_NONE) { + struct umem_wal_tx *utx = NULL; + + DAV_DBG(""); + err = dav_wal_tx_reserve(pop, &wal_id); + if (err) { + D_ERROR("so_wal_reserv failed, "DF_RC"\n", DP_RC(err)); + goto err_abort; + } + + if (pop->do_utx == NULL) { + utx = dav_umem_wtx_new(pop); + if (utx == NULL) { + err = ENOMEM; + goto err_abort; + } + } + pop->do_utx->utx_id = wal_id; + + tx = get_tx(); + + VALGRIND_START_TX; + + dav_hold_clogs(pop); + operation_start(pop->undo); + + VEC_INIT(&tx->actions); + DAV_SLIST_INIT(&tx->tx_entries); + + tx->ranges = ravl_new_sized(tx_range_def_cmp, + sizeof(struct tx_range_def)); + tx->first_snapshot = 1; + tx->pop = pop; + } else { + FATAL("Invalid stage %d to begin new transaction", tx->stage); + } + + struct tx_data *txd; + + D_ALLOC_PTR_NZ(txd); + if (txd == NULL) { + err = errno; + D_CRIT("Malloc!\n"); + goto err_abort; + } + + tx->last_errnum = 0; + ASSERT(env == NULL); + if (env != NULL) + memcpy(txd->env, env, sizeof(jmp_buf)); + else + memset(txd->env, 0, sizeof(jmp_buf)); + + txd->failure_behavior = failure_behavior; + + DAV_SLIST_INSERT_HEAD(&tx->tx_entries, txd, tx_entry); + + tx->stage = DAV_TX_STAGE_WORK; + + /* handle locks */ + va_list argp; + + va_start(argp, env); + + enum dav_tx_param param_type; + + while ((param_type = va_arg(argp, enum dav_tx_param)) != + DAV_TX_PARAM_NONE) { + if (param_type == DAV_TX_PARAM_CB) { + dav_tx_callback cb = + va_arg(argp, dav_tx_callback); + void *arg = va_arg(argp, void *); + + if (tx->stage_callback && + (tx->stage_callback != cb || + tx->stage_callback_arg != arg)) { + FATAL( + "transaction callback is already set, old %p new %p old_arg %p new_arg %p", + tx->stage_callback, cb, + tx->stage_callback_arg, arg); + } + + tx->stage_callback = cb; + tx->stage_callback_arg = arg; + } else { + ASSERT(param_type == DAV_TX_PARAM_CB); + } + } + va_end(argp); + + ASSERT(err == 0); + return 0; + +err_abort: + if (tx->stage == DAV_TX_STAGE_WORK) + obj_tx_abort(err, 0); + else + tx->stage = DAV_TX_STAGE_ONABORT; + return err; +} + +/* + * tx_abort_on_failure_flag -- (internal) return 0 or DAV_FLAG_TX_NO_ABORT + * based on transaction setting + */ +static uint64_t +tx_abort_on_failure_flag(struct tx *tx) +{ + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + if (txd->failure_behavior == DAV_TX_FAILURE_RETURN) + return DAV_FLAG_TX_NO_ABORT; + return 0; +} + +/* + * obj_tx_callback -- (internal) executes callback associated with current stage + */ +static void +obj_tx_callback(struct tx *tx) +{ + if (!tx->stage_callback) + return; + + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + /* is this the outermost transaction? */ + if (DAV_SLIST_NEXT(txd, tx_entry) == NULL) + tx->stage_callback(tx->pop, tx->stage, tx->stage_callback_arg); +} + +/* + * dav_tx_stage -- returns current transaction stage + */ +DAV_FUNC_EXPORT enum dav_tx_stage +dav_tx_stage_v2(void) +{ + return get_tx()->stage; +} + +/* + * obj_tx_abort -- aborts current transaction + */ +static void +obj_tx_abort(int errnum, int user) +{ + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + ASSERT(tx->pop != NULL); + + if (errnum == 0) + errnum = ECANCELED; + + tx->stage = DAV_TX_STAGE_ONABORT; + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + if (DAV_SLIST_NEXT(txd, tx_entry) == NULL) { + /* this is the outermost transaction */ + + /* process the undo log */ + tx_abort(tx->pop); + + dav_release_clogs(tx->pop); + } + + tx->last_errnum = errnum; + errno = errnum; + if (user) { + DAV_DBG("!explicit transaction abort"); + } + + /* ONABORT */ + obj_tx_callback(tx); + + if (!util_is_zeroed(txd->env, sizeof(jmp_buf))) + longjmp(txd->env, errnum); +} + +/* + * dav_tx_abort -- aborts current transaction + * + * Note: this function should not be called from inside of dav. + */ +DAV_FUNC_EXPORT void +dav_tx_abort_v2(int errnum) +{ + DAV_API_START(); + DAV_DBG(""); + obj_tx_abort(errnum, 1); + DAV_API_END(); +} + +/* + * dav_tx_errno -- returns last transaction error code + */ +DAV_FUNC_EXPORT int +dav_tx_errno_v2(void) +{ + DAV_DBG("err:%d", get_tx()->last_errnum); + + return get_tx()->last_errnum; +} + +static void +tx_post_commit(struct tx *tx) +{ + operation_finish(tx->pop->undo, 0); +} + +/* + * dav_tx_commit -- commits current transaction + */ +DAV_FUNC_EXPORT void +dav_tx_commit_v2(void) +{ + DAV_API_START(); + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + ASSERT(tx->pop); + DAV_DBG(""); + + /* WORK */ + obj_tx_callback(tx); + dav_obj_t *pop = tx->pop; + + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + if (DAV_SLIST_NEXT(txd, tx_entry) == NULL) { + /* this is the outermost transaction */ + + /* pre-commit phase */ + tx_pre_commit(tx); + + mo_wal_drain(&pop->p_ops); + + operation_start(pop->external); + + palloc_publish(pop->do_heap, VEC_ARR(&tx->actions), + VEC_SIZE(&tx->actions), pop->external); + + tx_post_commit(tx); + + dav_release_clogs(pop); + } + + tx->stage = DAV_TX_STAGE_ONCOMMIT; + + /* ONCOMMIT */ + obj_tx_callback(tx); + DAV_API_END(); +} + +/* + * dav_tx_end -- ends current transaction + */ +DAV_FUNC_EXPORT int +dav_tx_end_v2(void *data) +{ + struct tx *tx = get_tx(); + + if (tx->stage == DAV_TX_STAGE_WORK) + FATAL("dav_tx_end called without dav_tx_commit"); + + if (tx->pop == NULL) + FATAL("dav_tx_end called without dav_tx_begin"); + + if (tx->stage_callback && + (tx->stage == DAV_TX_STAGE_ONCOMMIT || + tx->stage == DAV_TX_STAGE_ONABORT)) { + tx->stage = DAV_TX_STAGE_FINALLY; + obj_tx_callback(tx); + } + + struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries); + + DAV_SLIST_REMOVE_HEAD(&tx->tx_entries, tx_entry); + + D_FREE(txd); + + VALGRIND_END_TX; + int ret = tx->last_errnum; + + if (DAV_SLIST_EMPTY(&tx->tx_entries)) { + dav_obj_t *pop = tx->pop; + dav_tx_callback cb = tx->stage_callback; + void *arg = tx->stage_callback_arg; + int rc; + + DAV_DBG(""); + ASSERT(pop); + tx->pop = NULL; + tx->stage = DAV_TX_STAGE_NONE; + tx->stage_callback = NULL; + tx->stage_callback_arg = NULL; + + VEC_DELETE(&tx->actions); + /* tx should not be accessed after this */ + + /* commit to WAL */ + rc = lw_tx_end(pop, data); + /* TODO: Handle WAL commit errors */ + D_ASSERT(rc == 0); + + if (cb) + cb(pop, DAV_TX_STAGE_NONE, arg); + } else { + /* resume the next transaction */ + tx->stage = DAV_TX_STAGE_WORK; + + /* abort called within inner transaction, waterfall the error */ + if (tx->last_errnum) + obj_tx_abort(tx->last_errnum, 0); + } + + return ret; +} + +/* + * vg_verify_initialized -- when executed under Valgrind verifies that + * the buffer has been initialized; explicit check at snapshotting time, + * because Valgrind may find it much later when it's impossible to tell + * for which snapshot it triggered + */ +static void +vg_verify_initialized(dav_obj_t *pop, const struct tx_range_def *def) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(pop, def); +#if VG_MEMCHECK_ENABLED + if (!On_memcheck) + return; + + VALGRIND_DO_DISABLE_ERROR_REPORTING; + char *start = OBJ_OFF_TO_PTR(pop, def->offset); + char *uninit = (char *)VALGRIND_CHECK_MEM_IS_DEFINED(start, def->size); + + if (uninit) { + VALGRIND_PRINTF( + "Snapshotting uninitialized data in range <%p,%p> (<offset:0x%lx,size:0x%lx>)\n", + start, start + def->size, def->offset, def->size); + + if (uninit != start) + VALGRIND_PRINTF("Uninitialized data starts at: %p\n", + uninit); + + VALGRIND_DO_ENABLE_ERROR_REPORTING; + VALGRIND_CHECK_MEM_IS_DEFINED(start, def->size); + } else { + VALGRIND_DO_ENABLE_ERROR_REPORTING; + } +#endif +} + +/* + * dav_tx_add_snapshot -- (internal) creates a variably sized snapshot + */ +static int +dav_tx_add_snapshot(struct tx *tx, struct tx_range_def *snapshot) +{ + /* + * Depending on the size of the block, either allocate an + * entire new object or use cache. + */ + void *ptr = OBJ_OFF_TO_PTR(tx->pop, snapshot->offset); + + VALGRIND_ADD_TO_TX(ptr, snapshot->size); + + /* do nothing */ + if (snapshot->flags & DAV_XADD_NO_SNAPSHOT) + return 0; + + if (!(snapshot->flags & DAV_XADD_ASSUME_INITIALIZED)) + vg_verify_initialized(tx->pop, snapshot); + + /* + * If we are creating the first snapshot, setup a redo log action to + * increment counter in the undo log, so that the log becomes + * invalid once the redo log is processed. + */ + if (tx->first_snapshot) { + struct dav_action *action = tx_action_add(tx); + + if (action == NULL) + return -1; + + uint64_t *n = &tx->pop->clogs.undo.gen_num; + + palloc_set_value(tx->pop->do_heap, action, + n, *n + 1); + + tx->first_snapshot = 0; + } + + return operation_add_buffer(tx->pop->undo, ptr, ptr, snapshot->size, + ULOG_OPERATION_BUF_CPY); +} + +/* + * dav_tx_merge_flags -- (internal) common code for merging flags between + * two ranges to ensure resultant behavior is correct + */ +static void +dav_tx_merge_flags(struct tx_range_def *dest, struct tx_range_def *merged) +{ + /* + * DAV_XADD_NO_FLUSH should only be set in merged range if set in + * both ranges + */ + if ((dest->flags & DAV_XADD_NO_FLUSH) && + !(merged->flags & DAV_XADD_NO_FLUSH)) { + dest->flags = dest->flags & (~DAV_XADD_NO_FLUSH); + } + + /* + * Extend DAV_XADD_WAL_CPTR when merged. + * REVISIT: Ideally merge should happen only if address ranges + * overlap. Current code merges adjacent ranges even if only one + * of them has this flag set. Fix this before closing DAOS-11049. + */ + if (merged->flags & DAV_XADD_WAL_CPTR) + dest->flags = dest->flags | DAV_XADD_WAL_CPTR; +} + +/* + * dav_tx_add_common -- (internal) common code for adding persistent memory + * into the transaction + */ +static int +dav_tx_add_common(struct tx *tx, struct tx_range_def *args) +{ + if (args->size > DAV_MAX_ALLOC_SIZE) { + ERR("snapshot size too large"); + return obj_tx_fail_err(EINVAL, args->flags); + } + + if (!OBJ_OFFRANGE_FROM_HEAP(tx->pop, args->offset, (args->offset + args->size))) { + ERR("object outside of heap"); + return obj_tx_fail_err(EINVAL, args->flags); + } + + int ret = 0; + + /* + * Search existing ranges backwards starting from the end of the + * snapshot. + */ + struct tx_range_def r = *args; + + DAV_DBG("(%lu,%lu) size=%zu", r.offset / 4096, r.offset % 4096, r.size); + struct tx_range_def search = {0, 0, 0}; + /* + * If the range is directly adjacent to an existing one, + * they can be merged, so search for less or equal elements. + */ + enum ravl_predicate p = RAVL_PREDICATE_LESS_EQUAL; + struct ravl_node *nprev = NULL; + + while (r.size != 0) { + search.offset = r.offset + r.size; + struct ravl_node *n = ravl_find(tx->ranges, &search, p); + /* + * We have to skip searching for LESS_EQUAL because + * the snapshot we would find is the one that was just + * created. + */ + p = RAVL_PREDICATE_LESS; + + struct tx_range_def *f = n ? ravl_data(n) : NULL; + + size_t fend = f == NULL ? 0 : f->offset + f->size; + size_t rend = r.offset + r.size; + + if (fend == 0 || fend < r.offset) { + /* + * If found no range or the found range is not + * overlapping or adjacent on the left side, we can just + * create the entire r.offset + r.size snapshot. + * + * Snapshot: + * --+- + * Existing ranges: + * ---- (no ranges) + * or +--- (no overlap) + * or ---+ (adjacent on on right side) + */ + if (nprev != NULL) { + /* + * But, if we have an existing adjacent snapshot + * on the right side, we can just extend it to + * include the desired range. + */ + struct tx_range_def *fprev = ravl_data(nprev); + + ASSERTeq(rend, fprev->offset); + fprev->offset -= r.size; + fprev->size += r.size; + } else { + /* + * If we don't have anything adjacent, create + * a new range in the tree. + */ + ret = tx_ranges_insert_def(tx->pop, + tx, &r); + if (ret != 0) + break; + } + ret = dav_tx_add_snapshot(tx, &r); + break; + } else if (fend <= rend) { + /* + * If found range has its end inside of the desired + * snapshot range, we can extend the found range by the + * size leftover on the left side. + * + * Snapshot: + * --+++-- + * Existing ranges: + * +++---- (overlap on left) + * or ---+--- (found snapshot is inside) + * or ---+-++ (inside, and adjacent on the right) + * or +++++-- (desired snapshot is inside) + * + */ + struct tx_range_def snapshot = *args; + + snapshot.offset = fend; + /* the side not yet covered by an existing snapshot */ + snapshot.size = rend - fend; + + /* the number of bytes intersecting in both ranges */ + size_t intersection = fend - MAX(f->offset, r.offset); + + r.size -= intersection + snapshot.size; + f->size += snapshot.size; + dav_tx_merge_flags(f, args); + + if (snapshot.size != 0) { + ret = dav_tx_add_snapshot(tx, &snapshot); + if (ret != 0) + break; + } + + /* + * If there's a snapshot adjacent on right side, merge + * the two ranges together. + */ + if (nprev != NULL) { + struct tx_range_def *fprev = ravl_data(nprev); + + ASSERTeq(rend, fprev->offset); + f->size += fprev->size; + dav_tx_merge_flags(f, fprev); + ravl_remove(tx->ranges, nprev); + } + } else if (fend >= r.offset) { + /* + * If found range has its end extending beyond the + * desired snapshot. + * + * Snapshot: + * --+++-- + * Existing ranges: + * -----++ (adjacent on the right) + * or ----++- (overlapping on the right) + * or ----+++ (overlapping and adjacent on the right) + * or --+++++ (desired snapshot is inside) + * + * Notice that we cannot create a snapshot based solely + * on this information without risking overwriting an + * existing one. We have to continue iterating, but we + * keep the information about adjacent snapshots in the + * nprev variable. + */ + size_t overlap = rend - MAX(f->offset, r.offset); + + r.size -= overlap; + dav_tx_merge_flags(f, args); + } else { + ASSERT(0); + } + + nprev = n; + } + + if (ret != 0) { + DAV_DBG("out of memory\n"); + return obj_tx_fail_err(ENOMEM, args->flags); + } + + return 0; +} + +/* + * dav_tx_add_range_direct -- adds persistent memory range into the + * transaction + */ +DAV_FUNC_EXPORT int +dav_tx_add_range_direct_v2(const void *ptr, size_t size) +{ + DAV_API_START(); + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + ASSERT(tx->pop != NULL); + + int ret; + + uint64_t flags = tx_abort_on_failure_flag(tx); + + if (!OBJ_PTR_FROM_POOL(tx->pop, ptr)) { + ERR("object outside of pool"); + ret = obj_tx_fail_err(EINVAL, flags); + DAV_API_END(); + return ret; + } + + struct tx_range_def args = { + .offset = OBJ_PTR_TO_OFF(tx->pop, ptr), + .size = size, + .flags = flags, + }; + + ret = dav_tx_add_common(tx, &args); + + DAV_API_END(); + return ret; +} + +/* + * dav_tx_xadd_range_direct -- adds persistent memory range into the + * transaction + */ +DAV_FUNC_EXPORT int +dav_tx_xadd_range_direct_v2(const void *ptr, size_t size, uint64_t flags) +{ + + DAV_API_START(); + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + int ret; + uint64_t off; + + flags |= tx_abort_on_failure_flag(tx); + + if (flags & ~DAV_XADD_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, flags + & ~DAV_XADD_VALID_FLAGS); + ret = obj_tx_fail_err(EINVAL, flags); + DAV_API_END(); + return ret; + } + + if (!OBJ_PTR_FROM_POOL(tx->pop, ptr)) { + ERR("object outside of pool"); + ret = obj_tx_fail_err(EINVAL, flags); + DAV_API_END(); + return ret; + } + + off = OBJ_PTR_TO_OFF(tx->pop, ptr); + struct tx_range_def args = { + .offset = off, + .size = size, + .flags = flags, + }; + + ret = dav_tx_add_common(tx, &args); + + DAV_API_END(); + return ret; +} + +/* + * dav_tx_add_range -- adds persistent memory range into the transaction + */ +DAV_FUNC_EXPORT int +dav_tx_add_range_v2(uint64_t hoff, size_t size) +{ + DAV_API_START(); + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + int ret; + + uint64_t flags = tx_abort_on_failure_flag(tx); + + ASSERT(OBJ_OFF_IS_VALID(tx->pop, hoff)); + + struct tx_range_def args = { + .offset = hoff, + .size = size, + .flags = flags, + }; + + ret = dav_tx_add_common(tx, &args); + + DAV_API_END(); + return ret; +} + +/* + * dav_tx_xadd_range -- adds persistent memory range into the transaction + */ +DAV_FUNC_EXPORT int +dav_tx_xadd_range_v2(uint64_t hoff, size_t size, uint64_t flags) +{ + DAV_API_START(); + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + int ret; + + flags |= tx_abort_on_failure_flag(tx); + + if (flags & ~DAV_XADD_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, flags + & ~DAV_XADD_VALID_FLAGS); + ret = obj_tx_fail_err(EINVAL, flags); + DAV_API_END(); + return ret; + } + + ASSERT(OBJ_OFF_IS_VALID(tx->pop, hoff)); + + struct tx_range_def args = { + .offset = hoff, + .size = size, + .flags = flags, + }; + + ret = dav_tx_add_common(tx, &args); + + DAV_API_END(); + return ret; +} + +/* + * dav_tx_alloc -- allocates a new object + */ +DAV_FUNC_EXPORT uint64_t +dav_tx_alloc_v2(size_t size, uint64_t type_num, uint64_t flags) +{ + uint64_t off; + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + flags |= tx_abort_on_failure_flag(tx); + + DAV_API_START(); + + if (size == 0) { + ERR("allocation with size 0"); + off = obj_tx_fail_null(EINVAL, flags); + DAV_API_END(); + return off; + } + + if (flags & ~DAV_TX_XALLOC_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, flags + & ~(DAV_TX_XALLOC_VALID_FLAGS)); + off = obj_tx_fail_null(EINVAL, flags); + DAV_API_END(); + return off; + } + + off = tx_alloc_common(tx, size, (type_num_t)type_num, + constructor_tx_alloc, ALLOC_ARGS(flags)); + + DAV_API_END(); + return off; +} + +/* + * dav_tx_xfree -- frees an existing object, with no_abort option + */ +static int +dav_tx_xfree(uint64_t off, uint64_t flags) +{ + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + flags |= tx_abort_on_failure_flag(tx); + + if (flags & ~DAV_XFREE_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, + flags & ~DAV_XFREE_VALID_FLAGS); + return obj_tx_fail_err(EINVAL, flags); + } + + if (off == 0) + return 0; + + dav_obj_t *pop = tx->pop; + + ASSERT(pop != NULL); + ASSERT(OBJ_OFF_IS_VALID(pop, off)); + + DAV_API_START(); + + struct dav_action *action; + uint64_t roff = palloc_get_realoffset(pop->do_heap, off); + + struct tx_range_def range = {roff, 0, 0}; + struct ravl_node *n = ravl_find(tx->ranges, &range, + RAVL_PREDICATE_LESS_EQUAL); + + /* + * If attempting to free an object allocated within the same + * transaction, simply cancel the alloc and remove it from the actions. + */ + if (n != NULL) { + struct tx_range_def *r = ravl_data(n); + + if ((r->offset + r->size) < roff) + goto out; + + VEC_FOREACH_BY_PTR(action, &tx->actions) { + if (action->type == DAV_ACTION_TYPE_HEAP && + action->heap.offset == off) { + void *ptr = OBJ_OFF_TO_PTR(pop, roff); + uint64_t toff, usize; + + palloc_get_prange(action, &toff, &usize, 1); + D_ASSERT(usize <= r->size); + if ((r->offset == roff) && (r->size == usize)) { + /* Exact match. */ + ravl_remove(tx->ranges, n); + } else if (r->offset == roff) { + /* Retain the right portion. */ + r->offset += usize; + r->size -= usize; + } else { + /* Retain the left portion. */ + uint64_t osize = r->size; + + r->size = roff - r->offset; + + /* Still data after range remove. */ + osize -= (r->size + usize); + if (osize) { + struct tx_range_def *r1 = + &(struct tx_range_def) + {roff + usize, osize, r->flags}; + + tx_ranges_insert_def(pop, tx, r1); + } + } + + VALGRIND_SET_CLEAN(ptr, usize); + VALGRIND_REMOVE_FROM_TX(ptr, usize); + palloc_cancel(pop->do_heap, action, 1); + VEC_ERASE_BY_PTR(&tx->actions, action); + DAV_API_END(); + return 0; + } + } + } + +out: + action = tx_action_add(tx); + if (action == NULL) { + int ret = obj_tx_fail_err(errno, flags); + + DAV_API_END(); + return ret; + } + + palloc_defer_free(pop->do_heap, off, action); + + DAV_API_END(); + return 0; +} + +/* + * dav_tx_free -- frees an existing object + */ +DAV_FUNC_EXPORT int +dav_tx_free_v2(uint64_t off) +{ + return dav_tx_xfree(off, 0); +} + +DAV_FUNC_EXPORT void* +dav_tx_off2ptr_v2(uint64_t off) +{ + struct tx *tx = get_tx(); + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + ASSERT(tx->pop != NULL); + + ASSERT(OBJ_OFF_IS_VALID(tx->pop, off)); + return (void *)OBJ_OFF_TO_PTR(tx->pop, off); +} + +/* arguments for constructor_alloc */ +struct constr_args { + int zero_init; + dav_constr constructor; + void *arg; +}; + +/* arguments for constructor_alloc_root */ +struct carg_root { + size_t size; + dav_constr constructor; + void *arg; +}; + +/* arguments for constructor_realloc and constructor_zrealloc */ +struct carg_realloc { + void *ptr; + size_t old_size; + size_t new_size; + int zero_init; + type_num_t user_type; + dav_constr constructor; + void *arg; +}; + +/* + * constructor_zrealloc_root -- (internal) constructor for dav_root + */ +static int +constructor_zrealloc_root(void *ctx, void *ptr, size_t usable_size, void *arg) +{ + dav_obj_t *pop = ctx; + + DAV_DBG("pop %p ptr %p arg %p", pop, ptr, arg); + + ASSERTne(ptr, NULL); + ASSERTne(arg, NULL); + + VALGRIND_ADD_TO_TX(ptr, usable_size); + + struct carg_realloc *carg = arg; + + if (usable_size > carg->old_size) { + size_t grow_len = usable_size - carg->old_size; + void *new_data_ptr = (void *)((uintptr_t)ptr + carg->old_size); + + mo_wal_memset(&pop->p_ops, new_data_ptr, 0, grow_len, 0); + } + int ret = 0; + + if (carg->constructor) + ret = carg->constructor(pop, ptr, carg->arg); + + VALGRIND_REMOVE_FROM_TX(ptr, usable_size); + + return ret; +} + +/* + * obj_realloc_root -- (internal) reallocate root object + */ +static int +obj_alloc_root(dav_obj_t *pop, size_t size) +{ + struct operation_context *ctx; + struct carg_realloc carg; + + DAV_DBG("pop %p size %zu", pop, size); + + carg.ptr = OBJ_OFF_TO_PTR(pop, pop->do_phdr->dp_root_offset); + carg.old_size = pop->do_phdr->dp_root_size; + carg.new_size = size; + carg.user_type = 0; + carg.constructor = NULL; + carg.zero_init = 1; + carg.arg = NULL; + + lw_tx_begin(pop); + ctx = pop->external; + operation_start(ctx); + + operation_add_entry(ctx, &pop->do_phdr->dp_root_size, size, ULOG_OPERATION_SET); + + int ret = palloc_operation(pop->do_heap, pop->do_phdr->dp_root_offset, + &pop->do_phdr->dp_root_offset, size, + constructor_zrealloc_root, &carg, + 0, 0, 0, 0, ctx); /* REVISIT: object_flags and type num ignored*/ + + lw_tx_end(pop, NULL); + return ret; +} + +/* + * dav_root_construct -- returns root object + */ +DAV_FUNC_EXPORT uint64_t +dav_root_v2(dav_obj_t *pop, size_t size) +{ + DAV_DBG("pop %p size %zu", pop, size); + + DAV_API_START(); + if (size > DAV_MAX_ALLOC_SIZE) { + ERR("requested size too large"); + errno = ENOMEM; + DAV_API_END(); + return 0; + } + + if (size == 0 && pop->do_phdr->dp_root_offset == 0) { + ERR("requested size cannot equals zero"); + errno = EINVAL; + DAV_API_END(); + return 0; + } + + /* REVISIT START + * For thread safety the below block has to be protected by lock + */ + if (size > pop->do_phdr->dp_root_size && + obj_alloc_root(pop, size)) { + ERR("dav_root failed"); + DAV_API_END(); + return 0; + } + + /* REVISIT END */ + + DAV_API_END(); + return pop->do_phdr->dp_root_offset; +} + +/* + * constructor_alloc -- (internal) constructor for obj_alloc_construct + */ +static int +constructor_alloc(void *ctx, void *ptr, size_t usable_size, void *arg) +{ + dav_obj_t *pop = ctx; + + struct mo_ops *p_ops = &pop->p_ops; + + DAV_DBG("pop %p ptr %p arg %p", pop, ptr, arg); + + ASSERTne(ptr, NULL); + ASSERTne(arg, NULL); + + struct constr_args *carg = arg; + + if (carg->zero_init) + mo_wal_memset(p_ops, ptr, 0, usable_size, 0); + + int ret = 0; + + if (carg->constructor) + ret = carg->constructor(pop, ptr, carg->arg); + + return ret; +} + +/* + * obj_alloc_construct -- (internal) allocates a new object with constructor + */ +static int +obj_alloc_construct(dav_obj_t *pop, uint64_t *offp, size_t size, + type_num_t type_num, uint64_t flags, + dav_constr constructor, void *arg) +{ + struct operation_context *ctx; + struct constr_args carg; + + if (size > DAV_MAX_ALLOC_SIZE) { + ERR("requested size too large"); + errno = ENOMEM; + return -1; + } + + carg.zero_init = flags & DAV_FLAG_ZERO; + carg.constructor = constructor; + carg.arg = arg; + + lw_tx_begin(pop); + ctx = pop->external; + operation_start(ctx); + + int ret = palloc_operation(pop->do_heap, 0, offp, size, constructor_alloc, &carg, type_num, + 0, CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), ctx); + + lw_tx_end(pop, NULL); + return ret; +} + +/* + * dav_alloc -- allocates a new object + */ +DAV_FUNC_EXPORT int +dav_alloc_v2(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags, + dav_constr constructor, void *arg) +{ + DAV_DBG(3, "pop %p offp %p size %zu type_num %llx flags %llx constructor %p arg %p", pop, + offp, size, (unsigned long long)type_num, (unsigned long long)flags, constructor, + arg); + + if (size == 0) { + ERR("allocation with size 0"); + errno = EINVAL; + return -1; + } + + if (flags & ~DAV_TX_XALLOC_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, flags & ~DAV_TX_XALLOC_VALID_FLAGS); + errno = EINVAL; + return -1; + } + + DAV_API_START(); + int ret = obj_alloc_construct(pop, offp, size, type_num, flags, constructor, arg); + + DAV_API_END(); + return ret; +} + +/* + * dav_free -- frees an existing object + */ +DAV_FUNC_EXPORT void +dav_free_v2(dav_obj_t *pop, uint64_t off) +{ + struct operation_context *ctx; + + DAV_DBG("oid.off 0x%016" PRIx64, off); + + if (off == 0) + return; + + DAV_API_START(); + + ASSERTne(pop, NULL); + ASSERT(OBJ_OFF_IS_VALID(pop, off)); + lw_tx_begin(pop); + ctx = pop->external; + operation_start(ctx); + + palloc_operation(pop->do_heap, off, NULL, 0, NULL, NULL, + 0, 0, 0, 0, ctx); + + lw_tx_end(pop, NULL); + DAV_API_END(); +} + +/* + * dav_memcpy_persist -- dav version of memcpy + */ +DAV_FUNC_EXPORT void * +dav_memcpy_persist_v2(dav_obj_t *pop, void *dest, const void *src, + size_t len) +{ + DAV_DBG("pop %p dest %p src %p len %zu", pop, dest, src, len); + D_ASSERT((dav_tx_stage_v2() == DAV_TX_STAGE_NONE)); + + DAV_API_START(); + lw_tx_begin(pop); + + void *ptr = mo_wal_memcpy(&pop->p_ops, dest, src, len, 0); + + lw_tx_end(pop, NULL); + DAV_API_END(); + return ptr; +} + +/* + * dav_memcpy_persist -- dav version of memcpy with deferrred commit to blob. + */ +DAV_FUNC_EXPORT void * +dav_memcpy_persist_relaxed_v2(dav_obj_t *pop, void *dest, const void *src, + size_t len) +{ + DAV_DBG("pop %p dest %p src %p len %zu", pop, dest, src, len); + DAV_API_START(); + if (pop->do_utx == NULL && dav_umem_wtx_new(pop) == NULL) + return 0; + + void *ptr = mo_wal_memcpy(&pop->p_ops, dest, src, len, 0); + + DAV_API_END(); + return ptr; +} + +/* + * dav_reserve -- reserves a single object + */ +DAV_FUNC_EXPORT uint64_t +dav_reserve_v2(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num, + uint64_t flags) +{ + struct constr_args carg; + + DAV_DBG(3, "pop %p act %p size %zu type_num %llx flags %llx", pop, act, size, + (unsigned long long)type_num, (unsigned long long)flags); + + if (flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS) { + ERR("unknown flags 0x%" PRIx64, flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS); + errno = EINVAL; + return 0; + } + + DAV_API_START(); + + if (pop->do_utx == NULL && dav_umem_wtx_new(pop) == NULL) + return 0; + + carg.zero_init = flags & DAV_FLAG_ZERO; + carg.constructor = NULL; + carg.arg = NULL; + + if (palloc_reserve(pop->do_heap, size, constructor_alloc, &carg, type_num, 0, + CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), act) != 0) { + DAV_API_END(); + return 0; + } + + DAV_API_END(); + return act->heap.offset; +} + +/* + * dav_defer_free -- creates a deferred free action + */ +DAV_FUNC_EXPORT void +dav_defer_free_v2(dav_obj_t *pop, uint64_t off, struct dav_action *act) +{ + ASSERT(off != 0); + ASSERT(OBJ_OFF_IS_VALID(pop, off)); + palloc_defer_free(pop->do_heap, off, act); +} + +#if 0 +/* + * dav_publish -- publishes a collection of actions + */ +int +dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) +{ + DAV_API_START(); + struct operation_context *ctx = pmalloc_operation_hold(pop); + + size_t entries_size = actvcnt * sizeof(struct ulog_entry_val); + + if (operation_reserve(ctx, entries_size) != 0) { + DAV_API_END(); + return -1; + } + + palloc_publish(&pop->do_heap, actv, actvcnt, ctx); + + pmalloc_operation_release(pop); + + DAV_API_END(); + return 0; +} +#endif + +/* + * dav_cancel -- cancels collection of actions + */ +DAV_FUNC_EXPORT void +dav_cancel_v2(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt) +{ + DAV_DBG("actvcnt=%zu", actvcnt); + DAV_API_START(); + palloc_cancel(pop->do_heap, actv, actvcnt); + DAV_API_END(); +} + +/* + * dav_tx_publish -- publishes actions inside of a transaction, + * with no_abort option + */ +DAV_FUNC_EXPORT int +dav_tx_publish_v2(struct dav_action *actv, size_t actvcnt) +{ + struct tx *tx = get_tx(); + uint64_t flags = 0; + uint64_t off, size; + int ret; + + ASSERT_IN_TX(tx); + ASSERT_TX_STAGE_WORK(tx); + + flags |= tx_abort_on_failure_flag(tx); + + DAV_API_START(); + + if (tx_action_reserve(tx, actvcnt) != 0) { + ret = obj_tx_fail_err(ENOMEM, flags); + + DAV_API_END(); + return ret; + } + + for (size_t i = 0; i < actvcnt; ++i) { + VEC_PUSH_BACK(&tx->actions, actv[i]); + if (palloc_action_isalloc(&actv[i])) { + palloc_get_prange(&actv[i], &off, &size, 1); + struct tx_range_def r = {off, size, + DAV_XADD_NO_SNAPSHOT | DAV_XADD_WAL_CPTR}; + + ret = dav_tx_add_common(tx, &r); + D_ASSERT(ret == 0); + } + } + + DAV_API_END(); + return 0; +} + +/* + * dav_get_zone_evictable -- Returns an evictable zone id that can be used for + * allocations. If there are no evictable zone with sufficient free space then + * zero is returned which maps to non-evictable zone. + */ +DAV_FUNC_EXPORT uint32_t +dav_get_zone_evictable_v2(dav_obj_t *pop, int flags) +{ + D_ASSERT(flags == 0); + /* REVISIT: TBD + * Return evictable zone that is currently marked as in-use and has sufficient free space. + * Else, find an evictable zone that has more that x% of free memory and mark it as in-use. + */ + return 0; +} diff --git a/src/common/dav_v2/tx.h b/src/common/dav_v2/tx.h new file mode 100644 index 00000000000..ba1fca6fc93 --- /dev/null +++ b/src/common/dav_v2/tx.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2016-2020, Intel Corporation */ + +/* + * tx.h -- internal definitions for transactions + */ + +#ifndef __DAOS_COMMON_INTERNAL_TX_H +#define __DAOS_COMMON_INTERNAL_TX_H 1 + +#include <stdint.h> + +#define TX_DEFAULT_RANGE_CACHE_SIZE (1 << 15) + +struct ulog_entry_base; +struct mo_ops; +/* + * tx_create_wal_entry -- convert to WAL a single ulog UNDO entry + */ +int tx_create_wal_entry(struct ulog_entry_base *e, void *arg, const struct mo_ops *p_ops); + +#endif diff --git a/src/common/dav_v2/ulog.c b/src/common/dav_v2/ulog.c new file mode 100644 index 00000000000..d04d2e6732a --- /dev/null +++ b/src/common/dav_v2/ulog.c @@ -0,0 +1,695 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2022, Intel Corporation */ + +/* + * ulog.c -- unified log implementation + */ + +#include <inttypes.h> +#include <string.h> + +#include "dav_internal.h" +#include "mo_wal.h" +#include "ulog.h" +#include "obj.h" +#include "out.h" +#include "valgrind_internal.h" + +/* + * Operation flag at the three most significant bits + */ +#define ULOG_OPERATION(op) ((uint64_t)(op)) +#define ULOG_OPERATION_MASK ((uint64_t)(0b111ULL << 61ULL)) +#define ULOG_OPERATION_FROM_OFFSET(off) \ + ((ulog_operation_type) ((off) & ULOG_OPERATION_MASK)) +#define ULOG_OFFSET_MASK (~(ULOG_OPERATION_MASK)) + +#define CACHELINE_ALIGN(size) ALIGN_UP(size, CACHELINE_SIZE) +#define IS_CACHELINE_ALIGNED(ptr)\ + (((uintptr_t)(ptr) & (CACHELINE_SIZE - 1)) == 0) + +/* + * ulog_next -- retrieves the pointer to the next ulog + */ +struct ulog * +ulog_next(struct ulog *ulog) +{ + return ulog->next; +} + +/* + * ulog_operation -- returns the type of entry operation + */ +ulog_operation_type +ulog_entry_type(const struct ulog_entry_base *entry) +{ + return ULOG_OPERATION_FROM_OFFSET(entry->offset); +} + +/* + * ulog_offset -- returns offset + */ +uint64_t +ulog_entry_offset(const struct ulog_entry_base *entry) +{ + return entry->offset & ULOG_OFFSET_MASK; +} + +/* + * ulog_entry_size -- returns the size of a ulog entry + */ +size_t +ulog_entry_size(const struct ulog_entry_base *entry) +{ + struct ulog_entry_buf *eb; + + switch (ulog_entry_type(entry)) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + case ULOG_OPERATION_AND: + case ULOG_OPERATION_OR: +#else + case ULOG_OPERATION_CLR_BITS: + case ULOG_OPERATION_SET_BITS: +#endif + case ULOG_OPERATION_SET: + return sizeof(struct ulog_entry_val); + case ULOG_OPERATION_BUF_SET: + case ULOG_OPERATION_BUF_CPY: + eb = (struct ulog_entry_buf *)entry; + return CACHELINE_ALIGN( + sizeof(struct ulog_entry_buf) + eb->size); + default: + ASSERT(0); + } + + return 0; +} + +/* + * ulog_entry_valid -- (internal) checks if a ulog entry is valid + * Returns 1 if the range is valid, otherwise 0 is returned. + */ +static int +ulog_entry_valid(struct ulog *ulog, const struct ulog_entry_base *entry) +{ + if (entry->offset == 0) + return 0; + + size_t size; + struct ulog_entry_buf *b; + + switch (ulog_entry_type(entry)) { + case ULOG_OPERATION_BUF_CPY: + case ULOG_OPERATION_BUF_SET: + size = ulog_entry_size(entry); + b = (struct ulog_entry_buf *)entry; + + uint64_t csum = util_checksum_compute(b, size, + &b->checksum, 0); + csum = util_checksum_seq(&ulog->gen_num, + sizeof(ulog->gen_num), csum); + + if (b->checksum != csum) + return 0; + break; + default: + break; + } + + return 1; +} + +/* + * ulog_construct -- initializes the ulog structure + */ +void +ulog_construct_new(struct ulog *ulog, size_t capacity, uint64_t gen_num, uint64_t flags) +{ + ASSERTne(ulog, NULL); + + ulog->capacity = capacity; + ulog->checksum = 0; + ulog->next = 0; + ulog->gen_num = gen_num; + ulog->flags = flags; + memset(ulog->unused, 0, sizeof(ulog->unused)); + + /* we only need to zero out the header of ulog's first entry */ + size_t zeroed_data = CACHELINE_ALIGN(sizeof(struct ulog_entry_base)); + /* + * We want to avoid replicating zeroes for every ulog of every + * lane, to do that, we need to use plain old memset. + */ + memset(ulog->data, 0, zeroed_data); +} + +/* + * ulog_foreach_entry -- iterates over every existing entry in the ulog + */ +int +ulog_foreach_entry(struct ulog *ulog, ulog_entry_cb cb, void *arg, const struct mo_ops *ops) +{ + struct ulog_entry_base *e; + int ret = 0; + + for (struct ulog *r = ulog; r != NULL; r = ulog_next(r)) { + for (size_t offset = 0; offset < r->capacity; ) { + e = (struct ulog_entry_base *)(r->data + offset); + if (!ulog_entry_valid(ulog, e)) + return ret; + + ret = cb(e, arg, ops); + if (ret != 0) + return ret; + + offset += ulog_entry_size(e); + } + } + + return ret; +} + +/* + * ulog_capacity -- (internal) returns the total capacity of the ulog + */ +size_t +ulog_capacity(struct ulog *ulog, size_t ulog_base_bytes) +{ + size_t capacity = ulog_base_bytes; + + ulog = ulog_next(ulog); + /* skip the first one, we count it in 'ulog_base_bytes' */ + while (ulog != NULL) { + capacity += ulog->capacity; + ulog = ulog_next(ulog); + } + + return capacity; +} + +/* + * ulog_rebuild_next_vec -- rebuilds the vector of next entries + */ +void +ulog_rebuild_next_vec(struct ulog *ulog, struct ulog_next *next) +{ + do { + if (ulog->next != 0) + VEC_PUSH_BACK(next, ulog->next); + } while ((ulog = ulog_next(ulog)) != NULL); +} + +/* + * ulog_reserve -- reserves new capacity in the ulog + */ +int +ulog_reserve(struct ulog *ulog, + size_t ulog_base_nbytes, size_t gen_num, + int auto_reserve, size_t *new_capacity, + ulog_extend_fn extend, struct ulog_next *next) +{ + if (!auto_reserve) { + D_CRIT("cannot auto reserve next ulog\n"); + return -1; + } + + size_t capacity = ulog_base_nbytes; + + VEC_FOREACH(ulog, next) { + ASSERTne(ulog, NULL); + capacity += ulog->capacity; + } + + while (capacity < *new_capacity) { + if (extend(&ulog->next, gen_num) != 0) + return -1; + VEC_PUSH_BACK(next, ulog->next); + ulog = ulog_next(ulog); + ASSERTne(ulog, NULL); + + capacity += ulog->capacity; + } + *new_capacity = capacity; + + return 0; +} + +/* + * ulog_checksum -- (internal) calculates ulog checksum + */ +static int +ulog_checksum(struct ulog *ulog, size_t ulog_base_bytes, int insert) +{ + return util_checksum(ulog, SIZEOF_ULOG(ulog_base_bytes), + &ulog->checksum, insert, 0); +} + +/* + * ulog_entry_val_create -- creates a new log value entry in the ulog + * + * This function requires at least a cacheline of space to be available in the + * ulog. + */ +struct ulog_entry_val * +ulog_entry_val_create(struct ulog *ulog, size_t offset, uint64_t *dest, + uint64_t value, ulog_operation_type type, const struct mo_ops *p_ops) +{ + struct ulog_entry_val *e = + (struct ulog_entry_val *)(ulog->data + offset); + + struct { + struct ulog_entry_val v; + struct ulog_entry_base zeroes; + } data; + COMPILE_ERROR_ON(sizeof(data) != sizeof(data.v) + sizeof(data.zeroes)); + + /* + * Write a little bit more to the buffer so that the next entry that + * resides in the log is erased. This will prevent leftovers from + * a previous, clobbered, log from being incorrectly applied. + */ + data.zeroes.offset = 0; + data.v.base.offset = p_ops->base ? (uint64_t)(dest) - + (uint64_t)((dav_obj_t *)p_ops->base)->do_base : + (uint64_t)dest; + data.v.base.offset |= ULOG_OPERATION(type); + data.v.value = value; + + memcpy(e, &data, sizeof(data)); + + return e; +} + +/* + * ulog_clobber_entry -- zeroes out a single log entry header + */ +void +ulog_clobber_entry(const struct ulog_entry_base *e) +{ + static const size_t aligned_entry_size = + CACHELINE_ALIGN(sizeof(struct ulog_entry_base)); + + memset((char *)e, 0, aligned_entry_size); +} + +/* + * ulog_entry_buf_create -- atomically creates a buffer entry in the log + */ +struct ulog_entry_buf * +ulog_entry_buf_create(struct ulog *ulog, size_t offset, uint64_t gen_num, + uint64_t *dest, const void *src, uint64_t size, + ulog_operation_type type, const struct mo_ops *p_ops) +{ + struct ulog_entry_buf *e = + (struct ulog_entry_buf *)(ulog->data + offset); + + /* + * Depending on the size of the source buffer, we might need to perform + * up to three separate copies: + * 1. The first cacheline, 24b of metadata and 40b of data + * If there's still data to be logged: + * 2. The entire remainder of data data aligned down to cacheline, + * for example, if there's 150b left, this step will copy only + * 128b. + * Now, we are left with between 0 to 63 bytes. If nonzero: + * 3. Create a stack allocated cacheline-sized buffer, fill in the + * remainder of the data, and copy the entire cacheline. + * + * This is done so that we avoid a cache-miss on misaligned writes. + */ + + struct ulog_entry_buf *b = alloca(CACHELINE_SIZE); + + ASSERT(p_ops->base != NULL); + b->base.offset = (uint64_t)dest - (uint64_t)((dav_obj_t *)p_ops->base)->do_base; + b->base.offset |= ULOG_OPERATION(type); + b->size = size; + b->checksum = 0; + + size_t bdatasize = CACHELINE_SIZE - sizeof(struct ulog_entry_buf); + size_t ncopy = MIN(size, bdatasize); + + memcpy(b->data, src, ncopy); + memset(b->data + ncopy, 0, bdatasize - ncopy); + + size_t remaining_size = ncopy > size ? 0 : size - ncopy; + + char *srcof = (char *)src + ncopy; + size_t rcopy = ALIGN_DOWN(remaining_size, CACHELINE_SIZE); + size_t lcopy = remaining_size - rcopy; + + uint8_t last_cacheline[CACHELINE_SIZE]; + + if (lcopy != 0) { + memcpy(last_cacheline, srcof + rcopy, lcopy); + memset(last_cacheline + lcopy, 0, CACHELINE_SIZE - lcopy); + } + + if (rcopy != 0) { + void *rdest = e->data + ncopy; + + ASSERT(IS_CACHELINE_ALIGNED(rdest)); + memcpy(rdest, srcof, rcopy); + } + + if (lcopy != 0) { + void *ldest = e->data + ncopy + rcopy; + + ASSERT(IS_CACHELINE_ALIGNED(ldest)); + + memcpy(ldest, last_cacheline, CACHELINE_SIZE); + } + + b->checksum = util_checksum_seq(b, CACHELINE_SIZE, 0); + if (rcopy != 0) + b->checksum = util_checksum_seq(srcof, rcopy, b->checksum); + if (lcopy != 0) + b->checksum = util_checksum_seq(last_cacheline, + CACHELINE_SIZE, b->checksum); + + b->checksum = util_checksum_seq(&gen_num, sizeof(gen_num), + b->checksum); + + ASSERT(IS_CACHELINE_ALIGNED(e)); + + memcpy(e, b, CACHELINE_SIZE); + + /* + * Allow having uninitialized data in the buffer - this requires marking + * data as defined so that comparing checksums is not reported as an + * error by memcheck. + */ + VALGRIND_DO_MAKE_MEM_DEFINED(e->data, ncopy + rcopy + lcopy); + VALGRIND_DO_MAKE_MEM_DEFINED(&e->checksum, sizeof(e->checksum)); + + ASSERT(ulog_entry_valid(ulog, &e->base)); + + return e; +} + +/* + * ulog_entry_apply -- applies modifications of a single ulog entry + */ +void +ulog_entry_apply(const struct ulog_entry_base *e, int persist, + const struct mo_ops *p_ops) +{ + ulog_operation_type t = ulog_entry_type(e); + uint64_t offset = ulog_entry_offset(e); + + size_t dst_size = sizeof(uint64_t); + uint64_t *dst = p_ops->base ? + (uint64_t *)((uintptr_t)((dav_obj_t *)p_ops->base)->do_base + offset) : + (uint64_t *)offset; + + struct ulog_entry_val *ev; + struct ulog_entry_buf *eb; + + uint16_t nbits; + uint32_t pos; + uint64_t bmask; + + SUPPRESS_UNUSED(persist); + + switch (t) { +#ifdef WAL_SUPPORTS_AND_OR_OPS + case ULOG_OPERATION_AND: + ev = (struct ulog_entry_val *)e; + + VALGRIND_ADD_TO_TX(dst, dst_size); + *dst &= ev->value; + break; + case ULOG_OPERATION_OR: + ev = (struct ulog_entry_val *)e; + + VALGRIND_ADD_TO_TX(dst, dst_size); + *dst |= ev->value; + break; +#else + case ULOG_OPERATION_CLR_BITS: + ev = (struct ulog_entry_val *)e; + pos = ULOG_ENTRY_VAL_TO_POS(ev->value); + nbits = ULOG_ENTRY_VAL_TO_BITS(ev->value); + if (nbits == RUN_BITS_PER_VALUE) + bmask = UINT64_MAX; + else + bmask = ((1ULL << nbits) - 1ULL) << pos; + + VALGRIND_ADD_TO_TX(dst, dst_size); + *dst &= ~bmask; + break; + case ULOG_OPERATION_SET_BITS: + ev = (struct ulog_entry_val *)e; + pos = ULOG_ENTRY_VAL_TO_POS(ev->value); + nbits = ULOG_ENTRY_VAL_TO_BITS(ev->value); + if (nbits == RUN_BITS_PER_VALUE) + bmask = UINT64_MAX; + else + bmask = ((1ULL << nbits) - 1ULL) << pos; + + VALGRIND_ADD_TO_TX(dst, dst_size); + *dst |= bmask; + break; +#endif + case ULOG_OPERATION_SET: + ev = (struct ulog_entry_val *)e; + + VALGRIND_ADD_TO_TX(dst, dst_size); + *dst = ev->value; + break; + case ULOG_OPERATION_BUF_CPY: + eb = (struct ulog_entry_buf *)e; + + dst_size = eb->size; + VALGRIND_ADD_TO_TX(dst, dst_size); + mo_wal_memcpy(p_ops, dst, eb->data, eb->size, 0); + break; + case ULOG_OPERATION_BUF_SET: + default: + ASSERT(0); + } + VALGRIND_REMOVE_FROM_TX(dst, dst_size); +} + +/* + * ulog_process_entry -- (internal) processes a single ulog entry + */ +static int +ulog_process_entry(struct ulog_entry_base *e, void *arg, + const struct mo_ops *p_ops) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(arg); + + ulog_entry_apply(e, 0, p_ops); + + return 0; +} +/* + * ulog_inc_gen_num -- (internal) increments gen num in the ulog + */ +static void +ulog_inc_gen_num(struct ulog *ulog) +{ + ulog->gen_num++; +} + +/* + * ulog_free_next -- free all ulogs starting from the indicated one. + * Function returns 1 if any ulog have been freed or unpinned, 0 otherwise. + */ +int +ulog_free_next(struct ulog *u, ulog_free_fn ulog_free) +{ + int ret = 0; + + if (u == NULL) + return ret; + + VEC(, struct ulog **) ulogs_internal_except_first; + VEC_INIT(&ulogs_internal_except_first); + + while (u->next != 0) { + if (VEC_PUSH_BACK(&ulogs_internal_except_first, + &u->next) != 0) { + /* this is fine, it will just use more memory */ + DAV_DBG("unable to free transaction logs memory"); + goto out; + } + u = u->next; + } + + /* free non-user defined logs */ + struct ulog **ulog_ptr; + + VEC_FOREACH_REVERSE(ulog_ptr, &ulogs_internal_except_first) { + ulog_free(*ulog_ptr); + *ulog_ptr = NULL; + ret = 1; + } + +out: + VEC_DELETE(&ulogs_internal_except_first); + return ret; +} + +/* + * ulog_clobber -- zeroes the metadata of the ulog + */ +void +ulog_clobber(struct ulog *dest, struct ulog_next *next) +{ + struct ulog empty; + + memset(&empty, 0, sizeof(empty)); + + if (next != NULL) + empty.next = VEC_SIZE(next) == 0 ? 0 : VEC_FRONT(next); + else + empty.next = dest->next; + + memcpy(dest, &empty, sizeof(empty)); +} + +/* + * ulog_clobber_data -- zeroes out 'nbytes' of data in the logs + */ +int +ulog_clobber_data(struct ulog *ulog_first, + struct ulog_next *next, ulog_free_fn ulog_free, + unsigned flags) +{ + ASSERTne(ulog_first, NULL); + + /* In case of abort we need to increment counter in the first ulog. */ + if (flags & ULOG_INC_FIRST_GEN_NUM) + ulog_inc_gen_num(ulog_first); + + /* + * In the case of abort or commit, we are not going to free all ulogs, + * but rather increment the generation number to be consistent in the + * first two ulogs. + */ + struct ulog *ulog_second = VEC_SIZE(next) == 0 ? 0 : *VEC_GET(next, 0); + + if (ulog_second && !(flags & ULOG_FREE_AFTER_FIRST)) + /* + * We want to keep gen_nums consistent between ulogs. + * If the transaction will commit successfully we'll reuse the + * second buffer (third and next ones will be freed anyway). + * If the application will crash we'll free 2nd ulog on + * recovery, which means we'll never read gen_num of the + * second ulog in case of an ungraceful shutdown. + */ + ulog_inc_gen_num(ulog_second); + + struct ulog *u; + + /* + * To make sure that transaction logs do not occupy too + * much of space, all of them, expect for the first one, + * are freed at the end of the operation. The reasoning for + * this is that pmalloc() is a relatively cheap operation for + * transactions where many hundreds of kilobytes are being + * snapshot, and so, allocating and freeing the buffer for + * each transaction is an acceptable overhead for the average + * case. + */ + if (flags & ULOG_FREE_AFTER_FIRST) + u = ulog_first; + else + u = ulog_second; + + if (u == NULL) + return 0; + + return ulog_free_next(u, ulog_free); +} + +/* + * ulog_process -- process ulog entries + */ +void +ulog_process(struct ulog *ulog, ulog_check_offset_fn check, + const struct mo_ops *p_ops) +{ + /* suppress unused-parameter errors */ + SUPPRESS_UNUSED(check); + +#ifdef DAV_EXTRA_DEBUG + if (check) + ulog_check(ulog, check, p_ops); +#endif + + ulog_foreach_entry(ulog, ulog_process_entry, NULL, p_ops); + mo_wal_drain(p_ops); +} + +/* + * ulog_base_nbytes -- (internal) counts the actual of number of bytes + * occupied by the ulog + */ +size_t +ulog_base_nbytes(struct ulog *ulog) +{ + size_t offset = 0; + struct ulog_entry_base *e; + + for (offset = 0; offset < ulog->capacity; ) { + e = (struct ulog_entry_base *)(ulog->data + offset); + if (!ulog_entry_valid(ulog, e)) + break; + + offset += ulog_entry_size(e); + } + + return offset; +} + +/* + * ulog_recovery_needed -- checks if the logs needs recovery + */ +int +ulog_recovery_needed(struct ulog *ulog, int verify_checksum) +{ + size_t nbytes = MIN(ulog_base_nbytes(ulog), ulog->capacity); + + if (nbytes == 0) + return 0; + + if (verify_checksum && !ulog_checksum(ulog, nbytes, 0)) + return 0; + + return 1; +} + +/* + * ulog_check_entry -- + * (internal) checks consistency of a single ulog entry + */ +static int +ulog_check_entry(struct ulog_entry_base *e, void *arg, const struct mo_ops *p_ops) +{ + uint64_t offset = ulog_entry_offset(e); + ulog_check_offset_fn check = arg; + + if (!check(p_ops->base, offset)) { + DAV_DBG("ulog %p invalid offset %" PRIu64, + e, e->offset); + return -1; + } + + return offset == 0 ? -1 : 0; +} + +/* + * ulog_check -- (internal) check consistency of ulog entries + */ +int +ulog_check(struct ulog *ulog, ulog_check_offset_fn check, const struct mo_ops *p_ops) +{ + DAV_DBG("ulog %p", ulog); + + return ulog_foreach_entry(ulog, + ulog_check_entry, check, p_ops); +} diff --git a/src/common/dav_v2/ulog.h b/src/common/dav_v2/ulog.h new file mode 100644 index 00000000000..0873dfdeb64 --- /dev/null +++ b/src/common/dav_v2/ulog.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2021, Intel Corporation */ + +/* + * ulog.h -- unified log public interface + */ + +#ifndef __DAOS_COMMON_ULOG_H +#define __DAOS_COMMON_ULOG_H 1 + +#include <stddef.h> +#include <stdint.h> + +#include "util.h" +#include "vec.h" +#include "mo_wal.h" + +struct ulog_entry_base { + uint64_t offset; /* offset with operation type flag */ +}; + +/* + * ulog_entry_val -- log entry + */ +struct ulog_entry_val { + struct ulog_entry_base base; + uint64_t value; /* value to be applied */ +}; + +/* + * ulog_entry_buf - ulog buffer entry + */ +struct ulog_entry_buf { + struct ulog_entry_base base; /* offset with operation type flag */ + uint64_t checksum; /* checksum of the entire log entry */ + uint64_t size; /* size of the buffer to be modified */ + uint8_t data[]; /* content to fill in */ +}; + +#define ULOG_UNUSED ((CACHELINE_SIZE - 40) / 8) +/* + * This structure *must* be located at a cacheline boundary. To achieve this, + * the next field is always allocated with extra padding, and then the offset + * is additionally aligned. + */ +#define ULOG(capacity_bytes) {\ + /* 64 bytes of metadata */\ + uint64_t checksum; /* checksum of ulog header and its entries */\ + struct ulog *next; /* offset of ulog extension */\ + uint64_t capacity; /* capacity of this ulog in bytes */\ + uint64_t gen_num; /* generation counter */\ + uint64_t flags; /* ulog flags */\ + uint64_t unused[ULOG_UNUSED]; /* must be 0 */\ + uint8_t data[capacity_bytes]; /* N bytes of data */\ +} + +#define SIZEOF_ULOG(base_capacity)\ +(sizeof(struct ulog) + base_capacity) + +/* + * Ulog buffer allocated by the user must be marked by this flag. + * It is important to not free it at the end: + * what user has allocated - user should free himself. + */ +#define ULOG_USER_OWNED (1U << 0) + +/* use this for allocations of aligned ulog extensions */ +#define SIZEOF_ALIGNED_ULOG(base_capacity)\ +ALIGN_UP(SIZEOF_ULOG(base_capacity + (2 * CACHELINE_SIZE)), CACHELINE_SIZE) + +struct ulog ULOG(0); + +VEC(ulog_next, struct ulog *); + +typedef uint64_t ulog_operation_type; + +#define ULOG_OPERATION_SET (0b000ULL << 61ULL) +#ifdef WAL_SUPPORTS_AND_OR_OPS +#define ULOG_OPERATION_AND (0b001ULL << 61ULL) +#define ULOG_OPERATION_OR (0b010ULL << 61ULL) +#else +#define ULOG_OPERATION_CLR_BITS (0b001ULL << 61ULL) +#define ULOG_OPERATION_SET_BITS (0b010ULL << 61ULL) +#endif +#define ULOG_OPERATION_BUF_SET (0b101ULL << 61ULL) +#define ULOG_OPERATION_BUF_CPY (0b110ULL << 61ULL) + +#ifndef WAL_SUPPORTS_AND_OR_OPS +#endif + +#ifdef WAL_SUPPORTS_AND_OR_OPS +#define ULOG_ENTRY_IS_BIT_OP(opc) ((opc == ULOG_OPERATION_AND) || \ + (opc == ULOG_OPERATION_OR)) +#else +#define ULOG_ENTRY_IS_BIT_OP(opc) ((opc == ULOG_OPERATION_CLR_BITS) || \ + (opc == ULOG_OPERATION_SET_BITS)) +#define ULOG_ENTRY_OPS_POS 16 /* bits' pos at value:16 */ +#define ULOG_ENTRY_OPS_BITS_MASK ((1ULL << ULOG_ENTRY_OPS_POS) - 1) +#define ULOG_ENTRY_VAL_TO_BITS(val) ((val) & ULOG_ENTRY_OPS_BITS_MASK) +#define ULOG_ENTRY_VAL_TO_POS(val) ((val) >> ULOG_ENTRY_OPS_POS) +#define ULOG_ENTRY_OPS_POS_MASK (RUN_BITS_PER_VALUE - 1ULL) +#define ULOG_ENTRY_TO_VAL(pos, nbits) (((uint64_t)(nbits) & ULOG_ENTRY_OPS_BITS_MASK) | \ + ((pos) & ULOG_ENTRY_OPS_POS_MASK) << ULOG_ENTRY_OPS_POS) +#endif + +/* immediately frees all associated ulog structures */ +#define ULOG_FREE_AFTER_FIRST (1U << 0) +/* increments gen_num of the first, preallocated, ulog */ +#define ULOG_INC_FIRST_GEN_NUM (1U << 1) + +typedef int (*ulog_check_offset_fn)(void *ctx, uint64_t offset); +typedef int (*ulog_extend_fn)(struct ulog **, uint64_t); +typedef int (*ulog_entry_cb)(struct ulog_entry_base *e, void *arg, + const struct mo_ops *p_ops); +typedef void (*ulog_free_fn)(struct ulog *ptr); + +struct ulog *ulog_next(struct ulog *ulog); + +void ulog_construct(uint64_t offset, size_t capacity, uint64_t gen_num, + int flush, uint64_t flags, const struct mo_ops *p_ops); +void ulog_construct_new(struct ulog *ulog, size_t capacity, uint64_t gen_num, + uint64_t flags); + +size_t ulog_capacity(struct ulog *ulog, size_t ulog_base_bytes); +void ulog_rebuild_next_vec(struct ulog *ulog, struct ulog_next *next); + +int ulog_foreach_entry(struct ulog *ulog, + ulog_entry_cb cb, void *arg, const struct mo_ops *ops); + +int ulog_reserve(struct ulog *ulog, + size_t ulog_base_nbytes, size_t gen_num, + int auto_reserve, size_t *new_capacity_bytes, + ulog_extend_fn extend, struct ulog_next *next); + +int ulog_free_next(struct ulog *u, ulog_free_fn ulog_free); +void ulog_clobber(struct ulog *dest, struct ulog_next *next); +int ulog_clobber_data(struct ulog *dest, + struct ulog_next *next, ulog_free_fn ulog_free, unsigned flags); +void ulog_clobber_entry(const struct ulog_entry_base *e); + +void ulog_process(struct ulog *ulog, ulog_check_offset_fn check, + const struct mo_ops *p_ops); + +size_t ulog_base_nbytes(struct ulog *ulog); +int ulog_recovery_needed(struct ulog *ulog, int verify_checksum); + +uint64_t ulog_entry_offset(const struct ulog_entry_base *entry); +ulog_operation_type ulog_entry_type(const struct ulog_entry_base *entry); + +struct ulog_entry_val * +ulog_entry_val_create(struct ulog *ulog, size_t offset, uint64_t *dest, uint64_t value, + ulog_operation_type type, const struct mo_ops *p_ops); + +struct ulog_entry_buf * +ulog_entry_buf_create(struct ulog *ulog, size_t offset, uint64_t gen_num, + uint64_t *dest, const void *src, uint64_t size, + ulog_operation_type type, const struct mo_ops *p_ops); + +void ulog_entry_apply(const struct ulog_entry_base *e, int persist, + const struct mo_ops *p_ops); + +size_t ulog_entry_size(const struct ulog_entry_base *entry); + +int ulog_check(struct ulog *ulog, ulog_check_offset_fn check, + const struct mo_ops *p_ops); + +#endif /* __DAOS_COMMON_ULOG_H */ diff --git a/src/common/dav_v2/util.c b/src/common/dav_v2/util.c new file mode 100644 index 00000000000..5ef73b0577d --- /dev/null +++ b/src/common/dav_v2/util.c @@ -0,0 +1,223 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2014-2022, Intel Corporation */ + +/* + * util.c -- very basic utilities + */ + +#include <stdlib.h> +#include <string.h> +#include <endian.h> + +#include "util.h" +#include "valgrind_internal.h" + + +#if ANY_VG_TOOL_ENABLED +/* Initialized to true if the process is running inside Valgrind. */ +unsigned _On_valgrind; +#endif + +#if VG_HELGRIND_ENABLED +/* Initialized to true if the process is running inside Valgrind helgrind. */ +unsigned _On_helgrind; +#endif + +#if VG_DRD_ENABLED +/* Initialized to true if the process is running inside Valgrind drd. */ +unsigned _On_drd; +#endif + +#if VG_HELGRIND_ENABLED || VG_DRD_ENABLED +/* Initialized to true if the process is running inside Valgrind drd or hg. */ +unsigned _On_drd_or_hg; +#endif + +#if VG_MEMCHECK_ENABLED +/* Initialized to true if the process is running inside Valgrind memcheck. */ +unsigned _On_memcheck; +#endif + +#if VG_TXINFO_ENABLED +/* true if DAV API and TX-related messages has to be enabled in Valgrind log. */ +int _Vg_txinfo_emit; +#endif /* VG_TXINFO_ENABLED */ + +/* + * util_is_zeroed -- check if given memory range is all zero + */ +int +util_is_zeroed(const void *addr, size_t len) +{ + const char *a = addr; + + if (len == 0) + return 1; + + if (a[0] == 0 && memcmp(a, a + 1, len - 1) == 0) + return 1; + + return 0; +} + +/* + * util_checksum_compute -- compute Fletcher64-like checksum + * + * csump points to where the checksum lives, so that location + * is treated as zeros while calculating the checksum. The + * checksummed data is assumed to be in little endian order. + */ +uint64_t +util_checksum_compute(void *addr, size_t len, uint64_t *csump, size_t skip_off) +{ + if (len % 4 != 0) + abort(); + + uint32_t *p32 = addr; + uint32_t *p32end = (uint32_t *)((char *)addr + len); + uint32_t *skip; + uint32_t lo32 = 0; + uint32_t hi32 = 0; + + if (skip_off) + skip = (uint32_t *)((char *)addr + skip_off); + else + skip = (uint32_t *)((char *)addr + len); + + while (p32 < p32end) + if (p32 == (uint32_t *)csump || p32 >= skip) { + /* lo32 += 0; treat first 32-bits as zero */ + p32++; + hi32 += lo32; + /* lo32 += 0; treat second 32-bits as zero */ + p32++; + hi32 += lo32; + } else { + lo32 += le32toh(*p32); + ++p32; + hi32 += lo32; + } + + return (uint64_t)hi32 << 32 | lo32; +} + +/* + * util_checksum -- compute Fletcher64-like checksum + * + * csump points to where the checksum lives, so that location + * is treated as zeros while calculating the checksum. + * If insert is true, the calculated checksum is inserted into + * the range at *csump. Otherwise the calculated checksum is + * checked against *csump and the result returned (true means + * the range checksummed correctly). + */ +int +util_checksum(void *addr, size_t len, uint64_t *csump, + int insert, size_t skip_off) +{ + uint64_t csum = util_checksum_compute(addr, len, csump, skip_off); + + if (insert) { + *csump = htole64(csum); + return 1; + } + + return *csump == htole64(csum); +} + +/* + * util_checksum_seq -- compute sequential Fletcher64-like checksum + * + * Merges checksum from the old buffer with checksum for current buffer. + */ +uint64_t +util_checksum_seq(const void *addr, size_t len, uint64_t csum) +{ + if (len % 4 != 0) + abort(); + const uint32_t *p32 = addr; + const uint32_t *p32end = (const uint32_t *)((const char *)addr + len); + uint32_t lo32 = (uint32_t)csum; + uint32_t hi32 = (uint32_t)(csum >> 32); + + while (p32 < p32end) { + lo32 += le32toh(*p32); + ++p32; + hi32 += lo32; + } + return (uint64_t)hi32 << 32 | lo32; +} + +/* + * util_init -- initialize the utils + * + * This is called from the library initialization code. + */ +#if ANY_VG_TOOL_ENABLED +__attribute__((constructor)) +static void +_util_init(void) +{ + util_init(); +} +#endif + +void +util_init(void) +{ +#if ANY_VG_TOOL_ENABLED + _On_valgrind = RUNNING_ON_VALGRIND; +#endif + +#if VG_MEMCHECK_ENABLED + if (_On_valgrind) { + unsigned tmp; + unsigned result; + unsigned res = VALGRIND_GET_VBITS(&tmp, &result, sizeof(tmp)); + + _On_memcheck = res ? 1 : 0; + } else { + _On_memcheck = 0; + } +#endif + +#if VG_DRD_ENABLED + if (_On_valgrind) + _On_drd = DRD_GET_DRD_THREADID ? 1 : 0; + else + _On_drd = 0; +#endif + +#if VG_HELGRIND_ENABLED + if (_On_valgrind) { + unsigned tmp; + unsigned result; + /* + * As of now (pmem-3.15) VALGRIND_HG_GET_ABITS is broken on + * the upstream version of Helgrind headers. It generates + * a sign-conversion error and actually returns UINT32_MAX-1 + * when not running under Helgrind. + */ + long res = VALGRIND_HG_GET_ABITS(&tmp, &result, sizeof(tmp)); + + _On_helgrind = res != -2 ? 1 : 0; + } else { + _On_helgrind = 0; + } +#endif + +#if VG_DRD_ENABLED || VG_HELGRIND_ENABLED + _On_drd_or_hg = (unsigned)(On_helgrind + On_drd); +#endif + +#if VG_TXINFO_ENABLED + if (_On_valgrind) { + char *txinfo_env = secure_getenv("D_DAV_VG_TXINFO"); + + if (txinfo_env) + _Vg_txinfo_emit = atoi(txinfo_env); + } else { + _Vg_txinfo_emit = 0; + } +#endif +} diff --git a/src/common/dav_v2/util.h b/src/common/dav_v2/util.h new file mode 100644 index 00000000000..f1e12321918 --- /dev/null +++ b/src/common/dav_v2/util.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2014-2021, Intel Corporation */ +/* + * Copyright (c) 2016-2020, Microsoft Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * util.h -- internal definitions for util module + */ + +#ifndef __DAOS_COMMON_UTIL_H +#define __DAOS_COMMON_UTIL_H 1 + +#include <string.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <ctype.h> +#include <stdatomic.h> +#include <sys/param.h> + +#if defined(__x86_64) || defined(_M_X64) || defined(__aarch64__) || \ + defined(__riscv) +#define PAGESIZE 4096 +#elif defined(__PPC64__) +#define PAGESIZE 65536 +#else +#error unable to recognize ISA at compile time +#endif + +#if defined(__x86_64) || defined(_M_X64) || defined(__aarch64__) || \ + defined(__riscv) +#define CACHELINE_SIZE 64ULL +#elif defined(__PPC64__) +#define CACHELINE_SIZE 128ULL +#else +#error unable to recognize architecture at compile time +#endif + +#define ALIGN_UP(size, align) (((size) + (align) - 1) & ~((align) - 1)) +#define ALIGN_DOWN(size, align) ((size) & ~((align) - 1)) + +void util_init(void); +int util_is_zeroed(const void *addr, size_t len); +uint64_t util_checksum_compute(void *addr, size_t len, uint64_t *csump, + size_t skip_off); +int util_checksum(void *addr, size_t len, uint64_t *csump, + int insert, size_t skip_off); +uint64_t util_checksum_seq(const void *addr, size_t len, uint64_t csum); + +#define force_inline __attribute__((always_inline)) inline + +typedef uint64_t ua_uint64_t __attribute__((aligned(1))); +typedef uint32_t ua_uint32_t __attribute__((aligned(1))); +typedef uint16_t ua_uint16_t __attribute__((aligned(1))); + +/* + * util_div_ceil -- divides a by b and rounds up the result + */ +static force_inline unsigned +util_div_ceil(unsigned a, unsigned b) +{ + return (unsigned)(((unsigned long)a + b - 1) / b); +} + +/* + * util_bool_compare_and_swap -- perform an atomic compare and swap + * util_fetch_and_* -- perform an operation atomically, return old value + * util_popcount -- count number of set bits + * util_lssb_index -- return index of least significant set bit, + * undefined on zero + * util_mssb_index -- return index of most significant set bit + * undefined on zero + * + * XXX assertions needed on (value != 0) in both versions of bitscans + * + */ + +/* + * ISO C11 -- 7.17.7.2 The atomic_load generic functions + * Integer width specific versions as supplement for: + * + * + * #include <stdatomic.h> + * C atomic_load(volatile A *object); + * C atomic_load_explicit(volatile A *object, memory_order order); + * + * The atomic_load interface doesn't return the loaded value, but instead + * copies it to a specified address. + * + * void util_atomic_load64(volatile A *object, A *destination); + * void util_atomic_load_explicit32(volatile A *object, A *destination, + * memory_order order); + * void util_atomic_load_explicit64(volatile A *object, A *destination, + * memory_order order); + * Also, instead of generic functions, two versions are available: + * for 32 bit fundamental integers, and for 64 bit ones. + */ + +#define util_atomic_load_explicit32 __atomic_load +#define util_atomic_load_explicit64 __atomic_load + +/* ISO C11 -- 7.17.7.1 The atomic_store generic functions */ +/* + * ISO C11 -- 7.17.7.1 The atomic_store generic functions + * Integer width specific versions as supplement for: + * + * #include <stdatomic.h> + * void atomic_store(volatile A *object, C desired); + * void atomic_store_explicit(volatile A *object, C desired, + * memory_order order); + */ +#define util_atomic_store_explicit32 __atomic_store_n +#define util_atomic_store_explicit64 __atomic_store_n + +/* + * https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html + * https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html + * https://clang.llvm.org/docs/LanguageExtensions.html#builtin-functions + */ +#define util_bool_compare_and_swap64 __sync_bool_compare_and_swap +#define util_fetch_and_add64 __sync_fetch_and_add +#define util_fetch_and_sub64 __sync_fetch_and_sub +#define util_popcount64(value) ((unsigned char)__builtin_popcountll(value)) + +#define util_lssb_index64(value) ((unsigned char)__builtin_ctzll(value)) +#define util_mssb_index64(value) ((unsigned char)(63 - __builtin_clzll(value))) + +/* ISO C11 -- 7.17.7 Operations on atomic types */ +#define util_atomic_load64(object, dest)\ + util_atomic_load_explicit64(object, dest, memory_order_seq_cst) + +#define COMPILE_ERROR_ON(cond) ((void)sizeof(char[(cond) ? -1 : 1])) + +/* macro for counting the number of varargs (up to 9) */ +#define COUNT(...)\ + COUNT_11TH(_, ##__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#define COUNT_11TH(_11, _10, _9, _8, _7, _6, _5, _4, _3, _2, X, ...) X + +/* concatenation macro */ +#define GLUE(A, B) GLUE_I(A, B) +#define GLUE_I(A, B) A##B + +/* macro for suppressing errors from unused variables (zero to 9) */ +#define SUPPRESS_UNUSED(...)\ + GLUE(SUPPRESS_ARG_, COUNT(__VA_ARGS__))(__VA_ARGS__) +#define SUPPRESS_ARG_0(X) +#define SUPPRESS_ARG_1(X) ((void)(X)) +#define SUPPRESS_ARG_2(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_1(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_3(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_2(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_4(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_3(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_5(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_4(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_6(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_5(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_7(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_6(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_8(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_7(__VA_ARGS__);\ +} while (0) +#define SUPPRESS_ARG_9(X, ...) do {\ + SUPPRESS_ARG_1(X); SUPPRESS_ARG_8(__VA_ARGS__);\ +} while (0) + +#endif /* __DAOS_COMMON_UTIL_H */ diff --git a/src/common/dav_v2/valgrind_internal.h b/src/common/dav_v2/valgrind_internal.h new file mode 100644 index 00000000000..57253b9bac0 --- /dev/null +++ b/src/common/dav_v2/valgrind_internal.h @@ -0,0 +1,293 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2015-2021, Intel Corporation */ + +/* + * valgrind_internal.h -- internal definitions for valgrind macros + */ + +#ifndef __DAOS_COMMON_VALGRIND_INTERNAL_H +#define __DAOS_COMMON_VALGRIND_INTERNAL_H 1 + +#ifdef D_HAS_VALGRIND +#if !defined(_WIN32) && !defined(__FreeBSD__) && !defined(__riscv) +#define VG_TXINFO_ENABLED 1 +#define VG_HELGRIND_ENABLED 1 +#define VG_MEMCHECK_ENABLED 1 +#define VG_DRD_ENABLED 1 +#endif +#endif + +#if VG_TXINFO_ENABLED || VG_HELGRIND_ENABLED || VG_MEMCHECK_ENABLED || \ + VG_DRD_ENABLED +#define ANY_VG_TOOL_ENABLED 1 +#else +#define ANY_VG_TOOL_ENABLED 0 +#endif + +#if ANY_VG_TOOL_ENABLED +extern unsigned _On_valgrind; +#define On_valgrind __builtin_expect(_On_valgrind, 0) +#include "valgrind/valgrind.h" +#else +#define On_valgrind (0) +#endif + +#if VG_HELGRIND_ENABLED +extern unsigned _On_helgrind; +#define On_helgrind __builtin_expect(_On_helgrind, 0) +#include "valgrind/helgrind.h" +#else +#define On_helgrind (0) +#endif + +#if VG_DRD_ENABLED +extern unsigned _On_drd; +#define On_drd __builtin_expect(_On_drd, 0) +#include "valgrind/drd.h" +#else +#define On_drd (0) +#endif + +#if VG_HELGRIND_ENABLED || VG_DRD_ENABLED + +extern unsigned _On_drd_or_hg; +#define On_drd_or_hg __builtin_expect(_On_drd_or_hg, 0) + +#define VALGRIND_ANNOTATE_HAPPENS_BEFORE(obj) do {\ + if (On_drd_or_hg) \ + ANNOTATE_HAPPENS_BEFORE((obj));\ +} while (0) + +#define VALGRIND_ANNOTATE_HAPPENS_AFTER(obj) do {\ + if (On_drd_or_hg) \ + ANNOTATE_HAPPENS_AFTER((obj));\ +} while (0) + +#define VALGRIND_ANNOTATE_NEW_MEMORY(addr, size) do {\ + if (On_drd_or_hg) \ + ANNOTATE_NEW_MEMORY((addr), (size));\ +} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_READS_BEGIN() do {\ + if (On_drd_or_hg) \ + ANNOTATE_IGNORE_READS_BEGIN();\ +} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_READS_END() do {\ + if (On_drd_or_hg) \ + ANNOTATE_IGNORE_READS_END();\ +} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_WRITES_BEGIN() do {\ + if (On_drd_or_hg) \ + ANNOTATE_IGNORE_WRITES_BEGIN();\ +} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_WRITES_END() do {\ + if (On_drd_or_hg) \ + ANNOTATE_IGNORE_WRITES_END();\ +} while (0) + +/* Supported by both helgrind and drd. */ +#define VALGRIND_HG_DRD_DISABLE_CHECKING(addr, size) do {\ + if (On_drd_or_hg) \ + VALGRIND_HG_DISABLE_CHECKING((addr), (size));\ +} while (0) + +#else + +#define On_drd_or_hg (0) + +#define VALGRIND_ANNOTATE_HAPPENS_BEFORE(obj) { (void)(obj); } + +#define VALGRIND_ANNOTATE_HAPPENS_AFTER(obj) { (void)(obj); } + +#define VALGRIND_ANNOTATE_NEW_MEMORY(addr, size) do {\ + (void) (addr);\ + (void) (size);\ +} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_READS_BEGIN() do {} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_READS_END() do {} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_WRITES_BEGIN() do {} while (0) + +#define VALGRIND_ANNOTATE_IGNORE_WRITES_END() do {} while (0) + +#define VALGRIND_HG_DRD_DISABLE_CHECKING(addr, size) do {\ + (void) (addr);\ + (void) (size);\ +} while (0) + +#endif + +#if VG_TXINFO_ENABLED + +extern int _Vg_txinfo_emit; +#define VG_txinfo_emit __builtin_expect(_Vg_txinfo_emit, 0) + +void util_emit_log(const char *func, int order); + +#define VALGRIND_SET_CLEAN(addr, len) do {\ + (void)(addr);\ + (void)(len);\ +} while (0) + +#define VALGRIND_START_TX do {} while (0) + +#define VALGRIND_END_TX do {} while (0) + +#define VALGRIND_ADD_TO_TX(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define VALGRIND_REMOVE_FROM_TX(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +/* + * Logs library and function name with proper suffix + * to VG log file. + */ +#define DAV_API_START() do {\ + if (VG_txinfo_emit)\ + VALGRIND_PRINTF("%s BEGIN\n", __func__);\ +} while (0) +#define DAV_API_END() do {\ + if (VG_txinfo_emit)\ + VALGRIND_PRINTF("%s END\n", __func__);\ +} while (0) + +#else /* VG_TXINFO_ENABLED */ + +#define VG_txinfo_emit (0) + +#define VALGRIND_SET_CLEAN(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define VALGRIND_START_TX do {} while (0) + +#define VALGRIND_END_TX do {} while (0) + +#define VALGRIND_ADD_TO_TX(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define VALGRIND_REMOVE_FROM_TX(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(addr, len) do {\ + (void) (addr);\ + (void) (len);\ +} while (0) + +#define DAV_API_START() do {} while (0) + +#define DAV_API_END() do {} while (0) + +#endif /* VG_TXINFO_ENABLED */ + +#if VG_MEMCHECK_ENABLED + +extern unsigned _On_memcheck; +#define On_memcheck __builtin_expect(_On_memcheck, 0) + +#include "valgrind/memcheck.h" + +#define VALGRIND_DO_DISABLE_ERROR_REPORTING do {\ + if (On_valgrind)\ + VALGRIND_DISABLE_ERROR_REPORTING;\ +} while (0) + +#define VALGRIND_DO_ENABLE_ERROR_REPORTING do {\ + if (On_valgrind)\ + VALGRIND_ENABLE_ERROR_REPORTING;\ +} while (0) + +#define VALGRIND_DO_CREATE_MEMPOOL(heap, rzB, is_zeroed) do {\ + if (On_memcheck)\ + VALGRIND_CREATE_MEMPOOL(heap, rzB, is_zeroed);\ +} while (0) + +#define VALGRIND_DO_DESTROY_MEMPOOL(heap) do {\ + if (On_memcheck)\ + VALGRIND_DESTROY_MEMPOOL(heap);\ +} while (0) + +#define VALGRIND_DO_MEMPOOL_ALLOC(heap, addr, size) do {\ + if (On_memcheck)\ + VALGRIND_MEMPOOL_ALLOC(heap, addr, size);\ +} while (0) + +#define VALGRIND_DO_MEMPOOL_FREE(heap, addr) do {\ + if (On_memcheck)\ + VALGRIND_MEMPOOL_FREE(heap, addr);\ +} while (0) + +#define VALGRIND_DO_MAKE_MEM_DEFINED(addr, len) do {\ + if (On_memcheck)\ + VALGRIND_MAKE_MEM_DEFINED(addr, len);\ +} while (0) + +#define VALGRIND_DO_MAKE_MEM_UNDEFINED(addr, len) do {\ + if (On_memcheck)\ + VALGRIND_MAKE_MEM_UNDEFINED(addr, len);\ +} while (0) + +#define VALGRIND_DO_MAKE_MEM_NOACCESS(addr, len) do {\ + if (On_memcheck)\ + VALGRIND_MAKE_MEM_NOACCESS(addr, len);\ +} while (0) + +#define VALGRIND_DO_CHECK_MEM_IS_ADDRESSABLE(addr, len) do {\ + if (On_memcheck)\ + VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, len);\ +} while (0) + +#else /* VG_MEMCHECK_ENABLED */ + +#define On_memcheck (0) + +#define VALGRIND_DO_DISABLE_ERROR_REPORTING do {} while (0) + +#define VALGRIND_DO_ENABLE_ERROR_REPORTING do {} while (0) + +#define VALGRIND_DO_CREATE_MEMPOOL(heap, rzB, is_zeroed)\ + do { (void) (heap); (void) (rzB); (void) (is_zeroed); } while (0) + +#define VALGRIND_DO_DESTROY_MEMPOOL(heap) { (void) (heap); } + +#define VALGRIND_DO_MEMPOOL_ALLOC(heap, addr, size)\ + do { (void) (heap); (void) (addr); (void) (size); } while (0) + +#define VALGRIND_DO_MEMPOOL_FREE(heap, addr)\ + do { (void) (heap); (void) (addr); } while (0) + +#define VALGRIND_DO_MAKE_MEM_DEFINED(addr, len)\ + do { (void) (addr); (void) (len); } while (0) + +#define VALGRIND_DO_MAKE_MEM_UNDEFINED(addr, len)\ + do { (void) (addr); (void) (len); } while (0) + +#define VALGRIND_DO_MAKE_MEM_NOACCESS(addr, len)\ + do { (void) (addr); (void) (len); } while (0) + +#define VALGRIND_DO_CHECK_MEM_IS_ADDRESSABLE(addr, len)\ + do { (void) (addr); (void) (len); } while (0) + +#endif /* VG_MEMCHECK_ENABLED */ + +#endif /* __DAOS_COMMON_VALGRIND_INTERNAL_H */ diff --git a/src/common/dav_v2/vec.h b/src/common/dav_v2/vec.h new file mode 100644 index 00000000000..14bbe667687 --- /dev/null +++ b/src/common/dav_v2/vec.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2017-2020, Intel Corporation */ + +/* + * vec.h -- vector interface + */ + +#ifndef __DAOS_COMMON_VEC_H +#define __DAOS_COMMON_VEC_H 1 + +#include <stddef.h> +#include "valgrind_internal.h" +#include "util.h" +#include "out.h" + +#define VEC_INIT_SIZE (64) + +#define VEC(name, type)\ +struct name {\ + type *buffer;\ + size_t size;\ + size_t capacity;\ +} + +#define VEC_INITIALIZER {NULL, 0, 0} + +#define VEC_INIT(vec) do {\ + (vec)->buffer = NULL;\ + (vec)->size = 0;\ + (vec)->capacity = 0;\ +} while (0) + +#define VEC_MOVE(vecl, vecr) do {\ + D_FREE((vecl)->buffer);\ + (vecl)->buffer = (vecr)->buffer;\ + (vecl)->size = (vecr)->size;\ + (vecl)->capacity = (vecr)->capacity;\ + (vecr)->buffer = NULL;\ + (vecr)->size = 0;\ + (vecr)->capacity = 0;\ +} while (0) + +#define VEC_REINIT(vec) do {\ + VALGRIND_ANNOTATE_NEW_MEMORY((vec), sizeof(*vec));\ + VALGRIND_ANNOTATE_NEW_MEMORY((vec)->buffer,\ + (sizeof(*(vec)->buffer) * ((vec)->capacity)));\ + (vec)->size = 0;\ +} while (0) + +static inline int +vec_reserve(void *vec, size_t ncapacity, size_t s) +{ + void *tbuf; + size_t ncap = ncapacity == 0 ? VEC_INIT_SIZE : ncapacity; + + VEC(vvec, void) *vecp = (struct vvec *)vec; + + D_REALLOC_NZ(tbuf, vecp->buffer, s * ncap); + if (tbuf == NULL) { + D_CRIT("Realloc!\n"); + return -1; + } + vecp->buffer = tbuf; + vecp->capacity = ncap; + return 0; +} + +#define VEC_RESERVE(vec, ncapacity)\ +(((vec)->size == 0 || (ncapacity) > (vec)->size) ?\ + vec_reserve((void *)vec, ncapacity, sizeof(*(vec)->buffer)) :\ + 0) + +#define VEC_POP_BACK(vec) ((vec)->size -= 1) + +#define VEC_FRONT(vec) ((vec)->buffer[0]) + +#define VEC_BACK(vec) ((vec)->buffer[(vec)->size - 1]) + +#define VEC_ERASE_BY_POS(vec, pos) do {\ + if ((pos) != ((vec)->size - 1))\ + (vec)->buffer[(pos)] = VEC_BACK(vec);\ + VEC_POP_BACK(vec);\ +} while (0) + +#define VEC_ERASE_BY_PTR(vec, element) do {\ + if ((element) != &VEC_BACK(vec))\ + *(element) = VEC_BACK(vec);\ + VEC_POP_BACK(vec);\ +} while (0) + +#define VEC_INSERT(vec, element)\ +((vec)->buffer[(vec)->size - 1] = (element), 0) + +#define VEC_INC_SIZE(vec)\ +(((vec)->size++), 0) + +#define VEC_INC_BACK(vec)\ +((vec)->capacity == (vec)->size ? \ + (VEC_RESERVE((vec), ((vec)->capacity * 2)) == 0 ? \ + VEC_INC_SIZE(vec) : -1) : \ + VEC_INC_SIZE(vec)) + +#define VEC_PUSH_BACK(vec, element)\ +(VEC_INC_BACK(vec) == 0 ? VEC_INSERT(vec, element) : -1) + +#define VEC_FOREACH(el, vec)\ +for (size_t _vec_i = 0;\ + _vec_i < (vec)->size && (((el) = (vec)->buffer[_vec_i]), 1);\ + ++_vec_i) + +#define VEC_FOREACH_REVERSE(el, vec)\ +for (size_t _vec_i = ((vec)->size);\ + _vec_i != 0 && (((el) = (vec)->buffer[_vec_i - 1]), 1);\ + --_vec_i) + +#define VEC_FOREACH_BY_POS(elpos, vec)\ +for ((elpos) = 0; (elpos) < (vec)->size; ++(elpos)) + +#define VEC_FOREACH_BY_PTR(el, vec)\ +for (size_t _vec_i = 0;\ + _vec_i < (vec)->size && (((el) = &(vec)->buffer[_vec_i]), 1);\ + ++_vec_i) + +#define VEC_SIZE(vec)\ +((vec)->size) + +#define VEC_CAPACITY(vec)\ +((vec)->capacity) + +#define VEC_ARR(vec)\ +((vec)->buffer) + +#define VEC_GET(vec, id)\ +(&(vec)->buffer[id]) + +#define VEC_CLEAR(vec) ((vec)->size = 0) + +#define VEC_DELETE(vec) do {\ + D_FREE((vec)->buffer);\ + (vec)->buffer = NULL;\ + (vec)->size = 0;\ + (vec)->capacity = 0;\ +} while (0) + +#endif /* __DAOS_COMMON_VEC_H */ diff --git a/src/common/dav_v2/vecq.h b/src/common/dav_v2/vecq.h new file mode 100644 index 00000000000..8af909439e0 --- /dev/null +++ b/src/common/dav_v2/vecq.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2018-2020, Intel Corporation */ + +/* + * vecq.h -- vector queue (FIFO) interface + */ + +#ifndef __DAOS_COMMON_VECQ_H +#define __DAOS_COMMON_VECQ_H 1 + +#include <stddef.h> + +#include "util.h" +#include "out.h" + +#define VECQ_INIT_SIZE (64) + +#define VECQ(name, type)\ +struct name {\ + type *buffer;\ + size_t capacity;\ + size_t front;\ + size_t back;\ +} + +#define VECQ_INIT(vec) do {\ + (vec)->buffer = NULL;\ + (vec)->capacity = 0;\ + (vec)->front = 0;\ + (vec)->back = 0;\ +} while (0) + +#define VECQ_REINIT(vec) do {\ + VALGRIND_ANNOTATE_NEW_MEMORY((vec), sizeof(*vec));\ + VALGRIND_ANNOTATE_NEW_MEMORY((vec)->buffer,\ + (sizeof(*(vec)->buffer) * ((vec)->capacity)));\ + (vec)->front = 0;\ + (vec)->back = 0;\ +} while (0) + +#define VECQ_FRONT_POS(vec)\ +((vec)->front & ((vec)->capacity - 1)) + +#define VECQ_BACK_POS(vec)\ +((vec)->back & ((vec)->capacity - 1)) + +#define VECQ_FRONT(vec)\ +((vec)->buffer[VECQ_FRONT_POS(vec)]) + +#define VECQ_BACK(vec) ((vec)->buffer[VECQ_BACK_POS(vec)]) + +#define VECQ_DEQUEUE(vec)\ +((vec)->buffer[(((vec)->front++) & ((vec)->capacity - 1))]) + +#define VECQ_SIZE(vec)\ +((vec)->back - (vec)->front) + +static inline int +realloc_set(void **buf, size_t s) +{ + void *tbuf; + + D_REALLOC_NZ(tbuf, *buf, s); + if (tbuf == NULL) { + D_CRIT("Realloc!\n"); + return -1; + } + *buf = tbuf; + return 0; +} + +#define VECQ_NCAPACITY(vec)\ +((vec)->capacity == 0 ? VECQ_INIT_SIZE : (vec)->capacity * 2) +#define VECQ_GROW(vec)\ +(realloc_set((void **)&(vec)->buffer,\ + VECQ_NCAPACITY(vec) * sizeof(*(vec)->buffer)) ? -1 :\ + (memcpy((vec)->buffer + (vec)->capacity, (vec)->buffer,\ + VECQ_FRONT_POS(vec) * sizeof(*(vec)->buffer)),\ + (vec)->front = VECQ_FRONT_POS(vec),\ + (vec)->back = (vec)->front + (vec)->capacity,\ + (vec)->capacity = VECQ_NCAPACITY(vec),\ + 0\ +)) + +#define VECQ_INSERT(vec, element)\ +(VECQ_BACK(vec) = element, (vec)->back += 1, 0) + +#define VECQ_ENQUEUE(vec, element)\ +((vec)->capacity == VECQ_SIZE(vec) ?\ + (VECQ_GROW(vec) == 0 ? VECQ_INSERT(vec, element) : -1) :\ +VECQ_INSERT(vec, element)) + +#define VECQ_CAPACITY(vec)\ +((vec)->capacity) + +#define VECQ_FOREACH(el, vec)\ +for (size_t _vec_i = 0;\ + _vec_i < VECQ_SIZE(vec) &&\ + (((el) = (vec)->buffer[_vec_i & ((vec)->capacity - 1)]), 1);\ + ++_vec_i) + +#define VECQ_FOREACH_REVERSE(el, vec)\ +for (size_t _vec_i = VECQ_SIZE(vec);\ + _vec_i > 0 &&\ + (((el) = (vec)->buffer[(_vec_i - 1) & ((vec)->capacity - 1)]), 1);\ + --_vec_i) + +#define VECQ_CLEAR(vec) do {\ + (vec)->front = 0;\ + (vec)->back = 0;\ +} while (0) + +#define VECQ_DELETE(vec) do {\ + D_FREE((vec)->buffer);\ + (vec)->buffer = NULL;\ + (vec)->capacity = 0;\ + (vec)->front = 0;\ + (vec)->back = 0;\ +} while (0) + +#endif /* __DAOS_COMMON_VECQ_H */ diff --git a/src/common/dav_v2/wal_tx.c b/src/common/dav_v2/wal_tx.c new file mode 100644 index 00000000000..8776127a1f1 --- /dev/null +++ b/src/common/dav_v2/wal_tx.c @@ -0,0 +1,509 @@ +/** + * (C) Copyright 2022-2023 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#include <daos/mem.h> +#include "dav_internal.h" +#include "wal_tx.h" +#include "util.h" + +struct umem_wal_tx_ops dav_wal_tx_ops; + +static inline uint64_t +mdblob_addr2offset(struct dav_obj *hdl, void *addr) +{ + D_ASSERT(((uintptr_t)addr >= (uintptr_t)hdl->do_base) && + ((uintptr_t)addr <= ((uintptr_t)hdl->do_base + hdl->do_size))); + return (uintptr_t)addr - (uintptr_t)hdl->do_base; +} + +#define AD_TX_ACT_ADD(tx, wa) \ + do { \ + d_list_add_tail(&(wa)->wa_link, &(tx)->wt_redo); \ + (tx)->wt_redo_cnt++; \ + if ((wa)->wa_act.ac_opc == UMEM_ACT_COPY || \ + (wa)->wa_act.ac_opc == UMEM_ACT_COPY_PTR) { \ + (tx)->wt_redo_payload_len += (wa)->wa_act.ac_copy.size; \ + } else if ((wa)->wa_act.ac_opc == UMEM_ACT_MOVE) { \ + /* ac_move src addr is playload after wal_trans_entry */\ + (tx)->wt_redo_payload_len += sizeof(uint64_t); \ + } \ + } while (0) + +/** allocate wal_action, if success the wa_link and wa_act.ac_opc will be init-ed */ +#define D_ALLOC_ACT(wa, opc, size) \ + do { \ + if (opc == UMEM_ACT_COPY) \ + D_ALLOC(wa, offsetof(struct wal_action, \ + wa_act.ac_copy.payload[size])); \ + else \ + D_ALLOC_PTR(wa); \ + if (likely(wa != NULL)) { \ + D_INIT_LIST_HEAD(&wa->wa_link); \ + wa->wa_act.ac_opc = opc; \ + } \ + } while (0) + +static inline void +act_copy_payload(struct umem_action *act, void *addr, daos_size_t size) +{ + char *dst = (char *)&act->ac_copy.payload[0]; + + if (size > 0) + memcpy(dst, addr, size); +} + +static void +dav_wal_tx_init(struct umem_wal_tx *utx, struct dav_obj *dav_hdl) +{ + struct dav_tx *tx = utx2wtx(utx); + + D_INIT_LIST_HEAD(&tx->wt_redo); + tx->wt_redo_cnt = 0; + tx->wt_redo_payload_len = 0; + tx->wt_redo_act_pos = NULL; + tx->wt_dav_hdl = dav_hdl; +} + +struct umem_wal_tx * +dav_umem_wtx_new(struct dav_obj *dav_hdl) +{ + struct umem_wal_tx *umem_wtx; + + D_ASSERT(dav_hdl->do_utx == NULL); + D_ALLOC_PTR(umem_wtx); + if (umem_wtx == NULL) + return NULL; + + umem_wtx->utx_ops = &dav_wal_tx_ops; + umem_wtx->utx_id = ULLONG_MAX; + dav_wal_tx_init(umem_wtx, dav_hdl); + dav_hdl->do_utx = umem_wtx; + return umem_wtx; +} + +void +dav_umem_wtx_cleanup(struct umem_wal_tx *utx) +{ + struct dav_tx *tx = utx2wtx(utx); + d_list_t *list = &tx->wt_redo; + struct wal_action *wa, *next; + + d_list_for_each_entry_safe(wa, next, list, wa_link) { + d_list_del(&wa->wa_link); + D_FREE(wa); + } +} + +static int +dav_wal_tx_submit(struct dav_obj *dav_hdl, struct umem_wal_tx *utx, void *data) +{ + struct wal_action *wa, *next; + struct umem_action *ua; + struct umem_store *store = dav_hdl->do_store; + struct dav_tx *tx = utx2wtx(utx); + d_list_t *redo_list = &tx->wt_redo; + + char *pathname = basename(dav_hdl->do_path); + uint64_t id = utx->utx_id; + int rc; + + if (wal_tx_act_nr(utx) == 0) + return 0; + + d_list_for_each_entry_safe(wa, next, redo_list, wa_link) { + ua = &wa->wa_act; + switch (ua->ac_opc) { + case UMEM_ACT_COPY: + D_DEBUG(DB_TRACE, + "%s: ACT_COPY txid=%lu, (p,o)=%lu,%lu size=%lu\n", + pathname, id, + ua->ac_copy.addr / PAGESIZE, ua->ac_copy.addr % PAGESIZE, + ua->ac_copy.size); + break; + case UMEM_ACT_COPY_PTR: + D_DEBUG(DB_TRACE, + "%s: ACT_COPY_PTR txid=%lu, (p,o)=%lu,%lu size=%lu ptr=0x%lx\n", + pathname, id, + ua->ac_copy_ptr.addr / PAGESIZE, ua->ac_copy_ptr.addr % PAGESIZE, + ua->ac_copy_ptr.size, ua->ac_copy_ptr.ptr); + break; + case UMEM_ACT_ASSIGN: + D_DEBUG(DB_TRACE, + "%s: ACT_ASSIGN txid=%lu, (p,o)=%lu,%lu size=%u\n", + pathname, id, + ua->ac_assign.addr / PAGESIZE, ua->ac_assign.addr % PAGESIZE, + ua->ac_assign.size); + break; + case UMEM_ACT_SET: + D_DEBUG(DB_TRACE, + "%s: ACT_SET txid=%lu, (p,o)=%lu,%lu size=%u val=%u\n", + pathname, id, + ua->ac_set.addr / PAGESIZE, ua->ac_set.addr % PAGESIZE, + ua->ac_set.size, ua->ac_set.val); + break; + case UMEM_ACT_SET_BITS: + D_DEBUG(DB_TRACE, + "%s: ACT_SET_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n", + pathname, id, + ua->ac_op_bits.addr / PAGESIZE, ua->ac_op_bits.addr % PAGESIZE, + ua->ac_op_bits.pos, ua->ac_op_bits.num); + break; + case UMEM_ACT_CLR_BITS: + D_DEBUG(DB_TRACE, + "%s: ACT_CLR_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n", + pathname, id, + ua->ac_op_bits.addr / PAGESIZE, ua->ac_op_bits.addr % PAGESIZE, + ua->ac_op_bits.pos, ua->ac_op_bits.num); + break; + default: + D_ERROR("%s: unknown opc %d\n", dav_hdl->do_path, ua->ac_opc); + ASSERT(0); + } + } + DAV_DBG("tx_id:%lu submitting to WAL: %u bytes in %u actions", + id, tx->wt_redo_payload_len, tx->wt_redo_cnt); + rc = store->stor_ops->so_wal_submit(store, utx, data); + return rc; +} + +/** complete the wl transaction */ +int +dav_wal_tx_commit(struct dav_obj *hdl, struct umem_wal_tx *utx, void *data) +{ + int rc; + + /* write actions in redo list to WAL */ + rc = dav_wal_tx_submit(hdl, utx, data); + + /* FAIL the engine if commit fails */ + D_ASSERT(rc == 0); + dav_umem_wtx_cleanup(utx); + return 0; +} + +int +dav_wal_tx_reserve(struct dav_obj *hdl, uint64_t *id) +{ + int rc; + + rc = hdl->do_store->stor_ops->so_wal_reserv(hdl->do_store, id); + /* REVISIT: + * Remove this assert once callers of dav_free() and dav_memcpy_persist() + * are modified to handle failures. + */ + D_ASSERT(rc == 0); + return rc; +} + +/** + * snapshot data from src to either wal redo log. + */ +int +dav_wal_tx_snap(void *hdl, void *addr, daos_size_t size, void *src, uint32_t flags) +{ + struct dav_obj *dav_hdl = (struct dav_obj *)hdl; + struct dav_tx *tx = utx2wtx(dav_hdl->do_utx); + struct wal_action *wa_redo; + int rc; + + D_ASSERT(hdl != NULL); + + if (addr == NULL || size == 0 || size > UMEM_ACT_PAYLOAD_MAX_LEN) + return -DER_INVAL; + + rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id, + mdblob_addr2offset(tx->wt_dav_hdl, addr), size); + if (rc != 0) + return rc; + + if (flags & DAV_XADD_WAL_CPTR) { + D_ALLOC_ACT(wa_redo, UMEM_ACT_COPY_PTR, size); + if (wa_redo == NULL) + return -DER_NOMEM; + wa_redo->wa_act.ac_copy_ptr.ptr = (uintptr_t)src; + wa_redo->wa_act.ac_copy_ptr.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_copy_ptr.size = size; + } else { + D_ALLOC_ACT(wa_redo, UMEM_ACT_COPY, size); + if (wa_redo == NULL) + return -DER_NOMEM; + act_copy_payload(&wa_redo->wa_act, src, size); + wa_redo->wa_act.ac_copy.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_copy.size = size; + } + AD_TX_ACT_ADD(tx, wa_redo); + return 0; +} + +/** assign uint64_t value to @addr */ +int +dav_wal_tx_assign(void *hdl, void *addr, uint64_t val) +{ + struct dav_obj *dav_hdl = (struct dav_obj *)hdl; + struct dav_tx *tx = utx2wtx(dav_hdl->do_utx); + struct wal_action *wa_redo; + int rc; + + D_ASSERT(hdl != NULL); + if (addr == NULL) + return -DER_INVAL; + + rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id, + mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t)); + if (rc != 0) + return rc; + + D_ALLOC_ACT(wa_redo, UMEM_ACT_ASSIGN, sizeof(uint64_t)); + if (wa_redo == NULL) + return -DER_NOMEM; + wa_redo->wa_act.ac_assign.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_assign.size = 8; + wa_redo->wa_act.ac_assign.val = val; + AD_TX_ACT_ADD(tx, wa_redo); + + return 0; +} + +/** Set bits starting from pos */ +int +dav_wal_tx_set_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits) +{ + struct dav_obj *dav_hdl = (struct dav_obj *)hdl; + struct dav_tx *tx = utx2wtx(dav_hdl->do_utx); + struct wal_action *wa_redo; + int rc; + + D_ASSERT(hdl != NULL); + if (addr == NULL) + return -DER_INVAL; + + rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id, + mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t)); + if (rc != 0) + return rc; + + D_ALLOC_ACT(wa_redo, UMEM_ACT_SET_BITS, sizeof(uint64_t)); + if (wa_redo == NULL) + return -DER_NOMEM; + wa_redo->wa_act.ac_op_bits.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_op_bits.num = num_bits; + wa_redo->wa_act.ac_op_bits.pos = pos; + AD_TX_ACT_ADD(tx, wa_redo); + + return 0; +} + +/** Clr bits starting from pos */ +int +dav_wal_tx_clr_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits) +{ + struct dav_obj *dav_hdl = (struct dav_obj *)hdl; + struct dav_tx *tx = utx2wtx(dav_hdl->do_utx); + struct wal_action *wa_redo; + int rc; + + D_ASSERT(hdl != NULL); + if (addr == NULL) + return -DER_INVAL; + + rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id, + mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t)); + if (rc != 0) + return rc; + + D_ALLOC_ACT(wa_redo, UMEM_ACT_CLR_BITS, sizeof(uint64_t)); + if (wa_redo == NULL) + return -DER_NOMEM; + wa_redo->wa_act.ac_op_bits.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_op_bits.num = num_bits; + wa_redo->wa_act.ac_op_bits.pos = pos; + AD_TX_ACT_ADD(tx, wa_redo); + + return 0; +} + +/** + * memset a storage region, save the operation for redo + */ +int +dav_wal_tx_set(void *hdl, void *addr, char c, daos_size_t size) +{ + struct dav_obj *dav_hdl = (struct dav_obj *)hdl; + struct dav_tx *tx = utx2wtx(dav_hdl->do_utx); + struct wal_action *wa_redo; + int rc; + + D_ASSERT(hdl != NULL); + + if (addr == NULL || size == 0 || size > UMEM_ACT_PAYLOAD_MAX_LEN) + return -DER_INVAL; + + rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id, + mdblob_addr2offset(tx->wt_dav_hdl, addr), size); + if (rc != 0) + return rc; + + D_ALLOC_ACT(wa_redo, UMEM_ACT_SET, size); + if (wa_redo == NULL) + return -DER_NOMEM; + + wa_redo->wa_act.ac_set.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr); + wa_redo->wa_act.ac_set.size = size; + wa_redo->wa_act.ac_set.val = c; + AD_TX_ACT_ADD(tx, wa_redo); + return 0; +} + +/** + * query action number in redo list. + */ +uint32_t +wal_tx_act_nr(struct umem_wal_tx *utx) +{ + struct dav_tx *tx = utx2wtx(utx); + + return tx->wt_redo_cnt; +} + +/** + * query payload length in redo list. + */ +uint32_t +wal_tx_payload_len(struct umem_wal_tx *utx) +{ + struct dav_tx *tx = utx2wtx(utx); + + return tx->wt_redo_payload_len; +} + +/** + * get first action pointer, NULL for list empty. + */ +struct umem_action * +wal_tx_act_first(struct umem_wal_tx *utx) +{ + struct dav_tx *tx = utx2wtx(utx); + + if (d_list_empty(&tx->wt_redo)) { + tx->wt_redo_act_pos = NULL; + return NULL; + } + + tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo); + return &tx->wt_redo_act_pos->wa_act; +} + +/** + * get next action pointer, NULL for done or list empty. + */ +struct umem_action * +wal_tx_act_next(struct umem_wal_tx *utx) +{ + struct dav_tx *tx = utx2wtx(utx); + + if (tx->wt_redo_act_pos == NULL) { + if (d_list_empty(&tx->wt_redo)) + return NULL; + tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo); + return &tx->wt_redo_act_pos->wa_act; + } + + D_ASSERT(!d_list_empty(&tx->wt_redo)); + tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo_act_pos->wa_link); + if (&tx->wt_redo_act_pos->wa_link == &tx->wt_redo) { + tx->wt_redo_act_pos = NULL; + return NULL; + } + return &tx->wt_redo_act_pos->wa_act; +} + +struct umem_wal_tx_ops dav_wal_tx_ops = { + .wtx_act_nr = wal_tx_act_nr, + .wtx_payload_sz = wal_tx_payload_len, + .wtx_act_first = wal_tx_act_first, + .wtx_act_next = wal_tx_act_next, +}; + +int +dav_wal_replay_cb(uint64_t tx_id, struct umem_action *act, void *arg) +{ + void *src, *dst; + ptrdiff_t off; + uint64_t *p, mask; + daos_size_t size; + int pos, num, val; + int rc = 0; + dav_obj_t *dav_hdl = arg; + void *base = dav_hdl->do_base; + struct umem_store *store = dav_hdl->do_store; + + switch (act->ac_opc) { + case UMEM_ACT_COPY: + D_DEBUG(DB_TRACE, + "ACT_COPY txid=%lu, (p,o)=%lu,%lu size=%lu\n", + tx_id, + act->ac_copy.addr / PAGESIZE, act->ac_copy.addr % PAGESIZE, + act->ac_copy.size); + off = act->ac_copy.addr; + dst = base + off; + src = (void *)&act->ac_copy.payload; + size = act->ac_copy.size; + memcpy(dst, src, size); + break; + case UMEM_ACT_ASSIGN: + D_DEBUG(DB_TRACE, + "ACT_ASSIGN txid=%lu, (p,o)=%lu,%lu size=%u\n", + tx_id, + act->ac_assign.addr / PAGESIZE, act->ac_assign.addr % PAGESIZE, + act->ac_assign.size); + off = act->ac_assign.addr; + dst = base + off; + size = act->ac_assign.size; + ASSERT_rt(size == 1 || size == 2 || size == 4); + src = &act->ac_assign.val; + memcpy(dst, src, size); + break; + case UMEM_ACT_SET: + D_DEBUG(DB_TRACE, + "ACT_SET txid=%lu, (p,o)=%lu,%lu size=%u val=%u\n", + tx_id, + act->ac_set.addr / PAGESIZE, act->ac_set.addr % PAGESIZE, + act->ac_set.size, act->ac_set.val); + off = act->ac_set.addr; + dst = base + off; + size = act->ac_set.size; + val = act->ac_set.val; + memset(dst, val, size); + break; + case UMEM_ACT_SET_BITS: + case UMEM_ACT_CLR_BITS: + D_DEBUG(DB_TRACE, + "ACT_CLR_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n", + tx_id, + act->ac_op_bits.addr / PAGESIZE, act->ac_op_bits.addr % PAGESIZE, + act->ac_op_bits.pos, act->ac_op_bits.num); + off = act->ac_op_bits.addr; + size = sizeof(uint64_t); + p = (uint64_t *)(base + off); + num = act->ac_op_bits.num; + pos = act->ac_op_bits.pos; + ASSERT_rt((pos >= 0) && (pos + num) <= 64); + mask = ((1ULL << num) - 1) << pos; + if (act->ac_opc == UMEM_ACT_SET_BITS) + *p |= mask; + else + *p &= ~mask; + break; + default: + D_ASSERT(0); + break; + } + + if (rc == 0) + rc = umem_cache_touch(store, tx_id, off, size); + + return rc; +} diff --git a/src/common/dav_v2/wal_tx.h b/src/common/dav_v2/wal_tx.h new file mode 100644 index 00000000000..e02759b9b3f --- /dev/null +++ b/src/common/dav_v2/wal_tx.h @@ -0,0 +1,44 @@ +/** + * (C) Copyright 2021-2022 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#ifndef __DAOS_COMMON_DAV_WAL_TX_ +#define __DAOS_COMMON_DAV_WAL_TX_ + +#include <gurt/list.h> +#include <daos_types.h> +#include <daos/mem.h> + +struct dav_obj; + +struct wal_action { + d_list_t wa_link; + struct umem_action wa_act; +}; + +struct dav_tx { + struct dav_obj *wt_dav_hdl; + d_list_t wt_redo; + uint32_t wt_redo_cnt; + uint32_t wt_redo_payload_len; + struct wal_action *wt_redo_act_pos; +}; +D_CASSERT(sizeof(struct dav_tx) <= UTX_PRIV_SIZE, + "Size of struct dav_tx is too big!"); + +#define dav_action_get_next(it) d_list_entry(it.next, struct wal_action, wa_link) + +struct umem_wal_tx *dav_umem_wtx_new(struct dav_obj *dav_hdl); +void dav_umem_wtx_cleanup(struct umem_wal_tx *utx); +int dav_wal_tx_reserve(struct dav_obj *hdl, uint64_t *id); +int dav_wal_tx_commit(struct dav_obj *hdl, struct umem_wal_tx *utx, void *data); +int dav_wal_tx_snap(void *hdl, void *addr, daos_size_t size, void *src, uint32_t flags); +int dav_wal_tx_assign(void *hdl, void *addr, uint64_t val); +int dav_wal_tx_clr_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits); +int dav_wal_tx_set_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits); +int dav_wal_tx_set(void *hdl, void *addr, char c, daos_size_t size); +int dav_wal_replay_cb(uint64_t tx_id, struct umem_action *act, void *base); + +#endif /*__DAOS_COMMON_DAV_WAL_TX_*/ diff --git a/src/common/mem.c b/src/common/mem.c index afc7f70eb66..03f870e51ab 100644 --- a/src/common/mem.c +++ b/src/common/mem.c @@ -17,7 +17,9 @@ #ifdef DAOS_PMEM_BUILD #include <libpmemobj.h> #include <daos_srv/ad_mem.h> +#define DAV_V2_BUILD #include "dav/dav.h" +#include "dav_v2/dav_v2.h" #endif #define UMEM_TX_DATA_MAGIC (0xc01df00d) @@ -44,7 +46,7 @@ umem_get_mb_evictable(struct umem_instance *umm, int flags) if (umm->umm_pool->up_store.store_type == DAOS_MD_BMEM) { pop = (dav_obj_t *)umm->umm_pool->up_priv; - return dav_get_zone_evictable(pop, flags); + return dav_get_zone_evictable_v2(pop, flags); } return 0; } @@ -82,6 +84,9 @@ umempobj_settings_init(bool md_on_ssd) case DAOS_MD_ADMEM: D_INFO("UMEM will use AD-hoc Memory as the metadata backend interface\n"); break; + case DAOS_MD_BMEM_V2: + D_INFO("UMEM will use Blob Backed Memory v2 as the metadata backend interface\n"); + break; default: D_ERROR("DAOS_MD_ON_SSD_MODE=%d envar invalid, use %d for BMEM or %d for ADMEM\n", md_mode, DAOS_MD_BMEM, DAOS_MD_ADMEM); @@ -106,6 +111,8 @@ int umempobj_backend_type2class_id(int backend) return UMEM_CLASS_BMEM; case DAOS_MD_ADMEM: return UMEM_CLASS_ADMEM; + case DAOS_MD_BMEM_V2: + return UMEM_CLASS_BMEM_V2; default: D_ASSERTF(0, "bad daos_md_backend %d\n", backend); @@ -173,6 +180,16 @@ set_slab_desc(struct umem_pool *ph_p, struct umem_slab_desc *slab) /* update with the new slab id */ slab->class_id = davslab.class_id; break; + case DAOS_MD_BMEM_V2: + davslab.unit_size = slab->unit_size; + davslab.alignment = 0; + davslab.units_per_block = 1000; + davslab.header_type = DAV_HEADER_NONE; + davslab.class_id = slab->class_id; + rc = dav_class_register_v2((dav_obj_t *)ph_p->up_priv, &davslab); + /* update with the new slab id */ + slab->class_id = davslab.class_id; + break; case DAOS_MD_ADMEM: /* NOOP for ADMEM now */ slab->class_id = class_id++; @@ -337,6 +354,15 @@ umempobj_create(const char *path, const char *layout_name, int flags, } umm_pool->up_priv = dav_hdl; break; + case DAOS_MD_BMEM_V2: + dav_hdl = dav_obj_create_v2(path, 0, poolsize, mode, &umm_pool->up_store); + if (!dav_hdl) { + D_ERROR("Failed to create pool %s, size="DF_U64": errno = %d\n", + path, poolsize, errno); + goto error; + } + umm_pool->up_priv = dav_hdl; + break; case DAOS_MD_ADMEM: rc = ad_blob_create(path, 0, store, &bh); if (rc) { @@ -420,6 +446,16 @@ umempobj_open(const char *path, const char *layout_name, int flags, struct umem_ goto error; } + umm_pool->up_priv = dav_hdl; + break; + case DAOS_MD_BMEM_V2: + dav_hdl = dav_obj_open_v2(path, 0, &umm_pool->up_store); + if (!dav_hdl) { + D_ERROR("Error in opening the pool %s: errno =%d\n", + path, errno); + goto error; + } + umm_pool->up_priv = dav_hdl; break; case DAOS_MD_ADMEM: @@ -464,6 +500,9 @@ umempobj_close(struct umem_pool *ph_p) case DAOS_MD_BMEM: dav_obj_close((dav_obj_t *)ph_p->up_priv); break; + case DAOS_MD_BMEM_V2: + dav_obj_close_v2((dav_obj_t *)ph_p->up_priv); + break; case DAOS_MD_ADMEM: bh.bh_blob = (struct ad_blob *)ph_p->up_priv; ad_blob_close(bh); @@ -503,6 +542,9 @@ umempobj_get_rootptr(struct umem_pool *ph_p, size_t size) case DAOS_MD_BMEM: off = dav_root((dav_obj_t *)ph_p->up_priv, size); return (char *)dav_get_base_ptr((dav_obj_t *)ph_p->up_priv) + off; + case DAOS_MD_BMEM_V2: + off = dav_root_v2((dav_obj_t *)ph_p->up_priv, size); + return (char *)dav_get_base_ptr((dav_obj_t *)ph_p->up_priv) + off; case DAOS_MD_ADMEM: bh.bh_blob = (struct ad_blob *)ph_p->up_priv; return ad_root(bh, size); @@ -540,6 +582,11 @@ umempobj_get_heapusage(struct umem_pool *ph_p, daos_size_t *curr_allocated) if (rc == 0) *curr_allocated = st.curr_allocated; break; + case DAOS_MD_BMEM_V2: + rc = dav_get_heap_stats_v2((dav_obj_t *)ph_p->up_priv, &st); + if (rc == 0) + *curr_allocated = st.curr_allocated; + break; case DAOS_MD_ADMEM: *curr_allocated = 40960; /* TODO */ break; @@ -579,6 +626,12 @@ umempobj_log_fraginfo(struct umem_pool *ph_p) DF_U64", run_active: "DF_U64"\n", st.run_allocated, st.run_active); break; + case DAOS_MD_BMEM_V2: + dav_get_heap_stats_v2((dav_obj_t *)ph_p->up_priv, &st); + D_ERROR("Fragmentation info, run_allocated: " + DF_U64", run_active: "DF_U64"\n", + st.run_allocated, st.run_active); + break; case DAOS_MD_ADMEM: /* TODO */ D_ERROR("Fragmentation info, not implemented in ADMEM yet.\n"); @@ -1074,9 +1127,7 @@ bmem_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned i pflags |= DAV_FLAG_ZERO; if (flags & UMEM_FLAG_NO_FLUSH) pflags |= DAV_FLAG_NO_FLUSH; - if (mbkt_id != 0) - pflags |= DAV_EZONE_ID(mbkt_id); - return dav_tx_alloc(size, type_num, pflags); + return dav_tx_xalloc(size, type_num, pflags); } static int @@ -1183,9 +1234,8 @@ bmem_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int typ unsigned int mbkt_id) { dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; - uint64_t flags = DAV_EZONE_ID(mbkt_id); - return dav_reserve(pop, (struct dav_action *)act, size, type_num, flags); + return dav_reserve(pop, (struct dav_action *)act, size, type_num); } static void @@ -1228,9 +1278,8 @@ bmem_atomic_alloc(struct umem_instance *umm, size_t size, unsigned int type_num, uint64_t off; dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; int rc; - uint64_t flags = DAV_EZONE_ID(mbkt_id); - rc = dav_alloc(pop, &off, size, type_num, flags, NULL, NULL); + rc = dav_alloc(pop, &off, size, type_num, NULL, NULL); if (rc) return UMOFF_NULL; return off; @@ -1277,6 +1326,251 @@ static umem_ops_t bmem_ops = { .mo_tx_add_callback = umem_tx_add_cb, }; +/** BMEM v2 operations (depends on dav) */ + +static int +bmem_tx_free_v2(struct umem_instance *umm, umem_off_t umoff) +{ + /* + * This free call could be on error cleanup code path where + * the transaction is already aborted due to previous failed + * pmemobj_tx call. Let's just skip it in this case. + * + * The reason we don't fix caller to avoid calling tx_free() + * in an aborted transaction is that the caller code could be + * shared by both transactional and non-transactional (where + * UMEM_CLASS_VMEM is used, see btree code) interfaces, and + * the explicit umem_free() on error cleanup is necessary for + * non-transactional case. + */ + if (dav_tx_stage_v2() == DAV_TX_STAGE_ONABORT) + return 0; + + if (!UMOFF_IS_NULL(umoff)) { + int rc; + + rc = dav_tx_free_v2(umem_off2offset(umoff)); + return rc ? umem_tx_errno(rc) : 0; + } + + return 0; +} + +static umem_off_t +bmem_tx_alloc_v2(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num, + unsigned int mbkt_id) +{ + uint64_t pflags = 0; + + get_slab(umm, &pflags, &size); + + if (flags & UMEM_FLAG_ZERO) + pflags |= DAV_FLAG_ZERO; + if (flags & UMEM_FLAG_NO_FLUSH) + pflags |= DAV_FLAG_NO_FLUSH; + if (mbkt_id != 0) + pflags |= DAV_EZONE_ID(mbkt_id); + return dav_tx_alloc_v2(size, type_num, pflags); +} + +static int +bmem_tx_add_v2(struct umem_instance *umm, umem_off_t umoff, + uint64_t offset, size_t size) +{ + int rc; + + rc = dav_tx_add_range_v2(umem_off2offset(umoff), size); + return rc ? umem_tx_errno(rc) : 0; +} + +static int +bmem_tx_xadd_v2(struct umem_instance *umm, umem_off_t umoff, uint64_t offset, + size_t size, uint64_t flags) +{ + int rc; + uint64_t pflags = 0; + + if (flags & UMEM_XADD_NO_SNAPSHOT) + pflags |= DAV_XADD_NO_SNAPSHOT; + + rc = dav_tx_xadd_range_v2(umem_off2offset(umoff), size, pflags); + return rc ? umem_tx_errno(rc) : 0; +} + + +static int +bmem_tx_add_ptr_v2(struct umem_instance *umm, void *ptr, size_t size) +{ + int rc; + + rc = dav_tx_add_range_direct_v2(ptr, size); + return rc ? umem_tx_errno(rc) : 0; +} + +static int +bmem_tx_abort_v2(struct umem_instance *umm, int err) +{ + /* + * obj_tx_abort() may have already been called in the error + * handling code of pmemobj APIs. + */ + if (dav_tx_stage_v2() != DAV_TX_STAGE_ONABORT) + dav_tx_abort_v2(err); + + err = dav_tx_end_v2(NULL); + return err ? umem_tx_errno(err) : 0; +} + +static int +bmem_tx_begin_v2(struct umem_instance *umm, struct umem_tx_stage_data *txd) +{ + int rc; + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + + if (txd != NULL) { + D_ASSERT(txd->txd_magic == UMEM_TX_DATA_MAGIC); + rc = dav_tx_begin_v2(pop, NULL, DAV_TX_PARAM_CB, pmem_stage_callback, + txd, DAV_TX_PARAM_NONE); + } else { + rc = dav_tx_begin_v2(pop, NULL, DAV_TX_PARAM_NONE); + } + + if (rc != 0) { + /* + * dav_tx_end() needs be called to re-initialize the + * tx state when dav_tx_begin() failed. + */ + rc = dav_tx_end_v2(NULL); + return rc ? umem_tx_errno(rc) : 0; + } + return 0; +} + +static int +bmem_tx_commit_v2(struct umem_instance *umm, void *data) +{ + int rc; + + dav_tx_commit_v2(); + rc = dav_tx_end_v2(data); + + return rc ? umem_tx_errno(rc) : 0; +} + +static int +bmem_tx_stage_v2(void) +{ + return dav_tx_stage_v2(); +} + +static void +bmem_defer_free_v2(struct umem_instance *umm, umem_off_t off, void *act) +{ + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + + dav_defer_free_v2(pop, umem_off2offset(off), + (struct dav_action *)act); +} + +static umem_off_t +bmem_reserve_v2(struct umem_instance *umm, void *act, size_t size, unsigned int type_num, + unsigned int mbkt_id) +{ + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + uint64_t flags = DAV_EZONE_ID(mbkt_id); + + return dav_reserve_v2(pop, (struct dav_action *)act, size, type_num, flags); +} + +static void +bmem_cancel_v2(struct umem_instance *umm, void *actv, int actv_cnt) +{ + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + + dav_cancel_v2(pop, (struct dav_action *)actv, actv_cnt); +} + +static int +bmem_tx_publish_v2(struct umem_instance *umm, void *actv, int actv_cnt) +{ + int rc; + + rc = dav_tx_publish_v2((struct dav_action *)actv, actv_cnt); + return rc ? umem_tx_errno(rc) : 0; +} + +static void * +bmem_atomic_copy_v2(struct umem_instance *umm, void *dest, const void *src, + size_t len, enum acopy_hint hint) +{ + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + + if (hint == UMEM_RESERVED_MEM) { + memcpy(dest, src, len); + return dest; + } else if (hint == UMEM_COMMIT_IMMEDIATE) { + return dav_memcpy_persist_v2(pop, dest, src, len); + } else { /* UMEM_COMMIT_DEFER */ + return dav_memcpy_persist_relaxed_v2(pop, dest, src, len); + } +} + +static umem_off_t +bmem_atomic_alloc_v2(struct umem_instance *umm, size_t size, unsigned int type_num, + unsigned int mbkt_id) +{ + uint64_t off; + dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + int rc; + uint64_t flags = DAV_EZONE_ID(mbkt_id); + + rc = dav_alloc_v2(pop, &off, size, type_num, flags, NULL, NULL); + if (rc) + return UMOFF_NULL; + return off; +} + +static int +bmem_atomic_free_v2(struct umem_instance *umm, umem_off_t umoff) +{ + if (!UMOFF_IS_NULL(umoff)) { + uint64_t off = umem_off2offset(umoff); + + dav_free_v2((dav_obj_t *)umm->umm_pool->up_priv, off); + } + return 0; +} + +static void +bmem_atomic_flush_v2(struct umem_instance *umm, void *addr, size_t len) +{ + /* REVISIT: We need to update the WAL with this info + * dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv; + * dav_flush(pop, addr, len); + */ +} + +static umem_ops_t bmem_v2_ops = { + .mo_tx_free = bmem_tx_free_v2, + .mo_tx_alloc = bmem_tx_alloc_v2, + .mo_tx_add = bmem_tx_add_v2, + .mo_tx_xadd = bmem_tx_xadd_v2, + .mo_tx_add_ptr = bmem_tx_add_ptr_v2, + .mo_tx_abort = bmem_tx_abort_v2, + .mo_tx_begin = bmem_tx_begin_v2, + .mo_tx_commit = bmem_tx_commit_v2, + .mo_tx_stage = bmem_tx_stage_v2, + .mo_reserve = bmem_reserve_v2, + .mo_defer_free = bmem_defer_free_v2, + .mo_cancel = bmem_cancel_v2, + .mo_tx_publish = bmem_tx_publish_v2, + .mo_atomic_copy = bmem_atomic_copy_v2, + .mo_atomic_alloc = bmem_atomic_alloc_v2, + .mo_atomic_free = bmem_atomic_free_v2, + .mo_atomic_flush = bmem_atomic_flush_v2, + .mo_tx_add_callback = umem_tx_add_cb, +}; + int umem_tx_errno(int err) { @@ -1366,6 +1660,11 @@ static struct umem_class umem_class_defined[] = { .umc_ops = &bmem_ops, .umc_name = "bmem", }, + { + .umc_id = UMEM_CLASS_BMEM_V2, + .umc_ops = &bmem_v2_ops, + .umc_name = "bmem_v2", + }, { .umc_id = UMEM_CLASS_ADMEM, .umc_ops = &ad_mem_ops, @@ -1415,6 +1714,11 @@ set_offsets(struct umem_instance *umm) umm->umm_base = (uint64_t)dav_get_base_ptr(dav_pop); break; + case UMEM_CLASS_BMEM_V2: + dav_pop = (dav_obj_t *)umm->umm_pool->up_priv; + + umm->umm_base = (uint64_t)dav_get_base_ptr_v2(dav_pop); + break; case UMEM_CLASS_ADMEM: bh.bh_blob = (struct ad_blob *)umm->umm_pool->up_priv; umm->umm_base = (uint64_t)ad_base(bh); @@ -1560,6 +1864,7 @@ umem_rsrvd_item_size(struct umem_instance *umm) case UMEM_CLASS_ADMEM: return sizeof(struct ad_reserv_act); case UMEM_CLASS_BMEM: + case UMEM_CLASS_BMEM_V2: return sizeof(struct dav_action); default: D_ERROR("bad umm_id %d\n", umm->umm_id); diff --git a/src/common/tests/umem_test_bmem.c b/src/common/tests/umem_test_bmem.c index 99cf620fe5d..fb963f9b21a 100644 --- a/src/common/tests/umem_test_bmem.c +++ b/src/common/tests/umem_test_bmem.c @@ -132,6 +132,7 @@ global_setup(void **state) print_message("Failed to set the md_on_ssd tunable\n"); return 1; } + ustore.store_type = umempobj_get_backend_type(); D_ALLOC_PTR(arg); if (arg == NULL) { diff --git a/src/include/daos/mem.h b/src/include/daos/mem.h index a06a9785270..db4af896c95 100644 --- a/src/include/daos/mem.h +++ b/src/include/daos/mem.h @@ -38,6 +38,7 @@ enum { DAOS_MD_PMEM = 0, DAOS_MD_BMEM = 1, DAOS_MD_ADMEM = 2, + DAOS_MD_BMEM_V2 = 3, }; /* return umem backend type */ @@ -271,6 +272,8 @@ typedef enum { UMEM_CLASS_BMEM, /** ad-hoc memory */ UMEM_CLASS_ADMEM, + /** blob backed memory v2 */ + UMEM_CLASS_BMEM_V2, /** unknown */ UMEM_CLASS_UNKNOWN, } umem_class_id_t; diff --git a/src/include/gurt/common.h b/src/include/gurt/common.h index cfce1a490ec..068e6d4c9b7 100644 --- a/src/include/gurt/common.h +++ b/src/include/gurt/common.h @@ -360,6 +360,7 @@ d_realpath(const char *path, char *resolved_path) _dalloc_; #define D_SPIN_LOCK(x) __D_PTHREAD(pthread_spin_lock, x) #define D_SPIN_UNLOCK(x) __D_PTHREAD(pthread_spin_unlock, x) #define D_MUTEX_LOCK(x) __D_PTHREAD(pthread_mutex_lock, x) +#define D_MUTEX_TRYLOCK(x) __D_PTHREAD_TRYLOCK(pthread_mutex_trylock, x) #define D_MUTEX_UNLOCK(x) __D_PTHREAD(pthread_mutex_unlock, x) #define D_RWLOCK_RDLOCK(x) __D_PTHREAD(pthread_rwlock_rdlock, x) #define D_RWLOCK_WRLOCK(x) __D_PTHREAD(pthread_rwlock_wrlock, x) diff --git a/utils/rpms/daos.rpmlintrc b/utils/rpms/daos.rpmlintrc index 2c905deda8e..889bb3b53f1 100644 --- a/utils/rpms/daos.rpmlintrc +++ b/utils/rpms/daos.rpmlintrc @@ -44,7 +44,7 @@ addFilter("E: static-library-without-debuginfo \/usr\/lib64\/lib(dfuse|ioil)\.a" # these need to be fixed: # https://daosio.atlassian.net/browse/DAOS-11539 -addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)).so") +addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)|dav_v2).so") # Tests rpm needs to be able to build daos from source so pulls in build deps and is expected. addFilter("daos-client-tests.x86_64: E: devel-dependency protobuf-c-devel") diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index e77db2c49f8..71c2c2902a6 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -15,7 +15,7 @@ Name: daos Version: 2.5.100 -Release: 9%{?relval}%{?dist} +Release: 10%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -451,6 +451,7 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent %{_libdir}/daos_srv/libplacement.so %{_libdir}/daos_srv/libpipeline.so %{_libdir}/libdaos_common_pmem.so +%{_libdir}/libdav_v2.so %config(noreplace) %{conf_dir}/vos_size_input.yaml %{_bindir}/daos_storage_estimator.py %{python3_sitearch}/storage_estimator/*.py @@ -585,6 +586,10 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent # No files in a shim package %changelog +* Wed Oct 16 2023 Sherin T George <sherin-t.george@hpe.com> 2.5.100-10 +- The modified DAV allocator with memory bucket support for md_on_ssd + phase-2 is delivered as dav_v2.so. + * Wed Aug 23 2023 Brian J. Murrell <brian.murrell@intel.com> 2.5.100-9 - Update fuse3 requirement to R: /usr/bin/fusermount3 by path rather than by package name, for portability and future-proofing diff --git a/utils/utest.yaml b/utils/utest.yaml index fd7580be142..211346090d4 100644 --- a/utils/utest.yaml +++ b/utils/utest.yaml @@ -114,6 +114,16 @@ sudo: True required_src: ["src/vos/tests/bio_ut.c"] tests: + - cmd: ["bin/vos_tests", "-A", "50"] + env_vars: + DAOS_MD_ON_SSD_MODE: "3" + aio: "AIO_7" + size: 13 + - cmd: ["bin/bio_ut"] + env_vars: + DAOS_MD_ON_SSD_MODE: "3" + aio: "AIO_7" + size: 4 - cmd: ["bin/vos_tests", "-A", "50"] aio: "AIO_7" size: 13