From 4a0bedeb3831fcfb1c177fe9ddc56a11a08887a5 Mon Sep 17 00:00:00 2001
From: Sherin T George <sherin-t.george@hpe.com>
Date: Mon, 16 Oct 2023 13:05:39 +0530
Subject: [PATCH] DAOS-14491: Retain support for phase-1 DAV heap

The phase-2 DAV allocator is placed under the subdirectory
src/common/dav_v2. This allocator is built as a standalone shared
library and linked to the libdaos_common_pmem library.
The umem will now support one more mode DAOS_MD_BMEM_V2. Setting
this mode in umem instance will result in using phase-2 DAV allocator
interfaces.

Signed-off-by: Sherin T George <sherin-t.george@hpe.com>
---
 debian/changelog                       |    7 +
 debian/daos-server.install             |    1 +
 src/common/SConscript                  |    3 +-
 src/common/dav/bucket.c                |   25 +-
 src/common/dav/bucket.h                |    8 +-
 src/common/dav/dav.h                   |  238 ++-
 src/common/dav/dav_iface.c             |    7 +
 src/common/dav/dav_internal.h          |    5 +
 src/common/dav/heap.c                  |  880 +++++++++--
 src/common/dav/heap.h                  |  108 +-
 src/common/dav/obj.h                   |    5 +-
 src/common/dav/palloc.c                |   70 +-
 src/common/dav/palloc.h                |  105 +-
 src/common/dav/recycler.c              |   26 +-
 src/common/dav/recycler.h              |    9 +-
 src/common/dav/tx.c                    |  350 +++--
 src/common/dav_v2/README.md            |    6 +
 src/common/dav_v2/SConscript           |   30 +
 src/common/dav_v2/alloc_class.c        |  647 +++++++++
 src/common/dav_v2/alloc_class.h        |   71 +
 src/common/dav_v2/bucket.c             |  275 ++++
 src/common/dav_v2/bucket.h             |   47 +
 src/common/dav_v2/container.h          |   44 +
 src/common/dav_v2/container_ravl.c     |  194 +++
 src/common/dav_v2/container_seglists.c |  175 +++
 src/common/dav_v2/critnib.c            |  678 +++++++++
 src/common/dav_v2/critnib.h            |   23 +
 src/common/dav_v2/dav_clogs.c          |  104 ++
 src/common/dav_v2/dav_clogs.h          |   56 +
 src/common/dav_v2/dav_iface.c          |  434 ++++++
 src/common/dav_v2/dav_internal.h       |   82 ++
 src/common/dav_v2/dav_v2.h             |  307 ++++
 src/common/dav_v2/heap.c               | 1398 ++++++++++++++++++
 src/common/dav_v2/heap.h               |   98 ++
 src/common/dav_v2/heap_layout.h        |  198 +++
 src/common/dav_v2/memblock.c           | 1615 +++++++++++++++++++++
 src/common/dav_v2/memblock.h           |  297 ++++
 src/common/dav_v2/memops.c             |  677 +++++++++
 src/common/dav_v2/memops.h             |   66 +
 src/common/dav_v2/mo_wal.h             |   95 ++
 src/common/dav_v2/obj.h                |   50 +
 src/common/dav_v2/out.h                |  104 ++
 src/common/dav_v2/palloc.c             |  977 +++++++++++++
 src/common/dav_v2/palloc.h             |  105 ++
 src/common/dav_v2/queue.h              |  112 ++
 src/common/dav_v2/ravl.c               |  613 ++++++++
 src/common/dav_v2/ravl.h               |   48 +
 src/common/dav_v2/ravl_interval.c      |  344 +++++
 src/common/dav_v2/ravl_interval.h      |   37 +
 src/common/dav_v2/recycler.c           |  323 +++++
 src/common/dav_v2/recycler.h           |   46 +
 src/common/dav_v2/stats.c              |   78 +
 src/common/dav_v2/stats.h              |   61 +
 src/common/dav_v2/sys_util.h           |   83 ++
 src/common/dav_v2/tx.c                 | 1855 ++++++++++++++++++++++++
 src/common/dav_v2/tx.h                 |   22 +
 src/common/dav_v2/ulog.c               |  695 +++++++++
 src/common/dav_v2/ulog.h               |  167 +++
 src/common/dav_v2/util.c               |  223 +++
 src/common/dav_v2/util.h               |  202 +++
 src/common/dav_v2/valgrind_internal.h  |  293 ++++
 src/common/dav_v2/vec.h                |  145 ++
 src/common/dav_v2/vecq.h               |  121 ++
 src/common/dav_v2/wal_tx.c             |  509 +++++++
 src/common/dav_v2/wal_tx.h             |   44 +
 src/common/mem.c                       |  321 +++-
 src/common/tests/umem_test_bmem.c      |    1 +
 src/include/daos/mem.h                 |    3 +
 src/include/gurt/common.h              |    1 +
 utils/rpms/daos.rpmlintrc              |    2 +-
 utils/rpms/daos.spec                   |    7 +-
 utils/utest.yaml                       |   10 +
 72 files changed, 16468 insertions(+), 598 deletions(-)
 create mode 100644 src/common/dav_v2/README.md
 create mode 100644 src/common/dav_v2/SConscript
 create mode 100644 src/common/dav_v2/alloc_class.c
 create mode 100644 src/common/dav_v2/alloc_class.h
 create mode 100644 src/common/dav_v2/bucket.c
 create mode 100644 src/common/dav_v2/bucket.h
 create mode 100644 src/common/dav_v2/container.h
 create mode 100644 src/common/dav_v2/container_ravl.c
 create mode 100644 src/common/dav_v2/container_seglists.c
 create mode 100644 src/common/dav_v2/critnib.c
 create mode 100644 src/common/dav_v2/critnib.h
 create mode 100644 src/common/dav_v2/dav_clogs.c
 create mode 100644 src/common/dav_v2/dav_clogs.h
 create mode 100644 src/common/dav_v2/dav_iface.c
 create mode 100644 src/common/dav_v2/dav_internal.h
 create mode 100644 src/common/dav_v2/dav_v2.h
 create mode 100644 src/common/dav_v2/heap.c
 create mode 100644 src/common/dav_v2/heap.h
 create mode 100644 src/common/dav_v2/heap_layout.h
 create mode 100644 src/common/dav_v2/memblock.c
 create mode 100644 src/common/dav_v2/memblock.h
 create mode 100644 src/common/dav_v2/memops.c
 create mode 100644 src/common/dav_v2/memops.h
 create mode 100644 src/common/dav_v2/mo_wal.h
 create mode 100644 src/common/dav_v2/obj.h
 create mode 100644 src/common/dav_v2/out.h
 create mode 100644 src/common/dav_v2/palloc.c
 create mode 100644 src/common/dav_v2/palloc.h
 create mode 100644 src/common/dav_v2/queue.h
 create mode 100644 src/common/dav_v2/ravl.c
 create mode 100644 src/common/dav_v2/ravl.h
 create mode 100644 src/common/dav_v2/ravl_interval.c
 create mode 100644 src/common/dav_v2/ravl_interval.h
 create mode 100644 src/common/dav_v2/recycler.c
 create mode 100644 src/common/dav_v2/recycler.h
 create mode 100644 src/common/dav_v2/stats.c
 create mode 100644 src/common/dav_v2/stats.h
 create mode 100644 src/common/dav_v2/sys_util.h
 create mode 100644 src/common/dav_v2/tx.c
 create mode 100644 src/common/dav_v2/tx.h
 create mode 100644 src/common/dav_v2/ulog.c
 create mode 100644 src/common/dav_v2/ulog.h
 create mode 100644 src/common/dav_v2/util.c
 create mode 100644 src/common/dav_v2/util.h
 create mode 100644 src/common/dav_v2/valgrind_internal.h
 create mode 100644 src/common/dav_v2/vec.h
 create mode 100644 src/common/dav_v2/vecq.h
 create mode 100644 src/common/dav_v2/wal_tx.c
 create mode 100644 src/common/dav_v2/wal_tx.h

diff --git a/debian/changelog b/debian/changelog
index 44c317daedd..806a855fbe4 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+daos (2.5.100-10) unstable; urgency=medium
+
+  [ Sherin T George ]
+  * Add DAV v2 lib
+
+ -- Sherin T George <sherin-t.george@hpe.com>  Mon, 16 Oct 2023 11:54:00 +0530
+
 daos (2.5.100-9) unstable; urgency=medium
 
   [ Brian J. Murrell ]
diff --git a/debian/daos-server.install b/debian/daos-server.install
index e34d8e92ae8..77ad815788a 100644
--- a/debian/daos-server.install
+++ b/debian/daos-server.install
@@ -25,6 +25,7 @@ usr/lib64/daos_srv/libbio.so
 usr/lib64/daos_srv/libplacement.so
 usr/lib64/daos_srv/libpipeline.so
 usr/lib64/libdaos_common_pmem.so
+usr/lib64/libdav_v2.so
 usr/share/daos/control/setup_spdk.sh
 usr/lib/systemd/system/daos_server.service
 usr/lib/sysctl.d/10-daos_server.conf
diff --git a/src/common/SConscript b/src/common/SConscript
index c61ecdeebe3..ca19d27e94a 100644
--- a/src/common/SConscript
+++ b/src/common/SConscript
@@ -30,7 +30,7 @@ def build_daos_common(denv, client):
                    'dav/ravl_interval.c', 'dav/recycler.c', 'dav/stats.c', 'dav/tx.c', 'dav/ulog.c',
                    'dav/util.c', 'dav/wal_tx.c']
         ad_mem_files = ['ad_mem.c', 'ad_tx.c']
-        common_libs.extend(['pmemobj', 'abt'])
+        common_libs.extend(['pmemobj', 'abt', 'dav_v2'])
         benv.AppendUnique(RPATH_FULL=['$PREFIX/lib64/daos_srv'])
         benv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD'])
         benv.Append(OBJPREFIX="v_")
@@ -53,6 +53,7 @@ def scons():
     """Execute build"""
     Import('env', 'base_env', 'prereqs')
 
+    SConscript('dav_v2/SConscript')
     env.AppendUnique(LIBPATH=[Dir('.')])
     base_env.AppendUnique(LIBPATH=[Dir('.')])
     base_env.d_add_build_rpath()
diff --git a/src/common/dav/bucket.c b/src/common/dav/bucket.c
index d3a975a5f26..8df41288a13 100644
--- a/src/common/dav/bucket.c
+++ b/src/common/dav/bucket.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2023, Intel Corporation */
+/* Copyright 2015-2022, Intel Corporation */
 
 /*
  * bucket.c -- bucket implementation
@@ -23,13 +23,15 @@
 
 struct bucket {
 	/* this struct is both the lock guard and the locked state */
-	struct bucket_locked             *locked;
-	struct alloc_class               *aclass;
-	struct block_container           *container;
+	struct bucket_locked *locked;
+
+	struct alloc_class *aclass;
+
+	struct block_container *container;
 	const struct block_container_ops *c_ops;
-	struct memory_block_reserved     *active_memory_block;
-	struct zoneset                   *zset;
-	int                               is_active;
+
+	struct memory_block_reserved *active_memory_block;
+	int is_active;
 };
 
 struct bucket_locked {
@@ -75,7 +77,7 @@ bucket_fini(struct bucket *b)
  * bucket_locked_new -- creates a new locked bucket instance
  */
 struct bucket_locked *
-bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct zoneset *zset)
+bucket_locked_new(struct block_container *c, struct alloc_class *aclass)
 {
 	ASSERTne(c, NULL);
 
@@ -90,7 +92,6 @@ bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct
 
 	util_mutex_init(&b->lock);
 	b->bucket.locked = b;
-	b->bucket.zset   = zset;
 
 	return b;
 
@@ -267,9 +268,3 @@ bucket_active_block(struct bucket *b)
 {
 	return b->is_active ? b->active_memory_block : NULL;
 }
-
-struct zoneset *
-bucket_get_zoneset(struct bucket *b)
-{
-	return b->zset;
-}
diff --git a/src/common/dav/bucket.h b/src/common/dav/bucket.h
index b0d92b66995..aadc6e714fc 100644
--- a/src/common/dav/bucket.h
+++ b/src/common/dav/bucket.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2023, Intel Corporation */
+/* Copyright 2015-2021, Intel Corporation */
 
 /*
  * bucket.h -- internal definitions for bucket
@@ -21,8 +21,8 @@
 struct bucket_locked;
 struct bucket;
 
-struct bucket_locked *
-bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct zoneset *zset);
+struct bucket_locked *bucket_locked_new(struct block_container *c,
+					struct alloc_class *aclass);
 
 struct bucket *bucket_acquire(struct bucket_locked *b);
 void bucket_release(struct bucket *b);
@@ -41,7 +41,5 @@ int bucket_detach_run(struct bucket *b,
 struct memory_block_reserved *bucket_active_block(struct bucket *b);
 
 void bucket_locked_delete(struct bucket_locked *b);
-struct zoneset *
-bucket_get_zoneset(struct bucket *b);
 
 #endif /* __DAOS_COMMON_BUCKET_H */
diff --git a/src/common/dav/dav.h b/src/common/dav/dav.h
index b505d739f8a..40af0351af3 100644
--- a/src/common/dav/dav.h
+++ b/src/common/dav/dav.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2023, Intel Corporation */
+/* Copyright 2015-2022, Intel Corporation */
 
 /*
  * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV)
@@ -16,42 +16,52 @@
 /*
  * allocation functions flags
  */
-#define DAV_FLAG_ZERO               (((uint64_t)1) << 0)
-#define DAV_FLAG_NO_FLUSH           (((uint64_t)1) << 1)
-#define DAV_FLAG_NO_SNAPSHOT        (((uint64_t)1) << 2)
-#define DAV_FLAG_ASSUME_INITIALIZED (((uint64_t)1) << 3)
-#define DAV_FLAG_TX_NO_ABORT        (((uint64_t)1) << 4)
-
-#define DAV_CLASS_ID(id)            (((uint64_t)(id)) << 48)
-#define DAV_EZONE_ID(id)            (((uint64_t)(id)) << 16)
-
-#define DAV_XALLOC_CLASS_MASK       ((((uint64_t)1 << 16) - 1) << 48)
-#define DAV_XALLOC_EZONE_MASK       ((((uint64_t)1 << 32) - 1) << 16)
-#define DAV_XALLOC_ZERO             DAV_FLAG_ZERO
-#define DAV_XALLOC_NO_FLUSH         DAV_FLAG_NO_FLUSH
-#define DAV_XALLOC_NO_ABORT         DAV_FLAG_TX_NO_ABORT
-
-#define DAV_TX_XALLOC_VALID_FLAGS                                                                  \
-	(DAV_XALLOC_ZERO | DAV_XALLOC_NO_FLUSH | DAV_XALLOC_EZONE_MASK | DAV_XALLOC_CLASS_MASK |   \
-	 DAV_XALLOC_NO_ABORT)
-
-#define DAV_XADD_NO_FLUSH           DAV_FLAG_NO_FLUSH
-#define DAV_XADD_NO_SNAPSHOT        DAV_FLAG_NO_SNAPSHOT
-#define DAV_XADD_ASSUME_INITIALIZED DAV_FLAG_ASSUME_INITIALIZED
-#define DAV_XADD_NO_ABORT           DAV_FLAG_TX_NO_ABORT
-#define DAV_XADD_VALID_FLAGS                                                                       \
-	(DAV_XADD_NO_FLUSH | DAV_XADD_NO_SNAPSHOT | DAV_XADD_ASSUME_INITIALIZED | DAV_XADD_NO_ABORT)
+#define DAV_FLAG_ZERO			(((uint64_t)1) << 0)
+#define DAV_FLAG_NO_FLUSH		(((uint64_t)1) << 1)
+#define DAV_FLAG_NO_SNAPSHOT		(((uint64_t)1) << 2)
+#define DAV_FLAG_ASSUME_INITIALIZED	(((uint64_t)1) << 3)
+#define DAV_FLAG_TX_NO_ABORT		(((uint64_t)1) << 4)
+
+#define DAV_CLASS_ID(id)		(((uint64_t)(id)) << 48)
+#ifdef	DAV_V2_BUILD
+#define DAV_EZONE_ID(id)		(((uint64_t)(id)) << 16)
+#endif	/* DAV_V2_BUILD */
+
+#define DAV_XALLOC_CLASS_MASK		((((uint64_t)1 << 16) - 1) << 48)
+#ifdef	DAV_V2_BUILD
+#define DAV_XALLOC_EZONE_MASK		((((uint64_t)1 << 16) - 1) << 32)
+#else	/* DAV_V2_BUILD */
+#define DAV_XALLOC_EZONE_MASK		0
+#endif	/* DAV_V2_BUILD */
+#define DAV_XALLOC_ZERO			DAV_FLAG_ZERO
+#define DAV_XALLOC_NO_FLUSH		DAV_FLAG_NO_FLUSH
+#define DAV_XALLOC_NO_ABORT		DAV_FLAG_TX_NO_ABORT
+
+#define DAV_TX_XALLOC_VALID_FLAGS	(DAV_XALLOC_ZERO |\
+					DAV_XALLOC_NO_FLUSH |\
+					DAV_XALLOC_EZONE_MASK |\
+					DAV_XALLOC_CLASS_MASK |\
+					DAV_XALLOC_NO_ABORT)
+
+#define DAV_XADD_NO_FLUSH		DAV_FLAG_NO_FLUSH
+#define DAV_XADD_NO_SNAPSHOT		DAV_FLAG_NO_SNAPSHOT
+#define DAV_XADD_ASSUME_INITIALIZED	DAV_FLAG_ASSUME_INITIALIZED
+#define DAV_XADD_NO_ABORT		DAV_FLAG_TX_NO_ABORT
+#define DAV_XADD_VALID_FLAGS		(DAV_XADD_NO_FLUSH |\
+					DAV_XADD_NO_SNAPSHOT |\
+					DAV_XADD_ASSUME_INITIALIZED |\
+					DAV_XADD_NO_ABORT)
 
 /*
  * WAL Redo hints.
  */
-#define DAV_XADD_WAL_CPTR     (((uint64_t)1) << 5)
+#define DAV_XADD_WAL_CPTR		(((uint64_t)1) << 5)
 
-#define DAV_XLOCK_NO_ABORT    DAV_FLAG_TX_NO_ABORT
-#define DAV_XLOCK_VALID_FLAGS (DAV_XLOCK_NO_ABORT)
+#define DAV_XLOCK_NO_ABORT	DAV_FLAG_TX_NO_ABORT
+#define DAV_XLOCK_VALID_FLAGS	(DAV_XLOCK_NO_ABORT)
 
-#define DAV_XFREE_NO_ABORT    DAV_FLAG_TX_NO_ABORT
-#define DAV_XFREE_VALID_FLAGS (DAV_XFREE_NO_ABORT)
+#define DAV_XFREE_NO_ABORT	DAV_FLAG_TX_NO_ABORT
+#define DAV_XFREE_VALID_FLAGS	(DAV_XFREE_NO_ABORT)
 
 typedef struct dav_obj dav_obj_t;
 struct umem_store;
@@ -73,7 +83,8 @@ struct umem_store;
  *			it returns NULL with errno set appropriately.
  */
 dav_obj_t *
-dav_obj_create(const char *path, int flags, size_t sz, mode_t mode, struct umem_store *store);
+dav_obj_create(const char *path, int flags, size_t sz, mode_t mode,
+	       struct umem_store *store);
 
 /**
  * Open and initialize a DAV object and return its handle.
@@ -117,9 +128,8 @@ typedef int (*dav_constr)(dav_obj_t *pop, void *ptr, void *arg);
  * initialized, or if it's interrupted before the constructor completes, the
  * memory reserved for the object is automatically reclaimed.
  */
-int
-dav_alloc(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags,
-	   dav_constr constructor, void *arg);
+int dav_alloc(dav_obj_t *pop, uint64_t *offp, size_t size,
+	      uint64_t type_num, dav_constr constructor, void *arg);
 
 /**
  * Frees the memory at specified offset within the DAV object pointed to by hdl.
@@ -135,13 +145,13 @@ dav_free(dav_obj_t *pop, uint64_t off);
 /*
  * DAV version of memcpy. Data copied is made persistent in blob.
  */
-void *
-dav_memcpy_persist(dav_obj_t *pop, void *dest, const void *src, size_t len);
+void *dav_memcpy_persist(dav_obj_t *pop, void *dest, const void *src,
+			 size_t len);
 /*
  * DAV version of memcpy with deferred commit to blob.
  */
-void *
-dav_memcpy_persist_relaxed(dav_obj_t *pop, void *dest, const void *src, size_t len);
+void *dav_memcpy_persist_relaxed(dav_obj_t *pop, void *dest, const void *src,
+				 size_t len);
 
 /*
  * If called for the first time on a newly created dav heap, the root object
@@ -152,8 +162,8 @@ dav_memcpy_persist_relaxed(dav_obj_t *pop, void *dest, const void *src, size_t l
  *
  * This function is currently *not* thread-safe.
  */
-uint64_t
-dav_root(dav_obj_t *pop, size_t size);
+uint64_t dav_root(dav_obj_t *pop, size_t size);
+
 
 /*
  * Transactions
@@ -163,22 +173,23 @@ dav_root(dav_obj_t *pop, size_t size);
  * the dav_tx_begin function.
  */
 enum dav_tx_stage {
-	DAV_TX_STAGE_NONE,     /* no transaction in this thread */
-	DAV_TX_STAGE_WORK,     /* transaction in progress */
-	DAV_TX_STAGE_ONCOMMIT, /* successfully committed */
-	DAV_TX_STAGE_ONABORT,  /* tx_begin failed or transaction aborted */
-	DAV_TX_STAGE_FINALLY,  /* always called */
+	DAV_TX_STAGE_NONE,	/* no transaction in this thread */
+	DAV_TX_STAGE_WORK,	/* transaction in progress */
+	DAV_TX_STAGE_ONCOMMIT,	/* successfully committed */
+	DAV_TX_STAGE_ONABORT,	/* tx_begin failed or transaction aborted */
+	DAV_TX_STAGE_FINALLY,	/* always called */
 
 	DAV_MAX_TX_STAGE
 };
 
-typedef void (*dav_tx_callback)(dav_obj_t *pop, enum dav_tx_stage stage, void *);
+typedef void (*dav_tx_callback)(dav_obj_t *pop, enum dav_tx_stage stage,
+	       void *);
 
 enum dav_tx_param {
 	DAV_TX_PARAM_NONE,
-	DAV_TX_PARAM_UNUSED1, /* For parity with libpmemobj */
-	DAV_TX_PARAM_UNUSED2, /* For parity with libpmemobj */
-	DAV_TX_PARAM_CB,      /* dav_tx_callback cb, void *arg */
+	DAV_TX_PARAM_UNUSED1,	/* For parity with libpmemobj */
+	DAV_TX_PARAM_UNUSED2,	/* For parity with libpmemobj */
+	DAV_TX_PARAM_CB,	/* dav_tx_callback cb, void *arg */
 };
 
 /*
@@ -189,8 +200,7 @@ enum dav_tx_param {
  * returns zero. Otherwise, stage changes to TX_STAGE_ONABORT and an error
  * number is returned.
  */
-int
-dav_tx_begin(dav_obj_t *pop, jmp_buf env, ...);
+int dav_tx_begin(dav_obj_t *pop, jmp_buf env, ...);
 
 /*
  * Aborts current transaction
@@ -199,16 +209,14 @@ dav_tx_begin(dav_obj_t *pop, jmp_buf env, ...);
  *
  * This function must be called during TX_STAGE_WORK.
  */
-void
-dav_tx_abort(int errnum);
+void dav_tx_abort(int errnum);
 
 /*
  * Commits current transaction
  *
  * This function must be called during TX_STAGE_WORK.
  */
-void
-dav_tx_commit(void);
+void dav_tx_commit(void);
 
 /*
  * Cleanups current transaction. Must always be called after dav_tx_begin,
@@ -223,38 +231,52 @@ dav_tx_commit(void);
  *
  * This function must *not* be called during TX_STAGE_WORK.
  */
-int
-dav_tx_end(void *data);
+int dav_tx_end(void *data);
 
 /*
  * Returns the current stage of the transaction.
  */
-enum dav_tx_stage
-dav_tx_stage(void);
+enum dav_tx_stage dav_tx_stage(void);
 
 /*
  * Returns last transaction error code.
  */
-int
-dav_tx_errno(void);
+int dav_tx_errno(void);
+
+/*
+ * Transactionally allocates a new object.
+ *
+ * If successful, returns PMEMoid.
+ * Otherwise, stage changes to TX_STAGE_ONABORT and an OID_NULL is returned.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+uint64_t dav_tx_alloc(size_t size, uint64_t type_num);
 
 /*
  * Transactionally allocates a new object.
  *
- * If successful, returns offset of the object in the heap.
- * Otherwise, stage changes to TX_STAGE_ONABORT and an zero is returned.
+ * If successful, returns PMEMoid.
+ * Otherwise, stage changes to TX_STAGE_ONABORT and an OID_NULL is returned.
  * 'Flags' is a bitmask of the following values:
  *  - POBJ_XALLOC_ZERO - zero the allocated object
  *  - POBJ_XALLOC_NO_FLUSH - skip flush on commit
  *  - POBJ_XALLOC_NO_ABORT - if the function does not end successfully,
- *  - DAV_CLASS_ID(id)	   - id of allocation class to use.
- *  - DAV_EZONE_ID(id)	   - id of zone to use.
  *  do not abort the transaction and return the error number.
  *
  * This function must be called during TX_STAGE_WORK.
  */
-uint64_t
-dav_tx_alloc(size_t size, uint64_t type_num, uint64_t flags);
+uint64_t dav_tx_xalloc(size_t size, uint64_t type_num, uint64_t flags);
+
+/*
+ * Transactionally allocates new zeroed object.
+ *
+ * If successful, returns PMEMoid.
+ * Otherwise, stage changes to TX_STAGE_ONABORT and an OID_NULL is returned.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+uint64_t dav_tx_zalloc(size_t size, uint64_t type_num);
 
 /*
  * Transactionally frees an existing object.
@@ -264,8 +286,7 @@ dav_tx_alloc(size_t size, uint64_t type_num, uint64_t flags);
  *
  * This function must be called during TX_STAGE_WORK.
  */
-int
-dav_tx_free(uint64_t off);
+int dav_tx_free(uint64_t off);
 
 /*
  * Takes a "snapshot" of the memory block of given size and located at given
@@ -279,8 +300,7 @@ dav_tx_free(uint64_t off);
  *
  * This function must be called during TX_STAGE_WORK.
  */
-int
-dav_tx_add_range(uint64_t off, size_t size);
+int dav_tx_add_range(uint64_t off, size_t size);
 
 /*
  * Takes a "snapshot" of the given memory region and saves it in the undo log.
@@ -294,8 +314,7 @@ dav_tx_add_range(uint64_t off, size_t size);
  *
  * This function must be called during TX_STAGE_WORK.
  */
-int
-dav_tx_add_range_direct(const void *ptr, size_t size);
+int dav_tx_add_range_direct(const void *ptr, size_t size);
 
 /*
  * Behaves exactly the same as dav_tx_add_range when 'flags' equals 0.
@@ -306,8 +325,7 @@ dav_tx_add_range_direct(const void *ptr, size_t size);
  *  - POBJ_XADD_NO_ABORT - if the function does not end successfully,
  *  do not abort the transaction and return the error number.
  */
-int
-dav_tx_xadd_range(uint64_t off, size_t size, uint64_t flags);
+int dav_tx_xadd_range(uint64_t off, size_t size, uint64_t flags);
 
 /*
  * Behaves exactly the same as dav_tx_add_range_direct when 'flags' equals
@@ -318,15 +336,13 @@ dav_tx_xadd_range(uint64_t off, size_t size, uint64_t flags);
  *  - POBJ_XADD_NO_ABORT - if the function does not end successfully,
  *  do not abort the transaction and return the error number.
  */
-int
-dav_tx_xadd_range_direct(const void *ptr, size_t size, uint64_t flags);
+int dav_tx_xadd_range_direct(const void *ptr, size_t size, uint64_t flags);
 
 /*
  * Converts the offset to a pointer in the context of heap associated with
  * current transaction.
  */
-void *
-dav_tx_off2ptr(uint64_t off);
+void *dav_tx_off2ptr(uint64_t off);
 
 enum dav_action_type {
 	/* a heap action (e.g., alloc) */
@@ -353,27 +369,18 @@ struct dav_action {
 	 * This structure should NEVER be stored on persistent memory!
 	 */
 	enum dav_action_type type;
-	uint32_t             data[3];
+	uint32_t data[3];
 	union {
 		struct dav_action_heap heap;
-		uint64_t               data2[14];
+		uint64_t data2[14];
 	};
 };
 
-#define DAV_ACTION_XRESERVE_VALID_FLAGS                                                            \
-	(DAV_XALLOC_CLASS_MASK | DAV_XALLOC_EZONE_MASK | DAV_XALLOC_ZERO)
-
-uint64_t
-dav_reserve(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num,
-	     uint64_t flags);
-void
-dav_defer_free(dav_obj_t *pop, uint64_t off, struct dav_action *act);
-int
-dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt);
-void
-dav_cancel(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt);
-int
-dav_tx_publish(struct dav_action *actv, size_t actvcnt);
+uint64_t dav_reserve(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num);
+void dav_defer_free(dav_obj_t *pop, uint64_t off, struct dav_action *act);
+int dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt);
+void dav_cancel(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt);
+int dav_tx_publish(struct dav_action *actv, size_t actvcnt);
 
 /*
  * Allocation class interface
@@ -456,7 +463,7 @@ struct dav_alloc_class_desc {
 	 * containing 256 bytes that spans two units. The usable size of that
 	 * allocation will be 240 bytes: 2 * 128 - 16 (header).
 	 */
-	size_t               unit_size;
+	size_t unit_size;
 
 	/*
 	 * Desired alignment of objects from the allocation class.
@@ -467,7 +474,7 @@ struct dav_alloc_class_desc {
 	 * compact one this means that the alignment is 48 bytes.
 	 *
 	 */
-	size_t               alignment;
+	size_t alignment;
 
 	/*
 	 * The minimum number of units that must be present in a
@@ -480,7 +487,7 @@ struct dav_alloc_class_desc {
 	 * allocate, but blocks do go back to the global heap if they are no
 	 * longer actively used for allocation.
 	 */
-	unsigned             units_per_block;
+	unsigned units_per_block;
 
 	/*
 	 * The header of allocations that originate from this allocation class.
@@ -490,14 +497,14 @@ struct dav_alloc_class_desc {
 	/*
 	 * The identifier of this allocation class.
 	 */
-	unsigned             class_id;
+	unsigned class_id;
 };
 
 /*
  * Registers an allocation class handle with the DAV object.
  */
-int
-dav_class_register(dav_obj_t *pop, struct dav_alloc_class_desc *p);
+int dav_class_register(dav_obj_t *pop, struct dav_alloc_class_desc *p);
+
 
 struct dav_heap_stats {
 	uint64_t curr_allocated;
@@ -508,30 +515,13 @@ struct dav_heap_stats {
  * Returns the heap allocation statistics associated  with the
  * DAV object.
  */
-int
-dav_get_heap_stats(dav_obj_t *pop, struct dav_heap_stats *st);
+int dav_get_heap_stats(dav_obj_t *pop, struct dav_heap_stats *st);
 
 struct umem_wal_tx;
 
-uint32_t
-wal_tx_act_nr(struct umem_wal_tx *tx);
-uint32_t
-wal_tx_payload_len(struct umem_wal_tx *tx);
-struct umem_action *
-wal_tx_act_first(struct umem_wal_tx *tx);
-struct umem_action *
-wal_tx_act_next(struct umem_wal_tx *tx);
-
-/**
- * Get an evictable zone with sufficient free space within.
- *
- * \param[in]		pop		pool handle
- * \param[in]		flags		zone selection criteria.
- *
- * \return id >= 0. Zero indicates non-evictable zone and will be
- *	returned if no evictable zone can be chosen.
- */
-uint32_t
-dav_get_zone_evictable(dav_obj_t *pop, int flags);
+uint32_t wal_tx_act_nr(struct umem_wal_tx *tx);
+uint32_t wal_tx_payload_len(struct umem_wal_tx *tx);
+struct umem_action *wal_tx_act_first(struct umem_wal_tx *tx);
+struct umem_action *wal_tx_act_next(struct umem_wal_tx *tx);
 
 #endif /* __DAOS_COMMON_DAV_H */
diff --git a/src/common/dav/dav_iface.c b/src/common/dav/dav_iface.c
index c1686570390..36d3c17a162 100644
--- a/src/common/dav/dav_iface.c
+++ b/src/common/dav/dav_iface.c
@@ -180,6 +180,13 @@ dav_obj_open_internal(int fd, int flags, size_t sz, const char *path, struct ume
 		palloc_heap_vg_open(hdl->do_heap, 1);
 #endif
 
+	rc = heap_buckets_init(hdl->do_heap);
+	if (rc) {
+		err = rc;
+		heap_cleanup(hdl->do_heap);
+		goto out2;
+	}
+
 	rc = dav_create_clogs(hdl);
 	if (rc) {
 		err = rc;
diff --git a/src/common/dav/dav_internal.h b/src/common/dav/dav_internal.h
index 9c9b263c494..0f8ddff5916 100644
--- a/src/common/dav/dav_internal.h
+++ b/src/common/dav/dav_internal.h
@@ -28,6 +28,11 @@ enum dav_stats_enabled {
 	DAV_STATS_DISABLED,
 };
 
+enum dav_arenas_assignment_type {
+	DAV_ARENAS_ASSIGNMENT_THREAD_KEY,
+	DAV_ARENAS_ASSIGNMENT_GLOBAL,
+};
+
 #define	DAV_PHDR_SIZE	4096
 
 /* DAV header data that will be persisted */
diff --git a/src/common/dav/heap.c b/src/common/dav/heap.c
index 6e4aec10edf..4384fe40f8c 100644
--- a/src/common/dav/heap.c
+++ b/src/common/dav/heap.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2023, Intel Corporation */
+/* Copyright 2015-2022, Intel Corporation */
 
 /*
  * heap.c -- heap implementation
@@ -31,70 +31,126 @@
  * This is the value by which the heap might grow once we hit an OOM.
  */
 #define HEAP_DEFAULT_GROW_SIZE (1 << 27) /* 128 megabytes */
+#define MAX_DEFAULT_ARENAS (1 << 10) /* 1024 arenas */
 
-/*
- * zoneset stores the collection of buckets and recyclers for allocation classes.
- * Each evictable zone is assigned a zoneset during first allocation.
- */
-struct zoneset {
-	uint32_t              zset_id;
-	uint32_t              padding;
-	struct bucket_locked *default_bucket;                  /* bucket for free chunks */
-	struct bucket_locked *buckets[MAX_ALLOCATION_CLASSES]; /* one bucket per allocation class */
-	struct recycler      *recyclers[MAX_ALLOCATION_CLASSES];
+enum dav_arenas_assignment_type Default_arenas_assignment_type =
+	DAV_ARENAS_ASSIGNMENT_GLOBAL;
+
+size_t Default_arenas_max;
+
+struct arenas_thread_assignment {
+	enum dav_arenas_assignment_type type;
+	union {
+		os_tls_key_t thread;
+		struct arena *global;
+	};
 };
 
-struct heap_rt {
-	struct alloc_class_collection *alloc_classes;
-	struct zoneset                *default_zset;
-	struct zoneset               **evictable_zsets;
-	os_mutex_t                     run_locks[MAX_RUN_LOCKS];
-	unsigned                       nlocks;
-	unsigned                       nzones;
-	unsigned                       zones_exhausted;
+struct arenas {
+	VEC(, struct arena *) vec;
+	size_t nactive;
+
+	/*
+	 * When nesting with other locks, this one must be acquired first,
+	 * prior to locking any buckets or memory blocks.
+	 */
+	os_mutex_t lock;
+
+	/* stores a pointer to one of the arenas */
+	struct arenas_thread_assignment assignment;
 };
 
 /*
- * heap_get_zoneset - returns the reference to the zoneset given
- *		      zone or zoneset id.
+ * Arenas store the collection of buckets for allocation classes.
+ * Each thread is assigned an arena on its first allocator operation
+ * if arena is set to auto.
  */
-struct zoneset *
-heap_get_zoneset(struct palloc_heap *heap, uint32_t zone_id)
-{
-	/* REVISIT:
-	 * Implement the code for evictable zonesets.
+struct arena {
+	/* one bucket per allocation class */
+	struct bucket_locked *buckets[MAX_ALLOCATION_CLASSES];
+
+	/*
+	 * Decides whether the arena can be
+	 * automatically assigned to a thread.
 	 */
-	return heap->rt->default_zset;
-}
+	int automatic;
+	size_t nthreads;
+	struct arenas *arenas;
+};
+
+struct heap_rt {
+	struct alloc_class_collection *alloc_classes;
+
+	struct bucket_locked *default_bucket;
+
+	struct arenas arenas;
+
+	struct recycler *recyclers[MAX_ALLOCATION_CLASSES];
+
+	os_mutex_t run_locks[MAX_RUN_LOCKS];
+	unsigned nlocks;
+
+	unsigned nzones;
+	unsigned zones_exhausted;
+};
 
 /*
- * heap_get_recycler - (internal) retrieves the recycler instance from the zoneset with
+ * heap_get_recycler - (internal) retrieves the recycler instance with
  *	the corresponding class id. Initializes the recycler if needed.
+ *
  */
 static struct recycler *
-heap_get_recycler(struct palloc_heap *heap, struct zoneset *zset, size_t id, size_t nallocs)
+heap_get_recycler(struct palloc_heap *heap, size_t id, size_t nallocs)
 {
 	struct recycler *r;
 
-	D_ASSERT(zset != NULL);
-	util_atomic_load_explicit64(&zset->recyclers[id], &r, memory_order_acquire);
+	util_atomic_load_explicit64(&heap->rt->recyclers[id], &r,
+		memory_order_acquire);
+
 	if (r != NULL)
 		return r;
 
-	r = recycler_new(heap, nallocs, zset);
-	if (r && !util_bool_compare_and_swap64(&zset->recyclers[id], NULL, r)) {
+	r = recycler_new(heap, nallocs,
+		&heap->rt->arenas.nactive);
+	if (r && !util_bool_compare_and_swap64(&heap->rt->recyclers[id],
+		NULL, r)) {
 		/*
 		 * If a different thread succeeded in assigning the recycler
 		 * first, the recycler this thread created needs to be deleted.
 		 */
 		recycler_delete(r);
 
-		return heap_get_recycler(heap, zset, id, nallocs);
+		return heap_get_recycler(heap, id, nallocs);
 	}
 
 	return r;
 }
 
+/*
+ * heap_arenas_init - (internal) initialize generic arenas info
+ */
+static int
+heap_arenas_init(struct arenas *arenas)
+{
+	util_mutex_init(&arenas->lock);
+	VEC_INIT(&arenas->vec);
+	arenas->nactive = 0;
+
+	if (VEC_RESERVE(&arenas->vec, MAX_DEFAULT_ARENAS) == -1)
+		return -1;
+	return 0;
+}
+
+/*
+ * heap_arenas_fini - (internal) destroy generic arenas info
+ */
+static void
+heap_arenas_fini(struct arenas *arenas)
+{
+	util_mutex_destroy(&arenas->lock);
+	VEC_DELETE(&arenas->vec);
+}
+
 /*
  * heap_alloc_classes -- returns the allocation classes collection
  */
@@ -104,6 +160,58 @@ heap_alloc_classes(struct palloc_heap *heap)
 	return heap->rt ? heap->rt->alloc_classes : NULL;
 }
 
+/*
+ * heap_arena_delete -- (internal) destroys arena instance
+ */
+static void
+heap_arena_delete(struct arena *arena)
+{
+	for (int i = 0; i < MAX_ALLOCATION_CLASSES; ++i)
+		if (arena->buckets[i] != NULL)
+			bucket_locked_delete(arena->buckets[i]);
+	D_FREE(arena);
+}
+
+/*
+ * heap_arena_new -- (internal) initializes arena instance
+ */
+static struct arena *
+heap_arena_new(struct palloc_heap *heap, int automatic)
+{
+	struct heap_rt *rt = heap->rt;
+	struct arena *arena;
+
+	D_ALLOC_PTR(arena);
+	if (arena == NULL) {
+		D_CRIT("!heap: arena malloc error\n");
+		return NULL;
+	}
+	arena->nthreads = 0;
+	arena->automatic = automatic;
+	arena->arenas = &heap->rt->arenas;
+
+	COMPILE_ERROR_ON(MAX_ALLOCATION_CLASSES > UINT8_MAX);
+	for (uint8_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		struct alloc_class *ac =
+			alloc_class_by_id(rt->alloc_classes, i);
+		if (ac != NULL) {
+			arena->buckets[i] =
+				bucket_locked_new(container_new_seglists(heap),
+					ac);
+			if (arena->buckets[i] == NULL)
+				goto error_bucket_create;
+		} else {
+			arena->buckets[i] = NULL;
+		}
+	}
+
+	return arena;
+
+error_bucket_create:
+	heap_arena_delete(arena);
+	return NULL;
+}
+
 /*
  * heap_get_best_class -- returns the alloc class that best fits the
  *	requested size
@@ -115,29 +223,283 @@ heap_get_best_class(struct palloc_heap *heap, size_t size)
 }
 
 /*
- * zoneset_bucket_acquire -- fetches by zoneset or by id a bucket exclusive
- * for the thread until zoneset_bucket_release is called
+ * heap_arena_thread_detach -- detaches arena from the current thread
+ *
+ * Must be called with arenas lock taken.
+ */
+static void
+heap_arena_thread_detach(struct arena *a)
+{
+	/*
+	 * Even though this is under a lock, nactive variable can also be read
+	 * concurrently from the recycler (without the arenas lock).
+	 * That's why we are using an atomic operation.
+	 */
+	if ((--a->nthreads) == 0)
+		util_fetch_and_sub64(&a->arenas->nactive, 1);
+}
+
+/*
+ * heap_arena_thread_attach -- assign arena to the current thread
+ *
+ * Must be called with arenas lock taken.
+ */
+static void
+heap_arena_thread_attach(struct palloc_heap *heap, struct arena *a)
+{
+	struct heap_rt *h = heap->rt;
+
+	struct arenas_thread_assignment *assignment = &h->arenas.assignment;
+
+	ASSERTeq(assignment->type, DAV_ARENAS_ASSIGNMENT_THREAD_KEY);
+
+	struct arena *thread_arena = os_tls_get(assignment->thread);
+
+	if (thread_arena)
+		heap_arena_thread_detach(thread_arena);
+
+	ASSERTne(a, NULL);
+
+	/*
+	 * Even though this is under a lock, nactive variable can also be read
+	 * concurrently from the recycler (without the arenas lock).
+	 * That's why we are using an atomic operation.
+	 */
+	if ((a->nthreads++) == 0)
+		util_fetch_and_add64(&a->arenas->nactive, 1);
+
+	os_tls_set(assignment->thread, a);
+}
+
+/*
+ * heap_thread_arena_destructor -- (internal) removes arena thread assignment
+ */
+static void
+heap_thread_arena_destructor(void *arg)
+{
+	struct arena *a = arg;
+
+	os_mutex_lock(&a->arenas->lock);
+	heap_arena_thread_detach(a);
+	os_mutex_unlock(&a->arenas->lock);
+}
+
+/*
+ * arena_thread_assignment_init -- (internal) initializes thread assignment
+ *	type for arenas.
+ */
+static int
+arena_thread_assignment_init(struct arenas_thread_assignment *assignment,
+	enum dav_arenas_assignment_type type)
+{
+	int ret = 0;
+
+	assignment->type = type;
+
+	switch (type) {
+	case DAV_ARENAS_ASSIGNMENT_THREAD_KEY:
+		ret = os_tls_key_create(&assignment->thread,
+			heap_thread_arena_destructor);
+		break;
+	case DAV_ARENAS_ASSIGNMENT_GLOBAL:
+		assignment->global = NULL;
+		break;
+	default:
+		ASSERT(0); /* unreachable */
+	}
+
+	return ret;
+}
+
+/*
+ * arena_thread_assignment_fini -- (internal) destroys thread assignment
+ *	type for arenas.
+ */
+static void
+arena_thread_assignment_fini(struct arenas_thread_assignment *assignment)
+{
+	switch (assignment->type) {
+	case DAV_ARENAS_ASSIGNMENT_THREAD_KEY:
+		os_tls_key_delete(assignment->thread);
+		break;
+	case DAV_ARENAS_ASSIGNMENT_GLOBAL:
+		break;
+	default:
+		ASSERT(0); /* unreachable */
+	}
+}
+
+/*
+ * heap_get_arena_by_id -- returns arena by id
+ *
+ * Must be called with arenas lock taken.
+ */
+static struct arena *
+heap_get_arena_by_id(struct palloc_heap *heap, unsigned arena_id)
+{
+	return VEC_ARR(&heap->rt->arenas.vec)[arena_id - 1];
+}
+
+/*
+ * heap_global_arena_assign -- (internal) assigns the first automatic arena
+ *	as the heaps' global arena assignment.
+ */
+static struct arena *
+heap_global_arena_assign(struct palloc_heap *heap)
+{
+	util_mutex_lock(&heap->rt->arenas.lock);
+
+	ASSERTne(VEC_SIZE(&heap->rt->arenas.vec), 0);
+
+	struct arena *a = NULL;
+
+	VEC_FOREACH(a, &heap->rt->arenas.vec) {
+		if (a->automatic)
+			break;
+	}
+
+	DAV_DBG("assigning %p arena to current thread", a);
+
+	/* at least one automatic arena must exist */
+	ASSERTne(a, NULL);
+	heap->rt->arenas.assignment.global = a;
+
+	util_mutex_unlock(&heap->rt->arenas.lock);
+
+	return a;
+}
+
+/*
+ * heap_thread_arena_assign -- (internal) assigns the least used arena
+ *	to current thread
+ *
+ * To avoid complexities with regards to races in the search for the least
+ * used arena, a lock is used, but the nthreads counter of the arena is still
+ * bumped using atomic instruction because it can happen in parallel to a
+ * destructor of a thread, which also touches that variable.
+ */
+static struct arena *
+heap_thread_arena_assign(struct palloc_heap *heap)
+{
+	util_mutex_lock(&heap->rt->arenas.lock);
+
+	struct arena *least_used = NULL;
+
+	ASSERTne(VEC_SIZE(&heap->rt->arenas.vec), 0);
+
+	struct arena *a;
+
+	VEC_FOREACH(a, &heap->rt->arenas.vec) {
+		if (!a->automatic)
+			continue;
+		if (least_used == NULL ||
+			a->nthreads < least_used->nthreads)
+			least_used = a;
+	}
+
+	DAV_DBG("assigning %p arena to current thread", least_used);
+
+	/* at least one automatic arena must exist */
+	ASSERTne(least_used, NULL);
+	heap_arena_thread_attach(heap, least_used);
+
+	util_mutex_unlock(&heap->rt->arenas.lock);
+
+	return least_used;
+}
+
+/*
+ * heap_thread_arena -- (internal) returns the arena assigned to the current
+ *	thread
+ */
+static struct arena *
+heap_thread_arena(struct palloc_heap *heap)
+{
+	struct arenas_thread_assignment *assignment =
+		&heap->rt->arenas.assignment;
+	struct arena *arena = NULL;
+
+	switch (assignment->type) {
+	case DAV_ARENAS_ASSIGNMENT_THREAD_KEY:
+		arena = os_tls_get(assignment->thread);
+		if (arena == NULL)
+			arena = heap_thread_arena_assign(heap);
+		break;
+	case DAV_ARENAS_ASSIGNMENT_GLOBAL:
+		arena = assignment->global;
+		if (arena == NULL)
+			arena = heap_global_arena_assign(heap);
+		break;
+	default:
+		ASSERT(0); /* unreachable */
+	}
+
+	ASSERTne(arena, NULL);
+
+	return arena;
+}
+
+/*
+ * heap_get_thread_arena_id -- returns the arena id assigned to the current
+ *	thread
+ */
+unsigned
+heap_get_thread_arena_id(struct palloc_heap *heap)
+{
+	unsigned arena_id = 1;
+	struct arena *arenap = heap_thread_arena(heap);
+	struct arena *arenav;
+	struct heap_rt *rt = heap->rt;
+
+	util_mutex_lock(&rt->arenas.lock);
+	VEC_FOREACH(arenav, &heap->rt->arenas.vec) {
+		if (arenav == arenap) {
+			util_mutex_unlock(&rt->arenas.lock);
+			return arena_id;
+		}
+		arena_id++;
+	}
+
+	util_mutex_unlock(&rt->arenas.lock);
+	ASSERT(0);
+	return arena_id;
+}
+
+/*
+ * heap_bucket_acquire -- fetches by arena or by id a bucket exclusive
+ * for the thread until heap_bucket_release is called
  */
 struct bucket *
-zoneset_bucket_acquire(struct zoneset *zset, uint8_t class_id)
+heap_bucket_acquire(struct palloc_heap *heap, uint8_t class_id,
+		uint16_t arena_id)
 {
+	struct heap_rt *rt = heap->rt;
 	struct bucket_locked *b;
 
-	D_ASSERT(zset != NULL);
+	if (class_id == DEFAULT_ALLOC_CLASS_ID) {
+		b = rt->default_bucket;
+		goto out;
+	}
+
+	if (arena_id == HEAP_ARENA_PER_THREAD) {
+		struct arena *arena = heap_thread_arena(heap);
 
-	if (class_id == DEFAULT_ALLOC_CLASS_ID)
-		b = zset->default_bucket;
-	else
-		b = zset->buckets[class_id];
+		ASSERTne(arena->buckets, NULL);
+		b = arena->buckets[class_id];
+	} else {
+		b = (VEC_ARR(&heap->rt->arenas.vec)
+			[arena_id - 1])->buckets[class_id];
+	}
 
+out:
 	return bucket_acquire(b);
 }
 
 /*
- * zoneset_bucket_release -- puts the bucket back into the heap
+ * heap_bucket_release -- puts the bucket back into the heap
  */
 void
-zoneset_bucket_release(struct bucket *b)
+heap_bucket_release(struct bucket *b)
 {
 	bucket_release(b);
 }
@@ -382,9 +744,8 @@ heap_run_into_free_chunk(struct palloc_heap *heap,
 static int
 heap_reclaim_run(struct palloc_heap *heap, struct memory_block *m, int startup)
 {
-	struct chunk_run    *run  = heap_get_chunk_run(heap, m);
+	struct chunk_run *run = heap_get_chunk_run(heap, m);
 	struct chunk_header *hdr = heap_get_chunk_hdr(heap, m);
-	struct zoneset      *zset = heap_get_zoneset(heap, m->zone_id);
 
 	struct alloc_class *c = alloc_class_by_run(
 		heap->rt->alloc_classes,
@@ -412,7 +773,8 @@ heap_reclaim_run(struct palloc_heap *heap, struct memory_block *m, int startup)
 		STATS_INC(heap->stats, transient, heap_run_allocated,
 			(c->rdsc.nallocs - e.free_space) * run->hdr.block_size);
 	}
-	struct recycler *recycler = heap_get_recycler(heap, zset, c->id, c->rdsc.nallocs);
+	struct recycler *recycler = heap_get_recycler(heap, c->id,
+		c->rdsc.nallocs);
 
 	if (recycler == NULL || recycler_put(recycler, e) < 0)
 		ERR("lost runtime tracking info of %u run due to OOM", c->id);
@@ -504,27 +866,24 @@ static int
 heap_recycle_unused(struct palloc_heap *heap, struct recycler *recycler,
 	struct bucket *defb, int force)
 {
-	struct zoneset      *zset;
-	struct memory_block *nm;
-	struct empty_runs    r = recycler_recalc(recycler, force);
-	struct bucket       *nb;
+	struct empty_runs r = recycler_recalc(recycler, force);
 
 	if (VEC_SIZE(&r) == 0)
 		return ENOMEM;
 
-	zset = recycler_get_zoneset(recycler);
-	D_ASSERT(zset != NULL);
-
-	nb = defb == NULL ? zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID) : NULL;
+	struct bucket *nb = defb == NULL ? heap_bucket_acquire(heap,
+		DEFAULT_ALLOC_CLASS_ID, HEAP_ARENA_PER_THREAD) : NULL;
 
 	ASSERT(defb != NULL || nb != NULL);
 
+	struct memory_block *nm;
+
 	VEC_FOREACH_BY_PTR(nm, &r) {
 		heap_run_into_free_chunk(heap, defb ? defb : nb, nm);
 	}
 
 	if (nb != NULL)
-		zoneset_bucket_release(nb);
+		heap_bucket_release(nb);
 
 	VEC_DELETE(&r);
 
@@ -537,12 +896,11 @@ heap_recycle_unused(struct palloc_heap *heap, struct recycler *recycler,
 static int
 heap_reclaim_garbage(struct palloc_heap *heap, struct bucket *bucket)
 {
-	int              ret = ENOMEM;
+	int ret = ENOMEM;
 	struct recycler *r;
-	struct zoneset  *zset = bucket_get_zoneset(bucket);
 
 	for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
-		r = zset->recyclers[i];
+		r = heap->rt->recyclers[i];
 		if (r == NULL)
 			continue;
 
@@ -595,15 +953,14 @@ heap_ensure_huge_bucket_filled(struct palloc_heap *heap,
 void
 heap_discard_run(struct palloc_heap *heap, struct memory_block *m)
 {
-	struct zoneset *zset = heap_get_zoneset(heap, m->zone_id);
-
-	D_ASSERT(zset != NULL);
 	if (heap_reclaim_run(heap, m, 0)) {
-		struct bucket *b = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID);
+		struct bucket *b =
+			heap_bucket_acquire(heap,
+			DEFAULT_ALLOC_CLASS_ID, 0);
 
 		heap_run_into_free_chunk(heap, b, m);
 
-		zoneset_bucket_release(b);
+		heap_bucket_release(b);
 	}
 }
 
@@ -626,6 +983,34 @@ heap_detach_and_try_discard_run(struct palloc_heap *heap, struct bucket *b)
 	return 0;
 }
 
+/*
+ * heap_force_recycle -- detaches all memory from arenas, and forces global
+ *	recycling of all memory blocks
+ */
+void
+heap_force_recycle(struct palloc_heap *heap)
+{
+	util_mutex_lock(&heap->rt->arenas.lock);
+	struct arena *arenap;
+
+	VEC_FOREACH(arenap, &heap->rt->arenas.vec) {
+		for (int i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+			struct bucket_locked *locked = arenap->buckets[i];
+
+			if (locked == NULL)
+				continue;
+
+			struct bucket *b = bucket_acquire(locked);
+
+			heap_detach_and_try_discard_run(heap, b);
+
+			bucket_release(b);
+		}
+	}
+	util_mutex_unlock(&heap->rt->arenas.lock);
+	heap_reclaim_garbage(heap, NULL);
+}
+
 /*
  * heap_reuse_from_recycler -- (internal) try reusing runs that are currently
  *	in the recycler
@@ -634,14 +1019,14 @@ static int
 heap_reuse_from_recycler(struct palloc_heap *heap,
 	struct bucket *b, uint32_t units, int force)
 {
-	struct zoneset     *zset = bucket_get_zoneset(b);
 	struct memory_block m = MEMORY_BLOCK_NONE;
 
 	m.size_idx = units;
 
 	struct alloc_class *aclass = bucket_alloc_class(b);
 
-	struct recycler *recycler = heap_get_recycler(heap, zset, aclass->id, aclass->rdsc.nallocs);
+	struct recycler *recycler = heap_get_recycler(heap, aclass->id,
+		aclass->rdsc.nallocs);
 
 	if (recycler == NULL) {
 		ERR("lost runtime tracking info of %u run due to OOM",
@@ -687,9 +1072,7 @@ heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b,
 {
 	int ret = 0;
 	struct alloc_class *aclass = bucket_alloc_class(b);
-	struct zoneset     *zset   = bucket_get_zoneset(b);
 
-	D_ASSERT(zset != NULL);
 	ASSERTeq(aclass->type, CLASS_RUN);
 
 	if (heap_detach_and_try_discard_run(heap, b) != 0)
@@ -699,10 +1082,12 @@ heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b,
 		goto out;
 
 	/* search in the next zone before attempting to create a new run */
-	struct bucket *defb = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID);
+	struct bucket *defb = heap_bucket_acquire(heap,
+		DEFAULT_ALLOC_CLASS_ID,
+		HEAP_ARENA_PER_THREAD);
 
 	heap_populate_bucket(heap, defb);
-	zoneset_bucket_release(defb);
+	heap_bucket_release(defb);
 
 	if (heap_reuse_from_recycler(heap, b, units, 0) == 0)
 		goto out;
@@ -711,21 +1096,23 @@ heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b,
 
 	m.size_idx = aclass->rdsc.size_idx;
 
-	defb = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID);
+	defb = heap_bucket_acquire(heap,
+		DEFAULT_ALLOC_CLASS_ID,
+		HEAP_ARENA_PER_THREAD);
 
 	/* cannot reuse an existing run, create a new one */
 	if (heap_get_bestfit_block(heap, defb, &m) == 0) {
 		ASSERTeq(m.block_off, 0);
 		if (heap_run_create(heap, b, &m) != 0) {
-			zoneset_bucket_release(defb);
+			heap_bucket_release(defb);
 			return ENOMEM;
 		}
 
-		zoneset_bucket_release(defb);
+		heap_bucket_release(defb);
 
 		goto out;
 	}
-	zoneset_bucket_release(defb);
+	heap_bucket_release(defb);
 
 	if (heap_reuse_from_recycler(heap, b, units, 0) == 0)
 		goto out;
@@ -743,8 +1130,6 @@ heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b,
 void
 heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m)
 {
-	struct zoneset *zset = heap_get_zoneset(heap, m->zone_id);
-
 	if (m->type != MEMORY_BLOCK_RUN)
 		return;
 
@@ -760,7 +1145,8 @@ heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m)
 	if (c == NULL)
 		return;
 
-	struct recycler *recycler = heap_get_recycler(heap, zset, c->id, c->rdsc.nallocs);
+	struct recycler *recycler = heap_get_recycler(heap, c->id,
+		c->rdsc.nallocs);
 
 	if (recycler == NULL) {
 		ERR("lost runtime tracking info of %u run due to OOM",
@@ -852,74 +1238,210 @@ heap_end(struct palloc_heap *h)
 }
 
 /*
- * heap_default_zoneset_init -- (internal) initializes default zone
+ * heap_arena_create -- create a new arena, push it to the vector
+ * and return new arena id or -1 on failure
  */
-static int
-heap_default_zoneset_init(struct palloc_heap *heap)
+int
+heap_arena_create(struct palloc_heap *heap)
 {
-	struct heap_rt     *h = heap->rt;
-	struct zoneset     *default_zset;
-	struct alloc_class *c;
-	uint8_t             i;
+	struct heap_rt *h = heap->rt;
+	struct arena *arena = heap_arena_new(heap, 0);
 
-	D_ALLOC_PTR(default_zset);
-	if (default_zset == NULL)
+	if (arena == NULL)
 		return -1;
 
-	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
-		c = alloc_class_by_id(h->alloc_classes, i);
+	util_mutex_lock(&h->arenas.lock);
 
-		if (c == NULL)
-			continue;
+	if (VEC_PUSH_BACK(&h->arenas.vec, arena))
+		goto err_push_back;
 
-		default_zset->buckets[c->id] =
-		    bucket_locked_new(container_new_seglists(heap), c, default_zset);
-		if (default_zset->buckets[c->id] == NULL)
-			goto error_bucket_create;
-	}
+	int ret = (int)VEC_SIZE(&h->arenas.vec);
 
-	default_zset->default_bucket = bucket_locked_new(
-	    container_new_ravl(heap), alloc_class_by_id(h->alloc_classes, DEFAULT_ALLOC_CLASS_ID),
-	    default_zset);
+	util_mutex_unlock(&h->arenas.lock);
 
-	if (default_zset->default_bucket == NULL)
-		goto error_bucket_create;
+	return ret;
 
-	heap->rt->default_zset = default_zset;
-	return 0;
+err_push_back:
+	util_mutex_unlock(&h->arenas.lock);
+	heap_arena_delete(arena);
+	return -1;
+}
 
-error_bucket_create:
-	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
-		c = alloc_class_by_id(h->alloc_classes, i);
-		if (c != NULL) {
-			if (default_zset->buckets[c->id] != NULL)
-				bucket_locked_delete(default_zset->buckets[c->id]);
-		}
+/*
+ * heap_get_narenas_total -- returns the number of all arenas in the heap
+ */
+unsigned
+heap_get_narenas_total(struct palloc_heap *heap)
+{
+	struct heap_rt *h = heap->rt;
+
+	util_mutex_lock(&h->arenas.lock);
+
+	unsigned total = (unsigned)VEC_SIZE(&h->arenas.vec);
+
+	util_mutex_unlock(&h->arenas.lock);
+
+	return total;
+}
+
+/*
+ * heap_get_narenas_max -- returns the max number of arenas
+ */
+unsigned
+heap_get_narenas_max(struct palloc_heap *heap)
+{
+	struct heap_rt *h = heap->rt;
+
+	util_mutex_lock(&h->arenas.lock);
+
+	unsigned max = (unsigned)VEC_CAPACITY(&h->arenas.vec);
+
+	util_mutex_unlock(&h->arenas.lock);
+
+	return max;
+}
+
+/*
+ * heap_set_narenas_max -- change the max number of arenas
+ */
+int
+heap_set_narenas_max(struct palloc_heap *heap, unsigned size)
+{
+	struct heap_rt *h = heap->rt;
+	int ret = -1;
+
+	util_mutex_lock(&h->arenas.lock);
+
+	unsigned capacity = (unsigned)VEC_CAPACITY(&h->arenas.vec);
+
+	if (size < capacity) {
+		ERR("cannot decrease max number of arenas");
+		goto out;
+	} else if (size == capacity) {
+		ret = 0;
+		goto out;
 	}
-	D_FREE(default_zset);
-	return -1;
+
+	ret = VEC_RESERVE(&h->arenas.vec, size);
+
+out:
+	util_mutex_unlock(&h->arenas.lock);
+	return ret;
 }
 
-static void
-heap_default_zoneset_cleanup(struct palloc_heap *heap)
+/*
+ * heap_get_narenas_auto -- returns the number of all automatic arenas
+ */
+unsigned
+heap_get_narenas_auto(struct palloc_heap *heap)
 {
-	struct zoneset  *default_zset = heap->rt->default_zset;
-	uint8_t          i;
+	struct heap_rt *h = heap->rt;
+	struct arena *arena;
+	unsigned narenas = 0;
 
-	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
-		if (default_zset->buckets[i] == NULL)
-			continue;
-		bucket_locked_delete(default_zset->buckets[i]);
+	util_mutex_lock(&h->arenas.lock);
+
+	VEC_FOREACH(arena, &h->arenas.vec) {
+		if (arena->automatic)
+			narenas++;
 	}
-	bucket_locked_delete(default_zset->default_bucket);
 
-	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
-		if (default_zset->recyclers[i] == NULL)
-			continue;
-		recycler_delete(default_zset->recyclers[i]);
+	util_mutex_unlock(&h->arenas.lock);
+
+	return narenas;
+}
+
+/*
+ * heap_get_arena_buckets -- returns a pointer to buckets from the arena
+ */
+struct bucket_locked **
+heap_get_arena_buckets(struct palloc_heap *heap, unsigned arena_id)
+{
+	util_mutex_lock(&heap->rt->arenas.lock);
+	struct arena *a = heap_get_arena_by_id(heap, arena_id);
+
+	util_mutex_unlock(&heap->rt->arenas.lock);
+
+	return a->buckets;
+}
+
+/*
+ * heap_get_arena_auto -- returns arena automatic value
+ */
+int
+heap_get_arena_auto(struct palloc_heap *heap, unsigned arena_id)
+{
+	int value;
+
+	util_mutex_lock(&heap->rt->arenas.lock);
+
+	struct arena *a = heap_get_arena_by_id(heap, arena_id);
+	value           = a->automatic;
+
+	util_mutex_unlock(&heap->rt->arenas.lock);
+
+	return value;
+}
+
+/*
+ * heap_set_arena_auto -- sets arena automatic value
+ */
+int
+heap_set_arena_auto(struct palloc_heap *heap, unsigned arena_id,
+		int automatic)
+{
+	unsigned nautomatic = 0;
+	struct arena *a;
+	struct heap_rt *h = heap->rt;
+	int ret = 0;
+
+	util_mutex_lock(&h->arenas.lock);
+	VEC_FOREACH(a, &h->arenas.vec)
+		if (a->automatic)
+			nautomatic++;
+
+	a = VEC_ARR(&heap->rt->arenas.vec)[arena_id - 1];
+
+	if (!automatic && nautomatic <= 1 && a->automatic) {
+		D_CRIT("at least one automatic arena must exist\n");
+		ret = -1;
+		goto out;
 	}
-	D_FREE(default_zset);
-	heap->rt->default_zset = NULL;
+	a->automatic = automatic;
+
+out:
+	util_mutex_unlock(&h->arenas.lock);
+	return ret;
+
+}
+
+/*
+ * heap_set_arena_thread -- assign arena with given id to the current thread
+ */
+void
+heap_set_arena_thread(struct palloc_heap *heap, unsigned arena_id)
+{
+	os_mutex_lock(&heap->rt->arenas.lock);
+	heap_arena_thread_attach(heap, heap_get_arena_by_id(heap, arena_id));
+	os_mutex_unlock(&heap->rt->arenas.lock);
+}
+
+/*
+ * heap_get_procs -- returns the number of arenas to create
+ */
+unsigned
+heap_get_procs(void)
+{
+	long cpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+	if (cpus < 1)
+		cpus = 1;
+
+	unsigned arenas = (unsigned)cpus;
+
+	DAV_DBG("creating %u arenas", arenas);
+
+	return arenas;
 }
 
 /*
@@ -929,16 +1451,61 @@ heap_default_zoneset_cleanup(struct palloc_heap *heap)
 int
 heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c)
 {
-	struct zoneset *default_zset = heap->rt->default_zset;
+	struct heap_rt *h = heap->rt;
+	size_t i;
+	struct arena *arena;
+
+	VEC_FOREACH_BY_POS(i, &h->arenas.vec) {
+		arena = VEC_ARR(&h->arenas.vec)[i];
+		if (arena->buckets[c->id] == NULL)
+			arena->buckets[c->id] = bucket_locked_new(
+				container_new_seglists(heap), c);
+		if (arena->buckets[c->id] == NULL)
+			goto error_cache_bucket_new;
+	}
 
-	if (default_zset->buckets[c->id] == NULL) {
-		default_zset->buckets[c->id] =
-		    bucket_locked_new(container_new_seglists(heap), c, default_zset);
-		if (default_zset->buckets[c->id] == NULL)
-			return -1;
+	return 0;
+
+error_cache_bucket_new:
+	for (; i != 0; --i)
+		bucket_locked_delete(
+			VEC_ARR(&h->arenas.vec)[i - 1]->buckets[c->id]);
+
+	return -1;
+}
+
+/*
+ * heap_buckets_init -- (internal) initializes bucket instances
+ */
+int
+heap_buckets_init(struct palloc_heap *heap)
+{
+	struct heap_rt *h = heap->rt;
+
+	for (uint8_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		struct alloc_class *c = alloc_class_by_id(h->alloc_classes, i);
+
+		if (c != NULL) {
+			if (heap_create_alloc_class_buckets(heap, c) != 0)
+				goto error_bucket_create;
+		}
 	}
 
+	h->default_bucket = bucket_locked_new(container_new_ravl(heap),
+		alloc_class_by_id(h->alloc_classes, DEFAULT_ALLOC_CLASS_ID));
+
+	if (h->default_bucket == NULL)
+		goto error_bucket_create;
+
 	return 0;
+
+error_bucket_create: {
+		struct arena *arena;
+
+		VEC_FOREACH(arena, &h->arenas.vec)
+			heap_arena_delete(arena);
+	}
+	return -1;
 }
 
 /*
@@ -999,12 +1566,25 @@ heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size,
 		goto error_heap_malloc;
 	}
 
+	err = arena_thread_assignment_init(&h->arenas.assignment,
+		Default_arenas_assignment_type);
+	if (err != 0)
+		goto error_assignment_init;
+
 	h->alloc_classes = alloc_class_collection_new();
 	if (h->alloc_classes == NULL) {
 		err = ENOMEM;
 		goto error_alloc_classes_new;
 	}
 
+	unsigned narenas_default = Default_arenas_max == 0 ?
+		heap_get_procs() : (unsigned)Default_arenas_max;
+
+	if (heap_arenas_init(&h->arenas) != 0) {
+		err = ENOMEM;
+		goto error_arenas_malloc;
+	}
+
 	h->nzones = heap_max_zone(heap_size);
 
 	h->zones_exhausted = 0;
@@ -1024,18 +1604,27 @@ heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size,
 	heap->alloc_pattern = PALLOC_CTL_DEBUG_NO_PATTERN;
 	VALGRIND_DO_CREATE_MEMPOOL(heap->layout, 0, 0);
 
-	if (heap_default_zoneset_init(heap) != 0) {
-		err = ENOMEM;
-		goto error_zoneset_init;
+	for (unsigned i = 0; i < narenas_default; ++i) {
+		if (VEC_PUSH_BACK(&h->arenas.vec, heap_arena_new(heap, 1))) {
+			err = ENOMEM;
+			goto error_vec_reserve;
+		}
 	}
 
+	for (unsigned i = 0; i < MAX_ALLOCATION_CLASSES; ++i)
+		h->recyclers[i] = NULL;
+
 	heap_zone_update_if_needed(heap);
 
 	return 0;
 
-error_zoneset_init:
+error_vec_reserve:
+	heap_arenas_fini(&h->arenas);
+error_arenas_malloc:
 	alloc_class_collection_delete(h->alloc_classes);
 error_alloc_classes_new:
+	arena_thread_assignment_fini(&h->arenas.assignment);
+error_assignment_init:
 	D_FREE(h);
 	heap->rt = NULL;
 error_heap_malloc:
@@ -1112,11 +1701,26 @@ heap_cleanup(struct palloc_heap *heap)
 
 	alloc_class_collection_delete(rt->alloc_classes);
 
-	heap_default_zoneset_cleanup(heap);
+	arena_thread_assignment_fini(&rt->arenas.assignment);
+	bucket_locked_delete(rt->default_bucket);
+
+	struct arena *arena;
+
+	VEC_FOREACH(arena, &rt->arenas.vec)
+		heap_arena_delete(arena);
 
 	for (unsigned i = 0; i < rt->nlocks; ++i)
 		util_mutex_destroy(&rt->run_locks[i]);
 
+	heap_arenas_fini(&rt->arenas);
+
+	for (int i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		if (heap->rt->recyclers[i] == NULL)
+			continue;
+
+		recycler_delete(rt->recyclers[i]);
+	}
+
 	VALGRIND_DO_DESTROY_MEMPOOL(heap->layout);
 
 	D_FREE(rt);
diff --git a/src/common/dav/heap.h b/src/common/dav/heap.h
index e1e205d076d..d3e2bba4cdf 100644
--- a/src/common/dav/heap.h
+++ b/src/common/dav/heap.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2023, Intel Corporation */
+/* Copyright 2015-2021, Intel Corporation */
 
 /*
  * heap.h -- internal definitions for heap
@@ -18,62 +18,81 @@
 #include "os_thread.h"
 #include "dav_internal.h"
 
+extern enum dav_arenas_assignment_type Default_arenas_assignment_type;
+extern size_t Default_arenas_max;
+
 #define HEAP_OFF_TO_PTR(heap, off) ((void *)((char *)((heap)->base) + (off)))
-#define HEAP_PTR_TO_OFF(heap, ptr) ((uintptr_t)(ptr) - (uintptr_t)((heap)->base))
-
-#define BIT_IS_CLR(a, i)           (!((a) & (1ULL << (i))))
-#define HEAP_ARENA_PER_THREAD      (0)
-
-int
-heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, uint64_t *sizep,
-	  void *base, struct mo_ops *p_ops, struct stats *stats, struct pool_set *set);
-int
-heap_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops);
-void
-heap_cleanup(struct palloc_heap *heap);
-int
-heap_check(void *heap_start, uint64_t heap_size);
-int
-heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops);
-int
-heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c);
-int
-heap_extend(struct palloc_heap *heap, struct bucket *defb, size_t size);
+#define HEAP_PTR_TO_OFF(heap, ptr)\
+	((uintptr_t)(ptr) - (uintptr_t)((heap)->base))
+
+#define BIT_IS_CLR(a, i)	(!((a) & (1ULL << (i))))
+#define HEAP_ARENA_PER_THREAD (0)
+
+int heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size,
+	      uint64_t *sizep, void *base, struct mo_ops *p_ops,
+	      struct stats *stats, struct pool_set *set);
+int heap_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops);
+void heap_cleanup(struct palloc_heap *heap);
+int heap_check(void *heap_start, uint64_t heap_size);
+int heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops);
+int heap_buckets_init(struct palloc_heap *heap);
+int heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c);
+int heap_extend(struct palloc_heap *heap, struct bucket *defb, size_t size);
 
 struct alloc_class *
 heap_get_best_class(struct palloc_heap *heap, size_t size);
 
 struct bucket *
-zoneset_bucket_acquire(struct zoneset *zset, uint8_t class_id);
-void
-zoneset_bucket_release(struct bucket *b);
+heap_bucket_acquire(struct palloc_heap *heap, uint8_t class_id, uint16_t arena_id);
+void heap_bucket_release(struct bucket *b);
+
+int heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b,
+			   struct memory_block *m);
+os_mutex_t *heap_get_run_lock(struct palloc_heap *heap,
+			      uint32_t chunk_id);
+
+void heap_force_recycle(struct palloc_heap *heap);
+
+void heap_discard_run(struct palloc_heap *heap, struct memory_block *m);
+
+void heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m);
+
+int heap_free_chunk_reuse(struct palloc_heap *heap,
+			  struct bucket *bucket, struct memory_block *m);
+
+void heap_foreach_object(struct palloc_heap *heap, object_callback cb,
+			 void *arg, struct memory_block start);
 
-int
-heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b, struct memory_block *m);
-os_mutex_t *
-heap_get_run_lock(struct palloc_heap *heap, uint32_t chunk_id);
+struct alloc_class_collection *heap_alloc_classes(struct palloc_heap *heap);
 
-void
-heap_discard_run(struct palloc_heap *heap, struct memory_block *m);
+void *heap_end(struct palloc_heap *heap);
 
-void
-heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m);
+unsigned heap_get_narenas_total(struct palloc_heap *heap);
 
-int
-heap_free_chunk_reuse(struct palloc_heap *heap, struct bucket *bucket, struct memory_block *m);
+unsigned heap_get_narenas_max(struct palloc_heap *heap);
 
-void
-heap_foreach_object(struct palloc_heap *heap, object_callback cb, void *arg,
-		    struct memory_block start);
+int heap_set_narenas_max(struct palloc_heap *heap, unsigned size);
 
-struct alloc_class_collection *
-heap_alloc_classes(struct palloc_heap *heap);
+unsigned heap_get_narenas_auto(struct palloc_heap *heap);
 
-void *
-heap_end(struct palloc_heap *heap);
+unsigned heap_get_thread_arena_id(struct palloc_heap *heap);
 
-void
-heap_vg_open(struct palloc_heap *heap, object_callback cb, void *arg, int objects);
+int heap_arena_create(struct palloc_heap *heap);
+
+struct bucket_locked **
+heap_get_arena_buckets(struct palloc_heap *heap, unsigned arena_id);
+
+int heap_get_arena_auto(struct palloc_heap *heap, unsigned arena_id);
+
+int heap_set_arena_auto(struct palloc_heap *heap, unsigned arena_id,
+			int automatic);
+
+void heap_set_arena_thread(struct palloc_heap *heap, unsigned arena_id);
+
+unsigned heap_get_procs(void);
+
+void heap_vg_open(struct palloc_heap *heap, object_callback cb,
+		  void *arg, int objects);
 
 static inline struct chunk_header *
 heap_get_chunk_hdr(struct palloc_heap *heap, const struct memory_block *m)
@@ -93,7 +112,4 @@ heap_get_chunk_run(struct palloc_heap *heap, const struct memory_block *m)
 	return GET_CHUNK_RUN(heap->layout, m->zone_id, m->chunk_id);
 }
 
-struct zoneset *
-heap_get_zoneset(struct palloc_heap *heap, uint32_t zone_id);
-
 #endif /* __DAOS_COMMON_HEAP_H */
diff --git a/src/common/dav/obj.h b/src/common/dav/obj.h
index 470323da1ef..3140235d105 100644
--- a/src/common/dav/obj.h
+++ b/src/common/dav/obj.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2014-2023, Intel Corporation */
+/* Copyright 2014-2021, Intel Corporation */
 
 /*
  * obj.h -- internal definitions for obj module
@@ -45,6 +45,7 @@ typedef uint64_t type_num_t;
 #define CLASS_ID_FROM_FLAG(flag)\
 ((uint16_t)((flag) >> 48))
 
-#define EZONE_ID_FROM_FLAG(flag) ((uint32_t)((flag) >> 16))
+#define ARENA_ID_FROM_FLAG(flag)\
+((uint16_t)((flag) >> 32))
 
 #endif /* __DAOS_COMMON_OBJ_H */
diff --git a/src/common/dav/palloc.c b/src/common/dav/palloc.c
index 255303de4a2..a7b5424576f 100644
--- a/src/common/dav/palloc.c
+++ b/src/common/dav/palloc.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2023, Intel Corporation */
+/* Copyright 2015-2022, Intel Corporation */
 
 /*
  * palloc.c -- implementation of pmalloc POSIX-like API
@@ -178,13 +178,15 @@ alloc_prep_block(struct palloc_heap *heap, const struct memory_block *m,
  * (best-fit, next-fit, ...) varies depending on the bucket container.
  */
 static int
-palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr constructor,
-			  void *arg, uint64_t extra_field, uint16_t object_flags, uint16_t class_id,
-			  uint32_t zset_id, struct dav_action_internal *out)
+palloc_reservation_create(struct palloc_heap *heap, size_t size,
+	palloc_constr constructor, void *arg,
+	uint64_t extra_field, uint16_t object_flags,
+	uint16_t class_id, uint16_t arena_id,
+	struct dav_action_internal *out)
 {
-	int                  err       = 0;
+	int err = 0;
+
 	struct memory_block *new_block = &out->m;
-	struct zoneset      *zset;
 
 	out->type = DAV_ACTION_TYPE_HEAP;
 
@@ -200,12 +202,6 @@ palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr c
 		return -1;
 	}
 
-	zset = heap_get_zoneset(heap, zset_id);
-	if (zset == NULL) {
-		errno = EINVAL;
-		return -1;
-	}
-
 	/*
 	 * The caller provided size in bytes, but buckets operate in
 	 * 'size indexes' which are multiples of the block size in the
@@ -226,7 +222,7 @@ palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr c
 	*new_block = MEMORY_BLOCK_NONE;
 	new_block->size_idx = (uint32_t)size_idx;
 
-	struct bucket *b = zoneset_bucket_acquire(zset, c->id);
+	struct bucket *b = heap_bucket_acquire(heap, c->id, arena_id);
 
 	err = heap_get_bestfit_block(heap, b, new_block);
 	if (err != 0)
@@ -258,7 +254,7 @@ palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr c
 	out->new_state = MEMBLOCK_ALLOCATED;
 
 out:
-	zoneset_bucket_release(b);
+	heap_bucket_release(b);
 
 	if (err == 0)
 		return 0;
@@ -300,17 +296,17 @@ static void
 palloc_restore_free_chunk_state(struct palloc_heap *heap,
 	struct memory_block *m)
 {
-	struct zoneset *zset = heap_get_zoneset(heap, m->zone_id);
-
 	if (m->type == MEMORY_BLOCK_HUGE) {
-		struct bucket *b = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID);
+		struct bucket *b = heap_bucket_acquire(heap,
+			DEFAULT_ALLOC_CLASS_ID,
+			HEAP_ARENA_PER_THREAD);
 		if (heap_free_chunk_reuse(heap, b, m) != 0) {
 			if (errno == EEXIST)
 				FATAL("duplicate runtime chunk state, possible double free");
 			else
 				D_CRIT("unable to track runtime chunk state\n");
 		}
-		zoneset_bucket_release(b);
+		heap_bucket_release(b);
 	}
 }
 
@@ -577,15 +573,18 @@ palloc_exec_actions(struct palloc_heap *heap,
  * palloc_reserve -- creates a single reservation
  */
 int
-palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg,
-	       uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t zset_id,
-	       struct dav_action *act)
+palloc_reserve(struct palloc_heap *heap, size_t size,
+	palloc_constr constructor, void *arg,
+	uint64_t extra_field, uint16_t object_flags,
+	uint16_t class_id, uint16_t arena_id,
+	struct dav_action *act)
 {
 	COMPILE_ERROR_ON(sizeof(struct dav_action) !=
 		sizeof(struct dav_action_internal));
 
-	return palloc_reservation_create(heap, size, constructor, arg, extra_field, object_flags,
-					 class_id, zset_id, (struct dav_action_internal *)act);
+	return palloc_reservation_create(heap, size, constructor, arg,
+		extra_field, object_flags, class_id, arena_id,
+		(struct dav_action_internal *)act);
 }
 
 /*
@@ -729,7 +728,7 @@ palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt
 int
 palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size,
 		 palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags,
-		 uint16_t class_id, uint32_t zset_id, struct operation_context *ctx)
+		 uint16_t class_id, uint16_t arena_id, struct operation_context *ctx)
 {
 	size_t user_size = 0;
 
@@ -759,8 +758,9 @@ palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, siz
 	/* alloc or realloc */
 	if (size != 0) {
 		alloc = &ops[nops++];
-		if (palloc_reservation_create(heap, size, constructor, arg, extra_field,
-					      object_flags, class_id, zset_id, alloc) != 0) {
+		if (palloc_reservation_create(heap, size, constructor, arg,
+			extra_field, object_flags,
+			class_id, arena_id, alloc) != 0) {
 			operation_cancel(ctx);
 			return -1;
 		}
@@ -907,6 +907,15 @@ palloc_boot(struct palloc_heap *heap, void *heap_start,
 		base, p_ops, stats, set);
 }
 
+/*
+ * palloc_buckets_init -- initialize buckets
+ */
+int
+palloc_buckets_init(struct palloc_heap *heap)
+{
+	return heap_buckets_init(heap);
+}
+
 /*
  * palloc_init -- initializes palloc heap
  */
@@ -944,6 +953,15 @@ palloc_heap_check_remote(void *heap_start, uint64_t heap_size,
 	return heap_check_remote(heap_start, heap_size, ops);
 }
 
+/*
+ * palloc_heap_cleanup -- cleanups the volatile heap state
+ */
+void
+palloc_heap_cleanup(struct palloc_heap *heap)
+{
+	heap_cleanup(heap);
+}
+
 #if VG_MEMCHECK_ENABLED
 /*
  * palloc_vg_register_alloc -- (internal) registers allocation header
diff --git a/src/common/dav/palloc.h b/src/common/dav/palloc.h
index 8c630e999e6..9c7560f1aaa 100644
--- a/src/common/dav/palloc.h
+++ b/src/common/dav/palloc.h
@@ -20,86 +20,85 @@
 #define PALLOC_CTL_DEBUG_NO_PATTERN (-1)
 
 struct palloc_heap {
-	struct mo_ops       p_ops;
+	struct mo_ops p_ops;
 	struct heap_layout *layout;
-	struct heap_rt     *rt;
-	uint64_t           *sizep;
-	uint64_t            growsize;
-	struct stats       *stats;
-	struct pool_set    *set;
-	void               *base;
-	int                 alloc_pattern;
+	struct heap_rt *rt;
+	uint64_t *sizep;
+	uint64_t growsize;
+
+	struct stats *stats;
+	struct pool_set *set;
+
+	void *base;
+
+	int alloc_pattern;
 };
 
 struct memory_block;
-struct zoneset;
 
-typedef int (*palloc_constr)(void *base, void *ptr, size_t usable_size, void *arg);
+typedef int (*palloc_constr)(void *base, void *ptr,
+		size_t usable_size, void *arg);
 
-int
-palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size,
-		 palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags,
-		 uint16_t class_id, uint32_t zset_id, struct operation_context *ctx);
+int palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off,
+	size_t size, palloc_constr constructor, void *arg,
+	uint64_t extra_field, uint16_t object_flags,
+	uint16_t class_id, uint16_t arena_id,
+	struct operation_context *ctx);
 
 int
-palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg,
-	       uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t zset_id,
+palloc_reserve(struct palloc_heap *heap, size_t size,
+	       palloc_constr constructor, void *arg,
+	       uint64_t extra_field, uint16_t object_flags,
+	       uint16_t class_id, uint16_t arena_id,
 	       struct dav_action *act);
 
-int
-palloc_action_isalloc(struct dav_action *act);
-void
-palloc_get_prange(struct dav_action *act, uint64_t *const off, uint64_t *const size,
-		  int persist_udata);
-uint64_t
-palloc_get_realoffset(struct palloc_heap *heap, uint64_t off);
+int palloc_action_isalloc(struct dav_action *act);
+void palloc_get_prange(struct dav_action *act, uint64_t *const off, uint64_t *const size,
+		       int persist_udata);
+uint64_t palloc_get_realoffset(struct palloc_heap *heap, uint64_t off);
 
 void
-palloc_defer_free(struct palloc_heap *heap, uint64_t off, struct dav_action *act);
+palloc_defer_free(struct palloc_heap *heap, uint64_t off,
+	struct dav_action *act);
 
 void
-palloc_cancel(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt);
+palloc_cancel(struct palloc_heap *heap,
+	struct dav_action *actv, size_t actvcnt);
 
 void
-palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt,
-	       struct operation_context *ctx);
+palloc_publish(struct palloc_heap *heap,
+	struct dav_action *actv, size_t actvcnt,
+	struct operation_context *ctx);
 
 void
-palloc_set_value(struct palloc_heap *heap, struct dav_action *act, uint64_t *ptr, uint64_t value);
+palloc_set_value(struct palloc_heap *heap, struct dav_action *act,
+	uint64_t *ptr, uint64_t value);
 
-uint64_t
-palloc_first(struct palloc_heap *heap);
-uint64_t
-palloc_next(struct palloc_heap *heap, uint64_t off);
+uint64_t palloc_first(struct palloc_heap *heap);
+uint64_t palloc_next(struct palloc_heap *heap, uint64_t off);
 
-size_t
-palloc_usable_size(struct palloc_heap *heap, uint64_t off);
-uint64_t
-palloc_extra(struct palloc_heap *heap, uint64_t off);
-uint16_t
-palloc_flags(struct palloc_heap *heap, uint64_t off);
+size_t palloc_usable_size(struct palloc_heap *heap, uint64_t off);
+uint64_t palloc_extra(struct palloc_heap *heap, uint64_t off);
+uint16_t palloc_flags(struct palloc_heap *heap, uint64_t off);
 
-int
-palloc_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, uint64_t *sizep,
-	    void *base, struct mo_ops *p_ops, struct stats *stats, struct pool_set *set);
+int palloc_boot(struct palloc_heap *heap, void *heap_start,
+		uint64_t heap_size, uint64_t *sizep,
+		void *base, struct mo_ops *p_ops,
+		struct stats *stats, struct pool_set *set);
 
-int
-palloc_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops);
-void *
-palloc_heap_end(struct palloc_heap *h);
-int
-palloc_heap_check(void *heap_start, uint64_t heap_size);
-int
-palloc_heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops);
-size_t
-palloc_heap(void *heap_start);
+int palloc_buckets_init(struct palloc_heap *heap);
+int palloc_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops);
+void *palloc_heap_end(struct palloc_heap *h);
+int palloc_heap_check(void *heap_start, uint64_t heap_size);
+int palloc_heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops);
+void palloc_heap_cleanup(struct palloc_heap *heap);
+size_t palloc_heap(void *heap_start);
 
 /* foreach callback, terminates iteration if return value is non-zero */
 typedef int (*object_callback)(const struct memory_block *m, void *arg);
 
 #if VG_MEMCHECK_ENABLED
-void
-palloc_heap_vg_open(struct palloc_heap *heap, int objects);
+void palloc_heap_vg_open(struct palloc_heap *heap, int objects);
 #endif
 
 #endif /* __DAOS_COMMON_PALLOC_H */
diff --git a/src/common/dav/recycler.c b/src/common/dav/recycler.c
index be26d9d7114..07537a44bd4 100644
--- a/src/common/dav/recycler.c
+++ b/src/common/dav/recycler.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2016-2023, Intel Corporation */
+/* Copyright 2016-2022, Intel Corporation */
 
 /*
  * recycler.c -- implementation of run recycler
@@ -49,7 +49,6 @@ recycler_element_cmp(const void *lhs, const void *rhs)
 struct recycler {
 	struct ravl *runs;
 	struct palloc_heap *heap;
-	struct zoneset     *zset;
 
 	/*
 	 * How many unaccounted units there *might* be inside of the memory
@@ -62,7 +61,8 @@ struct recycler {
 	 */
 	size_t unaccounted_units[MAX_CHUNK];
 	size_t unaccounted_total;
-	size_t              nallocs;
+	size_t nallocs;
+	size_t *peak_arenas;
 
 	VEC(, struct recycler_element) recalc;
 
@@ -73,7 +73,7 @@ struct recycler {
  * recycler_new -- creates new recycler instance
  */
 struct recycler *
-recycler_new(struct palloc_heap *heap, size_t nallocs, struct zoneset *zset)
+recycler_new(struct palloc_heap *heap, size_t nallocs, size_t *peak_arenas)
 {
 	struct recycler *r;
 
@@ -88,7 +88,7 @@ recycler_new(struct palloc_heap *heap, size_t nallocs, struct zoneset *zset)
 
 	r->heap = heap;
 	r->nallocs = nallocs;
-	r->zset              = zset;
+	r->peak_arenas = peak_arenas;
 	r->unaccounted_total = 0;
 	memset(&r->unaccounted_units, 0, sizeof(r->unaccounted_units));
 
@@ -219,7 +219,12 @@ recycler_recalc(struct recycler *r, int force)
 
 	uint64_t units = r->unaccounted_total;
 
-	uint64_t recalc_threshold = THRESHOLD_MUL * r->nallocs;
+	size_t peak_arenas;
+
+	util_atomic_load64(r->peak_arenas, &peak_arenas);
+
+	uint64_t recalc_threshold =
+		THRESHOLD_MUL * peak_arenas * r->nallocs;
 
 	if (!force && units < recalc_threshold)
 		return runs;
@@ -313,12 +318,3 @@ recycler_inc_unaccounted(struct recycler *r, const struct memory_block *m)
 	util_fetch_and_add64(&r->unaccounted_units[m->chunk_id],
 		m->size_idx);
 }
-
-/*
- * Return the zoneset associated with the recycler.
- */
-struct zoneset *
-recycler_get_zoneset(struct recycler *r)
-{
-	return r->zset;
-}
diff --git a/src/common/dav/recycler.h b/src/common/dav/recycler.h
index 7904289937d..2d68d8d70fc 100644
--- a/src/common/dav/recycler.h
+++ b/src/common/dav/recycler.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2016-2023, Intel Corporation */
+/* Copyright 2016-2021, Intel Corporation */
 
 /*
  * recycler.h -- internal definitions of run recycler
@@ -25,8 +25,8 @@ struct recycler_element {
 	uint32_t zone_id;
 };
 
-struct recycler      *
-recycler_new(struct palloc_heap *layout, size_t nallocs, struct zoneset *zset);
+struct recycler *recycler_new(struct palloc_heap *layout,
+	size_t nallocs, size_t *peak_arenas);
 void recycler_delete(struct recycler *r);
 struct recycler_element recycler_element_new(struct palloc_heap *heap,
 	const struct memory_block *m);
@@ -40,7 +40,4 @@ struct empty_runs recycler_recalc(struct recycler *r, int force);
 void recycler_inc_unaccounted(struct recycler *r,
 	const struct memory_block *m);
 
-struct zoneset *
-recycler_get_zoneset(struct recycler *r);
-
 #endif /* __DAOS_COMMON_RECYCLER_H */
diff --git a/src/common/dav/tx.c b/src/common/dav/tx.c
index c3bef536451..189dd073036 100644
--- a/src/common/dav/tx.c
+++ b/src/common/dav/tx.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause */
-/* Copyright 2015-2023, Intel Corporation */
+/* Copyright 2015-2022, Intel Corporation */
 
 /*
  * tx.c -- transactions implementation
@@ -354,8 +354,8 @@ tx_alloc_common(struct tx *tx, size_t size, type_num_t type_num,
 		return obj_tx_fail_null(ENOMEM, args.flags);
 
 	if (palloc_reserve(pop->do_heap, size, constructor, &args, type_num, 0,
-			   CLASS_ID_FROM_FLAG(args.flags), EZONE_ID_FROM_FLAG(args.flags),
-			   action) != 0)
+		CLASS_ID_FROM_FLAG(args.flags),
+		ARENA_ID_FROM_FLAG(args.flags), action) != 0)
 		goto err_oom;
 
 	palloc_get_prange(action, &off, &size, 1);
@@ -1265,7 +1265,68 @@ dav_tx_xadd_range(uint64_t hoff, size_t size, uint64_t flags)
  * dav_tx_alloc -- allocates a new object
  */
 uint64_t
-dav_tx_alloc(size_t size, uint64_t type_num, uint64_t flags)
+dav_tx_alloc(size_t size, uint64_t type_num)
+{
+	uint64_t off;
+
+	DAV_API_START();
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	uint64_t flags = tx_abort_on_failure_flag(tx);
+
+	if (size == 0) {
+		ERR("allocation with size 0");
+		off = obj_tx_fail_null(EINVAL, flags);
+		DAV_API_END();
+		return off;
+	}
+
+	off = tx_alloc_common(tx, size, (type_num_t)type_num,
+			constructor_tx_alloc, ALLOC_ARGS(flags));
+
+	DAV_API_END();
+	return off;
+}
+
+/*
+ * dav_tx_zalloc -- allocates a new zeroed object
+ */
+uint64_t
+dav_tx_zalloc(size_t size, uint64_t type_num)
+{
+	uint64_t off;
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	uint64_t flags = DAV_FLAG_ZERO;
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	DAV_API_START();
+	if (size == 0) {
+		ERR("allocation with size 0");
+		off = obj_tx_fail_null(EINVAL, flags);
+		DAV_API_END();
+		return off;
+	}
+
+	off = tx_alloc_common(tx, size, (type_num_t)type_num,
+			constructor_tx_alloc, ALLOC_ARGS(flags));
+
+	DAV_API_END();
+	return off;
+}
+
+/*
+ * dav_tx_xalloc -- allocates a new object
+ */
+uint64_t
+dav_tx_xalloc(size_t size, uint64_t type_num, uint64_t flags)
 {
 	uint64_t off;
 	struct tx *tx = get_tx();
@@ -1424,6 +1485,121 @@ dav_tx_off2ptr(uint64_t off)
 	return (void *)OBJ_OFF_TO_PTR(tx->pop, off);
 }
 
+/*
+ * dav_reserve -- reserves a single object
+ */
+uint64_t
+dav_reserve(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num)
+{
+	DAV_DBG("pop %p act %p size %zu type_num %llx",
+		pop, act, size,
+		(unsigned long long)type_num);
+
+	DAV_API_START();
+	if (pop->do_utx == NULL && dav_umem_wtx_new(pop) == NULL)
+		return 0;
+
+	if (palloc_reserve(pop->do_heap, size, NULL, NULL, type_num,
+		0, 0, 0, act) != 0) {
+		DAV_API_END();
+		return 0;
+	}
+
+	DAV_API_END();
+	return act->heap.offset;
+}
+
+/*
+ * dav_defer_free -- creates a deferred free action
+ */
+void
+dav_defer_free(dav_obj_t *pop, uint64_t off, struct dav_action *act)
+{
+	ASSERT(off != 0);
+	ASSERT(OBJ_OFF_IS_VALID(pop, off));
+	palloc_defer_free(pop->do_heap, off, act);
+}
+
+#if	0
+/*
+ * dav_publish -- publishes a collection of actions
+ */
+int
+dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt)
+{
+	DAV_API_START();
+	struct operation_context *ctx = pmalloc_operation_hold(pop);
+
+	size_t entries_size = actvcnt * sizeof(struct ulog_entry_val);
+
+	if (operation_reserve(ctx, entries_size) != 0) {
+		DAV_API_END();
+		return -1;
+	}
+
+	palloc_publish(&pop->do_heap, actv, actvcnt, ctx);
+
+	pmalloc_operation_release(pop);
+
+	DAV_API_END();
+	return 0;
+}
+#endif
+
+/*
+ * dav_cancel -- cancels collection of actions
+ */
+void
+dav_cancel(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt)
+{
+	DAV_DBG("actvcnt=%zu", actvcnt);
+	DAV_API_START();
+	palloc_cancel(pop->do_heap, actv, actvcnt);
+	DAV_API_END();
+}
+
+
+/*
+ * dav_tx_publish -- publishes actions inside of a transaction,
+ * with no_abort option
+ */
+int
+dav_tx_publish(struct dav_action *actv, size_t actvcnt)
+{
+	struct tx *tx = get_tx();
+	uint64_t flags = 0;
+	uint64_t off, size;
+	int ret;
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	DAV_API_START();
+
+	if (tx_action_reserve(tx, actvcnt) != 0) {
+		ret = obj_tx_fail_err(ENOMEM, flags);
+
+		DAV_API_END();
+		return ret;
+	}
+
+	for (size_t i = 0; i < actvcnt; ++i) {
+		VEC_PUSH_BACK(&tx->actions, actv[i]);
+		if (palloc_action_isalloc(&actv[i])) {
+			palloc_get_prange(&actv[i], &off, &size, 1);
+			struct tx_range_def r = {off, size, DAV_XADD_NO_SNAPSHOT|DAV_XADD_WAL_CPTR};
+
+			ret = dav_tx_add_common(tx, &r);
+			D_ASSERT(ret == 0);
+		}
+	}
+
+	DAV_API_END();
+	return 0;
+}
+
 /* arguments for constructor_alloc */
 struct constr_args {
 	int zero_init;
@@ -1431,6 +1607,7 @@ struct constr_args {
 	void *arg;
 };
 
+
 /* arguments for constructor_alloc_root */
 struct carg_root {
 	size_t size;
@@ -1608,8 +1785,10 @@ obj_alloc_construct(dav_obj_t *pop, uint64_t *offp, size_t size,
 	ctx = pop->external;
 	operation_start(ctx);
 
-	int ret = palloc_operation(pop->do_heap, 0, offp, size, constructor_alloc, &carg, type_num,
-				   0, CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), ctx);
+	int ret = palloc_operation(pop->do_heap, 0, offp, size,
+			constructor_alloc, &carg, type_num, 0,
+			CLASS_ID_FROM_FLAG(flags), ARENA_ID_FROM_FLAG(flags),
+			ctx);
 
 	lw_tx_end(pop, NULL);
 	return ret;
@@ -1619,12 +1798,11 @@ obj_alloc_construct(dav_obj_t *pop, uint64_t *offp, size_t size,
  * dav_alloc -- allocates a new object
  */
 int
-dav_alloc(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags,
-	   dav_constr constructor, void *arg)
+dav_alloc(dav_obj_t *pop, uint64_t *offp, size_t size,
+	  uint64_t type_num, dav_constr constructor, void *arg)
 {
-	DAV_DBG(3, "pop %p offp %p size %zu type_num %llx flags %llx constructor %p arg %p", pop,
-		offp, size, (unsigned long long)type_num, (unsigned long long)flags, constructor,
-		arg);
+	DAV_DBG("pop %p offp %p size %zu type_num %llx constructor %p arg %p",
+		pop, offp, size, (unsigned long long)type_num, constructor, arg);
 
 	if (size == 0) {
 		ERR("allocation with size 0");
@@ -1632,14 +1810,15 @@ dav_alloc(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64
 		return -1;
 	}
 
-	if (flags & ~DAV_TX_XALLOC_VALID_FLAGS) {
-		ERR("unknown flags 0x%" PRIx64, flags & ~DAV_TX_XALLOC_VALID_FLAGS);
+	if (offp == NULL) {
+		ERR("allocation offp is NULL");
 		errno = EINVAL;
 		return -1;
 	}
 
 	DAV_API_START();
-	int ret = obj_alloc_construct(pop, offp, size, type_num, flags, constructor, arg);
+	int ret = obj_alloc_construct(pop, offp, size, type_num,
+			0, constructor, arg);
 
 	DAV_API_END();
 	return ret;
@@ -1710,146 +1889,3 @@ dav_memcpy_persist_relaxed(dav_obj_t *pop, void *dest, const void *src,
 	DAV_API_END();
 	return ptr;
 }
-
-/*
- * dav_reserve -- reserves a single object
- */
-uint64_t
-dav_reserve(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num, uint64_t flags)
-{
-	struct constr_args carg;
-
-	DAV_DBG(3, "pop %p act %p size %zu type_num %llx flags %llx", pop, act, size,
-		(unsigned long long)type_num, (unsigned long long)flags);
-
-	if (flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS) {
-		ERR("unknown flags 0x%" PRIx64, flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS);
-		errno = EINVAL;
-		return 0;
-	}
-
-	DAV_API_START();
-
-	if (pop->do_utx == NULL && dav_umem_wtx_new(pop) == NULL)
-		return 0;
-
-	carg.zero_init   = flags & DAV_FLAG_ZERO;
-	carg.constructor = NULL;
-	carg.arg         = NULL;
-
-	if (palloc_reserve(pop->do_heap, size, constructor_alloc, &carg, type_num, 0,
-			   CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), act) != 0) {
-		DAV_API_END();
-		return 0;
-	}
-
-	DAV_API_END();
-	return act->heap.offset;
-}
-
-/*
- * dav_defer_free -- creates a deferred free action
- */
-void
-dav_defer_free(dav_obj_t *pop, uint64_t off, struct dav_action *act)
-{
-	ASSERT(off != 0);
-	ASSERT(OBJ_OFF_IS_VALID(pop, off));
-	palloc_defer_free(pop->do_heap, off, act);
-}
-
-#if 0
-/*
- * dav_publish -- publishes a collection of actions
- */
-int
-dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt)
-{
-	DAV_API_START();
-	struct operation_context *ctx = pmalloc_operation_hold(pop);
-
-	size_t entries_size = actvcnt * sizeof(struct ulog_entry_val);
-
-	if (operation_reserve(ctx, entries_size) != 0) {
-		DAV_API_END();
-		return -1;
-	}
-
-	palloc_publish(&pop->do_heap, actv, actvcnt, ctx);
-
-	pmalloc_operation_release(pop);
-
-	DAV_API_END();
-	return 0;
-}
-#endif
-
-/*
- * dav_cancel -- cancels collection of actions
- */
-void
-dav_cancel(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt)
-{
-	DAV_DBG("actvcnt=%zu", actvcnt);
-	DAV_API_START();
-	palloc_cancel(pop->do_heap, actv, actvcnt);
-	DAV_API_END();
-}
-
-/*
- * dav_tx_publish -- publishes actions inside of a transaction,
- * with no_abort option
- */
-int
-dav_tx_publish(struct dav_action *actv, size_t actvcnt)
-{
-	struct tx *tx    = get_tx();
-	uint64_t   flags = 0;
-	uint64_t   off, size;
-	int        ret;
-
-	ASSERT_IN_TX(tx);
-	ASSERT_TX_STAGE_WORK(tx);
-
-	flags |= tx_abort_on_failure_flag(tx);
-
-	DAV_API_START();
-
-	if (tx_action_reserve(tx, actvcnt) != 0) {
-		ret = obj_tx_fail_err(ENOMEM, flags);
-
-		DAV_API_END();
-		return ret;
-	}
-
-	for (size_t i = 0; i < actvcnt; ++i) {
-		VEC_PUSH_BACK(&tx->actions, actv[i]);
-		if (palloc_action_isalloc(&actv[i])) {
-			palloc_get_prange(&actv[i], &off, &size, 1);
-			struct tx_range_def r = {off, size,
-						 DAV_XADD_NO_SNAPSHOT | DAV_XADD_WAL_CPTR};
-
-			ret = dav_tx_add_common(tx, &r);
-			D_ASSERT(ret == 0);
-		}
-	}
-
-	DAV_API_END();
-	return 0;
-}
-
-/*
- * dav_get_zone_evictable -- Returns an evictable zone id that can be used for
- * allocations. If there are no evictable zone with sufficient free space then
- * zero is returned which maps to non-evictable zone.
- */
-uint32_t
-dav_get_zone_evictable(dav_obj_t *pop, int flags)
-{
-	D_ASSERT(flags == 0);
-	/* REVISIT: TBD
-	 * Return evictable zone that is currently marked as in-use and has sufficient free space.
-	 * Else, find an evictable zone that has more that x% of free memory and mark it as in-use.
-	 */
-	return 0;
-}
diff --git a/src/common/dav_v2/README.md b/src/common/dav_v2/README.md
new file mode 100644
index 00000000000..42616df7e6a
--- /dev/null
+++ b/src/common/dav_v2/README.md
@@ -0,0 +1,6 @@
+# DAOS Allocator for VOS
+
+The DAV alloctor for md_on_ssd phase 2 now supports evictable zones. This introduces change in the
+layout of heap and is not compatible with the DAV allocator of phase 1. In order to support both
+layouts the new allocator is packaged as a different library and linked to daos_common_pmem
+library.
diff --git a/src/common/dav_v2/SConscript b/src/common/dav_v2/SConscript
new file mode 100644
index 00000000000..8fd6c05ecd0
--- /dev/null
+++ b/src/common/dav_v2/SConscript
@@ -0,0 +1,30 @@
+"""Build dav_v2 libraries"""
+
+
+SRC = ['alloc_class.c', 'bucket.c', 'container_ravl.c', 'container_seglists.c', 'critnib.c',
+       'dav_clogs.c', 'dav_iface.c', 'heap.c', 'memblock.c', 'memops.c', 'palloc.c', 'ravl.c',
+       'ravl_interval.c', 'recycler.c', 'stats.c', 'tx.c', 'ulog.c', 'util.c', 'wal_tx.c']
+
+
+def scons():
+    """Scons function"""
+
+    Import('env', 'base_env')
+
+    env.AppendUnique(LIBPATH=[Dir('.')])
+    base_env.AppendUnique(LIBPATH=[Dir('.')])
+    base_env.d_add_build_rpath()
+    env.d_add_build_rpath()
+
+    denv = env.Clone()
+
+    denv.AppendUnique(LIBS=['pthread', 'gurt'])
+    denv.Append(CPPDEFINES=['-DDAOS_PMEM_BUILD', '-DDAV_V2_BUILD'])
+    denv.AppendUnique(CFLAGS=['-fvisibility=hidden'])
+
+    dav_v2 = denv.d_library('dav_v2', SRC)
+    denv.Install('$PREFIX/lib64/', dav_v2)
+
+
+if __name__ == "SCons.Script":
+    scons()
diff --git a/src/common/dav_v2/alloc_class.c b/src/common/dav_v2/alloc_class.c
new file mode 100644
index 00000000000..3dc5745db6a
--- /dev/null
+++ b/src/common/dav_v2/alloc_class.c
@@ -0,0 +1,647 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2022, Intel Corporation */
+
+/*
+ * alloc_class.c -- implementation of allocation classes
+ */
+
+#include <float.h>
+#include <string.h>
+
+#include "alloc_class.h"
+#include "heap_layout.h"
+#include "util.h"
+#include "out.h"
+#include "bucket.h"
+#include "critnib.h"
+
+#define RUN_CLASS_KEY_PACK(map_idx_s, flags_s, size_idx_s)\
+((uint64_t)(map_idx_s) << 32 |\
+(uint64_t)(flags_s) << 16 |\
+(uint64_t)(size_idx_s))
+
+/*
+ * Value used to mark a reserved spot in the bucket array.
+ */
+#define ACLASS_RESERVED ((void *)0xFFFFFFFFULL)
+
+/*
+ * The last size that is handled by runs.
+ */
+#define MAX_RUN_SIZE (CHUNKSIZE * 10)
+
+/*
+ * Maximum number of bytes the allocation class generation algorithm can decide
+ * to waste in a single run chunk.
+ */
+#define MAX_RUN_WASTED_BYTES 1024
+
+/*
+ * Allocation categories are used for allocation classes generation. Each one
+ * defines the biggest handled size (in bytes) and step pct of the generation
+ * process. The step percentage defines maximum allowed external fragmentation
+ * for the category.
+ */
+#define MAX_ALLOC_CATEGORIES 9
+
+/*
+ * The first size (in byes) which is actually used in the allocation
+ * class generation algorithm. All smaller sizes use the first predefined bucket
+ * with the smallest run unit size.
+ */
+#define FIRST_GENERATED_CLASS_SIZE 128
+
+/*
+ * The granularity of the allocation class generation algorithm.
+ */
+#define ALLOC_BLOCK_SIZE_GEN 64
+
+/*
+ * The first predefined allocation class size
+ */
+#define MIN_UNIT_SIZE 128
+
+static const struct {
+	size_t size;
+	float step;
+} categories[MAX_ALLOC_CATEGORIES] = {
+	/* dummy category - the first allocation class is predefined */
+	{FIRST_GENERATED_CLASS_SIZE, 0.05f},
+	{1024, 0.05f},
+	{2048, 0.05f},
+	{4096, 0.05f},
+	{8192, 0.05f},
+	{16384, 0.05f},
+	{32768, 0.05f},
+	{131072, 0.05f},
+	{393216, 0.05f},
+};
+
+#define RUN_UNIT_MAX_ALLOC 8U
+
+/*
+ * Every allocation has to be a multiple of at least 8 because we need to
+ * ensure proper alignment of every persistent structure.
+ */
+#define ALLOC_BLOCK_SIZE 16
+
+/*
+ * Converts size (in bytes) to number of allocation blocks.
+ */
+#define SIZE_TO_CLASS_MAP_INDEX(_s, _g) (1 + (((_s) - 1) / (_g)))
+
+/*
+ * Target number of allocations per run instance.
+ */
+#define RUN_MIN_NALLOCS 200
+
+/*
+ * Hard limit of chunks per single run.
+ */
+#define RUN_SIZE_IDX_CAP (16)
+
+#define ALLOC_CLASS_DEFAULT_FLAGS CHUNK_FLAG_FLEX_BITMAP
+
+struct alloc_class_collection {
+	size_t granularity;
+
+	struct alloc_class *aclasses[MAX_ALLOCATION_CLASSES];
+
+	/*
+	 * The last size (in bytes) that is handled by runs, everything bigger
+	 * uses the default class.
+	 */
+	size_t last_run_max_size;
+
+	/* maps allocation classes to allocation sizes, excluding the header! */
+	uint8_t *class_map_by_alloc_size;
+
+	/* maps allocation classes to run unit sizes */
+	struct critnib *class_map_by_unit_size;
+
+	int fail_on_missing_class;
+	int autogenerate_on_missing_class;
+};
+
+/*
+ * alloc_class_find_first_free_slot -- searches for the
+ *	first available allocation class slot
+ *
+ * This function must be thread-safe because allocation classes can be created
+ * at runtime.
+ */
+int
+alloc_class_find_first_free_slot(struct alloc_class_collection *ac,
+	uint8_t *slot)
+{
+	for (int n = 0; n < MAX_ALLOCATION_CLASSES; ++n) {
+		if (util_bool_compare_and_swap64(&ac->aclasses[n],
+				NULL, ACLASS_RESERVED)) {
+			*slot = (uint8_t)n;
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+/*
+ * alloc_class_reserve -- reserve the specified class id
+ */
+int
+alloc_class_reserve(struct alloc_class_collection *ac, uint8_t id)
+{
+	return util_bool_compare_and_swap64(&ac->aclasses[id],
+			NULL, ACLASS_RESERVED) ? 0 : -1;
+}
+
+/*
+ * alloc_class_reservation_clear -- removes the reservation on class id
+ */
+static void
+alloc_class_reservation_clear(struct alloc_class_collection *ac, int id)
+{
+	int ret = util_bool_compare_and_swap64(&ac->aclasses[id],
+		ACLASS_RESERVED, NULL);
+	ASSERT(ret);
+}
+
+/*
+ * alloc_class_new -- creates a new allocation class
+ */
+struct alloc_class *
+alloc_class_new(int id, struct alloc_class_collection *ac,
+	enum alloc_class_type type, enum header_type htype,
+	size_t unit_size, size_t alignment,
+	uint32_t size_idx)
+{
+	DAV_DBG("alloc_class_new id:%d\n",
+		  (type == CLASS_HUGE) ? DEFAULT_ALLOC_CLASS_ID : id);
+
+	struct alloc_class *c;
+
+	D_ALLOC_PTR_NZ(c);
+
+	if (c == NULL)
+		goto error_class_alloc;
+
+	c->unit_size = unit_size;
+	c->header_type = htype;
+	c->type = type;
+	c->flags = (uint16_t)
+		(header_type_to_flag[c->header_type] |
+		(alignment ? CHUNK_FLAG_ALIGNED : 0)) |
+		ALLOC_CLASS_DEFAULT_FLAGS;
+
+	switch (type) {
+	case CLASS_HUGE:
+		id = DEFAULT_ALLOC_CLASS_ID;
+		break;
+	case CLASS_RUN:
+		c->rdsc.alignment = alignment;
+		memblock_run_bitmap(&size_idx, c->flags, unit_size,
+			alignment, NULL, &c->rdsc.bitmap);
+		c->rdsc.nallocs = c->rdsc.bitmap.nbits;
+		c->rdsc.size_idx = size_idx;
+
+		/* these two fields are duplicated from class */
+		c->rdsc.unit_size = c->unit_size;
+		c->rdsc.flags = c->flags;
+
+		uint8_t slot = (uint8_t)id;
+
+		if (id < 0 && alloc_class_find_first_free_slot(ac,
+				&slot) != 0)
+			goto error_map_insert;
+		id = slot;
+
+		size_t map_idx = SIZE_TO_CLASS_MAP_INDEX(c->unit_size,
+			ac->granularity);
+		ASSERT(map_idx <= UINT32_MAX);
+		uint32_t map_idx_s = (uint32_t)map_idx;
+		uint16_t size_idx_s = (uint16_t)size_idx;
+		uint16_t flags_s = (uint16_t)c->flags;
+		uint64_t k = RUN_CLASS_KEY_PACK(map_idx_s,
+			flags_s, size_idx_s);
+
+		if (critnib_insert(ac->class_map_by_unit_size,
+		    k, c) != 0) {
+			ERR("unable to register allocation class");
+			goto error_map_insert;
+		}
+
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	c->id = (uint8_t)id;
+	ac->aclasses[c->id] = c;
+	return c;
+
+error_map_insert:
+	D_FREE(c);
+error_class_alloc:
+	if (id >= 0)
+		alloc_class_reservation_clear(ac, id);
+
+	D_CRIT("alloc_class_new failed\n");
+	return NULL;
+}
+
+/*
+ * alloc_class_delete -- (internal) deletes an allocation class
+ */
+void
+alloc_class_delete(struct alloc_class_collection *ac,
+	struct alloc_class *c)
+{
+	DAV_DBG("alloc_class_delete: %d\n", c->id);
+
+	ac->aclasses[c->id] = NULL;
+	D_FREE(c);
+}
+
+/*
+ * alloc_class_find_or_create -- (internal) searches for the
+ * biggest allocation class for which unit_size is evenly divisible by n.
+ * If no such class exists, create one.
+ */
+static struct alloc_class *
+alloc_class_find_or_create(struct alloc_class_collection *ac, size_t n)
+{
+	COMPILE_ERROR_ON(MAX_ALLOCATION_CLASSES > UINT8_MAX);
+	uint64_t required_size_bytes = n * RUN_MIN_NALLOCS;
+	uint32_t required_size_idx = 1;
+
+	if (required_size_bytes > RUN_DEFAULT_SIZE) {
+		required_size_bytes -= RUN_DEFAULT_SIZE;
+		required_size_idx +=
+			CALC_SIZE_IDX(CHUNKSIZE, required_size_bytes);
+		if (required_size_idx > RUN_SIZE_IDX_CAP)
+			required_size_idx = RUN_SIZE_IDX_CAP;
+	}
+
+	for (int i = MAX_ALLOCATION_CLASSES - 1; i >= 0; --i) {
+		struct alloc_class *c = ac->aclasses[i];
+
+		if (c == NULL || c->type == CLASS_HUGE ||
+				c->rdsc.size_idx < required_size_idx)
+			continue;
+
+		if (n % c->unit_size == 0 &&
+			n / c->unit_size <= RUN_UNIT_MAX_ALLOC)
+			return c;
+	}
+
+	/*
+	 * In order to minimize the wasted space at the end of the run the
+	 * run data size must be divisible by the allocation class unit size
+	 * with the smallest possible remainder, preferably 0.
+	 */
+	struct run_bitmap b;
+	size_t runsize_bytes = 0;
+
+	do {
+		if (runsize_bytes != 0) /* don't increase on first iteration */
+			n += ALLOC_BLOCK_SIZE_GEN;
+
+		uint32_t size_idx = required_size_idx;
+
+		memblock_run_bitmap(&size_idx, ALLOC_CLASS_DEFAULT_FLAGS, n, 0,
+			NULL, &b);
+
+		runsize_bytes = RUN_CONTENT_SIZE_BYTES(size_idx) - b.size;
+	} while ((runsize_bytes % n) > MAX_RUN_WASTED_BYTES);
+
+	/*
+	 * Now that the desired unit size is found the existing classes need
+	 * to be searched for possible duplicates. If a class that can handle
+	 * the calculated size already exists, simply return that.
+	 */
+	for (int i = 1; i < MAX_ALLOCATION_CLASSES; ++i) {
+		struct alloc_class *c = ac->aclasses[i];
+
+		if (c == NULL || c->type == CLASS_HUGE)
+			continue;
+		if (n / c->unit_size <= RUN_UNIT_MAX_ALLOC &&
+			n % c->unit_size == 0)
+			return c;
+		if (c->unit_size == n)
+			return c;
+	}
+
+	return alloc_class_new(-1, ac, CLASS_RUN, HEADER_COMPACT, n, 0,
+		required_size_idx);
+}
+
+/*
+ * alloc_class_find_min_frag -- searches for an existing allocation
+ * class that will provide the smallest internal fragmentation for the given
+ * size.
+ */
+static struct alloc_class *
+alloc_class_find_min_frag(struct alloc_class_collection *ac, size_t n)
+{
+	struct alloc_class *best_c = NULL;
+	size_t lowest_waste = SIZE_MAX;
+
+	ASSERTne(n, 0);
+
+	/*
+	 * Start from the largest buckets in order to minimize unit size of
+	 * allocated memory blocks.
+	 */
+	for (int i = MAX_ALLOCATION_CLASSES - 1; i >= 0; --i) {
+		struct alloc_class *c = ac->aclasses[i];
+
+		/* can't use alloc classes /w no headers by default */
+		if (c == NULL || c->header_type == HEADER_NONE)
+			continue;
+
+		size_t real_size = n + header_type_to_size[c->header_type];
+
+		size_t units = CALC_SIZE_IDX(c->unit_size, real_size);
+
+		/* can't exceed the maximum allowed run unit max */
+		if (c->type == CLASS_RUN && units > RUN_UNIT_MAX_ALLOC)
+			continue;
+
+		if (c->unit_size * units == real_size)
+			return c;
+
+		size_t waste = (c->unit_size * units) - real_size;
+
+		/*
+		 * If we assume that the allocation class is only ever going to
+		 * be used with exactly one size, the effective internal
+		 * fragmentation would be increased by the leftover
+		 * memory at the end of the run.
+		 */
+		if (c->type == CLASS_RUN) {
+			size_t wasted_units = c->rdsc.nallocs % units;
+			size_t wasted_bytes = wasted_units * c->unit_size;
+			size_t waste_avg_per_unit = wasted_bytes /
+				c->rdsc.nallocs;
+
+			waste += waste_avg_per_unit;
+		}
+
+		if (best_c == NULL || lowest_waste > waste) {
+			best_c = c;
+			lowest_waste = waste;
+		}
+	}
+
+	ASSERTne(best_c, NULL);
+	return best_c;
+}
+
+/*
+ * alloc_class_collection_new -- creates a new collection of allocation classes
+ */
+struct alloc_class_collection *
+alloc_class_collection_new()
+{
+	struct alloc_class_collection *ac;
+
+	D_ALLOC_PTR(ac);
+	if (ac == NULL)
+		return NULL;
+
+	ac->granularity = ALLOC_BLOCK_SIZE;
+	ac->last_run_max_size = MAX_RUN_SIZE;
+	ac->fail_on_missing_class = 0;
+	ac->autogenerate_on_missing_class = 1;
+
+	size_t maps_size = (MAX_RUN_SIZE / ac->granularity) + 1;
+
+	D_ALLOC_NZ(ac->class_map_by_alloc_size, maps_size);
+	if (ac->class_map_by_alloc_size == NULL)
+		goto error;
+	ac->class_map_by_unit_size = critnib_new();
+	if (ac->class_map_by_unit_size == NULL)
+		goto error;
+
+	memset(ac->class_map_by_alloc_size, 0xFF, maps_size);
+
+	if (alloc_class_new(-1, ac, CLASS_HUGE, HEADER_COMPACT,
+		CHUNKSIZE, 0, 1) == NULL)
+		goto error;
+
+	struct alloc_class *predefined_class =
+		alloc_class_new(-1, ac, CLASS_RUN, HEADER_COMPACT,
+			MIN_UNIT_SIZE, 0, 1);
+	if (predefined_class == NULL)
+		goto error;
+
+	for (size_t i = 0; i < FIRST_GENERATED_CLASS_SIZE / ac->granularity;
+		++i) {
+		ac->class_map_by_alloc_size[i] = predefined_class->id;
+	}
+
+	/*
+	 * Based on the defined categories, a set of allocation classes is
+	 * created. The unit size of those classes is depended on the category
+	 * initial size and step.
+	 */
+	size_t granularity_mask = ALLOC_BLOCK_SIZE_GEN - 1;
+
+	for (int c = 1; c < MAX_ALLOC_CATEGORIES; ++c) {
+		size_t n = categories[c - 1].size + ALLOC_BLOCK_SIZE_GEN;
+
+		do {
+			if (alloc_class_find_or_create(ac, n) == NULL)
+				goto error;
+
+			float stepf = (float)n * categories[c].step;
+			size_t stepi = (size_t)stepf;
+
+			stepi = (stepf - (float)stepi < FLT_EPSILON) ?
+				stepi : stepi + 1;
+
+			n += (stepi + (granularity_mask)) & ~granularity_mask;
+		} while (n <= categories[c].size);
+	}
+
+	/*
+	 * Find the largest alloc class and use it's unit size as run allocation
+	 * threshold.
+	 */
+	uint8_t largest_aclass_slot;
+
+	for (largest_aclass_slot = MAX_ALLOCATION_CLASSES - 1;
+			largest_aclass_slot > 0 &&
+			ac->aclasses[largest_aclass_slot] == NULL;
+			--largest_aclass_slot) {
+		/* intentional NOP */
+	}
+
+	struct alloc_class *c = ac->aclasses[largest_aclass_slot];
+
+	/*
+	 * The actual run might contain less unit blocks than the theoretical
+	 * unit max variable. This may be the case for very large unit sizes.
+	 */
+	size_t real_unit_max = (c->rdsc.nallocs < RUN_UNIT_MAX_ALLOC) ?
+		c->rdsc.nallocs : RUN_UNIT_MAX_ALLOC;
+
+	size_t theoretical_run_max_size = c->unit_size * real_unit_max;
+
+	ac->last_run_max_size = theoretical_run_max_size <= MAX_RUN_SIZE ?
+		theoretical_run_max_size : MAX_RUN_SIZE;
+
+#ifdef DAV_EXTRA_DEBUG
+	/*
+	 * Verify that each bucket's unit size points back to the bucket by the
+	 * bucket map. This must be true for the default allocation classes,
+	 * otherwise duplicate buckets will be created.
+	 */
+	for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		struct alloc_class *cl = ac->aclasses[i];
+
+		if (cl != NULL && cl->type == CLASS_RUN) {
+			ASSERTeq(i, cl->id);
+			ASSERTeq(alloc_class_by_run(ac, cl->unit_size,
+				cl->flags, cl->rdsc.size_idx), cl);
+		}
+	}
+#endif
+
+	return ac;
+
+error:
+	alloc_class_collection_delete(ac);
+
+	return NULL;
+}
+
+/*
+ * alloc_class_collection_delete -- deletes the allocation class collection and
+ *	all of the classes within it
+ */
+void
+alloc_class_collection_delete(struct alloc_class_collection *ac)
+{
+	for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		struct alloc_class *c = ac->aclasses[i];
+
+		if (c != NULL)
+			alloc_class_delete(ac, c);
+	}
+
+	if (ac->class_map_by_unit_size)
+		critnib_delete(ac->class_map_by_unit_size);
+	D_FREE(ac->class_map_by_alloc_size);
+	D_FREE(ac);
+}
+
+/*
+ * alloc_class_assign_by_size -- (internal) chooses the allocation class that
+ *	best approximates the provided size
+ */
+static struct alloc_class *
+alloc_class_assign_by_size(struct alloc_class_collection *ac,
+	size_t size)
+{
+	size_t class_map_index = SIZE_TO_CLASS_MAP_INDEX(size,
+		ac->granularity);
+	struct alloc_class *c = alloc_class_find_min_frag(ac,
+		class_map_index * ac->granularity);
+
+	ASSERTne(c, NULL);
+
+	/*
+	 * We don't lock this array because locking this section here and then
+	 * bailing out if someone else was faster would be still slower than
+	 * just calculating the class and failing to assign the variable.
+	 * We are using a compare and swap so that helgrind/drd don't complain.
+	 */
+	util_bool_compare_and_swap64(
+		&ac->class_map_by_alloc_size[class_map_index],
+		MAX_ALLOCATION_CLASSES, c->id);
+
+	DAV_DBG("alloc_class_assign_by_size: %zu id:%d",
+		  size, c->id);
+
+	return c;
+}
+
+/*
+ * alloc_class_by_alloc_size -- returns allocation class that is assigned
+ *	to handle an allocation of the provided size
+ */
+struct alloc_class *
+alloc_class_by_alloc_size(struct alloc_class_collection *ac, size_t size)
+{
+	if (size < ac->last_run_max_size) {
+		uint8_t class_id = ac->class_map_by_alloc_size[
+			SIZE_TO_CLASS_MAP_INDEX(size, ac->granularity)];
+
+		if (class_id == MAX_ALLOCATION_CLASSES) {
+			if (ac->fail_on_missing_class)
+				return NULL;
+			else if (ac->autogenerate_on_missing_class)
+				return alloc_class_assign_by_size(ac, size);
+			else
+				return ac->aclasses[DEFAULT_ALLOC_CLASS_ID];
+		}
+
+		return ac->aclasses[class_id];
+	} else {
+		return ac->aclasses[DEFAULT_ALLOC_CLASS_ID];
+	}
+}
+
+/*
+ * alloc_class_by_run -- returns the allocation class that has the given
+ *	unit size
+ */
+struct alloc_class *
+alloc_class_by_run(struct alloc_class_collection *ac,
+	size_t unit_size, uint16_t flags, uint32_t size_idx)
+{
+	size_t map_idx = SIZE_TO_CLASS_MAP_INDEX(unit_size, ac->granularity);
+
+	ASSERT(map_idx <= UINT32_MAX);
+
+	uint32_t map_idx_s = (uint32_t)map_idx;
+
+	ASSERT(size_idx <= UINT16_MAX);
+
+	uint16_t size_idx_s = (uint16_t)size_idx;
+	uint16_t flags_s = (uint16_t)flags;
+
+	return critnib_get(ac->class_map_by_unit_size,
+		RUN_CLASS_KEY_PACK(map_idx_s, flags_s, size_idx_s));
+}
+
+/*
+ * alloc_class_by_id -- returns the allocation class with an id
+ */
+struct alloc_class *
+alloc_class_by_id(struct alloc_class_collection *ac, uint8_t id)
+{
+	return ac->aclasses[id];
+}
+
+/*
+ * alloc_class_calc_size_idx -- calculates how many units does the size require
+ */
+ssize_t
+alloc_class_calc_size_idx(struct alloc_class *c, size_t size)
+{
+	uint32_t size_idx = CALC_SIZE_IDX(c->unit_size,
+		size + header_type_to_size[c->header_type]);
+
+	if (c->type == CLASS_RUN) {
+		if (c->header_type == HEADER_NONE && size_idx != 1)
+			return -1;
+		else if (size_idx > RUN_UNIT_MAX)
+			return -1;
+		else if (size_idx > c->rdsc.nallocs)
+			return -1;
+	}
+
+	return size_idx;
+}
diff --git a/src/common/dav_v2/alloc_class.h b/src/common/dav_v2/alloc_class.h
new file mode 100644
index 00000000000..676c064d975
--- /dev/null
+++ b/src/common/dav_v2/alloc_class.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2020, Intel Corporation */
+
+/*
+ * alloc_class.h -- internal definitions for allocation classes
+ */
+
+#ifndef __DAOS_COMMON_ALLOC_CLASS_H
+#define __DAOS_COMMON_ALLOC_CLASS_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include "heap_layout.h"
+#include "memblock.h"
+
+#define MAX_ALLOCATION_CLASSES (UINT8_MAX)
+#define DEFAULT_ALLOC_CLASS_ID (0)
+#define RUN_UNIT_MAX RUN_BITS_PER_VALUE
+
+struct alloc_class_collection;
+
+enum alloc_class_type {
+	CLASS_UNKNOWN,
+	CLASS_HUGE,
+	CLASS_RUN,
+
+	MAX_ALLOC_CLASS_TYPES
+};
+
+struct alloc_class {
+	uint8_t id;
+	uint16_t flags;
+
+	size_t unit_size;
+
+	enum header_type header_type;
+	enum alloc_class_type type;
+
+	/* run-specific data */
+	struct run_descriptor rdsc;
+};
+
+struct alloc_class_collection *alloc_class_collection_new(void);
+void alloc_class_collection_delete(struct alloc_class_collection *ac);
+
+struct alloc_class *alloc_class_by_run(
+	struct alloc_class_collection *ac,
+	size_t unit_size, uint16_t flags, uint32_t size_idx);
+struct alloc_class *alloc_class_by_alloc_size(
+	struct alloc_class_collection *ac, size_t size);
+struct alloc_class *alloc_class_by_id(
+	struct alloc_class_collection *ac, uint8_t id);
+
+int alloc_class_reserve(struct alloc_class_collection *ac, uint8_t id);
+int alloc_class_find_first_free_slot(struct alloc_class_collection *ac,
+	uint8_t *slot);
+
+ssize_t
+alloc_class_calc_size_idx(struct alloc_class *c, size_t size);
+
+struct alloc_class *
+alloc_class_new(int id, struct alloc_class_collection *ac,
+	enum alloc_class_type type, enum header_type htype,
+	size_t unit_size, size_t alignment,
+	uint32_t size_idx);
+
+void alloc_class_delete(struct alloc_class_collection *ac,
+	struct alloc_class *c);
+
+#endif /* __DAOS_COMMON_ALLOC_CLASS_H */
diff --git a/src/common/dav_v2/bucket.c b/src/common/dav_v2/bucket.c
new file mode 100644
index 00000000000..33aba6167c5
--- /dev/null
+++ b/src/common/dav_v2/bucket.c
@@ -0,0 +1,275 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2023, Intel Corporation */
+
+/*
+ * bucket.c -- bucket implementation
+ *
+ * Buckets manage volatile state of the heap. They are the abstraction layer
+ * between the heap-managed chunks/runs and memory allocations.
+ *
+ * Each bucket instance can have a different underlying container that is
+ * responsible for selecting blocks - which means that whether the allocator
+ * serves memory blocks in best/first/next -fit manner is decided during bucket
+ * creation.
+ */
+
+#include "alloc_class.h"
+#include "bucket.h"
+#include "heap.h"
+#include "memblock.h"
+#include "out.h"
+#include "sys_util.h"
+#include "valgrind_internal.h"
+
+struct bucket {
+	/* this struct is both the lock guard and the locked state */
+	struct bucket_locked             *locked;
+	struct alloc_class               *aclass;
+	struct block_container           *container;
+	const struct block_container_ops *c_ops;
+	struct memory_block_reserved     *active_memory_block;
+	struct zoneset                   *zset;
+	int                               is_active;
+};
+
+struct bucket_locked {
+	struct bucket bucket;
+	pthread_mutex_t lock;
+};
+
+/*
+ * bucket_init -- initializes the bucket's runtime state
+ */
+static int
+bucket_init(struct bucket *b, struct block_container *c,
+	struct alloc_class *aclass)
+{
+	b->container = c;
+	b->c_ops = c->c_ops;
+
+	b->is_active = 0;
+	b->active_memory_block = NULL;
+	if (aclass && aclass->type == CLASS_RUN) {
+		D_ALLOC_PTR(b->active_memory_block);
+
+		if (b->active_memory_block == NULL)
+			return -1;
+	}
+	b->aclass = aclass;
+
+	return 0;
+}
+
+/*
+ * bucket_fini -- destroys the bucket's runtime state
+ */
+static void
+bucket_fini(struct bucket *b)
+{
+	if (b->active_memory_block)
+		D_FREE(b->active_memory_block);
+	b->c_ops->destroy(b->container);
+}
+
+/*
+ * bucket_locked_new -- creates a new locked bucket instance
+ */
+struct bucket_locked *
+bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct zoneset *zset)
+{
+	ASSERTne(c, NULL);
+
+	struct bucket_locked *b;
+
+	D_ALLOC_PTR_NZ(b);
+	if (b == NULL)
+		return NULL;
+
+	if (bucket_init(&b->bucket, c, aclass) != 0)
+		goto err_bucket_init;
+
+	util_mutex_init(&b->lock);
+	b->bucket.locked = b;
+	b->bucket.zset   = zset;
+
+	return b;
+
+err_bucket_init:
+	D_FREE(b);
+	return NULL;
+}
+
+/*
+ * bucket_locked_delete -- cleanups and deallocates locked bucket instance
+ */
+void
+bucket_locked_delete(struct bucket_locked *b)
+{
+	bucket_fini(&b->bucket);
+	util_mutex_destroy(&b->lock);
+	D_FREE(b);
+}
+
+/*
+ * bucket_acquire -- acquires a usable bucket struct
+ */
+struct bucket *
+bucket_acquire(struct bucket_locked *b)
+{
+	util_mutex_lock(&b->lock);
+	return &b->bucket;
+}
+
+/*
+ * bucket_release -- releases a bucket struct
+ */
+void
+bucket_release(struct bucket *b)
+{
+	util_mutex_unlock(&b->locked->lock);
+}
+
+/*
+ * bucket_try_insert_attached_block -- tries to return a previously allocated
+ *	memory block back to the original bucket
+ */
+void
+bucket_try_insert_attached_block(struct bucket *b, const struct memory_block *m)
+{
+	struct memory_block *active = &b->active_memory_block->m;
+
+	if (b->is_active &&
+	    m->chunk_id == active->chunk_id &&
+	    m->zone_id == active->zone_id) {
+		bucket_insert_block(b, m);
+	}
+}
+
+/*
+ * bucket_alloc_class -- returns the bucket's alloc class
+ */
+struct alloc_class *
+bucket_alloc_class(struct bucket *b)
+{
+	return b->aclass;
+}
+
+/*
+ * bucket_insert_block -- inserts a block into the bucket
+ */
+int
+bucket_insert_block(struct bucket *b, const struct memory_block *m)
+{
+#if VG_MEMCHECK_ENABLED || VG_HELGRIND_ENABLED || VG_DRD_ENABLED
+	if (On_memcheck || On_drd_or_hg) {
+		size_t size = m->m_ops->get_real_size(m);
+		void *data = m->m_ops->get_real_data(m);
+
+		VALGRIND_DO_MAKE_MEM_NOACCESS(data, size);
+		VALGRIND_ANNOTATE_NEW_MEMORY(data, size);
+	}
+#endif
+	return b->c_ops->insert(b->container, m);
+}
+
+/*
+ * bucket_remove_block -- removes an exact block from the bucket
+ */
+int
+bucket_remove_block(struct bucket *b, const struct memory_block *m)
+{
+	return b->c_ops->get_rm_exact(b->container, m);
+}
+
+/*
+ * bucket_alloc_block -- allocates a block from the bucket
+ */
+int
+bucket_alloc_block(struct bucket *b, struct memory_block *m_out)
+{
+	return b->c_ops->get_rm_bestfit(b->container, m_out);
+}
+
+/*
+ * bucket_memblock_insert_block -- (internal) bucket insert wrapper
+ *	for callbacks
+ */
+static int
+bucket_memblock_insert_block(const struct memory_block *m, void *b)
+{
+	return bucket_insert_block(b, m);
+}
+
+/*
+ * bucket_attach_run - attaches a run to a bucket, making it active
+ */
+int
+bucket_attach_run(struct bucket *b, const struct memory_block *m)
+{
+	pthread_mutex_t *lock = m->m_ops->get_lock(m);
+
+	util_mutex_lock(lock);
+
+	int ret = m->m_ops->iterate_free(m, bucket_memblock_insert_block, b);
+
+	util_mutex_unlock(lock);
+
+	if (ret == 0) {
+		b->active_memory_block->m = *m;
+		b->active_memory_block->bucket = b->locked;
+		b->is_active = 1;
+		util_fetch_and_add64(&b->active_memory_block->nresv, 1);
+	} else {
+		b->c_ops->rm_all(b->container);
+	}
+	return 0;
+}
+
+/*
+ * bucket_detach_run - gets rid of the active block in the bucket
+ */
+int
+bucket_detach_run(struct bucket *b, struct memory_block *m_out, int *empty)
+{
+	*empty = 0;
+
+	struct memory_block_reserved **active = &b->active_memory_block;
+
+	if (b->is_active) {
+		b->c_ops->rm_all(b->container);
+		if (util_fetch_and_sub64(&(*active)->nresv, 1) == 1) {
+			*m_out = (*active)->m;
+			*empty = 1;
+
+			VALGRIND_ANNOTATE_HAPPENS_AFTER(&(*active)->nresv);
+			(*active)->m = MEMORY_BLOCK_NONE;
+		} else {
+			VALGRIND_ANNOTATE_HAPPENS_BEFORE(&(*active)->nresv);
+			*active = NULL;
+		}
+		b->is_active = 0;
+	}
+
+	if (*active == NULL) {
+		D_ALLOC_PTR(*active);
+		if (*active == NULL)
+			return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * bucket_active_block -- returns the bucket active block
+ */
+struct memory_block_reserved *
+bucket_active_block(struct bucket *b)
+{
+	return b->is_active ? b->active_memory_block : NULL;
+}
+
+struct zoneset *
+bucket_get_zoneset(struct bucket *b)
+{
+	return b->zset;
+}
diff --git a/src/common/dav_v2/bucket.h b/src/common/dav_v2/bucket.h
new file mode 100644
index 00000000000..b0d92b66995
--- /dev/null
+++ b/src/common/dav_v2/bucket.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2023, Intel Corporation */
+
+/*
+ * bucket.h -- internal definitions for bucket
+ */
+
+#ifndef __DAOS_COMMON_BUCKET_H
+#define __DAOS_COMMON_BUCKET_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "alloc_class.h"
+#include "container.h"
+#include "memblock.h"
+
+#define CALC_SIZE_IDX(_unit_size, _size)\
+	((_size) == 0 ? 0 : (uint32_t)((((_size)-1) / (_unit_size)) + 1))
+
+struct bucket_locked;
+struct bucket;
+
+struct bucket_locked *
+bucket_locked_new(struct block_container *c, struct alloc_class *aclass, struct zoneset *zset);
+
+struct bucket *bucket_acquire(struct bucket_locked *b);
+void bucket_release(struct bucket *b);
+
+struct alloc_class *bucket_alloc_class(struct bucket *b);
+int bucket_insert_block(struct bucket *b, const struct memory_block *m);
+void bucket_try_insert_attached_block(struct bucket *b,
+	const struct memory_block *m);
+int bucket_remove_block(struct bucket *b, const struct memory_block *m);
+int bucket_alloc_block(struct bucket *b, struct memory_block *m_out);
+
+int bucket_attach_run(struct bucket *b, const struct memory_block *m);
+int bucket_detach_run(struct bucket *b,
+	struct memory_block *m_out, int *empty);
+
+struct memory_block_reserved *bucket_active_block(struct bucket *b);
+
+void bucket_locked_delete(struct bucket_locked *b);
+struct zoneset *
+bucket_get_zoneset(struct bucket *b);
+
+#endif /* __DAOS_COMMON_BUCKET_H */
diff --git a/src/common/dav_v2/container.h b/src/common/dav_v2/container.h
new file mode 100644
index 00000000000..2ec71e88243
--- /dev/null
+++ b/src/common/dav_v2/container.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2020, Intel Corporation */
+
+/*
+ * container.h -- internal definitions for block containers
+ */
+
+#ifndef __DAOS_COMMON_CONTAINER_H
+#define __DAOS_COMMON_CONTAINER_H 1
+
+#include "memblock.h"
+
+struct block_container {
+	const struct block_container_ops *c_ops;
+	struct palloc_heap *heap;
+};
+
+struct block_container_ops {
+	/* inserts a new memory block into the container */
+	int (*insert)(struct block_container *c, const struct memory_block *m);
+
+	/* removes exact match memory block */
+	int (*get_rm_exact)(struct block_container *c,
+		const struct memory_block *m);
+
+	/* removes and returns the best-fit memory block for size */
+	int (*get_rm_bestfit)(struct block_container *c,
+		struct memory_block *m);
+
+	/* checks whether the container is empty */
+	int (*is_empty)(struct block_container *c);
+
+	/* removes all elements from the container */
+	void (*rm_all)(struct block_container *c);
+
+	/* deletes the container */
+	void (*destroy)(struct block_container *c);
+};
+
+struct palloc_heap;
+struct block_container *container_new_ravl(struct palloc_heap *heap);
+struct block_container *container_new_seglists(struct palloc_heap *heap);
+
+#endif /* __DAOS_COMMON_CONTAINER_H */
diff --git a/src/common/dav_v2/container_ravl.c b/src/common/dav_v2/container_ravl.c
new file mode 100644
index 00000000000..8cf5033c44d
--- /dev/null
+++ b/src/common/dav_v2/container_ravl.c
@@ -0,0 +1,194 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2022, Intel Corporation */
+
+/*
+ * container_ravl.c -- implementation of ravl-based block container
+ */
+
+#include "container.h"
+#include "ravl.h"
+#include "out.h"
+#include "sys_util.h"
+
+struct block_container_ravl {
+	struct block_container super;
+	struct ravl *tree;
+};
+
+/*
+ * container_compare_memblocks -- (internal) compares two memory blocks
+ */
+static int
+container_compare_memblocks(const void *lhs, const void *rhs)
+{
+	const struct memory_block *l = lhs;
+	const struct memory_block *r = rhs;
+
+	int64_t diff = (int64_t)l->size_idx - (int64_t)r->size_idx;
+
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->zone_id - (int64_t)r->zone_id;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->chunk_id - (int64_t)r->chunk_id;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->block_off - (int64_t)r->block_off;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	return 0;
+}
+
+/*
+ * container_ravl_insert_block -- (internal) inserts a new memory block
+ *	into the container
+ */
+static int
+container_ravl_insert_block(struct block_container *bc,
+	const struct memory_block *m)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	struct memory_block *e = m->m_ops->get_user_data(m);
+
+	VALGRIND_DO_MAKE_MEM_DEFINED(e, sizeof(*e));
+	VALGRIND_ADD_TO_TX(e, sizeof(*e));
+	*e = *m;
+	VALGRIND_SET_CLEAN(e, sizeof(*e));
+	VALGRIND_REMOVE_FROM_TX(e, sizeof(*e));
+
+	return ravl_insert(c->tree, e);
+}
+
+/*
+ * container_ravl_get_rm_block_bestfit -- (internal) removes and returns the
+ *	best-fit memory block for size
+ */
+static int
+container_ravl_get_rm_block_bestfit(struct block_container *bc,
+	struct memory_block *m)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	struct ravl_node *n = ravl_find(c->tree, m,
+		RAVL_PREDICATE_GREATER_EQUAL);
+
+	if (n == NULL)
+		return ENOMEM;
+
+	struct memory_block *e = ravl_data(n);
+	*m = *e;
+	ravl_remove(c->tree, n);
+
+	return 0;
+}
+
+/*
+ * container_ravl_get_rm_block_exact --
+ *	(internal) removes exact match memory block
+ */
+static int
+container_ravl_get_rm_block_exact(struct block_container *bc,
+	const struct memory_block *m)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	struct ravl_node *n = ravl_find(c->tree, m, RAVL_PREDICATE_EQUAL);
+
+	if (n == NULL)
+		return ENOMEM;
+
+	ravl_remove(c->tree, n);
+
+	return 0;
+}
+
+/*
+ * container_ravl_is_empty -- (internal) checks whether the container is empty
+ */
+static int
+container_ravl_is_empty(struct block_container *bc)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	return ravl_empty(c->tree);
+}
+
+/*
+ * container_ravl_rm_all -- (internal) removes all elements from the tree
+ */
+static void
+container_ravl_rm_all(struct block_container *bc)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	ravl_clear(c->tree);
+}
+
+/*
+ * container_ravl_delete -- (internal) deletes the container
+ */
+static void
+container_ravl_destroy(struct block_container *bc)
+{
+	struct block_container_ravl *c =
+		(struct block_container_ravl *)bc;
+
+	ravl_delete(c->tree);
+
+	D_FREE(bc);
+}
+
+/*
+ * Tree-based block container used to provide best-fit functionality to the
+ * bucket. The time complexity for this particular container is O(k) where k is
+ * the length of the key.
+ *
+ * The get methods also guarantee that the block with lowest possible address
+ * that best matches the requirements is provided.
+ */
+static const struct block_container_ops container_ravl_ops = {
+	.insert = container_ravl_insert_block,
+	.get_rm_exact = container_ravl_get_rm_block_exact,
+	.get_rm_bestfit = container_ravl_get_rm_block_bestfit,
+	.is_empty = container_ravl_is_empty,
+	.rm_all = container_ravl_rm_all,
+	.destroy = container_ravl_destroy,
+};
+
+/*
+ * container_new_ravl -- allocates and initializes a ravl container
+ */
+struct block_container *
+container_new_ravl(struct palloc_heap *heap)
+{
+	struct block_container_ravl *bc;
+
+	D_ALLOC_PTR_NZ(bc);
+	if (bc == NULL)
+		goto error_container_malloc;
+
+	bc->super.heap = heap;
+	bc->super.c_ops = &container_ravl_ops;
+	bc->tree = ravl_new(container_compare_memblocks);
+	if (bc->tree == NULL)
+		goto error_ravl_new;
+
+	return (struct block_container *)&bc->super;
+
+error_ravl_new:
+	D_FREE(bc);
+
+error_container_malloc:
+	return NULL;
+}
diff --git a/src/common/dav_v2/container_seglists.c b/src/common/dav_v2/container_seglists.c
new file mode 100644
index 00000000000..943d70ad87d
--- /dev/null
+++ b/src/common/dav_v2/container_seglists.c
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2022, Intel Corporation */
+
+/*
+ * container_seglists.c -- implementation of segregated lists block container
+ *
+ * This container is constructed from N (up to 64) intrusive lists and a
+ * single 8 byte bitmap that stores the information whether a given list is
+ * empty or not.
+ */
+
+#include "container.h"
+#include "out.h"
+#include "sys_util.h"
+#include "util.h"
+#include "valgrind_internal.h"
+#include "vecq.h"
+
+#define SEGLIST_BLOCK_LISTS 64U
+
+struct block_container_seglists {
+	struct block_container super;
+	struct memory_block m;
+
+	VECQ(, uint32_t) blocks[SEGLIST_BLOCK_LISTS];
+	uint64_t nonempty_lists;
+};
+
+/*
+ * container_seglists_insert_block -- (internal) inserts a new memory block
+ *	into the container
+ */
+static int
+container_seglists_insert_block(struct block_container *bc,
+	const struct memory_block *m)
+{
+	ASSERT(m->chunk_id < MAX_CHUNK);
+	ASSERT(m->zone_id < UINT16_MAX);
+	ASSERTne(m->size_idx, 0);
+
+	struct block_container_seglists *c =
+		(struct block_container_seglists *)bc;
+
+	if (c->nonempty_lists == 0)
+		c->m = *m;
+
+	ASSERT(m->size_idx <= SEGLIST_BLOCK_LISTS);
+	ASSERT(m->chunk_id == c->m.chunk_id);
+	ASSERT(m->zone_id == c->m.zone_id);
+
+	if (VECQ_ENQUEUE(&c->blocks[m->size_idx - 1], m->block_off) != 0)
+		return -1;
+
+	/* marks the list as nonempty */
+	c->nonempty_lists |= 1ULL << (m->size_idx - 1);
+
+	return 0;
+}
+
+/*
+ * container_seglists_get_rm_block_bestfit -- (internal) removes and returns the
+ *	best-fit memory block for size
+ */
+static int
+container_seglists_get_rm_block_bestfit(struct block_container *bc,
+	struct memory_block *m)
+{
+	struct block_container_seglists *c =
+		(struct block_container_seglists *)bc;
+
+	ASSERT(m->size_idx <= SEGLIST_BLOCK_LISTS);
+	uint32_t i = 0;
+
+	/* applicable lists */
+	uint64_t size_mask = (1ULL << (m->size_idx - 1)) - 1;
+	uint64_t v = c->nonempty_lists & ~size_mask;
+
+	if (v == 0)
+		return ENOMEM;
+
+	/* finds the list that serves the smallest applicable size */
+	i = util_lssb_index64(v);
+
+	uint32_t block_offset = VECQ_DEQUEUE(&c->blocks[i]);
+
+	if (VECQ_SIZE(&c->blocks[i]) == 0) /* marks the list as empty */
+		c->nonempty_lists &= ~(1ULL << (i));
+
+	*m = c->m;
+	m->block_off = block_offset;
+	m->size_idx = i + 1;
+
+	return 0;
+}
+
+/*
+ * container_seglists_is_empty -- (internal) checks whether the container is
+ * empty
+ */
+static int
+container_seglists_is_empty(struct block_container *bc)
+{
+	struct block_container_seglists *c =
+		(struct block_container_seglists *)bc;
+
+	return c->nonempty_lists == 0;
+}
+
+/*
+ * container_seglists_rm_all -- (internal) removes all elements from the tree
+ */
+static void
+container_seglists_rm_all(struct block_container *bc)
+{
+	struct block_container_seglists *c =
+		(struct block_container_seglists *)bc;
+
+	for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i)
+		VECQ_CLEAR(&c->blocks[i]);
+
+	c->nonempty_lists = 0;
+}
+
+/*
+ * container_seglists_delete -- (internal) deletes the container
+ */
+static void
+container_seglists_destroy(struct block_container *bc)
+{
+	struct block_container_seglists *c =
+		(struct block_container_seglists *)bc;
+
+	for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i)
+		VECQ_DELETE(&c->blocks[i]);
+
+	D_FREE(c);
+}
+
+/*
+ * This container does not support retrieval of exact memory blocks, but other
+ * than provides best-fit in O(1) time for unit sizes that do not exceed 64.
+ */
+static const struct block_container_ops container_seglists_ops = {
+	.insert = container_seglists_insert_block,
+	.get_rm_exact = NULL,
+	.get_rm_bestfit = container_seglists_get_rm_block_bestfit,
+	.is_empty = container_seglists_is_empty,
+	.rm_all = container_seglists_rm_all,
+	.destroy = container_seglists_destroy,
+};
+
+/*
+ * container_new_seglists -- allocates and initializes a seglists container
+ */
+struct block_container *
+container_new_seglists(struct palloc_heap *heap)
+{
+	struct block_container_seglists *bc;
+
+	D_ALLOC_PTR_NZ(bc);
+	if (bc == NULL)
+		goto error_container_malloc;
+
+	bc->super.heap = heap;
+	bc->super.c_ops = &container_seglists_ops;
+
+	for (unsigned i = 0; i < SEGLIST_BLOCK_LISTS; ++i)
+		VECQ_INIT(&bc->blocks[i]);
+	bc->nonempty_lists = 0;
+
+	return (struct block_container *)&bc->super;
+
+error_container_malloc:
+	return NULL;
+}
diff --git a/src/common/dav_v2/critnib.c b/src/common/dav_v2/critnib.c
new file mode 100644
index 00000000000..8a33d7d883d
--- /dev/null
+++ b/src/common/dav_v2/critnib.c
@@ -0,0 +1,678 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2022, Intel Corporation */
+
+/*
+ * critnib.c -- implementation of critnib tree
+ *
+ * It offers identity lookup (like a hashmap) and <= lookup (like a search
+ * tree).  Unlike some hashing algorithms (cuckoo hash, perfect hashing) the
+ * complexity isn't constant, but for data sizes we expect it's several
+ * times as fast as cuckoo, and has no "stop the world" cases that would
+ * cause latency (ie, better worst case behavior).
+ */
+
+/*
+ * STRUCTURE DESCRIPTION
+ *
+ * Critnib is a hybrid between a radix tree and DJ Bernstein's critbit:
+ * it skips nodes for uninteresting radix nodes (ie, ones that would have
+ * exactly one child), this requires adding to every node a field that
+ * describes the slice (4-bit in our case) that this radix level is for.
+ *
+ * This implementation also stores each node's path (ie, bits that are
+ * common to every key in that subtree) -- this doesn't help with lookups
+ * at all (unused in == match, could be reconstructed at no cost in <=
+ * after first dive) but simplifies inserts and removes.  If we ever want
+ * that piece of memory it's easy to trim it down.
+ */
+
+/*
+ * CONCURRENCY ISSUES
+ *
+ * Reads are completely lock-free sync-free, but only almost wait-free:
+ * if for some reason a read thread gets pathologically stalled, it will
+ * notice the data being stale and restart the work.  In usual cases,
+ * the structure having been modified does _not_ cause a restart.
+ *
+ * Writes could be easily made lock-free as well (with only a cmpxchg
+ * sync), but this leads to problems with removes.  A possible solution
+ * would be doing removes by overwriting by NULL w/o freeing -- yet this
+ * would lead to the structure growing without bounds.  Complex per-node
+ * locks would increase concurrency but they slow down individual writes
+ * enough that in practice a simple global write lock works faster.
+ *
+ * Removes are the only operation that can break reads.  The structure
+ * can do local RCU well -- the problem being knowing when it's safe to
+ * free.  Any synchronization with reads would kill their speed, thus
+ * instead we have a remove count.  The grace period is DELETED_LIFE,
+ * after which any read will notice staleness and restart its work.
+ */
+#include <errno.h>
+#include <stdbool.h>
+
+#include "critnib.h"
+#include "out.h"
+#include "sys_util.h"
+#include "valgrind_internal.h"
+#include "util.h"
+
+/*
+ * A node that has been deleted is left untouched for this many delete
+ * cycles.  Reads have guaranteed correctness if they took no longer than
+ * DELETED_LIFE concurrent deletes, otherwise they notice something is
+ * wrong and restart.  The memory of deleted nodes is never freed to
+ * malloc nor their pointers lead anywhere wrong, thus a stale read will
+ * (temporarily) get a wrong answer but won't crash.
+ *
+ * There's no need to count writes as they never interfere with reads.
+ *
+ * Allowing stale reads (of arbitrarily old writes or of deletes less than
+ * DELETED_LIFE old) might sound counterintuitive, but it doesn't affect
+ * semantics in any way: the thread could have been stalled just after
+ * returning from our code.  Thus, the guarantee is: the result of get() or
+ * find_le() is a value that was current at any point between the call
+ * start and end.
+ */
+#define DELETED_LIFE 16
+
+#define SLICE 4
+#define NIB ((1ULL << SLICE) - 1)
+#define SLNODES (1 << SLICE)
+
+typedef unsigned char sh_t;
+
+struct critnib_node {
+	/*
+	 * path is the part of a tree that's already traversed (be it through
+	 * explicit nodes or collapsed links) -- ie, any subtree below has all
+	 * those bits set to this value.
+	 *
+	 * nib is a 4-bit slice that's an index into the node's children.
+	 *
+	 * shift is the length (in bits) of the part of the key below this node.
+	 *
+	 *            nib
+	 * |XXXXXXXXXX|?|*****|
+	 *    path      ^
+	 *              +-----+
+	 *               shift
+	 */
+	struct critnib_node *child[SLNODES];
+	uint64_t path;
+	sh_t shift;
+};
+
+struct critnib_leaf {
+	uint64_t key;
+	void *value;
+};
+
+struct critnib {
+	struct critnib_node *root;
+
+	/* pool of freed nodes: singly linked list, next at child[0] */
+	struct critnib_node *deleted_node;
+	struct critnib_leaf *deleted_leaf;
+
+	/* nodes removed but not yet eligible for reuse */
+	struct critnib_node *pending_del_nodes[DELETED_LIFE];
+	struct critnib_leaf *pending_del_leaves[DELETED_LIFE];
+
+	uint64_t remove_count;
+
+	pthread_mutex_t mutex; /* writes/removes */
+};
+
+/*
+ * atomic load
+ */
+static void
+load(void *src, void *dst)
+{
+	util_atomic_load_explicit64((uint64_t *)src, (uint64_t *)dst,
+		memory_order_acquire);
+}
+
+/*
+ * atomic store
+ */
+static void
+store(void *dst, void *src)
+{
+	util_atomic_store_explicit64((uint64_t *)dst, (uint64_t)src,
+		memory_order_release);
+}
+
+/*
+ * internal: is_leaf -- check tagged pointer for leafness
+ */
+static inline bool
+is_leaf(struct critnib_node *n)
+{
+	return (uint64_t)n & 1;
+}
+
+/*
+ * internal: to_leaf -- untag a leaf pointer
+ */
+static inline struct critnib_leaf *
+to_leaf(struct critnib_node *n)
+{
+	return (void *)((uint64_t)n & ~1ULL);
+}
+
+/*
+ * internal: path_mask -- return bit mask of a path above a subtree [shift]
+ * bits tall
+ */
+static inline uint64_t
+path_mask(sh_t shift)
+{
+	return ~NIB << shift;
+}
+
+/*
+ * internal: slice_index -- return index of child at the given nib
+ */
+static inline unsigned
+slice_index(uint64_t key, sh_t shift)
+{
+	return (unsigned)((key >> shift) & NIB);
+}
+
+/*
+ * critnib_new -- allocates a new critnib structure
+ */
+struct critnib *
+critnib_new(void)
+{
+	struct critnib *c;
+
+	D_ALLOC_PTR(c);
+	if (!c)
+		return NULL;
+
+	util_mutex_init(&c->mutex);
+
+	VALGRIND_HG_DRD_DISABLE_CHECKING(&c->root, sizeof(c->root));
+	VALGRIND_HG_DRD_DISABLE_CHECKING(&c->remove_count,
+					sizeof(c->remove_count));
+
+	return c;
+}
+
+/*
+ * internal: delete_node -- recursively free (to malloc) a subtree
+ */
+static void
+delete_node(struct critnib_node *__restrict n)
+{
+	if (!is_leaf(n)) {
+		for (int i = 0; i < SLNODES; i++) {
+			if (n->child[i])
+				delete_node(n->child[i]);
+		}
+
+		D_FREE(n);
+	} else {
+		void *ptr;
+
+		ptr = (void *)to_leaf(n);
+		D_FREE(ptr);
+	}
+}
+
+/*
+ * critnib_delete -- destroy and free a critnib struct
+ */
+void
+critnib_delete(struct critnib *c)
+{
+	if (c->root)
+		delete_node(c->root);
+
+	util_mutex_destroy(&c->mutex);
+
+	for (struct critnib_node *m = c->deleted_node; m; ) {
+		struct critnib_node *mm = m->child[0];
+
+		D_FREE(m);
+		m = mm;
+	}
+
+	for (struct critnib_leaf *k = c->deleted_leaf; k; ) {
+		struct critnib_leaf *kk = k->value;
+
+		D_FREE(k);
+		k = kk;
+	}
+
+	for (int i = 0; i < DELETED_LIFE; i++) {
+		D_FREE(c->pending_del_nodes[i]);
+		D_FREE(c->pending_del_leaves[i]);
+	}
+
+	D_FREE(c);
+}
+
+/*
+ * internal: free_node -- free (to internal pool, not malloc) a node.
+ *
+ * We cannot free them to malloc as a stalled reader thread may still walk
+ * through such nodes; it will notice the result being bogus but only after
+ * completing the walk, thus we need to ensure any freed nodes still point
+ * to within the critnib structure.
+ */
+static void
+free_node(struct critnib *__restrict c, struct critnib_node *__restrict n)
+{
+	if (!n)
+		return;
+
+	ASSERT(!is_leaf(n));
+	n->child[0] = c->deleted_node;
+	c->deleted_node = n;
+}
+
+/*
+ * internal: alloc_node -- allocate a node from our pool or from malloc
+ */
+static struct critnib_node *
+alloc_node(struct critnib *__restrict c)
+{
+	if (!c->deleted_node) {
+		struct critnib_node *n;
+
+		D_ALLOC_PTR_NZ(n);
+		if (n == NULL)
+			D_CRIT("Malloc!\n");
+
+		return n;
+	}
+
+	struct critnib_node *n = c->deleted_node;
+
+	c->deleted_node = n->child[0];
+	VALGRIND_ANNOTATE_NEW_MEMORY(n, sizeof(*n));
+
+	return n;
+}
+
+/*
+ * internal: free_leaf -- free (to internal pool, not malloc) a leaf.
+ *
+ * See free_node().
+ */
+static void
+free_leaf(struct critnib *__restrict c, struct critnib_leaf *__restrict k)
+{
+	if (!k)
+		return;
+
+	k->value = c->deleted_leaf;
+	c->deleted_leaf = k;
+}
+
+/*
+ * internal: alloc_leaf -- allocate a leaf from our pool or from malloc
+ */
+static struct critnib_leaf *
+alloc_leaf(struct critnib *__restrict c)
+{
+	if (!c->deleted_leaf) {
+		struct critnib_leaf *k;
+
+		D_ALLOC_PTR_NZ(k);
+		if (k == NULL)
+			D_CRIT("Malloc!\n");
+
+		return k;
+	}
+
+	struct critnib_leaf *k = c->deleted_leaf;
+
+	c->deleted_leaf = k->value;
+	VALGRIND_ANNOTATE_NEW_MEMORY(k, sizeof(*k));
+
+	return k;
+}
+
+/*
+ * critnib_insert -- write a key:value pair to the critnib structure
+ *
+ * Returns:
+ *  - 0 on success
+ *  - EEXIST if such a key already exists
+ *  - ENOMEM if we're out of memory
+ *
+ * Takes a global write lock but doesn't stall any readers.
+ */
+int
+critnib_insert(struct critnib *c, uint64_t key, void *value)
+{
+	util_mutex_lock(&c->mutex);
+
+	struct critnib_leaf *k = alloc_leaf(c);
+
+	if (!k) {
+		util_mutex_unlock(&c->mutex);
+
+		return ENOMEM;
+	}
+
+	VALGRIND_HG_DRD_DISABLE_CHECKING(k, sizeof(struct critnib_leaf));
+
+	k->key = key;
+	k->value = value;
+
+	struct critnib_node *kn = (void *)((uint64_t)k | 1);
+
+	struct critnib_node *n = c->root;
+
+	if (!n) {
+		c->root = kn;
+
+		util_mutex_unlock(&c->mutex);
+
+		return 0;
+	}
+
+	struct critnib_node **parent = &c->root;
+	struct critnib_node *prev = c->root;
+
+	while (n && !is_leaf(n) && (key & path_mask(n->shift)) == n->path) {
+		prev = n;
+		parent = &n->child[slice_index(key, n->shift)];
+		n = *parent;
+	}
+
+	if (!n) {
+		n = prev;
+		store(&n->child[slice_index(key, n->shift)], kn);
+
+		util_mutex_unlock(&c->mutex);
+
+		return 0;
+	}
+
+	uint64_t path = is_leaf(n) ? to_leaf(n)->key : n->path;
+	/* Find where the path differs from our key. */
+	uint64_t at = path ^ key;
+
+	if (!at) {
+		ASSERT(is_leaf(n));
+		free_leaf(c, to_leaf(kn));
+		/* fail instead of replacing */
+
+		util_mutex_unlock(&c->mutex);
+
+		return EEXIST;
+	}
+
+	/* and convert that to an index. */
+	sh_t sh = util_mssb_index64(at) & (sh_t)~(SLICE - 1);
+
+	struct critnib_node *m = alloc_node(c);
+
+	if (!m) {
+		free_leaf(c, to_leaf(kn));
+
+		util_mutex_unlock(&c->mutex);
+
+		return ENOMEM;
+	}
+	VALGRIND_HG_DRD_DISABLE_CHECKING(m, sizeof(struct critnib_node));
+
+	for (int i = 0; i < SLNODES; i++)
+		m->child[i] = NULL;
+
+	m->child[slice_index(key, sh)] = kn;
+	m->child[slice_index(path, sh)] = n;
+	m->shift = sh;
+	m->path = key & path_mask(sh);
+	store(parent, m);
+
+	util_mutex_unlock(&c->mutex);
+
+	return 0;
+}
+
+/*
+ * critnib_remove -- delete a key from the critnib structure, return its value
+ */
+void *
+critnib_remove(struct critnib *c, uint64_t key)
+{
+	struct critnib_leaf *k;
+	void *value = NULL;
+
+	util_mutex_lock(&c->mutex);
+
+	struct critnib_node *n = c->root;
+
+	if (!n)
+		goto not_found;
+
+	uint64_t del = util_fetch_and_add64(&c->remove_count, 1) % DELETED_LIFE;
+
+	free_node(c, c->pending_del_nodes[del]);
+	free_leaf(c, c->pending_del_leaves[del]);
+	c->pending_del_nodes[del] = NULL;
+	c->pending_del_leaves[del] = NULL;
+
+	if (is_leaf(n)) {
+		k = to_leaf(n);
+		if (k->key == key) {
+			store(&c->root, NULL);
+			goto del_leaf;
+		}
+
+		goto not_found;
+	}
+	/*
+	 * n and k are a parent:child pair (after the first iteration); k is the
+	 * leaf that holds the key we're deleting.
+	 */
+	struct critnib_node **k_parent = &c->root;
+	struct critnib_node **n_parent = &c->root;
+	struct critnib_node *kn = n;
+
+	while (!is_leaf(kn)) {
+		n_parent = k_parent;
+		n = kn;
+		k_parent = &kn->child[slice_index(key, kn->shift)];
+		kn = *k_parent;
+
+		if (!kn)
+			goto not_found;
+	}
+
+	k = to_leaf(kn);
+	if (k->key != key)
+		goto not_found;
+
+	store(&n->child[slice_index(key, n->shift)], NULL);
+
+	/* Remove the node if there's only one remaining child. */
+	int ochild = -1;
+
+	for (int i = 0; i < SLNODES; i++) {
+		if (n->child[i]) {
+			if (ochild != -1)
+				goto del_leaf;
+
+			ochild = i;
+		}
+	}
+
+	ASSERTne(ochild, -1);
+
+	store(n_parent, n->child[ochild]);
+	c->pending_del_nodes[del] = n;
+
+del_leaf:
+	value = k->value;
+	c->pending_del_leaves[del] = k;
+
+not_found:
+	util_mutex_unlock(&c->mutex);
+	return value;
+}
+
+/*
+ * critnib_get -- query for a key ("==" match), returns value or NULL
+ *
+ * Doesn't need a lock but if many deletes happened while our thread was
+ * somehow stalled the query is restarted (as freed nodes remain unused only
+ * for a grace period).
+ *
+ * Counterintuitively, it's pointless to return the most current answer,
+ * we need only one that was valid at any point after the call started.
+ */
+void *
+critnib_get(struct critnib *c, uint64_t key)
+{
+	uint64_t wrs1, wrs2;
+	void *res;
+
+	do {
+		struct critnib_node *n;
+
+		load(&c->remove_count, &wrs1);
+		load(&c->root, &n);
+
+		/*
+		 * critbit algorithm: dive into the tree, looking at nothing but
+		 * each node's critical bit^H^H^Hnibble.  This means we risk
+		 * going wrong way if our path is missing, but that's ok...
+		 */
+		while (n && !is_leaf(n))
+			load(&n->child[slice_index(key, n->shift)], &n);
+
+		/* ... as we check it at the end. */
+		struct critnib_leaf *k = to_leaf(n);
+
+		res = (n && k->key == key) ? k->value : NULL;
+		load(&c->remove_count, &wrs2);
+	} while (wrs1 + DELETED_LIFE <= wrs2);
+
+	return res;
+}
+
+/*
+ * internal: find_successor -- return the rightmost non-null node in a subtree
+ */
+static void *
+find_successor(struct critnib_node *__restrict n)
+{
+	while (1) {
+		int nib;
+
+		for (nib = NIB; nib >= 0; nib--)
+			if (n->child[nib])
+				break;
+
+		if (nib < 0)
+			return NULL;
+
+		n = n->child[nib];
+		if (is_leaf(n))
+			return to_leaf(n)->value;
+	}
+}
+
+/*
+ * internal: find_le -- recursively search <= in a subtree
+ */
+static void *
+find_le(struct critnib_node *__restrict n, uint64_t key)
+{
+	if (!n)
+		return NULL;
+
+	if (is_leaf(n)) {
+		struct critnib_leaf *k = to_leaf(n);
+
+		return (k->key <= key) ? k->value : NULL;
+	}
+
+	/*
+	 * is our key outside the subtree we're in?
+	 *
+	 * If we're inside, all bits above the nib will be identical; note
+	 * that shift points at the nib's lower rather than upper edge, so it
+	 * needs to be masked away as well.
+	 */
+	if ((key ^ n->path) >> (n->shift) & ~NIB) {
+		/*
+		 * subtree is too far to the left?
+		 * -> its rightmost value is good
+		 */
+		if (n->path < key)
+			return find_successor(n);
+
+		/*
+		 * subtree is too far to the right?
+		 * -> it has nothing of interest to us
+		 */
+		return NULL;
+	}
+
+	unsigned nib = slice_index(key, n->shift);
+
+	/* recursive call: follow the path */
+	{
+		struct critnib_node *m;
+
+		load(&n->child[nib], &m);
+
+		void *value = find_le(m, key);
+
+		if (value)
+			return value;
+	}
+
+	/*
+	 * nothing in that subtree?  We strayed from the path at this point,
+	 * thus need to search every subtree to our left in this node.  No
+	 * need to dive into any but the first non-null, though.
+	 */
+	for (; nib > 0; nib--) {
+		struct critnib_node *m;
+
+		load(&n->child[nib - 1], &m);
+		if (m) {
+			n = m;
+			if (is_leaf(n))
+				return to_leaf(n)->value;
+
+			return find_successor(n);
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * critnib_find_le -- query for a key ("<=" match), returns value or NULL
+ *
+ * Same guarantees as critnib_get().
+ */
+void *
+critnib_find_le(struct critnib *c, uint64_t key)
+{
+	uint64_t wrs1, wrs2;
+	void *res;
+
+	do {
+		load(&c->remove_count, &wrs1);
+
+		struct critnib_node *n; /* avoid a subtle TOCTOU */
+
+		load(&c->root, &n);
+		res = n ? find_le(n, key) : NULL;
+		load(&c->remove_count, &wrs2);
+	} while (wrs1 + DELETED_LIFE <= wrs2);
+
+	return res;
+}
diff --git a/src/common/dav_v2/critnib.h b/src/common/dav_v2/critnib.h
new file mode 100644
index 00000000000..b07815fba4c
--- /dev/null
+++ b/src/common/dav_v2/critnib.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2020, Intel Corporation */
+
+/*
+ * critnib.h -- internal definitions for critnib tree
+ */
+
+#ifndef __DAOS_COMMON_CRITNIB_H
+#define __DAOS_COMMON_CRITNIB_H 1
+
+#include <stdint.h>
+
+struct critnib;
+
+struct critnib *critnib_new(void);
+void critnib_delete(struct critnib *c);
+
+int critnib_insert(struct critnib *c, uint64_t key, void *value);
+void *critnib_remove(struct critnib *c, uint64_t key);
+void *critnib_get(struct critnib *c, uint64_t key);
+void *critnib_find_le(struct critnib *c, uint64_t key);
+
+#endif /* __DAOS_COMMON_CRITNIB_H */
diff --git a/src/common/dav_v2/dav_clogs.c b/src/common/dav_v2/dav_clogs.c
new file mode 100644
index 00000000000..a27eabe02d6
--- /dev/null
+++ b/src/common/dav_v2/dav_clogs.c
@@ -0,0 +1,104 @@
+/**
+ * (C) Copyright 2015-2022 Intel Corporation.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause-Patent
+ */
+
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "dav_internal.h"
+#include "memops.h"
+#include "tx.h"
+
+static void
+clogs_extend_free(struct ulog *redo)
+{
+	D_FREE(redo);
+}
+
+static int
+clogs_extend_redo(struct ulog **redo, uint64_t gen_num)
+{
+	size_t size = SIZEOF_ALIGNED_ULOG(LANE_REDO_EXTERNAL_SIZE);
+
+	D_ALIGNED_ALLOC_NZ(*redo, CACHELINE_SIZE, size);
+	if (*redo == NULL)
+		return -1;
+
+	size_t capacity = ALIGN_DOWN(size - sizeof(struct ulog), CACHELINE_SIZE);
+
+	ulog_construct_new(*redo, capacity, gen_num, 0);
+	return 0;
+}
+
+static int
+clogs_extend_undo(struct ulog **undo, uint64_t gen_num)
+{
+	size_t size = TX_DEFAULT_RANGE_CACHE_SIZE;
+
+	D_ALIGNED_ALLOC_NZ(*undo, CACHELINE_SIZE, size);
+	if (*undo == NULL)
+		return -1;
+
+	size_t capacity = ALIGN_DOWN(size - sizeof(struct ulog), CACHELINE_SIZE);
+
+	ulog_construct_new(*undo, capacity, gen_num, 0);
+	return 0;
+}
+
+int
+dav_create_clogs(dav_obj_t *hdl)
+{
+
+	ulog_construct_new((struct ulog *)&hdl->clogs.external,
+		LANE_REDO_EXTERNAL_SIZE, 0, 0);
+	ulog_construct_new((struct ulog *)&hdl->clogs.undo,
+		LANE_UNDO_SIZE, 0, 0);
+
+	hdl->external = operation_new((struct ulog *)&hdl->clogs.external,
+		LANE_REDO_EXTERNAL_SIZE, clogs_extend_redo, clogs_extend_free,
+		&hdl->p_ops, LOG_TYPE_REDO);
+	if (hdl->external == NULL)
+		return -1;
+	hdl->undo = operation_new((struct ulog *)&hdl->clogs.undo,
+		LANE_UNDO_SIZE, clogs_extend_undo, clogs_extend_free,
+		&hdl->p_ops, LOG_TYPE_UNDO);
+	if (hdl->undo == NULL) {
+		operation_delete(hdl->external);
+		return -1;
+	}
+	return 0;
+}
+
+void
+dav_destroy_clogs(dav_obj_t *hdl)
+{
+	operation_free_logs(hdl->external);
+	operation_delete(hdl->external);
+	operation_free_logs(hdl->undo);
+	operation_delete(hdl->undo);
+}
+
+int
+dav_hold_clogs(dav_obj_t *hdl)
+{
+	if (hdl->nested_tx++ == 0) {
+		operation_init(hdl->external);
+		operation_init(hdl->undo);
+	}
+	return 0;
+}
+
+int
+dav_release_clogs(dav_obj_t *hdl)
+{
+	if (hdl->nested_tx == 0)
+		FATAL("release clogs");
+	--hdl->nested_tx;
+	return 0;
+}
diff --git a/src/common/dav_v2/dav_clogs.h b/src/common/dav_v2/dav_clogs.h
new file mode 100644
index 00000000000..8c7af256ccc
--- /dev/null
+++ b/src/common/dav_v2/dav_clogs.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2022, Intel Corporation */
+
+/*
+ * dav_iface.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV)
+ */
+
+#ifndef __DAOS_COMMON_DAV_CLOGS_H
+#define __DAOS_COMMON_CLOGS_H 1
+
+#include <stdint.h>
+#include <sys/types.h>
+#include "ulog.h"
+
+#define LANE_TOTAL_SIZE (3072) /* 3 * 1024 (sum of 3 old lane sections) */
+/*
+ * We have 3 kilobytes to distribute be split between transactional redo
+ * and undo logs.
+ * Since by far the most space consuming operations are transactional
+ * snapshots, most of the space, 2304 bytes, is assigned to the undo log.
+ * After that, the remainder, 640 bytes, or 40 ulog entries, is left for the
+ * transactional redo logs.
+ * Thanks to this distribution, all small and medium transactions should be
+ * entirely performed without allocating any additional metadata.
+ *
+ * These values must be cacheline size aligned to be used for ulogs. Therefore
+ * they are parametrized for the size of the struct ulog changes between
+ * platforms.
+ */
+#define LANE_UNDO_SIZE (LANE_TOTAL_SIZE \
+			- LANE_REDO_EXTERNAL_SIZE \
+			- 2 * sizeof(struct ulog)) /* 2304 for 64B ulog */
+#define LANE_REDO_EXTERNAL_SIZE ALIGN_UP(704 - sizeof(struct ulog), \
+					CACHELINE_SIZE) /* 640 for 64B ulog */
+
+struct dav_clogs {
+	/*
+	 * Redo log for large operations/transactions.
+	 * Can be extended by the use of internal ulog.
+	 */
+	struct ULOG(LANE_REDO_EXTERNAL_SIZE) external;
+	/*
+	 * Undo log for snapshots done in a transaction.
+	 * Can be extended/shrunk by the use of internal ulog.
+	 */
+	struct ULOG(LANE_UNDO_SIZE) undo;
+};
+
+typedef struct dav_obj dav_obj_t;
+
+int dav_create_clogs(dav_obj_t *hdl);
+void dav_destroy_clogs(dav_obj_t *hdl);
+int dav_hold_clogs(dav_obj_t *hdl);
+int dav_release_clogs(dav_obj_t *hdl);
+
+#endif /* __DAOS_COMMON_DAV_CLOGS_H */
diff --git a/src/common/dav_v2/dav_iface.c b/src/common/dav_v2/dav_iface.c
new file mode 100644
index 00000000000..3879e56c2d4
--- /dev/null
+++ b/src/common/dav_v2/dav_iface.c
@@ -0,0 +1,434 @@
+/**
+ * (C) Copyright 2015-2023 Intel Corporation.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause-Patent
+ */
+
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <uuid/uuid.h>
+
+#include <daos/mem.h>
+#include "dav_internal.h"
+#include "heap.h"
+#include "palloc.h"
+#include "mo_wal.h"
+#include "obj.h"
+
+#define	DAV_HEAP_INIT	0x1
+#define MEGABYTE	((uintptr_t)1 << 20)
+
+/*
+ * get_uuid_lo -- (internal) evaluates XOR sum of least significant
+ * 8 bytes with most significant 8 bytes.
+ */
+static inline uint64_t
+get_uuid_lo(uuid_t uuid)
+{
+	uint64_t uuid_lo = 0;
+
+	for (int i = 0; i < 8; i++)
+		uuid_lo = (uuid_lo << 8) | (uuid[i] ^ uuid[8 + i]);
+
+	return uuid_lo;
+}
+
+static void
+setup_dav_phdr(dav_obj_t *hdl)
+{
+	struct dav_phdr *hptr;
+	uuid_t	uuid;
+
+	ASSERT(hdl->do_base != NULL);
+	hptr = (struct dav_phdr *)(hdl->do_base);
+	uuid_generate(uuid);
+	hptr->dp_uuid_lo = get_uuid_lo(uuid);
+	hptr->dp_root_offset = 0;
+	hptr->dp_root_size = 0;
+	hptr->dp_heap_offset = sizeof(struct dav_phdr);
+	hptr->dp_heap_size = hdl->do_size - sizeof(struct dav_phdr);
+	hptr->dp_stats_persistent.heap_curr_allocated = 0;
+	hdl->do_phdr = hptr;
+}
+
+static void
+persist_dav_phdr(dav_obj_t *hdl)
+{
+	mo_wal_persist(&hdl->p_ops, hdl->do_phdr, offsetof(struct dav_phdr, dp_unused));
+}
+
+static dav_obj_t *
+dav_obj_open_internal(int fd, int flags, size_t sz, const char *path, struct umem_store *store)
+{
+	dav_obj_t *hdl = NULL;
+	void      *base;
+	char      *heap_base;
+	uint64_t   heap_size;
+	uint64_t   num_pages;
+	int        persist_hdr = 0;
+	int        err         = 0;
+	int        rc;
+
+	base = mmap(NULL, sz, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+	if (base == MAP_FAILED)
+		return NULL;
+
+	D_ALIGNED_ALLOC(hdl, CACHELINE_SIZE, sizeof(dav_obj_t));
+	if (hdl == NULL) {
+		err = ENOMEM;
+		goto out0;
+	}
+
+	/* REVISIT: In future pass the meta instance as argument instead of fd */
+	hdl->do_fd = fd;
+	hdl->do_base = base;
+	hdl->do_size = sz;
+	hdl->p_ops.base = hdl;
+
+	hdl->do_store = store;
+	if (hdl->do_store->stor_priv == NULL) {
+		D_ERROR("meta context not defined. WAL commit disabled for %s\n", path);
+	} else {
+		rc = umem_cache_alloc(store, 0);
+		if (rc != 0) {
+			D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc));
+			err = rc;
+			goto out1;
+		}
+	}
+
+	D_STRNDUP(hdl->do_path, path, strlen(path));
+
+	num_pages = (sz + UMEM_CACHE_PAGE_SZ - 1) >> UMEM_CACHE_PAGE_SZ_SHIFT;
+	rc = umem_cache_map_range(hdl->do_store, 0, base, num_pages);
+	if (rc != 0) {
+		D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc));
+		err = rc;
+		goto out2;
+	}
+
+	if (flags & DAV_HEAP_INIT) {
+		setup_dav_phdr(hdl);
+		heap_base = (char *)hdl->do_base + hdl->do_phdr->dp_heap_offset;
+		heap_size = hdl->do_phdr->dp_heap_size;
+
+		rc = lw_tx_begin(hdl);
+		if (rc) {
+			err = ENOMEM;
+			goto out2;
+		}
+
+		rc = heap_init(heap_base, heap_size, &hdl->do_phdr->dp_heap_size,
+			       &hdl->p_ops);
+		if (rc) {
+			err = rc;
+			goto out2;
+		}
+		persist_hdr = 1;
+	} else {
+		hdl->do_phdr = hdl->do_base;
+
+		D_ASSERT(store != NULL);
+
+		rc = store->stor_ops->so_load(store, hdl->do_base);
+		if (rc) {
+			D_ERROR("Failed to read blob to vos file %s, rc = %d\n", path, rc);
+			goto out2;
+		}
+
+		rc = hdl->do_store->stor_ops->so_wal_replay(hdl->do_store, dav_wal_replay_cb, hdl);
+		if (rc) {
+			err = rc;
+			goto out2;
+		}
+
+		heap_base = (char *)hdl->do_base + hdl->do_phdr->dp_heap_offset;
+		heap_size = hdl->do_phdr->dp_heap_size;
+
+		rc = lw_tx_begin(hdl);
+		if (rc) {
+			err = ENOMEM;
+			goto out2;
+		}
+	}
+
+	hdl->do_stats = stats_new(hdl);
+	if (hdl->do_stats == NULL)
+		goto out2;
+
+	D_ALLOC_PTR(hdl->do_heap);
+	if (hdl->do_heap == NULL) {
+		err = ENOMEM;
+		goto out2;
+	}
+
+	rc = heap_boot(hdl->do_heap, heap_base, heap_size,
+		&hdl->do_phdr->dp_heap_size, hdl->do_base,
+		&hdl->p_ops, hdl->do_stats, NULL);
+	if (rc) {
+		err = rc;
+		goto out2;
+	}
+
+#if VG_MEMCHECK_ENABLED
+	if (On_memcheck)
+		palloc_heap_vg_open(hdl->do_heap, 1);
+#endif
+
+	rc = dav_create_clogs(hdl);
+	if (rc) {
+		err = rc;
+		heap_cleanup(hdl->do_heap);
+		goto out2;
+	}
+
+	if (persist_hdr)
+		persist_dav_phdr(hdl);
+
+	lw_tx_end(hdl, NULL);
+
+#if VG_MEMCHECK_ENABLED
+	if (On_memcheck) {
+		/* mark unused part of the pool as not accessible */
+		void *end = palloc_heap_end(hdl->do_heap);
+
+		VALGRIND_DO_MAKE_MEM_NOACCESS(end,
+					      OBJ_OFF_TO_PTR(hdl, heap_size) - end);
+	}
+#endif
+	return hdl;
+
+out2:
+	if (hdl->do_stats)
+		stats_delete(hdl, hdl->do_stats);
+	if (hdl->do_heap)
+		D_FREE(hdl->do_heap);
+	if (hdl->do_utx) {
+		dav_umem_wtx_cleanup(hdl->do_utx);
+		D_FREE(hdl->do_utx);
+	}
+	D_FREE(hdl->do_path);
+	umem_cache_free(hdl->do_store);
+out1:
+	D_FREE(hdl);
+out0:
+	munmap(base, sz);
+	errno = err;
+	return NULL;
+
+}
+
+DAV_FUNC_EXPORT dav_obj_t *
+dav_obj_create_v2(const char *path, int flags, size_t sz, mode_t mode, struct umem_store *store)
+{
+	int fd;
+	dav_obj_t *hdl;
+	struct stat statbuf;
+
+	SUPPRESS_UNUSED(flags);
+
+	if (sz == 0) {
+		/* Open the file and obtain the size */
+		fd = open(path, O_RDWR|O_CLOEXEC);
+		if (fd == -1)
+			return NULL;
+
+		if (fstat(fd, &statbuf) != 0) {
+			close(fd);
+			return NULL;
+		}
+		sz = statbuf.st_size;
+	} else {
+		fd = open(path, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, mode);
+		if (fd == -1)
+			return NULL;
+
+		if (fallocate(fd, 0, 0, (off_t)sz) == -1) {
+			close(fd);
+			errno = ENOSPC;
+			return NULL;
+		}
+	}
+
+	if (!store->stor_size || (sz < store->stor_size)) {
+		ERR("Invalid umem_store size");
+		errno = EINVAL;
+		close(fd);
+		return NULL;
+	}
+
+	hdl = dav_obj_open_internal(fd, DAV_HEAP_INIT, store->stor_size, path, store);
+	if (hdl == NULL) {
+		close(fd);
+		return NULL;
+	}
+	DAV_DBG("pool %s created, size="DF_U64"", hdl->do_path, sz);
+	return hdl;
+}
+
+DAV_FUNC_EXPORT dav_obj_t *
+dav_obj_open_v2(const char *path, int flags, struct umem_store *store)
+{
+	size_t size;
+	int fd;
+	dav_obj_t *hdl;
+	struct stat statbuf;
+
+	SUPPRESS_UNUSED(flags);
+
+	fd = open(path, O_RDWR|O_CLOEXEC);
+	if (fd == -1)
+		return NULL;
+
+	if (fstat(fd, &statbuf) != 0) {
+		close(fd);
+		return NULL;
+	}
+	size = (size_t)statbuf.st_size;
+
+	if (!store->stor_size || (size < store->stor_size)) {
+		ERR("Invalid umem_store size");
+		errno = EINVAL;
+		close(fd);
+		return NULL;
+	}
+
+	hdl = dav_obj_open_internal(fd, 0, store->stor_size, path, store);
+	if (hdl == NULL) {
+		close(fd);
+		return NULL;
+	}
+	DAV_DBG("pool %s is open, size="DF_U64"", hdl->do_path, size);
+	return hdl;
+}
+
+DAV_FUNC_EXPORT void
+dav_obj_close_v2(dav_obj_t *hdl)
+{
+
+	if (hdl == NULL) {
+		ERR("NULL handle");
+		return;
+	}
+	dav_destroy_clogs(hdl);
+	heap_cleanup(hdl->do_heap);
+	D_FREE(hdl->do_heap);
+
+	stats_delete(hdl, hdl->do_stats);
+
+	munmap(hdl->do_base, hdl->do_size);
+	close(hdl->do_fd);
+	if (hdl->do_utx) {
+		dav_umem_wtx_cleanup(hdl->do_utx);
+		D_FREE(hdl->do_utx);
+	}
+	umem_cache_free(hdl->do_store);
+	DAV_DBG("pool %s is closed", hdl->do_path);
+	D_FREE(hdl->do_path);
+	D_FREE(hdl);
+}
+
+DAV_FUNC_EXPORT void *
+dav_get_base_ptr_v2(dav_obj_t *hdl)
+{
+	return hdl->do_base;
+}
+
+DAV_FUNC_EXPORT int
+dav_class_register_v2(dav_obj_t *pop, struct dav_alloc_class_desc *p)
+{
+	uint8_t id = (uint8_t)p->class_id;
+	struct alloc_class_collection *ac = heap_alloc_classes(pop->do_heap);
+
+	if (p->unit_size <= 0 || p->unit_size > DAV_MAX_ALLOC_SIZE ||
+		p->units_per_block <= 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (p->alignment != 0 && p->unit_size % p->alignment != 0) {
+		ERR("unit size must be evenly divisible by alignment");
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (p->alignment > (MEGABYTE * 2)) {
+		ERR("alignment cannot be larger than 2 megabytes");
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (p->class_id >= MAX_ALLOCATION_CLASSES) {
+		ERR("class id outside of the allowed range");
+		errno = ERANGE;
+		return -1;
+	}
+
+	enum header_type lib_htype = MAX_HEADER_TYPES;
+
+	switch (p->header_type) {
+	case DAV_HEADER_LEGACY:
+		lib_htype = HEADER_LEGACY;
+		break;
+	case DAV_HEADER_COMPACT:
+		lib_htype = HEADER_COMPACT;
+		break;
+	case DAV_HEADER_NONE:
+		lib_htype = HEADER_NONE;
+		break;
+	case MAX_DAV_HEADER_TYPES:
+	default:
+		ERR("invalid header type");
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (id == 0) {
+		if (alloc_class_find_first_free_slot(ac, &id) != 0) {
+			ERR("no available free allocation class identifier");
+			errno = EINVAL;
+			return -1;
+		}
+	} else {
+		if (alloc_class_reserve(ac, id) != 0) {
+			ERR("attempted to overwrite an allocation class");
+			errno = EEXIST;
+			return -1;
+		}
+	}
+
+	size_t runsize_bytes =
+		CHUNK_ALIGN_UP((p->units_per_block * p->unit_size) +
+		RUN_BASE_METADATA_SIZE);
+
+	/* aligning the buffer might require up-to to 'alignment' bytes */
+	if (p->alignment != 0)
+		runsize_bytes += p->alignment;
+
+	uint32_t size_idx = (uint32_t)(runsize_bytes / CHUNKSIZE);
+
+	if (size_idx > UINT16_MAX)
+		size_idx = UINT16_MAX;
+
+	struct alloc_class *c = alloc_class_new(id,
+		heap_alloc_classes(pop->do_heap), CLASS_RUN,
+		lib_htype, p->unit_size, p->alignment, size_idx);
+	if (c == NULL) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (heap_create_alloc_class_buckets(pop->do_heap, c) != 0) {
+		alloc_class_delete(ac, c);
+		return -1;
+	}
+
+	p->class_id = c->id;
+	p->units_per_block = c->rdsc.nallocs;
+
+	return 0;
+}
diff --git a/src/common/dav_v2/dav_internal.h b/src/common/dav_v2/dav_internal.h
new file mode 100644
index 00000000000..408f03a01ae
--- /dev/null
+++ b/src/common/dav_v2/dav_internal.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2022, Intel Corporation */
+
+/*
+ * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV)
+ */
+
+#ifndef __DAOS_COMMON_DAV_INTERNAL_H
+#define __DAOS_COMMON_DAV_INTERNAL_H 1
+
+#include "dav_v2.h"
+#include "dav_clogs.h"
+#include "heap.h"
+#include "mo_wal.h"
+#include "wal_tx.h"
+
+#define DAV_FUNC_EXPORT __attribute__ ((visibility ("default")))
+
+#define DAV_MAX_ALLOC_SIZE ((size_t)0x3FFDFFFC0)
+
+enum dav_tx_failure_behavior {
+	DAV_TX_FAILURE_ABORT,
+	DAV_TX_FAILURE_RETURN,
+};
+
+enum dav_stats_enabled {
+	DAV_STATS_ENABLED_TRANSIENT,
+	DAV_STATS_ENABLED_BOTH,
+	DAV_STATS_ENABLED_PERSISTENT,
+	DAV_STATS_DISABLED,
+};
+
+#define	DAV_PHDR_SIZE	4096
+
+/* DAV header data that will be persisted */
+struct dav_phdr {
+	uint64_t		dp_uuid_lo;
+	uint64_t		dp_heap_offset;
+	uint64_t		dp_heap_size;
+	uint64_t		dp_root_offset;
+	uint64_t		dp_root_size;
+	struct stats_persistent dp_stats_persistent;
+	char	 dp_unused[DAV_PHDR_SIZE - sizeof(uint64_t)*5 -
+			sizeof(struct stats_persistent)];
+};
+
+/* DAV object handle */
+typedef struct dav_obj {
+	char				*do_path;
+	uint64_t			 do_size;
+	void				*do_base;
+	struct palloc_heap		*do_heap;
+	struct dav_phdr			*do_phdr;
+	struct operation_context	*external;
+	struct operation_context	*undo;
+	struct mo_ops			 p_ops;	/* REVISIT */
+	struct stats			*do_stats;
+	int				 do_fd;
+	int				 nested_tx;
+	struct umem_wal_tx		*do_utx;
+	struct umem_store               *do_store;
+
+	struct dav_clogs		 clogs __attribute__ ((__aligned__(CACHELINE_SIZE)));
+} dav_obj_t;
+
+static inline
+struct dav_tx *utx2wtx(struct umem_wal_tx *utx)
+{
+	return (struct dav_tx *)&utx->utx_private;
+}
+
+static inline
+struct umem_wal_tx *wtx2utx(struct dav_tx *wtx)
+{
+	return (struct umem_wal_tx *)((void *)wtx
+			- (ptrdiff_t)offsetof(struct umem_wal_tx, utx_private));
+}
+
+int lw_tx_begin(dav_obj_t *pop);
+int lw_tx_end(dav_obj_t *pop, void *data);
+
+#endif /* __DAOS_COMMON_DAV_INTERNAL_H */
diff --git a/src/common/dav_v2/dav_v2.h b/src/common/dav_v2/dav_v2.h
new file mode 100644
index 00000000000..4d5094ba195
--- /dev/null
+++ b/src/common/dav_v2/dav_v2.h
@@ -0,0 +1,307 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2023, Intel Corporation */
+
+/*
+ * dav_flags.h -- Interfaces exported by DAOS internal Allocator for VOS (DAV)
+ */
+
+#ifndef __DAOS_COMMON_DAV_V2_H
+#define __DAOS_COMMON_DAV_V2_H 1
+
+#include <setjmp.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include "../dav/dav.h"
+
+typedef struct dav_obj dav_obj_t;
+struct umem_store;
+
+/**
+ * Create and initialize a DAV object and return its handle.
+ *
+ * \param[in]	path	Path of the vos file.
+ *
+ * \param[in]	flags	additional flags (Future).
+ *
+ * \param[in]	sz	size of the file/heap.
+ *
+ * \param[in]	mode	permission to use while creating the file.
+ *
+ * \param[in]	store	backing umem store.
+ *
+ * \return		Returns the pointer to the object handle. Upon failure,
+ *			it returns NULL with errno set appropriately.
+ */
+dav_obj_t *
+dav_obj_create_v2(const char *path, int flags, size_t sz, mode_t mode, struct umem_store *store);
+
+/**
+ * Open and initialize a DAV object and return its handle.
+ *
+ * \param[in]	path	Path of the vos file.
+ *
+ * \param[in]	flags	additional flags (Future).
+ *
+ * \param[in]	store	backing umem store.
+ *
+ * \return		Returns the pointer to the object handle. Upon failure,
+ *			it returns NULL with errno set appropriately.
+ */
+dav_obj_t *
+dav_obj_open_v2(const char *path, int flags, struct umem_store *store);
+
+/**
+ * Close the DAV object
+ *
+ * \param[in]	hdl	DAV handle
+ */
+void
+dav_obj_close_v2(dav_obj_t *hdl);
+
+/**
+ * Return the pointer to the base of the heap.
+ *
+ * \param[in]	hdl	DAV handle
+ *
+ * \return		Returns the pointer to the base of the heap pointed to
+ *			by hdl.
+ */
+void *
+dav_get_base_ptr_v2(dav_obj_t *hdl);
+
+typedef int (*dav_constr)(dav_obj_t *pop, void *ptr, void *arg);
+
+/*
+ * Allocates a new object from the pool and calls a constructor function before
+ * returning. It is guaranteed that allocated object is either properly
+ * initialized, or if it's interrupted before the constructor completes, the
+ * memory reserved for the object is automatically reclaimed.
+ */
+int
+dav_alloc_v2(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags,
+	   dav_constr constructor, void *arg);
+
+/**
+ * Frees the memory at specified offset within the DAV object pointed to by hdl.
+ *
+ * \param[in]	hdl	DAV handle.
+ *
+ * \param[in]	off	offset to the memory location. off should correspond
+ *			to the offset returned by previous call to dav_malloc().
+ */
+void
+dav_free_v2(dav_obj_t *pop, uint64_t off);
+
+/*
+ * DAV version of memcpy. Data copied is made persistent in blob.
+ */
+void *
+dav_memcpy_persist_v2(dav_obj_t *pop, void *dest, const void *src, size_t len);
+/*
+ * DAV version of memcpy with deferred commit to blob.
+ */
+void *
+dav_memcpy_persist_relaxed_v2(dav_obj_t *pop, void *dest, const void *src, size_t len);
+
+/*
+ * If called for the first time on a newly created dav heap, the root object
+ * of given size is allocated.  Otherwise, it returns the existing root object.
+ * In such case, the size must be not less than the actual root object size
+ * stored in the pool.  If it's larger, the root object is automatically
+ * resized.
+ *
+ * This function is currently *not* thread-safe.
+ */
+uint64_t
+dav_root_v2(dav_obj_t *pop, size_t size);
+
+/*
+ * Starts a new transaction in the current thread.
+ * If called within an open transaction, starts a nested transaction.
+ *
+ * If successful, transaction stage changes to TX_STAGE_WORK and function
+ * returns zero. Otherwise, stage changes to TX_STAGE_ONABORT and an error
+ * number is returned.
+ */
+int
+dav_tx_begin_v2(dav_obj_t *pop, jmp_buf env, ...);
+
+/*
+ * Aborts current transaction
+ *
+ * Causes transition to TX_STAGE_ONABORT.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+void
+dav_tx_abort_v2(int errnum);
+
+/*
+ * Commits current transaction
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+void
+dav_tx_commit_v2(void);
+
+/*
+ * Cleanups current transaction. Must always be called after dav_tx_begin,
+ * even if starting the transaction failed.
+ *
+ * If called during TX_STAGE_NONE, has no effect.
+ *
+ * Always causes transition to TX_STAGE_NONE.
+ *
+ * If transaction was successful, returns 0. Otherwise returns error code set
+ * by dav_tx_abort.
+ *
+ * This function must *not* be called during TX_STAGE_WORK.
+ */
+int
+dav_tx_end_v2(void *data);
+
+/*
+ * Returns the current stage of the transaction.
+ */
+enum dav_tx_stage
+dav_tx_stage_v2(void);
+
+/*
+ * Returns last transaction error code.
+ */
+int
+dav_tx_errno_v2(void);
+
+/*
+ * Transactionally allocates a new object.
+ *
+ * If successful, returns offset of the object in the heap.
+ * Otherwise, stage changes to TX_STAGE_ONABORT and an zero is returned.
+ * 'Flags' is a bitmask of the following values:
+ *  - POBJ_XALLOC_ZERO - zero the allocated object
+ *  - POBJ_XALLOC_NO_FLUSH - skip flush on commit
+ *  - POBJ_XALLOC_NO_ABORT - if the function does not end successfully,
+ *  - DAV_CLASS_ID(id)	   - id of allocation class to use.
+ *  - DAV_EZONE_ID(id)	   - id of zone to use.
+ *  do not abort the transaction and return the error number.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+uint64_t
+dav_tx_alloc_v2(size_t size, uint64_t type_num, uint64_t flags);
+
+/*
+ * Transactionally frees an existing object.
+ *
+ * If successful, returns zero.
+ * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+int
+dav_tx_free_v2(uint64_t off);
+
+/*
+ * Takes a "snapshot" of the memory block of given size and located at given
+ * offset 'off' in the object 'oid' and saves it in the undo log.
+ * The application is then free to directly modify the object in that memory
+ * range. In case of failure or abort, all the changes within this range will
+ * be rolled-back automatically.
+ *
+ * If successful, returns zero.
+ * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+int
+dav_tx_add_range_v2(uint64_t off, size_t size);
+
+/*
+ * Takes a "snapshot" of the given memory region and saves it in the undo log.
+ * The application is then free to directly modify the object in that memory
+ * range. In case of failure or abort, all the changes within this range will
+ * be rolled-back automatically. The supplied block of memory has to be within
+ * the given pool.
+ *
+ * If successful, returns zero.
+ * Otherwise, stage changes to TX_STAGE_ONABORT and an error number is returned.
+ *
+ * This function must be called during TX_STAGE_WORK.
+ */
+int
+dav_tx_add_range_direct_v2(const void *ptr, size_t size);
+
+/*
+ * Behaves exactly the same as dav_tx_add_range when 'flags' equals 0.
+ * 'Flags' is a bitmask of the following values:
+ *  - POBJ_XADD_NO_FLUSH - skips flush on commit
+ *  - POBJ_XADD_NO_SNAPSHOT - added range will not be snapshotted
+ *  - POBJ_XADD_ASSUME_INITIALIZED - added range is assumed to be initialized
+ *  - POBJ_XADD_NO_ABORT - if the function does not end successfully,
+ *  do not abort the transaction and return the error number.
+ */
+int
+dav_tx_xadd_range_v2(uint64_t off, size_t size, uint64_t flags);
+
+/*
+ * Behaves exactly the same as dav_tx_add_range_direct when 'flags' equals
+ * 0. 'Flags' is a bitmask of the following values:
+ *  - POBJ_XADD_NO_FLUSH - skips flush on commit
+ *  - POBJ_XADD_NO_SNAPSHOT - added range will not be snapshotted
+ *  - POBJ_XADD_ASSUME_INITIALIZED - added range is assumed to be initialized
+ *  - POBJ_XADD_NO_ABORT - if the function does not end successfully,
+ *  do not abort the transaction and return the error number.
+ */
+int
+dav_tx_xadd_range_direct_v2(const void *ptr, size_t size, uint64_t flags);
+
+/*
+ * Converts the offset to a pointer in the context of heap associated with
+ * current transaction.
+ */
+void *
+dav_tx_off2ptr_v2(uint64_t off);
+
+#define DAV_ACTION_XRESERVE_VALID_FLAGS						\
+	(DAV_XALLOC_CLASS_MASK | DAV_XALLOC_EZONE_MASK | DAV_XALLOC_ZERO)
+
+struct dav_action;
+uint64_t
+dav_reserve_v2(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num,
+	     uint64_t flags);
+void
+dav_defer_free_v2(dav_obj_t *pop, uint64_t off, struct dav_action *act);
+void
+dav_cancel_v2(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt);
+int
+dav_tx_publish_v2(struct dav_action *actv, size_t actvcnt);
+
+struct dav_alloc_class_desc;
+/*
+ * Registers an allocation class handle with the DAV object.
+ */
+int
+dav_class_register_v2(dav_obj_t *pop, struct dav_alloc_class_desc *p);
+
+struct dav_heap_stats;
+/*
+ * Returns the heap allocation statistics associated  with the
+ * DAV object.
+ */
+int
+dav_get_heap_stats_v2(dav_obj_t *pop, struct dav_heap_stats *st);
+
+/**
+ * Get an evictable zone with sufficient free space within.
+ *
+ * \param[in]           pop             pool handle
+ * \param[in]           flags           zone selection criteria.
+ *
+ * \return id >= 0. Zero indicates non-evictable zone and will be
+ *      returned if no evictable zone can be chosen.
+ */
+uint32_t
+dav_get_zone_evictable_v2(dav_obj_t *pop, int flags);
+
+#endif /* __DAOS_COMMON_DAV_V2_H */
diff --git a/src/common/dav_v2/heap.c b/src/common/dav_v2/heap.c
new file mode 100644
index 00000000000..9c2ed954d5d
--- /dev/null
+++ b/src/common/dav_v2/heap.c
@@ -0,0 +1,1398 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2023, Intel Corporation */
+
+/*
+ * heap.c -- heap implementation
+ */
+
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+#include <float.h>
+
+#include "bucket.h"
+#include "dav_internal.h"
+#include "memblock.h"
+#include "queue.h"
+#include "heap.h"
+#include "out.h"
+#include "util.h"
+#include "sys_util.h"
+#include "valgrind_internal.h"
+#include "recycler.h"
+#include "container.h"
+#include "alloc_class.h"
+
+#define MAX_RUN_LOCKS MAX_CHUNK
+#define MAX_RUN_LOCKS_VG 1024 /* avoid perf issues /w drd */
+
+/*
+ * This is the value by which the heap might grow once we hit an OOM.
+ */
+#define HEAP_DEFAULT_GROW_SIZE (1 << 27) /* 128 megabytes */
+
+/*
+ * zoneset stores the collection of buckets and recyclers for allocation classes.
+ * Each evictable zone is assigned a zoneset during first allocation.
+ */
+struct zoneset {
+	uint32_t              zset_id;
+	uint32_t              padding;
+	struct bucket_locked *default_bucket;                  /* bucket for free chunks */
+	struct bucket_locked *buckets[MAX_ALLOCATION_CLASSES]; /* one bucket per allocation class */
+	struct recycler      *recyclers[MAX_ALLOCATION_CLASSES];
+};
+
+struct heap_rt {
+	struct alloc_class_collection *alloc_classes;
+	struct zoneset                *default_zset;
+	struct zoneset               **evictable_zsets;
+	pthread_mutex_t                run_locks[MAX_RUN_LOCKS];
+	unsigned                       nlocks;
+	unsigned                       nzones;
+	unsigned                       zones_exhausted;
+};
+
+/*
+ * heap_get_zoneset - returns the reference to the zoneset given
+ *		      zone or zoneset id.
+ */
+struct zoneset *
+heap_get_zoneset(struct palloc_heap *heap, uint32_t zone_id)
+{
+	/* REVISIT:
+	 * Implement the code for evictable zonesets.
+	 */
+	return heap->rt->default_zset;
+}
+
+/*
+ * heap_get_recycler - (internal) retrieves the recycler instance from the zoneset with
+ *	the corresponding class id. Initializes the recycler if needed.
+ */
+static struct recycler *
+heap_get_recycler(struct palloc_heap *heap, struct zoneset *zset, size_t id, size_t nallocs)
+{
+	struct recycler *r;
+
+	D_ASSERT(zset != NULL);
+	util_atomic_load_explicit64(&zset->recyclers[id], &r, memory_order_acquire);
+	if (r != NULL)
+		return r;
+
+	r = recycler_new(heap, nallocs, zset);
+	if (r && !util_bool_compare_and_swap64(&zset->recyclers[id], NULL, r)) {
+		/*
+		 * If a different thread succeeded in assigning the recycler
+		 * first, the recycler this thread created needs to be deleted.
+		 */
+		recycler_delete(r);
+
+		return heap_get_recycler(heap, zset, id, nallocs);
+	}
+
+	return r;
+}
+
+/*
+ * heap_alloc_classes -- returns the allocation classes collection
+ */
+struct alloc_class_collection *
+heap_alloc_classes(struct palloc_heap *heap)
+{
+	return heap->rt ? heap->rt->alloc_classes : NULL;
+}
+
+/*
+ * heap_get_best_class -- returns the alloc class that best fits the
+ *	requested size
+ */
+struct alloc_class *
+heap_get_best_class(struct palloc_heap *heap, size_t size)
+{
+	return alloc_class_by_alloc_size(heap->rt->alloc_classes, size);
+}
+
+/*
+ * zoneset_bucket_acquire -- fetches by zoneset or by id a bucket exclusive
+ * for the thread until zoneset_bucket_release is called
+ */
+struct bucket *
+zoneset_bucket_acquire(struct zoneset *zset, uint8_t class_id)
+{
+	struct bucket_locked *b;
+
+	D_ASSERT(zset != NULL);
+
+	if (class_id == DEFAULT_ALLOC_CLASS_ID)
+		b = zset->default_bucket;
+	else
+		b = zset->buckets[class_id];
+
+	return bucket_acquire(b);
+}
+
+/*
+ * zoneset_bucket_release -- puts the bucket back into the heap
+ */
+void
+zoneset_bucket_release(struct bucket *b)
+{
+	bucket_release(b);
+}
+
+/*
+ * heap_get_run_lock -- returns the lock associated with memory block
+ */
+pthread_mutex_t *
+heap_get_run_lock(struct palloc_heap *heap, uint32_t chunk_id)
+{
+	return &heap->rt->run_locks[chunk_id % heap->rt->nlocks];
+}
+
+/*
+ * heap_max_zone -- (internal) calculates how many zones can the heap fit
+ */
+static unsigned
+heap_max_zone(size_t size)
+{
+	unsigned max_zone = 0;
+
+	size -= sizeof(struct heap_header);
+
+	while (size >= ZONE_MIN_SIZE) {
+		max_zone++;
+		size -= size <= ZONE_MAX_SIZE ? size : ZONE_MAX_SIZE;
+	}
+
+	return max_zone;
+}
+
+/*
+ * zone_calc_size_idx -- (internal) calculates zone size index
+ */
+static uint32_t
+zone_calc_size_idx(uint32_t zone_id, unsigned max_zone, size_t heap_size)
+{
+	ASSERT(max_zone > 0);
+	if (zone_id < max_zone - 1)
+		return MAX_CHUNK;
+
+	ASSERT(heap_size >= zone_id * ZONE_MAX_SIZE);
+	size_t zone_raw_size = heap_size - zone_id * ZONE_MAX_SIZE;
+
+	ASSERT(zone_raw_size >= (sizeof(struct zone_header) +
+			sizeof(struct chunk_header) * MAX_CHUNK) +
+			sizeof(struct heap_header));
+	zone_raw_size -= sizeof(struct zone_header) +
+		sizeof(struct chunk_header) * MAX_CHUNK +
+		sizeof(struct heap_header);
+
+	size_t zone_size_idx = zone_raw_size / CHUNKSIZE;
+
+	ASSERT(zone_size_idx <= UINT32_MAX);
+
+	return (uint32_t)zone_size_idx;
+}
+
+/*
+ * heap_zone_init -- (internal) writes zone's first chunk and header
+ */
+static void
+heap_zone_init(struct palloc_heap *heap, uint32_t zone_id,
+	uint32_t first_chunk_id)
+{
+	struct zone *z = ZID_TO_ZONE(heap->layout, zone_id);
+	uint32_t size_idx = zone_calc_size_idx(zone_id, heap->rt->nzones,
+			*heap->sizep);
+
+	ASSERT(size_idx > first_chunk_id);
+	memblock_huge_init(heap, first_chunk_id, zone_id,
+		size_idx - first_chunk_id);
+
+	struct zone_header nhdr = {
+		.size_idx = size_idx,
+		.magic = ZONE_HEADER_MAGIC,
+	};
+
+	z->header = nhdr; /* write the entire header (8 bytes) at once */
+	mo_wal_persist(&heap->p_ops, &z->header, sizeof(z->header));
+}
+
+/*
+ * heap_get_adjacent_free_block -- locates adjacent free memory block in heap
+ */
+static int
+heap_get_adjacent_free_block(struct palloc_heap *heap,
+	const struct memory_block *in, struct memory_block *out, int prev)
+{
+	struct zone *z = ZID_TO_ZONE(heap->layout, in->zone_id);
+	struct chunk_header *hdr = &z->chunk_headers[in->chunk_id];
+
+	out->zone_id = in->zone_id;
+
+	if (prev) {
+		if (in->chunk_id == 0)
+			return ENOENT;
+
+		struct chunk_header *prev_hdr =
+			&z->chunk_headers[in->chunk_id - 1];
+		out->chunk_id = in->chunk_id - prev_hdr->size_idx;
+
+		if (z->chunk_headers[out->chunk_id].type != CHUNK_TYPE_FREE)
+			return ENOENT;
+
+		out->size_idx = z->chunk_headers[out->chunk_id].size_idx;
+	} else { /* next */
+		if (in->chunk_id + hdr->size_idx == z->header.size_idx)
+			return ENOENT;
+
+		out->chunk_id = in->chunk_id + hdr->size_idx;
+
+		if (z->chunk_headers[out->chunk_id].type != CHUNK_TYPE_FREE)
+			return ENOENT;
+
+		out->size_idx = z->chunk_headers[out->chunk_id].size_idx;
+	}
+	memblock_rebuild_state(heap, out);
+
+	return 0;
+}
+
+/*
+ * heap_coalesce -- (internal) merges adjacent memory blocks
+ */
+static struct memory_block
+heap_coalesce(struct palloc_heap *heap,
+	const struct memory_block *blocks[], int n)
+{
+	struct memory_block ret = MEMORY_BLOCK_NONE;
+
+	const struct memory_block *b = NULL;
+
+	ret.size_idx = 0;
+	for (int i = 0; i < n; ++i) {
+		if (blocks[i] == NULL)
+			continue;
+		b = b ? b : blocks[i];
+		ret.size_idx += blocks[i]->size_idx;
+	}
+
+	ASSERTne(b, NULL);
+
+	ret.chunk_id = b->chunk_id;
+	ret.zone_id = b->zone_id;
+	ret.block_off = b->block_off;
+	memblock_rebuild_state(heap, &ret);
+
+	return ret;
+}
+
+/*
+ * heap_coalesce_huge -- finds neighbors of a huge block, removes them from the
+ *	volatile state and returns the resulting block
+ */
+static struct memory_block
+heap_coalesce_huge(struct palloc_heap *heap, struct bucket *b,
+	const struct memory_block *m)
+{
+	const struct memory_block *blocks[3] = {NULL, m, NULL};
+
+	struct memory_block prev = MEMORY_BLOCK_NONE;
+
+	if (heap_get_adjacent_free_block(heap, m, &prev, 1) == 0 &&
+		bucket_remove_block(b, &prev) == 0) {
+		blocks[0] = &prev;
+	}
+
+	struct memory_block next = MEMORY_BLOCK_NONE;
+
+	if (heap_get_adjacent_free_block(heap, m, &next, 0) == 0 &&
+		bucket_remove_block(b, &next) == 0) {
+		blocks[2] = &next;
+	}
+
+	return heap_coalesce(heap, blocks, 3);
+}
+
+/*
+ * heap_free_chunk_reuse -- reuses existing free chunk
+ */
+int
+heap_free_chunk_reuse(struct palloc_heap *heap,
+	struct bucket *bucket,
+	struct memory_block *m)
+{
+	/*
+	 * Perform coalescing just in case there
+	 * are any neighboring free chunks.
+	 */
+	struct memory_block nm = heap_coalesce_huge(heap, bucket, m);
+
+	if (nm.size_idx != m->size_idx)
+		m->m_ops->prep_hdr(&nm, MEMBLOCK_FREE, NULL);
+
+	*m = nm;
+
+	return bucket_insert_block(bucket, m);
+}
+
+/*
+ * heap_run_into_free_chunk -- (internal) creates a new free chunk in place of
+ *	a run.
+ */
+static void
+heap_run_into_free_chunk(struct palloc_heap *heap,
+	struct bucket *bucket,
+	struct memory_block *m)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(heap, m);
+
+	m->block_off = 0;
+	m->size_idx = hdr->size_idx;
+
+	STATS_SUB(heap->stats, transient, heap_run_active,
+		m->size_idx * CHUNKSIZE);
+
+	/*
+	 * The only thing this could race with is heap_memblock_on_free()
+	 * because that function is called after processing the operation,
+	 * which means that a different thread might immediately call this
+	 * function if the free() made the run empty.
+	 * We could forgo this lock if it weren't for helgrind which needs it
+	 * to establish happens-before relation for the chunk metadata.
+	 */
+	pthread_mutex_t *lock = m->m_ops->get_lock(m);
+
+	util_mutex_lock(lock);
+
+	*m = memblock_huge_init(heap, m->chunk_id, m->zone_id, m->size_idx);
+
+	heap_free_chunk_reuse(heap, bucket, m);
+
+	util_mutex_unlock(lock);
+}
+
+/*
+ * heap_reclaim_run -- checks the run for available memory if unclaimed.
+ *
+ * Returns 1 if reclaimed chunk, 0 otherwise.
+ */
+static int
+heap_reclaim_run(struct palloc_heap *heap, struct memory_block *m, int startup)
+{
+	struct chunk_run    *run  = heap_get_chunk_run(heap, m);
+	struct chunk_header *hdr = heap_get_chunk_hdr(heap, m);
+	struct zoneset      *zset = heap_get_zoneset(heap, m->zone_id);
+
+	struct alloc_class *c = alloc_class_by_run(
+		heap->rt->alloc_classes,
+		run->hdr.block_size, hdr->flags, m->size_idx);
+
+	struct recycler_element e = recycler_element_new(heap, m);
+
+	if (c == NULL) {
+		uint32_t size_idx = m->size_idx;
+		struct run_bitmap b;
+
+		m->m_ops->get_bitmap(m, &b);
+
+		ASSERTeq(size_idx, m->size_idx);
+
+		return e.free_space == b.nbits;
+	}
+
+	if (e.free_space == c->rdsc.nallocs)
+		return 1;
+
+	if (startup) {
+		STATS_INC(heap->stats, transient, heap_run_active,
+			m->size_idx * CHUNKSIZE);
+		STATS_INC(heap->stats, transient, heap_run_allocated,
+			(c->rdsc.nallocs - e.free_space) * run->hdr.block_size);
+	}
+	struct recycler *recycler = heap_get_recycler(heap, zset, c->id, c->rdsc.nallocs);
+
+	if (recycler == NULL || recycler_put(recycler, e) < 0)
+		ERR("lost runtime tracking info of %u run due to OOM", c->id);
+
+	return 0;
+}
+
+/*
+ * heap_reclaim_zone_garbage -- (internal) creates volatile state of unused runs
+ */
+static void
+heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket,
+	uint32_t zone_id)
+{
+	struct zone *z = ZID_TO_ZONE(heap->layout, zone_id);
+
+	for (uint32_t i = 0; i < z->header.size_idx; ) {
+		struct chunk_header *hdr = &z->chunk_headers[i];
+
+		ASSERT(hdr->size_idx != 0);
+
+		struct memory_block m = MEMORY_BLOCK_NONE;
+
+		m.zone_id = zone_id;
+		m.chunk_id = i;
+		m.size_idx = hdr->size_idx;
+
+		memblock_rebuild_state(heap, &m);
+		m.m_ops->reinit_chunk(&m);
+
+		switch (hdr->type) {
+		case CHUNK_TYPE_RUN:
+			if (heap_reclaim_run(heap, &m, 1) != 0)
+				heap_run_into_free_chunk(heap, bucket, &m);
+			break;
+		case CHUNK_TYPE_FREE:
+			heap_free_chunk_reuse(heap, bucket, &m);
+			break;
+		case CHUNK_TYPE_USED:
+			break;
+		default:
+			ASSERT(0);
+		}
+
+		i = m.chunk_id + m.size_idx; /* hdr might have changed */
+	}
+}
+
+/*
+ * heap_populate_bucket -- (internal) creates volatile state of memory blocks
+ */
+static int
+heap_populate_bucket(struct palloc_heap *heap, struct bucket *bucket)
+{
+	struct heap_rt *h = heap->rt;
+
+	/* at this point we are sure that there's no more memory in the heap */
+	if (h->zones_exhausted == h->nzones)
+		return ENOMEM;
+
+	uint32_t zone_id = h->zones_exhausted++;
+	struct zone *z = ZID_TO_ZONE(heap->layout, zone_id);
+
+	/* ignore zone and chunk headers */
+	VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(z, sizeof(z->header) +
+		sizeof(z->chunk_headers));
+
+	if (z->header.magic != ZONE_HEADER_MAGIC)
+		heap_zone_init(heap, zone_id, 0);
+
+	heap_reclaim_zone_garbage(heap, bucket, zone_id);
+
+	/*
+	 * It doesn't matter that this function might not have found any
+	 * free blocks because there is still potential that subsequent calls
+	 * will find something in later zones.
+	 */
+	return 0;
+}
+
+/*
+ * heap_recycle_unused -- recalculate scores in the recycler and turn any
+ *	empty runs into free chunks
+ *
+ * If force is not set, this function might effectively be a noop if not enough
+ * of space was freed.
+ */
+static int
+heap_recycle_unused(struct palloc_heap *heap, struct recycler *recycler,
+	struct bucket *defb, int force)
+{
+	struct zoneset      *zset;
+	struct memory_block *nm;
+	struct empty_runs    r = recycler_recalc(recycler, force);
+	struct bucket       *nb;
+
+	if (VEC_SIZE(&r) == 0)
+		return ENOMEM;
+
+	zset = recycler_get_zoneset(recycler);
+	D_ASSERT(zset != NULL);
+
+	nb = defb == NULL ? zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID) : NULL;
+
+	ASSERT(defb != NULL || nb != NULL);
+
+	VEC_FOREACH_BY_PTR(nm, &r) {
+		heap_run_into_free_chunk(heap, defb ? defb : nb, nm);
+	}
+
+	if (nb != NULL)
+		zoneset_bucket_release(nb);
+
+	VEC_DELETE(&r);
+
+	return 0;
+}
+
+/*
+ * heap_reclaim_garbage -- (internal) creates volatile state of unused runs
+ */
+static int
+heap_reclaim_garbage(struct palloc_heap *heap, struct bucket *bucket)
+{
+	int              ret = ENOMEM;
+	struct recycler *r;
+	struct zoneset  *zset = bucket_get_zoneset(bucket);
+
+	for (size_t i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		r = zset->recyclers[i];
+		if (r == NULL)
+			continue;
+
+		if (heap_recycle_unused(heap, r, bucket, 1) == 0)
+			ret = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * heap_ensure_huge_bucket_filled --
+ *	(internal) refills the default bucket if needed
+ */
+static int
+heap_ensure_huge_bucket_filled(struct palloc_heap *heap,
+	struct bucket *bucket)
+{
+	if (heap_reclaim_garbage(heap, bucket) == 0)
+		return 0;
+
+	if (heap_populate_bucket(heap, bucket) == 0)
+		return 0;
+
+#if	0	/*REVISIT: heap extend not supported*/
+	int extend;
+
+	extend = heap_extend(heap, bucket, heap->growsize);
+	if (extend < 0)
+		return ENOMEM;
+
+	if (extend == 1)
+		return 0;
+#endif
+
+	/*
+	 * Extending the pool does not automatically add the chunks into the
+	 * runtime state of the bucket - we need to traverse the new zone if
+	 * it was created.
+	 */
+	if (heap_populate_bucket(heap, bucket) == 0)
+		return 0;
+
+	return ENOMEM;
+}
+
+/*
+ * heap_discard_run -- puts the memory block back into the global heap.
+ */
+void
+heap_discard_run(struct palloc_heap *heap, struct memory_block *m)
+{
+	struct zoneset *zset = heap_get_zoneset(heap, m->zone_id);
+
+	D_ASSERT(zset != NULL);
+	if (heap_reclaim_run(heap, m, 0)) {
+		struct bucket *b = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID);
+
+		heap_run_into_free_chunk(heap, b, m);
+
+		zoneset_bucket_release(b);
+	}
+}
+
+/*
+ * heap_detach_and_try_discard_run -- detaches the active from a bucket and
+ *	tries to discard the run if it is completely empty (has no allocations)
+ */
+static int
+heap_detach_and_try_discard_run(struct palloc_heap *heap, struct bucket *b)
+{
+	int empty = 0;
+	struct memory_block m;
+
+	if (bucket_detach_run(b, &m, &empty) != 0)
+		return -1;
+
+	if (empty)
+		heap_discard_run(heap, &m);
+
+	return 0;
+}
+
+/*
+ * heap_reuse_from_recycler -- (internal) try reusing runs that are currently
+ *	in the recycler
+ */
+static int
+heap_reuse_from_recycler(struct palloc_heap *heap,
+	struct bucket *b, uint32_t units, int force)
+{
+	struct zoneset     *zset = bucket_get_zoneset(b);
+	struct memory_block m = MEMORY_BLOCK_NONE;
+
+	m.size_idx = units;
+
+	struct alloc_class *aclass = bucket_alloc_class(b);
+
+	struct recycler *recycler = heap_get_recycler(heap, zset, aclass->id, aclass->rdsc.nallocs);
+
+	if (recycler == NULL) {
+		ERR("lost runtime tracking info of %u run due to OOM",
+			aclass->id);
+		return 0;
+	}
+
+	if (!force && recycler_get(recycler, &m) == 0)
+		return bucket_attach_run(b, &m);
+
+	heap_recycle_unused(heap, recycler, NULL, force);
+
+	if (recycler_get(recycler, &m) == 0)
+		return bucket_attach_run(b, &m);
+
+	return ENOMEM;
+}
+
+/*
+ * heap_run_create -- (internal) initializes a new run on an existing free chunk
+ */
+static int
+heap_run_create(struct palloc_heap *heap, struct bucket *b,
+	struct memory_block *m)
+{
+	struct alloc_class *aclass = bucket_alloc_class(b);
+	*m = memblock_run_init(heap, m->chunk_id, m->zone_id, &aclass->rdsc);
+
+	bucket_attach_run(b, m);
+
+	STATS_INC(heap->stats, transient, heap_run_active,
+		m->size_idx * CHUNKSIZE);
+
+	return 0;
+}
+
+/*
+ * heap_ensure_run_bucket_filled -- (internal) refills the bucket if needed
+ */
+static int
+heap_ensure_run_bucket_filled(struct palloc_heap *heap, struct bucket *b,
+	uint32_t units)
+{
+	int ret = 0;
+	struct alloc_class *aclass = bucket_alloc_class(b);
+	struct zoneset     *zset   = bucket_get_zoneset(b);
+
+	D_ASSERT(zset != NULL);
+	ASSERTeq(aclass->type, CLASS_RUN);
+
+	if (heap_detach_and_try_discard_run(heap, b) != 0)
+		return ENOMEM;
+
+	if (heap_reuse_from_recycler(heap, b, units, 0) == 0)
+		goto out;
+
+	/* search in the next zone before attempting to create a new run */
+	struct bucket *defb = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID);
+
+	heap_populate_bucket(heap, defb);
+	zoneset_bucket_release(defb);
+
+	if (heap_reuse_from_recycler(heap, b, units, 0) == 0)
+		goto out;
+
+	struct memory_block m = MEMORY_BLOCK_NONE;
+
+	m.size_idx = aclass->rdsc.size_idx;
+
+	defb = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID);
+
+	/* cannot reuse an existing run, create a new one */
+	if (heap_get_bestfit_block(heap, defb, &m) == 0) {
+		ASSERTeq(m.block_off, 0);
+		if (heap_run_create(heap, b, &m) != 0) {
+			zoneset_bucket_release(defb);
+			return ENOMEM;
+		}
+
+		zoneset_bucket_release(defb);
+
+		goto out;
+	}
+	zoneset_bucket_release(defb);
+
+	if (heap_reuse_from_recycler(heap, b, units, 0) == 0)
+		goto out;
+
+	ret = ENOMEM;
+out:
+
+	return ret;
+}
+
+/*
+ * heap_memblock_on_free -- bookkeeping actions executed at every free of a
+ *	block
+ */
+void
+heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m)
+{
+	struct zoneset *zset = heap_get_zoneset(heap, m->zone_id);
+
+	if (m->type != MEMORY_BLOCK_RUN)
+		return;
+
+	struct chunk_header *hdr = heap_get_chunk_hdr(heap, m);
+	struct chunk_run *run = heap_get_chunk_run(heap, m);
+
+	ASSERTeq(hdr->type, CHUNK_TYPE_RUN);
+
+	struct alloc_class *c = alloc_class_by_run(
+		heap->rt->alloc_classes,
+		run->hdr.block_size, hdr->flags, hdr->size_idx);
+
+	if (c == NULL)
+		return;
+
+	struct recycler *recycler = heap_get_recycler(heap, zset, c->id, c->rdsc.nallocs);
+
+	if (recycler == NULL) {
+		ERR("lost runtime tracking info of %u run due to OOM",
+			c->id);
+	} else {
+		recycler_inc_unaccounted(recycler, m);
+	}
+}
+
+/*
+ * heap_split_block -- (internal) splits unused part of the memory block
+ */
+static void
+heap_split_block(struct palloc_heap *heap, struct bucket *b,
+		struct memory_block *m, uint32_t units)
+{
+	struct alloc_class *aclass = bucket_alloc_class(b);
+
+	ASSERT(units <= UINT16_MAX);
+	ASSERT(units > 0);
+
+	if (aclass->type == CLASS_RUN) {
+		ASSERT((uint64_t)m->block_off + (uint64_t)units <= UINT32_MAX);
+		struct memory_block r = {m->chunk_id, m->zone_id,
+			m->size_idx - units, (uint32_t)(m->block_off + units),
+			NULL, NULL, 0, 0, NULL};
+		memblock_rebuild_state(heap, &r);
+		if (bucket_insert_block(b, &r) != 0)
+			D_CRIT("failed to allocate memory block runtime tracking info\n");
+	} else {
+		uint32_t new_chunk_id = m->chunk_id + units;
+		uint32_t new_size_idx = m->size_idx - units;
+
+		struct memory_block n = memblock_huge_init(heap,
+			new_chunk_id, m->zone_id, new_size_idx);
+
+		*m = memblock_huge_init(heap, m->chunk_id, m->zone_id, units);
+
+		if (bucket_insert_block(b, &n) != 0)
+			D_CRIT("failed to allocate memory block runtime tracking info\n");
+	}
+
+	m->size_idx = units;
+}
+
+/*
+ * heap_get_bestfit_block --
+ *	extracts a memory block of equal size index
+ */
+int
+heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b,
+	struct memory_block *m)
+{
+	struct alloc_class *aclass = bucket_alloc_class(b);
+	uint32_t units = m->size_idx;
+
+	while (bucket_alloc_block(b, m) != 0) {
+		if (aclass->type == CLASS_HUGE) {
+			if (heap_ensure_huge_bucket_filled(heap, b) != 0)
+				return ENOMEM;
+		} else {
+			if (heap_ensure_run_bucket_filled(heap, b, units) != 0)
+				return ENOMEM;
+		}
+	}
+
+	ASSERT(m->size_idx >= units);
+
+	if (units != m->size_idx)
+		heap_split_block(heap, b, m, units);
+
+	m->m_ops->ensure_header_type(m, aclass->header_type);
+	m->header_type = aclass->header_type;
+
+	return 0;
+}
+
+/*
+ * heap_end -- returns first address after heap
+ */
+void *
+heap_end(struct palloc_heap *h)
+{
+	ASSERT(h->rt->nzones > 0);
+
+	struct zone *last_zone = ZID_TO_ZONE(h->layout, h->rt->nzones - 1);
+
+	return &last_zone->chunks[last_zone->header.size_idx];
+}
+
+/*
+ * heap_default_zoneset_init -- (internal) initializes default zone
+ */
+static int
+heap_default_zoneset_init(struct palloc_heap *heap)
+{
+	struct heap_rt     *h = heap->rt;
+	struct zoneset     *default_zset;
+	struct alloc_class *c;
+	uint8_t             i;
+
+	D_ALLOC_PTR(default_zset);
+	if (default_zset == NULL)
+		return -1;
+
+	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		c = alloc_class_by_id(h->alloc_classes, i);
+
+		if (c == NULL)
+			continue;
+
+		default_zset->buckets[c->id] =
+		    bucket_locked_new(container_new_seglists(heap), c, default_zset);
+		if (default_zset->buckets[c->id] == NULL)
+			goto error_bucket_create;
+	}
+
+	default_zset->default_bucket = bucket_locked_new(
+	    container_new_ravl(heap), alloc_class_by_id(h->alloc_classes, DEFAULT_ALLOC_CLASS_ID),
+	    default_zset);
+
+	if (default_zset->default_bucket == NULL)
+		goto error_bucket_create;
+
+	heap->rt->default_zset = default_zset;
+	return 0;
+
+error_bucket_create:
+	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		c = alloc_class_by_id(h->alloc_classes, i);
+		if (c != NULL) {
+			if (default_zset->buckets[c->id] != NULL)
+				bucket_locked_delete(default_zset->buckets[c->id]);
+		}
+	}
+	D_FREE(default_zset);
+	return -1;
+}
+
+static void
+heap_default_zoneset_cleanup(struct palloc_heap *heap)
+{
+	struct zoneset  *default_zset = heap->rt->default_zset;
+	uint8_t          i;
+
+	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		if (default_zset->buckets[i] == NULL)
+			continue;
+		bucket_locked_delete(default_zset->buckets[i]);
+	}
+	bucket_locked_delete(default_zset->default_bucket);
+
+	for (i = 0; i < MAX_ALLOCATION_CLASSES; ++i) {
+		if (default_zset->recyclers[i] == NULL)
+			continue;
+		recycler_delete(default_zset->recyclers[i]);
+	}
+	D_FREE(default_zset);
+	heap->rt->default_zset = NULL;
+}
+
+/*
+ * heap_create_alloc_class_buckets -- allocates all cache bucket
+ * instances of the specified type
+ */
+int
+heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c)
+{
+	struct zoneset *default_zset = heap->rt->default_zset;
+
+	if (default_zset->buckets[c->id] == NULL) {
+		default_zset->buckets[c->id] =
+		    bucket_locked_new(container_new_seglists(heap), c, default_zset);
+		if (default_zset->buckets[c->id] == NULL)
+			return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_zone_update_if_needed -- updates the zone metadata if the pool has been
+ *	extended.
+ */
+static void
+heap_zone_update_if_needed(struct palloc_heap *heap)
+{
+	struct zone *z;
+
+	for (uint32_t i = 0; i < heap->rt->nzones; ++i) {
+		z = ZID_TO_ZONE(heap->layout, i);
+		if (z->header.magic != ZONE_HEADER_MAGIC)
+			continue;
+
+		size_t size_idx = zone_calc_size_idx(i, heap->rt->nzones,
+			*heap->sizep);
+
+		if (size_idx == z->header.size_idx)
+			continue;
+
+		heap_zone_init(heap, i, z->header.size_idx);
+	}
+}
+
+/*
+ * heap_boot -- opens the heap region of the dav_obj pool
+ *
+ * If successful function returns zero. Otherwise an error number is returned.
+ */
+int
+heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size,
+	  uint64_t *sizep, void *base, struct mo_ops *p_ops,
+	  struct stats *stats, struct pool_set *set)
+{
+	struct heap_rt *h;
+	int err;
+
+	/*
+	 * The size can be 0 if interrupted during heap_init or this is the
+	 * first time booting the heap with the persistent size field.
+	 */
+	if (*sizep == 0) {
+		*sizep = heap_size;
+
+		mo_wal_persist(p_ops, sizep, sizeof(*sizep));
+	}
+
+	if (heap_size < *sizep) {
+		ERR("mapped region smaller than the heap size");
+		return EINVAL;
+	}
+
+	D_ALLOC_PTR_NZ(h);
+	if (h == NULL) {
+		err = ENOMEM;
+		goto error_heap_malloc;
+	}
+
+	h->alloc_classes = alloc_class_collection_new();
+	if (h->alloc_classes == NULL) {
+		err = ENOMEM;
+		goto error_alloc_classes_new;
+	}
+
+	h->nzones = heap_max_zone(heap_size);
+
+	h->zones_exhausted = 0;
+
+	h->nlocks = On_valgrind ? MAX_RUN_LOCKS_VG : MAX_RUN_LOCKS;
+	for (unsigned i = 0; i < h->nlocks; ++i)
+		util_mutex_init(&h->run_locks[i]);
+
+	heap->p_ops = *p_ops;
+	heap->layout = heap_start;
+	heap->rt = h;
+	heap->sizep = sizep;
+	heap->base = base;
+	heap->stats = stats;
+	heap->set = set;
+	heap->growsize = HEAP_DEFAULT_GROW_SIZE;
+	heap->alloc_pattern = PALLOC_CTL_DEBUG_NO_PATTERN;
+	VALGRIND_DO_CREATE_MEMPOOL(heap->layout, 0, 0);
+
+	if (heap_default_zoneset_init(heap) != 0) {
+		err = ENOMEM;
+		goto error_zoneset_init;
+	}
+
+	heap_zone_update_if_needed(heap);
+
+	return 0;
+
+error_zoneset_init:
+	alloc_class_collection_delete(h->alloc_classes);
+error_alloc_classes_new:
+	D_FREE(h);
+	heap->rt = NULL;
+error_heap_malloc:
+	return err;
+}
+
+/*
+ * heap_write_header -- (internal) creates a clean header
+ */
+static void
+heap_write_header(struct heap_header *hdr)
+{
+	struct heap_header newhdr = {
+		.signature = HEAP_SIGNATURE,
+		.major = HEAP_MAJOR,
+		.minor = HEAP_MINOR,
+		.unused = 0,
+		.chunksize = CHUNKSIZE,
+		.chunks_per_zone = MAX_CHUNK,
+		.reserved = {0},
+		.checksum = 0
+	};
+
+	util_checksum(&newhdr, sizeof(newhdr), &newhdr.checksum, 1, 0);
+	*hdr = newhdr;
+}
+
+/*
+ * heap_init -- initializes the heap
+ *
+ * If successful function returns zero. Otherwise an error number is returned.
+ */
+int
+heap_init(void *heap_start, uint64_t heap_size, uint64_t *sizep,
+	  struct mo_ops *p_ops)
+{
+	if (heap_size < HEAP_MIN_SIZE)
+		return EINVAL;
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(heap_start, heap_size);
+
+	struct heap_layout *layout = heap_start;
+
+	heap_write_header(&layout->header);
+	mo_wal_persist(p_ops, &layout->header, sizeof(struct heap_header));
+
+	unsigned zones = heap_max_zone(heap_size);
+
+	for (unsigned i = 0; i < zones; ++i) {
+		struct zone *zone = ZID_TO_ZONE(layout, i);
+
+		mo_wal_memset(p_ops, &zone->header, 0,
+			      sizeof(struct zone_header), 0);
+		mo_wal_memset(p_ops, &zone->chunk_headers, 0,
+			      sizeof(struct chunk_header), 0);
+
+		/* only explicitly allocated chunks should be accessible */
+		VALGRIND_DO_MAKE_MEM_NOACCESS(&zone->chunk_headers,
+			sizeof(struct chunk_header));
+	}
+	*sizep = heap_size;
+	mo_wal_persist(p_ops, sizep, sizeof(*sizep));
+
+	return 0;
+}
+
+/*
+ * heap_cleanup -- cleanups the volatile heap state
+ */
+void
+heap_cleanup(struct palloc_heap *heap)
+{
+	struct heap_rt *rt = heap->rt;
+
+	alloc_class_collection_delete(rt->alloc_classes);
+
+	heap_default_zoneset_cleanup(heap);
+
+	for (unsigned i = 0; i < rt->nlocks; ++i)
+		util_mutex_destroy(&rt->run_locks[i]);
+
+	VALGRIND_DO_DESTROY_MEMPOOL(heap->layout);
+
+	D_FREE(rt);
+	heap->rt = NULL;
+}
+
+/*
+ * heap_verify_header -- (internal) verifies if the heap header is consistent
+ */
+static int
+heap_verify_header(struct heap_header *hdr)
+{
+	if (util_checksum(hdr, sizeof(*hdr), &hdr->checksum, 0, 0) != 1) {
+		D_CRIT("heap: invalid header's checksum\n");
+		return -1;
+	}
+
+	if (memcmp(hdr->signature, HEAP_SIGNATURE, HEAP_SIGNATURE_LEN) != 0) {
+		D_CRIT("heap: invalid signature\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_verify_zone_header --
+ *	(internal) verifies if the zone header is consistent
+ */
+static int
+heap_verify_zone_header(struct zone_header *hdr)
+{
+	if (hdr->magic != ZONE_HEADER_MAGIC) /* not initialized */
+		return 0;
+
+	if (hdr->size_idx == 0) {
+		D_CRIT("heap: invalid zone size\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_verify_chunk_header --
+ *	(internal) verifies if the chunk header is consistent
+ */
+static int
+heap_verify_chunk_header(struct chunk_header *hdr)
+{
+	if (hdr->type == CHUNK_TYPE_UNKNOWN) {
+		D_CRIT("heap: invalid chunk type\n");
+		return -1;
+	}
+
+	if (hdr->type >= MAX_CHUNK_TYPE) {
+		D_CRIT("heap: unknown chunk type\n");
+		return -1;
+	}
+
+	if (hdr->flags & ~CHUNK_FLAGS_ALL_VALID) {
+		D_CRIT("heap: invalid chunk flags\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_verify_zone -- (internal) verifies if the zone is consistent
+ */
+static int
+heap_verify_zone(struct zone *zone)
+{
+	if (zone->header.magic == 0)
+		return 0; /* not initialized, and that is OK */
+
+	if (zone->header.magic != ZONE_HEADER_MAGIC) {
+		D_CRIT("heap: invalid zone magic\n");
+		return -1;
+	}
+
+	if (heap_verify_zone_header(&zone->header))
+		return -1;
+
+	uint32_t i;
+
+	for (i = 0; i < zone->header.size_idx; ) {
+		if (heap_verify_chunk_header(&zone->chunk_headers[i]))
+			return -1;
+
+		i += zone->chunk_headers[i].size_idx;
+	}
+
+	if (i != zone->header.size_idx) {
+		D_CRIT("heap: chunk sizes mismatch\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_check -- verifies if the heap is consistent and can be opened properly
+ *
+ * If successful function returns zero. Otherwise an error number is returned.
+ */
+int
+heap_check(void *heap_start, uint64_t heap_size)
+{
+	if (heap_size < HEAP_MIN_SIZE) {
+		D_CRIT("heap: invalid heap size\n");
+		return -1;
+	}
+
+	struct heap_layout *layout = heap_start;
+
+	if (heap_verify_header(&layout->header))
+		return -1;
+
+	for (unsigned i = 0; i < heap_max_zone(heap_size); ++i) {
+		if (heap_verify_zone(ZID_TO_ZONE(layout, i)))
+			return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_check_remote -- verifies if the heap of a remote pool is consistent
+ *	and can be opened properly
+ *
+ * If successful function returns zero. Otherwise an error number is returned.
+ */
+int
+heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops)
+{
+	struct zone *zone_buff;
+
+	if (heap_size < HEAP_MIN_SIZE) {
+		D_CRIT("heap: invalid heap size\n");
+		return -1;
+	}
+
+	struct heap_layout *layout = heap_start;
+
+	struct heap_header header;
+
+	if (ops->read(ops->ctx, ops->base, &header, &layout->header,
+						sizeof(struct heap_header))) {
+		D_CRIT("heap: obj_read_remote error\n");
+		return -1;
+	}
+
+	if (heap_verify_header(&header))
+		return -1;
+
+	D_ALLOC_PTR_NZ(zone_buff);
+	if (zone_buff == NULL) {
+		D_CRIT("heap: zone_buff malloc error\n");
+		return -1;
+	}
+	for (unsigned i = 0; i < heap_max_zone(heap_size); ++i) {
+		if (ops->read(ops->ctx, ops->base, zone_buff,
+				ZID_TO_ZONE(layout, i), sizeof(struct zone))) {
+			D_CRIT("heap: obj_read_remote error\n");
+			goto out;
+		}
+
+		if (heap_verify_zone(zone_buff))
+			goto out;
+	}
+	D_FREE(zone_buff);
+	return 0;
+
+out:
+	D_FREE(zone_buff);
+	return -1;
+}
+
+/*
+ * heap_zone_foreach_object -- (internal) iterates through objects in a zone
+ */
+static int
+heap_zone_foreach_object(struct palloc_heap *heap, object_callback cb,
+	void *arg, struct memory_block *m)
+{
+	struct zone *zone = ZID_TO_ZONE(heap->layout, m->zone_id);
+
+	if (zone->header.magic == 0)
+		return 0;
+
+	for (; m->chunk_id < zone->header.size_idx; ) {
+		struct chunk_header *hdr = heap_get_chunk_hdr(heap, m);
+
+		memblock_rebuild_state(heap, m);
+		m->size_idx = hdr->size_idx;
+
+		if (m->m_ops->iterate_used(m, cb, arg) != 0)
+			return 1;
+
+		m->chunk_id += m->size_idx;
+		m->block_off = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * heap_foreach_object -- (internal) iterates through objects in the heap
+ */
+void
+heap_foreach_object(struct palloc_heap *heap, object_callback cb, void *arg,
+	struct memory_block m)
+{
+	for (; m.zone_id < heap->rt->nzones; ++m.zone_id) {
+		if (heap_zone_foreach_object(heap, cb, arg, &m) != 0)
+			break;
+
+		m.chunk_id = 0;
+	}
+}
+
+#if VG_MEMCHECK_ENABLED
+/*
+ * heap_vg_open -- notifies Valgrind about heap layout
+ */
+void
+heap_vg_open(struct palloc_heap *heap, object_callback cb,
+	void *arg, int objects)
+{
+	ASSERTne(cb, NULL);
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(heap->layout, *heap->sizep);
+
+	struct heap_layout *layout = heap->layout;
+
+	VALGRIND_DO_MAKE_MEM_DEFINED(&layout->header, sizeof(layout->header));
+
+	unsigned zones = heap_max_zone(*heap->sizep);
+	struct memory_block m = MEMORY_BLOCK_NONE;
+
+	for (unsigned i = 0; i < zones; ++i) {
+		struct zone *z = ZID_TO_ZONE(layout, i);
+		uint32_t chunks;
+
+		m.zone_id = i;
+		m.chunk_id = 0;
+
+		VALGRIND_DO_MAKE_MEM_DEFINED(&z->header, sizeof(z->header));
+
+		if (z->header.magic != ZONE_HEADER_MAGIC)
+			continue;
+
+		chunks = z->header.size_idx;
+
+		for (uint32_t c = 0; c < chunks; ) {
+			struct chunk_header *hdr = &z->chunk_headers[c];
+
+			/* define the header before rebuilding state */
+			VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr));
+
+			m.chunk_id = c;
+			m.size_idx = hdr->size_idx;
+
+			memblock_rebuild_state(heap, &m);
+
+			m.m_ops->vg_init(&m, objects, cb, arg);
+			m.block_off = 0;
+
+			ASSERT(hdr->size_idx > 0);
+
+			c += hdr->size_idx;
+		}
+
+		/* mark all unused chunk headers after last as not accessible */
+		VALGRIND_DO_MAKE_MEM_NOACCESS(&z->chunk_headers[chunks],
+			(MAX_CHUNK - chunks) * sizeof(struct chunk_header));
+	}
+}
+#endif
diff --git a/src/common/dav_v2/heap.h b/src/common/dav_v2/heap.h
new file mode 100644
index 00000000000..21f6d0dfd0b
--- /dev/null
+++ b/src/common/dav_v2/heap.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2023, Intel Corporation */
+
+/*
+ * heap.h -- internal definitions for heap
+ */
+
+#ifndef __DAOS_COMMON_HEAP_H
+#define __DAOS_COMMON_HEAP_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "memblock.h"
+#include "bucket.h"
+#include "memops.h"
+#include "palloc.h"
+#include "dav_internal.h"
+
+#define HEAP_OFF_TO_PTR(heap, off) ((void *)((char *)((heap)->base) + (off)))
+#define HEAP_PTR_TO_OFF(heap, ptr) ((uintptr_t)(ptr) - (uintptr_t)((heap)->base))
+
+#define BIT_IS_CLR(a, i)           (!((a) & (1ULL << (i))))
+#define HEAP_ARENA_PER_THREAD      (0)
+
+int
+heap_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, uint64_t *sizep,
+	  void *base, struct mo_ops *p_ops, struct stats *stats, struct pool_set *set);
+int
+heap_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops);
+void
+heap_cleanup(struct palloc_heap *heap);
+int
+heap_check(void *heap_start, uint64_t heap_size);
+int
+heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops);
+int
+heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c);
+int
+heap_extend(struct palloc_heap *heap, struct bucket *defb, size_t size);
+
+struct alloc_class *
+heap_get_best_class(struct palloc_heap *heap, size_t size);
+
+struct bucket *
+zoneset_bucket_acquire(struct zoneset *zset, uint8_t class_id);
+void
+zoneset_bucket_release(struct bucket *b);
+
+int
+heap_get_bestfit_block(struct palloc_heap *heap, struct bucket *b, struct memory_block *m);
+pthread_mutex_t *
+heap_get_run_lock(struct palloc_heap *heap, uint32_t chunk_id);
+
+void
+heap_discard_run(struct palloc_heap *heap, struct memory_block *m);
+
+void
+heap_memblock_on_free(struct palloc_heap *heap, const struct memory_block *m);
+
+int
+heap_free_chunk_reuse(struct palloc_heap *heap, struct bucket *bucket, struct memory_block *m);
+
+void
+heap_foreach_object(struct palloc_heap *heap, object_callback cb, void *arg,
+		    struct memory_block start);
+
+struct alloc_class_collection *
+heap_alloc_classes(struct palloc_heap *heap);
+
+void *
+heap_end(struct palloc_heap *heap);
+
+void
+heap_vg_open(struct palloc_heap *heap, object_callback cb, void *arg, int objects);
+
+static inline struct chunk_header *
+heap_get_chunk_hdr(struct palloc_heap *heap, const struct memory_block *m)
+{
+	return GET_CHUNK_HDR(heap->layout, m->zone_id, m->chunk_id);
+}
+
+static inline struct chunk *
+heap_get_chunk(struct palloc_heap *heap, const struct memory_block *m)
+{
+	return GET_CHUNK(heap->layout, m->zone_id, m->chunk_id);
+}
+
+static inline struct chunk_run *
+heap_get_chunk_run(struct palloc_heap *heap, const struct memory_block *m)
+{
+	return GET_CHUNK_RUN(heap->layout, m->zone_id, m->chunk_id);
+}
+
+struct zoneset *
+heap_get_zoneset(struct palloc_heap *heap, uint32_t zone_id);
+
+#endif /* __DAOS_COMMON_HEAP_H */
diff --git a/src/common/dav_v2/heap_layout.h b/src/common/dav_v2/heap_layout.h
new file mode 100644
index 00000000000..c7209670103
--- /dev/null
+++ b/src/common/dav_v2/heap_layout.h
@@ -0,0 +1,198 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2020, Intel Corporation */
+
+/*
+ * heap_layout.h -- internal definitions for heap layout
+ */
+
+#ifndef __DAOS_COMMON_HEAP_LAYOUT_H
+#define __DAOS_COMMON_HEAP_LAYOUT_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define HEAP_MAJOR 1
+#define HEAP_MINOR 0
+
+#define MAX_CHUNK (UINT16_MAX - 7) /* has to be multiple of 8 */
+#define CHUNK_BASE_ALIGNMENT 1024
+#define CHUNKSIZE ((size_t)1024 * 256)	/* 256 kilobytes */
+#define MAX_MEMORY_BLOCK_SIZE (MAX_CHUNK * CHUNKSIZE)
+#define HEAP_SIGNATURE_LEN 16
+#define HEAP_SIGNATURE "MEMORY_HEAP_HDR\0"
+#define ZONE_HEADER_MAGIC 0xC3F0A2D2
+#define ZONE_MIN_SIZE (sizeof(struct zone) + sizeof(struct chunk))
+#define ZONE_MAX_SIZE (sizeof(struct zone) + sizeof(struct chunk) * MAX_CHUNK)
+#define HEAP_MIN_SIZE (sizeof(struct heap_layout) + ZONE_MIN_SIZE)
+
+/* Base bitmap values, relevant for both normal and flexible bitmaps */
+#define RUN_BITS_PER_VALUE 64U
+#define RUN_BASE_METADATA_VALUES\
+	((unsigned)(sizeof(struct chunk_run_header) / sizeof(uint64_t)))
+#define RUN_BASE_METADATA_SIZE (sizeof(struct chunk_run_header))
+
+#define RUN_CONTENT_SIZE (CHUNKSIZE - RUN_BASE_METADATA_SIZE)
+
+/*
+ * Calculates the size in bytes of a single run instance, including bitmap
+ */
+#define RUN_CONTENT_SIZE_BYTES(size_idx)\
+(RUN_CONTENT_SIZE + (((size_idx) - 1) * CHUNKSIZE))
+
+/* Default bitmap values, specific for old, non-flexible, bitmaps */
+#define RUN_DEFAULT_METADATA_VALUES 40 /* in 8 byte words, 320 bytes total */
+#define RUN_DEFAULT_BITMAP_VALUES \
+	(RUN_DEFAULT_METADATA_VALUES - RUN_BASE_METADATA_VALUES)
+#define RUN_DEFAULT_BITMAP_SIZE (sizeof(uint64_t) * RUN_DEFAULT_BITMAP_VALUES)
+#define RUN_DEFAULT_BITMAP_NBITS\
+	(RUN_BITS_PER_VALUE * RUN_DEFAULT_BITMAP_VALUES)
+#define RUN_DEFAULT_SIZE \
+	(CHUNKSIZE - RUN_BASE_METADATA_SIZE - RUN_DEFAULT_BITMAP_SIZE)
+
+/*
+ * Calculates the size in bytes of a single run instance, without bitmap,
+ * but only for the default fixed-bitmap algorithm
+ */
+#define RUN_DEFAULT_SIZE_BYTES(size_idx)\
+(RUN_DEFAULT_SIZE + (((size_idx) - 1) * CHUNKSIZE))
+
+#define CHUNK_MASK ((CHUNKSIZE) - 1)
+#define CHUNK_ALIGN_UP(value) ((((value) + CHUNK_MASK) & ~CHUNK_MASK))
+
+enum chunk_flags {
+	CHUNK_FLAG_COMPACT_HEADER	=	0x0001,
+	CHUNK_FLAG_HEADER_NONE		=	0x0002,
+	CHUNK_FLAG_ALIGNED		=	0x0004,
+	CHUNK_FLAG_FLEX_BITMAP		=	0x0008,
+};
+
+#define CHUNK_FLAGS_ALL_VALID (\
+	CHUNK_FLAG_COMPACT_HEADER |\
+	CHUNK_FLAG_HEADER_NONE |\
+	CHUNK_FLAG_ALIGNED |\
+	CHUNK_FLAG_FLEX_BITMAP\
+)
+
+enum chunk_type {
+	CHUNK_TYPE_UNKNOWN,
+	CHUNK_TYPE_FOOTER, /* not actual chunk type */
+	CHUNK_TYPE_FREE,
+	CHUNK_TYPE_USED,
+	CHUNK_TYPE_RUN,
+	CHUNK_TYPE_RUN_DATA,
+
+	MAX_CHUNK_TYPE
+};
+
+struct chunk {
+	uint8_t data[CHUNKSIZE];
+};
+
+struct chunk_run_header {
+	uint64_t block_size;
+	uint64_t alignment; /* valid only /w CHUNK_FLAG_ALIGNED */
+};
+
+struct chunk_run {
+	struct chunk_run_header hdr;
+	uint8_t content[RUN_CONTENT_SIZE]; /* bitmap + data */
+};
+
+struct chunk_header {
+	uint16_t type;
+	uint16_t flags;
+	uint32_t size_idx;
+};
+
+struct zone_header {
+	uint32_t magic;
+	uint32_t size_idx;
+	uint8_t reserved[56];
+};
+
+struct zone {
+	struct zone_header header;
+	struct chunk_header chunk_headers[MAX_CHUNK];
+	struct chunk chunks[];
+};
+
+struct heap_header {
+	char signature[HEAP_SIGNATURE_LEN];
+	uint64_t major;
+	uint64_t minor;
+	uint64_t unused; /* might be garbage */
+	uint64_t chunksize;
+	uint64_t chunks_per_zone;
+	uint8_t reserved[960];
+	uint64_t checksum;
+};
+
+struct heap_layout {
+	struct heap_header header;
+	struct zone zone0;	/* first element of zones array */
+};
+
+#define ALLOC_HDR_SIZE_SHIFT (48ULL)
+#define ALLOC_HDR_FLAGS_MASK (((1ULL) << ALLOC_HDR_SIZE_SHIFT) - 1)
+
+struct allocation_header_legacy {
+	uint8_t unused[8];
+	uint64_t size;
+	uint8_t unused2[32];
+	uint64_t root_size;
+	uint64_t type_num;
+};
+
+#define ALLOC_HDR_COMPACT_SIZE sizeof(struct allocation_header_compact)
+
+struct allocation_header_compact {
+	uint64_t size;
+	uint64_t extra;
+};
+
+enum header_type {
+	HEADER_LEGACY,
+	HEADER_COMPACT,
+	HEADER_NONE,
+
+	MAX_HEADER_TYPES
+};
+
+static const size_t header_type_to_size[MAX_HEADER_TYPES] = {
+	sizeof(struct allocation_header_legacy),
+	sizeof(struct allocation_header_compact),
+	0
+};
+
+static const enum chunk_flags header_type_to_flag[MAX_HEADER_TYPES] = {
+	(enum chunk_flags)0,
+	CHUNK_FLAG_COMPACT_HEADER,
+	CHUNK_FLAG_HEADER_NONE
+};
+
+static inline struct zone *
+ZID_TO_ZONE(struct heap_layout *layout, size_t zone_id)
+{
+	return (struct zone *)
+		((uintptr_t)&layout->zone0 + ZONE_MAX_SIZE * zone_id);
+}
+
+static inline struct chunk_header *
+GET_CHUNK_HDR(struct heap_layout *layout, size_t zone_id, unsigned chunk_id)
+{
+	return &ZID_TO_ZONE(layout, zone_id)->chunk_headers[chunk_id];
+}
+
+static inline struct chunk *
+GET_CHUNK(struct heap_layout *layout, size_t zone_id, unsigned chunk_id)
+{
+	return &ZID_TO_ZONE(layout, zone_id)->chunks[chunk_id];
+}
+
+static inline struct chunk_run *
+GET_CHUNK_RUN(struct heap_layout *layout, size_t zone_id, unsigned chunk_id)
+{
+	return (struct chunk_run *)GET_CHUNK(layout, zone_id, chunk_id);
+}
+
+#endif /* __DAOS_COMMON_HEAP_LAYOUT_H */
diff --git a/src/common/dav_v2/memblock.c b/src/common/dav_v2/memblock.c
new file mode 100644
index 00000000000..cf3204432b1
--- /dev/null
+++ b/src/common/dav_v2/memblock.c
@@ -0,0 +1,1615 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2021, Intel Corporation */
+
+/*
+ * memblock.c -- implementation of memory block
+ *
+ * Memory block is a representation of persistent object that resides in the
+ * heap. A valid memory block must be either a huge (free or used) chunk or a
+ * block inside a run.
+ *
+ * Huge blocks are 1:1 correlated with the chunk headers in the zone whereas
+ * run blocks are represented by bits in corresponding chunk bitmap.
+ *
+ * This file contains implementations of abstract operations on memory blocks.
+ * Instead of storing the mbops structure inside each memory block the correct
+ * method implementation is chosen at runtime.
+ */
+
+#include <string.h>
+
+#include "obj.h"
+#include "heap.h"
+#include "memblock.h"
+#include "out.h"
+#include "valgrind_internal.h"
+#include "alloc_class.h"
+
+/* calculates the size of the entire run, including any additional chunks */
+#define SIZEOF_RUN(runp, size_idx)\
+	(sizeof(*(runp)) + (((size_idx) - 1) * CHUNKSIZE))
+
+/*
+ * memblock_header_type -- determines the memory block's header type
+ */
+static enum header_type
+memblock_header_type(const struct memory_block *m)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	if (hdr->flags & CHUNK_FLAG_COMPACT_HEADER)
+		return HEADER_COMPACT;
+
+	if (hdr->flags & CHUNK_FLAG_HEADER_NONE)
+		return HEADER_NONE;
+
+	return HEADER_LEGACY;
+}
+
+/*
+ * memblock_header_legacy_get_size --
+ *	(internal) returns the size stored in a legacy header
+ */
+static size_t
+memblock_header_legacy_get_size(const struct memory_block *m)
+{
+	struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m);
+
+	return hdr->size;
+}
+
+/*
+ * memblock_header_compact_get_size --
+ *	(internal) returns the size stored in a compact header
+ */
+static size_t
+memblock_header_compact_get_size(const struct memory_block *m)
+{
+	struct allocation_header_compact *hdr = m->m_ops->get_real_data(m);
+
+	return hdr->size & ALLOC_HDR_FLAGS_MASK;
+}
+
+/*
+ * memblock_header_none_get_size --
+ *	(internal) determines the sizes of an object without a header
+ */
+static size_t
+memblock_header_none_get_size(const struct memory_block *m)
+{
+	return m->m_ops->block_size(m);
+}
+
+/*
+ * memblock_header_legacy_get_extra --
+ *	(internal) returns the extra field stored in a legacy header
+ */
+static uint64_t
+memblock_header_legacy_get_extra(const struct memory_block *m)
+{
+	struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m);
+
+	return hdr->type_num;
+}
+
+/*
+ * memblock_header_compact_get_extra --
+ *	(internal) returns the extra field stored in a compact header
+ */
+static uint64_t
+memblock_header_compact_get_extra(const struct memory_block *m)
+{
+	struct allocation_header_compact *hdr = m->m_ops->get_real_data(m);
+
+	return hdr->extra;
+}
+
+/*
+ * memblock_header_none_get_extra --
+ *	(internal) objects without a header don't have an extra field
+ */
+static uint64_t
+memblock_header_none_get_extra(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	return 0;
+}
+
+/*
+ * memblock_header_legacy_get_flags --
+ *	(internal) returns the flags stored in a legacy header
+ */
+static uint16_t
+memblock_header_legacy_get_flags(const struct memory_block *m)
+{
+	struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m);
+
+	return (uint16_t)(hdr->root_size >> ALLOC_HDR_SIZE_SHIFT);
+}
+
+/*
+ * memblock_header_compact_get_flags --
+ *	(internal) returns the flags stored in a compact header
+ */
+static uint16_t
+memblock_header_compact_get_flags(const struct memory_block *m)
+{
+	struct allocation_header_compact *hdr = m->m_ops->get_real_data(m);
+
+	return (uint16_t)(hdr->size >> ALLOC_HDR_SIZE_SHIFT);
+}
+
+/*
+ * memblock_header_none_get_flags --
+ *	(internal) objects without a header do not support flags
+ */
+static uint16_t
+memblock_header_none_get_flags(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	return 0;
+}
+
+/*
+ * memblock_header_legacy_write --
+ *	(internal) writes a legacy header of an object
+ */
+static void
+memblock_header_legacy_write(const struct memory_block *m,
+	size_t size, uint64_t extra, uint16_t flags)
+{
+	struct allocation_header_legacy hdr;
+
+	hdr.size = size;
+	hdr.type_num = extra;
+	hdr.root_size = ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT);
+
+	struct allocation_header_legacy *hdrp = m->m_ops->get_real_data(m);
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(hdrp, sizeof(*hdrp));
+
+	VALGRIND_ADD_TO_TX(hdrp, sizeof(*hdrp));
+	memcpy(hdrp, &hdr, sizeof(hdr)); /* legacy header is 64 bytes in size */
+	VALGRIND_REMOVE_FROM_TX(hdrp, sizeof(*hdrp));
+
+	/* unused fields of the legacy headers are used as a red zone */
+	VALGRIND_DO_MAKE_MEM_NOACCESS(hdrp->unused, sizeof(hdrp->unused));
+}
+
+/*
+ * memblock_header_compact_write --
+ *	(internal) writes a compact header of an object
+ */
+static void
+memblock_header_compact_write(const struct memory_block *m,
+	size_t size, uint64_t extra, uint16_t flags)
+{
+	COMPILE_ERROR_ON(ALLOC_HDR_COMPACT_SIZE > CACHELINE_SIZE);
+
+	struct {
+		struct allocation_header_compact hdr;
+		uint8_t padding[CACHELINE_SIZE - ALLOC_HDR_COMPACT_SIZE];
+	} padded;
+
+	/*
+	 * REVISIT:
+	 * Below memset is added to prevent valgrind propagating the
+	 * cleared V-Bits of the padding field all the way till DMA buffer
+	 * as part of logging by WAL.
+	 * This code needs to be revisited when valgrind macros are
+	 * enabled within DAV.
+	 */
+	padded.hdr.size = size | ((uint64_t)flags << ALLOC_HDR_SIZE_SHIFT);
+	padded.hdr.extra = extra;
+
+	struct allocation_header_compact *hdrp = m->m_ops->get_real_data(m);
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(hdrp, sizeof(*hdrp));
+
+	/*
+	 * If possible write the entire header with a single memcpy, this allows
+	 * the copy implementation to avoid a cache miss on a partial cache line
+	 * write.
+	 */
+	size_t hdr_size = ALLOC_HDR_COMPACT_SIZE;
+
+	if ((uintptr_t)hdrp % CACHELINE_SIZE == 0 && size >= sizeof(padded))
+		hdr_size = sizeof(padded);
+
+	VALGRIND_ADD_TO_TX(hdrp, hdr_size);
+
+	memcpy(hdrp, &padded, hdr_size);
+	VALGRIND_DO_MAKE_MEM_UNDEFINED((char *)hdrp + ALLOC_HDR_COMPACT_SIZE,
+		hdr_size - ALLOC_HDR_COMPACT_SIZE);
+
+	VALGRIND_REMOVE_FROM_TX(hdrp, hdr_size);
+}
+
+/*
+ * memblock_header_none_write --
+ *	(internal) nothing to write
+ */
+static void
+memblock_header_none_write(const struct memory_block *m,
+	size_t size, uint64_t extra, uint16_t flags)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m, size, extra, flags);
+
+	/* NOP */
+}
+
+/*
+ * memblock_header_legacy_invalidate --
+ *	(internal) invalidates a legacy header
+ */
+static void
+memblock_header_legacy_invalidate(const struct memory_block *m)
+{
+	struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m);
+
+	VALGRIND_SET_CLEAN(hdr, sizeof(*hdr));
+}
+
+/*
+ * memblock_header_compact_invalidate --
+ *	(internal) invalidates a compact header
+ */
+static void
+memblock_header_compact_invalidate(const struct memory_block *m)
+{
+	struct allocation_header_compact *hdr = m->m_ops->get_real_data(m);
+
+	VALGRIND_SET_CLEAN(hdr, sizeof(*hdr));
+}
+
+/*
+ * memblock_no_header_invalidate --
+ *	(internal) nothing to invalidate
+ */
+static void
+memblock_header_none_invalidate(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	/* NOP */
+}
+
+/*
+ * memblock_header_legacy_reinit --
+ *	(internal) reinitializes a legacy header after a heap restart
+ */
+static void
+memblock_header_legacy_reinit(const struct memory_block *m)
+{
+	struct allocation_header_legacy *hdr = m->m_ops->get_real_data(m);
+
+	VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr));
+
+	/* unused fields of the legacy headers are used as a red zone */
+	VALGRIND_DO_MAKE_MEM_NOACCESS(hdr->unused, sizeof(hdr->unused));
+}
+
+/*
+ * memblock_header_compact_reinit --
+ *	(internal) reinitializes a compact header after a heap restart
+ */
+static void
+memblock_header_compact_reinit(const struct memory_block *m)
+{
+	struct allocation_header_compact *hdr = m->m_ops->get_real_data(m);
+
+	VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr));
+}
+
+/*
+ * memblock_header_none_reinit --
+ *	(internal) nothing to reinitialize
+ */
+static void
+memblock_header_none_reinit(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	/* NOP */
+}
+
+static const struct {
+	/* determines the sizes of an object */
+	size_t (*get_size)(const struct memory_block *m);
+
+	/* returns the extra field (if available, 0 if not) */
+	uint64_t (*get_extra)(const struct memory_block *m);
+
+	/* returns the flags stored in a header (if available, 0 if not) */
+	uint16_t (*get_flags)(const struct memory_block *m);
+
+	/*
+	 * Stores size, extra info and flags in header of an object
+	 * (if available, does nothing otherwise).
+	 */
+	void (*write)(const struct memory_block *m,
+		size_t size, uint64_t extra, uint16_t flags);
+	void (*invalidate)(const struct memory_block *m);
+
+	/*
+	 * Reinitializes a header after a heap restart (if available, does
+	 * nothing otherwise) (VG).
+	 */
+	void (*reinit)(const struct memory_block *m);
+} memblock_header_ops[MAX_HEADER_TYPES] = {
+	[HEADER_LEGACY] = {
+		memblock_header_legacy_get_size,
+		memblock_header_legacy_get_extra,
+		memblock_header_legacy_get_flags,
+		memblock_header_legacy_write,
+		memblock_header_legacy_invalidate,
+		memblock_header_legacy_reinit,
+	},
+	[HEADER_COMPACT] = {
+		memblock_header_compact_get_size,
+		memblock_header_compact_get_extra,
+		memblock_header_compact_get_flags,
+		memblock_header_compact_write,
+		memblock_header_compact_invalidate,
+		memblock_header_compact_reinit,
+	},
+	[HEADER_NONE] = {
+		memblock_header_none_get_size,
+		memblock_header_none_get_extra,
+		memblock_header_none_get_flags,
+		memblock_header_none_write,
+		memblock_header_none_invalidate,
+		memblock_header_none_reinit,
+	}
+};
+
+/*
+ * memblock_run_default_nallocs -- returns the number of memory blocks
+ *	available in the in a run with given parameters using the default
+ *	fixed-bitmap algorithm
+ */
+static unsigned
+memblock_run_default_nallocs(uint32_t *size_idx, uint16_t flags,
+	uint64_t unit_size, uint64_t alignment)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(flags);
+
+	unsigned nallocs = (unsigned)
+		(RUN_DEFAULT_SIZE_BYTES(*size_idx) / unit_size);
+
+	while (nallocs > RUN_DEFAULT_BITMAP_NBITS) {
+		/* trying to create a run with number of units exceeding the bitmap size */
+		DAV_DBG("run:%lu number of units %u exceeds bitmap size (%u)",
+			  unit_size, nallocs, RUN_DEFAULT_BITMAP_NBITS);
+		if (*size_idx > 1) {
+			*size_idx -= 1;
+			/* recalculate the number of allocations */
+			nallocs = (uint32_t)
+				(RUN_DEFAULT_SIZE_BYTES(*size_idx) / unit_size);
+			/* run was constructed with fewer chunks (minus one) */
+			D_INFO("run:%lu constructed with fewer chunks:%u\n",
+				  unit_size, *size_idx);
+		} else {
+			/*
+			 * run was constructed with fewer units than optimal,
+			 * this might lead to inefficient memory utilization!
+			 */
+			D_INFO("run:%lu constructed with fewer units:%u than optimal:%u\n",
+				unit_size, RUN_DEFAULT_BITMAP_NBITS, nallocs);
+
+			nallocs = RUN_DEFAULT_BITMAP_NBITS;
+		}
+	}
+
+	return nallocs - (alignment ? 1 : 0);
+}
+
+/*
+ * memblock_run_bitmap -- calculate bitmap parameters for given arguments
+ */
+void
+memblock_run_bitmap(uint32_t *size_idx, uint16_t flags,
+	uint64_t unit_size, uint64_t alignment, void *content,
+	struct run_bitmap *b)
+{
+	ASSERTne(*size_idx, 0);
+
+	/*
+	 * Flexible bitmaps have a variably sized values array. The size varies
+	 * depending on:
+	 *	alignment - initial run alignment might require up-to a unit
+	 *	size idx - the larger the run, the more units it carries
+	 *	unit_size - the smaller the unit size, the more units per run
+	 *
+	 * The size of the bitmap also has to be calculated in such a way that
+	 * the beginning of allocations data is cacheline aligned. This is
+	 * required to perform many optimizations throughout the codebase.
+	 * This alignment requirement means that some of the bitmap values might
+	 * remain unused and will serve only as a padding for data.
+	 */
+	if (flags & CHUNK_FLAG_FLEX_BITMAP) {
+		/*
+		 * First calculate the number of values without accounting for
+		 * the bitmap size.
+		 */
+		size_t content_size = RUN_CONTENT_SIZE_BYTES(*size_idx);
+
+		b->nbits = (unsigned)(content_size / unit_size);
+		b->nvalues = util_div_ceil(b->nbits, RUN_BITS_PER_VALUE);
+
+		/*
+		 * Then, align the number of values up, so that the cacheline
+		 * alignment is preserved.
+		 */
+		b->nvalues = ALIGN_UP(b->nvalues + RUN_BASE_METADATA_VALUES,
+			(unsigned)(CACHELINE_SIZE / sizeof(*b->values)))
+			- RUN_BASE_METADATA_VALUES;
+
+		/*
+		 * This is the total number of bytes needed for the bitmap AND
+		 * padding.
+		 */
+		b->size = b->nvalues * sizeof(*b->values);
+
+		/*
+		 * Calculate the number of allocations again, but this time
+		 * accounting for the bitmap/padding.
+		 */
+		b->nbits = (unsigned)((content_size - b->size) / unit_size)
+			- (alignment ? 1U : 0U);
+
+		/*
+		 * The last step is to calculate how much of the padding
+		 * is left at the end of the bitmap.
+		 */
+		unsigned unused_bits = (b->nvalues * RUN_BITS_PER_VALUE)
+			- b->nbits;
+		unsigned unused_values = unused_bits / RUN_BITS_PER_VALUE;
+
+		b->nvalues -= unused_values;
+		b->values = (uint64_t *)content;
+
+		return;
+	}
+
+	b->size = RUN_DEFAULT_BITMAP_SIZE;
+	b->nbits = memblock_run_default_nallocs(size_idx, flags,
+		unit_size, alignment);
+
+	unsigned unused_bits = RUN_DEFAULT_BITMAP_NBITS - b->nbits;
+	unsigned unused_values = unused_bits / RUN_BITS_PER_VALUE;
+
+	b->nvalues = RUN_DEFAULT_BITMAP_VALUES - unused_values;
+	b->values = (uint64_t *)content;
+}
+
+/*
+ * run_get_bitmap -- initializes run bitmap information
+ */
+static void
+run_get_bitmap(const struct memory_block *m, struct run_bitmap *b)
+{
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	if (m->cached_bitmap != NULL) {
+		*b = *m->cached_bitmap;
+		b->values = (uint64_t *)run->content;
+	} else {
+		struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+		uint32_t size_idx = hdr->size_idx;
+
+		memblock_run_bitmap(&size_idx, hdr->flags, run->hdr.block_size,
+			run->hdr.alignment, run->content, b);
+		ASSERTeq(size_idx, hdr->size_idx);
+	}
+}
+
+/*
+ * huge_block_size -- returns the compile-time constant which defines the
+ *	huge memory block size.
+ */
+static size_t
+huge_block_size(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	return CHUNKSIZE;
+}
+
+/*
+ * run_block_size -- looks for the right chunk and returns the block size
+ *	information that is attached to the run block metadata.
+ */
+static size_t
+run_block_size(const struct memory_block *m)
+{
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	return run->hdr.block_size;
+}
+
+/*
+ * huge_get_real_data -- returns pointer to the beginning data of a huge block
+ */
+static void *
+huge_get_real_data(const struct memory_block *m)
+{
+	return heap_get_chunk(m->heap, m)->data;
+}
+
+/*
+ * run_get_data_start -- (internal) returns the pointer to the beginning of
+ *	allocations in a run
+ */
+static char *
+run_get_data_start(const struct memory_block *m)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	if (hdr->flags & CHUNK_FLAG_ALIGNED) {
+		/*
+		 * Alignment is property of user data in allocations. And
+		 * since objects have headers, we need to take them into
+		 * account when calculating the address.
+		 */
+		uintptr_t hsize = header_type_to_size[m->header_type];
+		uintptr_t base = (uintptr_t)run->content +
+			b.size + hsize;
+		return (char *)(ALIGN_UP(base, run->hdr.alignment) - hsize);
+	} else {
+		return (char *)&run->content + b.size;
+	}
+}
+
+/*
+ * run_get_data_offset -- (internal) returns the number of bytes between
+ *	run base metadata and data
+ */
+static size_t
+run_get_data_offset(const struct memory_block *m)
+{
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	return (size_t)run_get_data_start(m) - (size_t)&run->content;
+}
+
+/*
+ * run_get_real_data -- returns pointer to the beginning data of a run block
+ */
+static void *
+run_get_real_data(const struct memory_block *m)
+{
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	ASSERT(run->hdr.block_size != 0);
+
+	return run_get_data_start(m) + (run->hdr.block_size * m->block_off);
+}
+
+/*
+ * block_get_user_data -- returns pointer to the data of a block
+ */
+static void *
+block_get_user_data(const struct memory_block *m)
+{
+	return (char *)m->m_ops->get_real_data(m) +
+		header_type_to_size[m->header_type];
+}
+
+/*
+ * chunk_get_chunk_hdr_value -- (internal) get value of a header for redo log
+ */
+static uint64_t
+chunk_get_chunk_hdr_value(uint16_t type, uint16_t flags, uint32_t size_idx)
+{
+	uint64_t val;
+	struct chunk_header hdr;
+
+	COMPILE_ERROR_ON(sizeof(struct chunk_header) != sizeof(uint64_t));
+
+	hdr.type = type;
+	hdr.flags = flags;
+	hdr.size_idx = size_idx;
+	memcpy(&val, &hdr, sizeof(val));
+
+	return val;
+}
+
+/*
+ * huge_prep_operation_hdr -- prepares the new value of a chunk header that will
+ *	be set after the operation concludes.
+ */
+static void
+huge_prep_operation_hdr(const struct memory_block *m, enum memblock_state op,
+	struct operation_context *ctx)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	/*
+	 * Depending on the operation that needs to be performed a new chunk
+	 * header needs to be prepared with the new chunk state.
+	 */
+	uint64_t val = chunk_get_chunk_hdr_value(
+		op == MEMBLOCK_ALLOCATED ? CHUNK_TYPE_USED : CHUNK_TYPE_FREE,
+		hdr->flags,
+		m->size_idx);
+
+	if (ctx == NULL) {
+		util_atomic_store_explicit64((uint64_t *)hdr, val,
+			memory_order_relaxed);
+		mo_wal_persist(&m->heap->p_ops, hdr, sizeof(*hdr));
+	} else {
+		operation_add_entry(ctx, hdr, val, ULOG_OPERATION_SET);
+	}
+
+	VALGRIND_DO_MAKE_MEM_NOACCESS(hdr + 1,
+		(hdr->size_idx - 1) * sizeof(struct chunk_header));
+
+	/*
+	 * In the case of chunks larger than one unit the footer must be
+	 * created immediately AFTER the persistent state is safely updated.
+	 */
+	if (m->size_idx == 1)
+		return;
+
+	struct chunk_header *footer = hdr + m->size_idx - 1;
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(footer, sizeof(*footer));
+
+	val = chunk_get_chunk_hdr_value(CHUNK_TYPE_FOOTER, 0, m->size_idx);
+
+	/*
+	 * It's only safe to write the footer AFTER the persistent part of
+	 * the operation have been successfully processed because the footer
+	 * pointer might point to a currently valid persistent state
+	 * of a different chunk.
+	 * The footer entry change is updated as transient because it will
+	 * be recreated at heap boot regardless - it's just needed for runtime
+	 * operations.
+	 */
+	if (ctx == NULL) {
+		util_atomic_store_explicit64((uint64_t *)footer, val,
+			memory_order_relaxed);
+		VALGRIND_SET_CLEAN(footer, sizeof(*footer));
+	} else {
+		operation_add_typed_entry(ctx,
+			footer, val, ULOG_OPERATION_SET, LOG_TRANSIENT);
+	}
+}
+
+/*
+ * run_prep_operation_hdr -- prepares the new value for a select few bytes of
+ *	a run bitmap that will be set after the operation concludes.
+ *
+ * It's VERY important to keep in mind that the particular value of the
+ * bitmap this method is modifying must not be changed after this function
+ * is called and before the operation is processed.
+ */
+static void
+run_prep_operation_hdr(const struct memory_block *m, enum memblock_state op,
+	struct operation_context *ctx)
+{
+	ASSERT(m->size_idx <= RUN_BITS_PER_VALUE);
+	ASSERT(m->size_idx > 0);
+
+	/*
+	 * Free blocks are represented by clear bits and used blocks by set
+	 * bits - which is the reverse of the commonly used scheme.
+	 *
+	 * Here a bit mask is prepared that flips the bits that represent the
+	 * memory block provided by the caller - because both the size index and
+	 * the block offset are tied 1:1 to the bitmap this operation is
+	 * relatively simple.
+	 */
+	uint64_t bmask;
+
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	if (m->size_idx == RUN_BITS_PER_VALUE) {
+		ASSERTeq(m->block_off % RUN_BITS_PER_VALUE, 0);
+		bmask = UINT64_MAX;
+	} else {
+		bmask = ((1ULL << m->size_idx) - 1ULL) <<
+				(m->block_off % RUN_BITS_PER_VALUE);
+	}
+#else
+	uint16_t num = m->size_idx;
+	uint32_t pos = m->block_off % RUN_BITS_PER_VALUE;
+
+	ASSERT_rt(num > 0 && num <= RUN_BITS_PER_VALUE);
+	bmask = ULOG_ENTRY_TO_VAL(pos, num);
+#endif
+
+	/*
+	 * The run bitmap is composed of several 8 byte values, so a proper
+	 * element of the bitmap array must be selected.
+	 */
+	unsigned bpos = m->block_off / RUN_BITS_PER_VALUE;
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	/* the bit mask is applied immediately by the add entry operations */
+	if (op == MEMBLOCK_ALLOCATED) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+		operation_add_entry(ctx, &b.values[bpos],
+				    bmask, ULOG_OPERATION_OR);
+#else
+		operation_add_entry(ctx, &b.values[bpos],
+				    bmask, ULOG_OPERATION_SET_BITS);
+#endif
+	} else if (op == MEMBLOCK_FREE) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+		operation_add_entry(ctx, &b.values[bpos],
+				    ~bmask, ULOG_OPERATION_AND);
+#else
+		operation_add_entry(ctx, &b.values[bpos],
+				    bmask, ULOG_OPERATION_CLR_BITS);
+#endif
+	} else {
+		ASSERT(0);
+	}
+}
+
+/*
+ * huge_get_lock -- because huge memory blocks are always allocated from a
+ *	single bucket there's no reason to lock them - the bucket itself is
+ *	protected.
+ */
+static pthread_mutex_t *
+huge_get_lock(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	return NULL;
+}
+
+/*
+ * run_get_lock -- gets the runtime mutex from the heap.
+ */
+static pthread_mutex_t *
+run_get_lock(const struct memory_block *m)
+{
+	return heap_get_run_lock(m->heap, m->chunk_id);
+}
+
+/*
+ * huge_get_state -- returns whether a huge block is allocated or not
+ */
+static enum memblock_state
+huge_get_state(const struct memory_block *m)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	if (hdr->type == CHUNK_TYPE_USED)
+		return MEMBLOCK_ALLOCATED;
+
+	if (hdr->type == CHUNK_TYPE_FREE)
+		return MEMBLOCK_FREE;
+
+	return MEMBLOCK_STATE_UNKNOWN;
+}
+
+/*
+ * huge_get_state -- returns whether a block from a run is allocated or not
+ */
+static enum memblock_state
+run_get_state(const struct memory_block *m)
+{
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	unsigned v = m->block_off / RUN_BITS_PER_VALUE;
+	uint64_t bitmap = b.values[v];
+	unsigned bit = m->block_off % RUN_BITS_PER_VALUE;
+
+	unsigned bit_last = bit + m->size_idx;
+
+	ASSERT(bit_last <= RUN_BITS_PER_VALUE);
+
+	for (unsigned i = bit; i < bit_last; ++i) {
+		if (!BIT_IS_CLR(bitmap, i))
+			return MEMBLOCK_ALLOCATED;
+	}
+
+	return MEMBLOCK_FREE;
+}
+
+/*
+ * huge_ensure_header_type -- checks the header type of a chunk and modifies
+ *	it if necessary. This is fail-safe atomic.
+ */
+static void
+huge_ensure_header_type(const struct memory_block *m,
+	enum header_type t)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	ASSERTeq(hdr->type, CHUNK_TYPE_FREE);
+
+	if ((hdr->flags & header_type_to_flag[t]) == 0) {
+		VALGRIND_ADD_TO_TX(hdr, sizeof(*hdr));
+		uint16_t f = ((uint16_t)header_type_to_flag[t]);
+		uint64_t nhdr = chunk_get_chunk_hdr_value(hdr->type,
+			hdr->flags | f, hdr->size_idx);
+		util_atomic_store_explicit64((uint64_t *)hdr,
+			nhdr, memory_order_relaxed);
+		mo_wal_persist(&m->heap->p_ops, hdr, sizeof(*hdr));
+		VALGRIND_REMOVE_FROM_TX(hdr, sizeof(*hdr));
+	}
+}
+
+/*
+ * run_ensure_header_type -- runs must be created with appropriate header type.
+ */
+static void
+run_ensure_header_type(const struct memory_block *m,
+	enum header_type t)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m, t);
+
+#ifdef DAV_EXTRA_DEBUG
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	ASSERTeq(hdr->type, CHUNK_TYPE_RUN);
+	ASSERT((hdr->flags & header_type_to_flag[t]) == header_type_to_flag[t]);
+#endif
+}
+
+/*
+ * block_get_real_size -- returns the size of a memory block that includes all
+ *	of the overhead (headers)
+ */
+static size_t
+block_get_real_size(const struct memory_block *m)
+{
+	/*
+	 * There are two valid ways to get a size. If the memory block
+	 * initialized properly and the size index is set, the chunk unit size
+	 * can be simply multiplied by that index, otherwise we need to look at
+	 * the allocation header.
+	 */
+	if (m->size_idx != 0)
+		return m->m_ops->block_size(m) * m->size_idx;
+	else
+		return memblock_header_ops[m->header_type].get_size(m);
+}
+
+/*
+ * block_get_user_size -- returns the size of a memory block without overheads,
+ *	this is the size of a data block that can be used.
+ */
+static size_t
+block_get_user_size(const struct memory_block *m)
+{
+	return block_get_real_size(m) - header_type_to_size[m->header_type];
+}
+
+/*
+ * block_write_header -- writes a header of an allocation
+ */
+static void
+block_write_header(const struct memory_block *m,
+	uint64_t extra_field, uint16_t flags)
+{
+	memblock_header_ops[m->header_type].write(m,
+		block_get_real_size(m), extra_field, flags);
+}
+
+/*
+ * block_invalidate -- invalidates allocation data and header
+ */
+static void
+block_invalidate(const struct memory_block *m)
+{
+	void *data = m->m_ops->get_user_data(m);
+	size_t size = m->m_ops->get_user_size(m);
+
+	VALGRIND_SET_CLEAN(data, size);
+
+	memblock_header_ops[m->header_type].invalidate(m);
+}
+
+/*
+ * block_reinit_header -- reinitializes a block after a heap restart
+ */
+static void
+block_reinit_header(const struct memory_block *m)
+{
+	memblock_header_ops[m->header_type].reinit(m);
+}
+
+/*
+ * block_get_extra -- returns the extra field of an allocation
+ */
+static uint64_t
+block_get_extra(const struct memory_block *m)
+{
+	return memblock_header_ops[m->header_type].get_extra(m);
+}
+
+/*
+ * block_get_flags -- returns the flags of an allocation
+ */
+static uint16_t
+block_get_flags(const struct memory_block *m)
+{
+	return memblock_header_ops[m->header_type].get_flags(m);
+}
+
+/*
+ * heap_run_process_bitmap_value -- (internal) looks for unset bits in the
+ * value, creates a valid memory block out of them and inserts that
+ * block into the given bucket.
+ */
+static int
+run_process_bitmap_value(const struct memory_block *m,
+	uint64_t value, uint32_t base_offset, object_callback cb, void *arg)
+{
+	int ret = 0;
+	uint64_t shift = 0; /* already processed bits */
+	struct memory_block s = *m;
+
+	do {
+		/*
+		 * Shift the value so that the next memory block starts on the
+		 * least significant position:
+		 *	..............0 (free block)
+		 * or	..............1 (used block)
+		 */
+		uint64_t shifted = value >> shift;
+
+		/* all clear or set bits indicate the end of traversal */
+		if (shifted == 0) {
+			/*
+			 * Insert the remaining blocks as free. Remember that
+			 * unsigned values are always zero-filled, so we must
+			 * take the current shift into account.
+			 */
+			s.block_off = (uint32_t)(base_offset + shift);
+			s.size_idx = (uint32_t)(RUN_BITS_PER_VALUE - shift);
+
+			ret = cb(&s, arg);
+			if (ret != 0)
+				return ret;
+
+			break;
+		} else if (shifted == UINT64_MAX) {
+			break;
+		}
+
+		/*
+		 * Offset and size of the next free block, either of these
+		 * can be zero depending on where the free block is located
+		 * in the value.
+		 */
+		unsigned off = (unsigned)util_lssb_index64(~shifted);
+		unsigned size = (unsigned)util_lssb_index64(shifted);
+
+		shift += off + size;
+
+		if (size != 0) { /* zero size means skip to the next value */
+			s.block_off = (uint32_t)(base_offset + (shift - size));
+			s.size_idx = (uint32_t)(size);
+
+			memblock_rebuild_state(m->heap, &s);
+			ret = cb(&s, arg);
+			if (ret != 0)
+				return ret;
+		}
+	} while (shift != RUN_BITS_PER_VALUE);
+
+	return 0;
+}
+
+/*
+ * run_iterate_free -- iterates over free blocks in a run
+ */
+static int
+run_iterate_free(const struct memory_block *m, object_callback cb, void *arg)
+{
+	int ret = 0;
+	uint32_t block_off = 0;
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	struct memory_block nm = *m;
+
+	for (unsigned i = 0; i < b.nvalues; ++i) {
+		uint64_t v = b.values[i];
+
+		ASSERT((uint64_t)RUN_BITS_PER_VALUE * (uint64_t)i
+			<= UINT32_MAX);
+		block_off = RUN_BITS_PER_VALUE * i;
+		ret = run_process_bitmap_value(&nm, v, block_off, cb, arg);
+		if (ret != 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * run_iterate_used -- iterates over used blocks in a run
+ */
+static int
+run_iterate_used(const struct memory_block *m, object_callback cb, void *arg)
+{
+	uint32_t i = m->block_off / RUN_BITS_PER_VALUE;
+	uint32_t block_start = m->block_off % RUN_BITS_PER_VALUE;
+	uint32_t block_off;
+
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	struct memory_block iter = *m;
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	for (; i < b.nvalues; ++i) {
+		uint64_t v = b.values[i];
+
+		block_off = (uint32_t)(RUN_BITS_PER_VALUE * i);
+
+		for (uint32_t j = block_start; j < RUN_BITS_PER_VALUE; ) {
+			if (block_off + j >= (uint32_t)b.nbits)
+				break;
+
+			if (!BIT_IS_CLR(v, j)) {
+				iter.block_off = (uint32_t)(block_off + j);
+
+				/*
+				 * The size index of this memory block cannot be
+				 * retrieved at this time because the header
+				 * might not be initialized in valgrind yet.
+				 */
+				iter.size_idx = 0;
+
+				if (cb(&iter, arg) != 0)
+					return 1;
+
+				iter.size_idx = CALC_SIZE_IDX(
+					run->hdr.block_size,
+					iter.m_ops->get_real_size(&iter));
+				j = (uint32_t)(j + iter.size_idx);
+			} else {
+				++j;
+			}
+		}
+		block_start = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * huge_iterate_free -- calls cb on memory block if it's free
+ */
+static int
+huge_iterate_free(const struct memory_block *m, object_callback cb, void *arg)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	return hdr->type == CHUNK_TYPE_FREE ? cb(m, arg) : 0;
+}
+
+/*
+ * huge_iterate_free -- calls cb on memory block if it's used
+ */
+static int
+huge_iterate_used(const struct memory_block *m, object_callback cb, void *arg)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	return hdr->type == CHUNK_TYPE_USED ? cb(m, arg) : 0;
+}
+
+/*
+ * huge_vg_init -- initializes chunk metadata in memcheck state
+ */
+static void
+huge_vg_init(const struct memory_block *m, int objects,
+	object_callback cb, void *arg)
+{
+	struct zone *z = ZID_TO_ZONE(m->heap->layout, m->zone_id);
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+	struct chunk *chunk = heap_get_chunk(m->heap, m);
+
+	VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr));
+
+	/*
+	 * Mark unused chunk headers as not accessible.
+	 */
+	VALGRIND_DO_MAKE_MEM_NOACCESS(
+		&z->chunk_headers[m->chunk_id + 1],
+		(m->size_idx - 1) *
+		sizeof(struct chunk_header));
+
+	size_t size = block_get_real_size(m);
+
+	VALGRIND_DO_MAKE_MEM_NOACCESS(chunk, size);
+
+	if (objects && huge_get_state(m) == MEMBLOCK_ALLOCATED) {
+		if (cb(m, arg) != 0)
+			FATAL("failed to initialize valgrind state");
+	}
+}
+
+/*
+ * run_vg_init -- initializes run metadata in memcheck state
+ */
+static void
+run_vg_init(const struct memory_block *m, int objects,
+	object_callback cb, void *arg)
+{
+	struct zone *z = ZID_TO_ZONE(m->heap->layout, m->zone_id);
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+	struct chunk_run *run = heap_get_chunk_run(m->heap, m);
+
+	VALGRIND_DO_MAKE_MEM_DEFINED(hdr, sizeof(*hdr));
+
+	/* set the run metadata as defined */
+	VALGRIND_DO_MAKE_MEM_DEFINED(run, RUN_BASE_METADATA_SIZE);
+
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+
+	/*
+	 * Mark run data headers as defined.
+	 */
+	for (unsigned j = 1; j < m->size_idx; ++j) {
+		struct chunk_header *data_hdr =
+			&z->chunk_headers[m->chunk_id + j];
+		VALGRIND_DO_MAKE_MEM_DEFINED(data_hdr,
+			sizeof(struct chunk_header));
+		ASSERTeq(data_hdr->type, CHUNK_TYPE_RUN_DATA);
+	}
+
+	VALGRIND_DO_MAKE_MEM_NOACCESS(run, SIZEOF_RUN(run, m->size_idx));
+
+	/* set the run bitmap as defined */
+	VALGRIND_DO_MAKE_MEM_DEFINED(run, b.size + RUN_BASE_METADATA_SIZE);
+
+	if (objects) {
+		if (run_iterate_used(m, cb, arg) != 0)
+			FATAL("failed to initialize valgrind state");
+	}
+}
+
+/*
+ * run_reinit_chunk -- run reinitialization on first zone traversal
+ */
+static void
+run_reinit_chunk(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	/* noop */
+}
+
+/*
+ * huge_write_footer -- (internal) writes a chunk footer
+ */
+static void
+huge_write_footer(struct chunk_header *hdr, uint32_t size_idx)
+{
+	if (size_idx == 1) /* that would overwrite the header */
+		return;
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr + size_idx - 1, sizeof(*hdr));
+
+	struct chunk_header f = *hdr;
+
+	f.type = CHUNK_TYPE_FOOTER;
+	f.size_idx = size_idx;
+	*(hdr + size_idx - 1) = f;
+	/* no need to persist, footers are recreated in heap_populate_buckets */
+	VALGRIND_SET_CLEAN(hdr + size_idx - 1, sizeof(f));
+}
+
+/*
+ * huge_reinit_chunk -- chunk reinitialization on first zone traversal
+ */
+static void
+huge_reinit_chunk(const struct memory_block *m)
+{
+	struct chunk_header *hdr = heap_get_chunk_hdr(m->heap, m);
+
+	if (hdr->type == CHUNK_TYPE_USED)
+		huge_write_footer(hdr, hdr->size_idx);
+}
+
+/*
+ * run_calc_free -- calculates the number of free units in a run
+ */
+static void
+run_calc_free(const struct memory_block *m,
+	uint32_t *free_space, uint32_t *max_free_block)
+{
+	struct run_bitmap b;
+
+	run_get_bitmap(m, &b);
+	for (unsigned i = 0; i < b.nvalues; ++i) {
+		uint64_t value = ~b.values[i];
+
+		if (value == 0)
+			continue;
+
+		uint32_t free_in_value = util_popcount64(value);
+
+		*free_space = *free_space + free_in_value;
+
+		/*
+		 * If this value has less free blocks than already found max,
+		 * there's no point in calculating.
+		 */
+		if (free_in_value < *max_free_block)
+			continue;
+
+		/* if the entire value is empty, no point in calculating */
+		if (free_in_value == RUN_BITS_PER_VALUE) {
+			*max_free_block = RUN_BITS_PER_VALUE;
+			continue;
+		}
+
+		/* if already at max, no point in calculating */
+		if (*max_free_block == RUN_BITS_PER_VALUE)
+			continue;
+
+		/*
+		 * Calculate the biggest free block in the bitmap.
+		 * This algorithm is not the most clever imaginable, but it's
+		 * easy to implement and fast enough.
+		 */
+		uint16_t n = 0;
+
+		while (value != 0) {
+			value &= (value << 1ULL);
+			n++;
+		}
+
+		if (n > *max_free_block)
+			*max_free_block = n;
+	}
+}
+
+/*
+ * huge_fill_pct -- huge blocks by definition use the entirety of a chunk
+ */
+static unsigned
+huge_fill_pct(const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(m);
+
+	return 100;
+}
+
+/*
+ * run_fill_pct -- calculates the percentage of allocated units inside of a run
+ */
+static unsigned
+run_fill_pct(const struct memory_block *m)
+{
+	struct run_bitmap b;
+	unsigned clearbits = 0;
+
+	run_get_bitmap(m, &b);
+	for (unsigned i = 0; i < b.nvalues; ++i) {
+		uint64_t value = ~b.values[i];
+
+		if (value == 0)
+			continue;
+
+		clearbits += util_popcount64(value);
+	}
+	ASSERT(b.nbits >= clearbits);
+	unsigned setbits = b.nbits - clearbits;
+
+	return (100 * setbits) / b.nbits;
+}
+
+static const struct memory_block_ops mb_ops[MAX_MEMORY_BLOCK] = {
+	[MEMORY_BLOCK_HUGE] = {
+		.block_size = huge_block_size,
+		.prep_hdr = huge_prep_operation_hdr,
+		.get_lock = huge_get_lock,
+		.get_state = huge_get_state,
+		.get_user_data = block_get_user_data,
+		.get_real_data = huge_get_real_data,
+		.get_user_size = block_get_user_size,
+		.get_real_size = block_get_real_size,
+		.write_header = block_write_header,
+		.invalidate = block_invalidate,
+		.ensure_header_type = huge_ensure_header_type,
+		.reinit_header = block_reinit_header,
+		.vg_init = huge_vg_init,
+		.get_extra = block_get_extra,
+		.get_flags = block_get_flags,
+		.iterate_free = huge_iterate_free,
+		.iterate_used = huge_iterate_used,
+		.reinit_chunk = huge_reinit_chunk,
+		.calc_free = NULL,
+		.get_bitmap = NULL,
+		.fill_pct = huge_fill_pct,
+	},
+	[MEMORY_BLOCK_RUN] = {
+		.block_size = run_block_size,
+		.prep_hdr = run_prep_operation_hdr,
+		.get_lock = run_get_lock,
+		.get_state = run_get_state,
+		.get_user_data = block_get_user_data,
+		.get_real_data = run_get_real_data,
+		.get_user_size = block_get_user_size,
+		.get_real_size = block_get_real_size,
+		.write_header = block_write_header,
+		.invalidate = block_invalidate,
+		.ensure_header_type = run_ensure_header_type,
+		.reinit_header = block_reinit_header,
+		.vg_init = run_vg_init,
+		.get_extra = block_get_extra,
+		.get_flags = block_get_flags,
+		.iterate_free = run_iterate_free,
+		.iterate_used = run_iterate_used,
+		.reinit_chunk = run_reinit_chunk,
+		.calc_free = run_calc_free,
+		.get_bitmap = run_get_bitmap,
+		.fill_pct = run_fill_pct,
+	}
+};
+
+/*
+ * memblock_huge_init -- initializes a new huge memory block
+ */
+struct memory_block
+memblock_huge_init(struct palloc_heap *heap,
+	uint32_t chunk_id, uint32_t zone_id, uint32_t size_idx)
+{
+	struct memory_block m = MEMORY_BLOCK_NONE;
+
+	m.chunk_id = chunk_id;
+	m.zone_id = zone_id;
+	m.size_idx = size_idx;
+	m.heap = heap;
+
+	struct chunk_header *hdr = heap_get_chunk_hdr(heap, &m);
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(hdr, sizeof(*hdr));
+	VALGRIND_ANNOTATE_NEW_MEMORY(hdr, sizeof(*hdr));
+
+	uint64_t nhdr = chunk_get_chunk_hdr_value(CHUNK_TYPE_FREE,
+		0, size_idx);
+	util_atomic_store_explicit64((uint64_t *)hdr,
+		nhdr, memory_order_relaxed);
+
+	mo_wal_persist(&heap->p_ops, hdr, sizeof(*hdr));
+
+	huge_write_footer(hdr, size_idx);
+
+	memblock_rebuild_state(heap, &m);
+
+	return m;
+}
+
+/*
+ * memblock_run_init -- initializes a new run memory block
+ */
+struct memory_block
+memblock_run_init(struct palloc_heap *heap,
+	uint32_t chunk_id, uint32_t zone_id, struct run_descriptor *rdsc)
+{
+	uint32_t size_idx = rdsc->size_idx;
+
+	ASSERTne(size_idx, 0);
+
+	struct memory_block m = MEMORY_BLOCK_NONE;
+
+	m.chunk_id = chunk_id;
+	m.zone_id = zone_id;
+	m.size_idx = size_idx;
+	m.heap = heap;
+
+	struct zone *z = ZID_TO_ZONE(heap->layout, zone_id);
+	struct chunk_run *run = heap_get_chunk_run(heap, &m);
+	size_t runsize = SIZEOF_RUN(run, size_idx);
+
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(run, runsize);
+
+	/* add/remove chunk_run and chunk_header to valgrind transaction */
+	VALGRIND_ADD_TO_TX(run, runsize);
+	run->hdr.block_size = rdsc->unit_size;
+	run->hdr.alignment = rdsc->alignment;
+
+	struct run_bitmap b = rdsc->bitmap;
+
+	b.values = (uint64_t *)run->content;
+
+	size_t bitmap_size = b.size;
+
+	/* set all the bits */
+	memset(b.values, 0xFF, bitmap_size);
+
+	/* clear only the bits available for allocations from this bucket */
+	memset(b.values, 0, sizeof(*b.values) * (b.nvalues - 1));
+
+	unsigned trailing_bits = b.nbits % RUN_BITS_PER_VALUE;
+	uint64_t last_value = UINT64_MAX << trailing_bits;
+
+	b.values[b.nvalues - 1] = last_value;
+
+	VALGRIND_REMOVE_FROM_TX(run, runsize);
+
+	mo_wal_flush(&heap->p_ops, run,
+		sizeof(struct chunk_run_header) +
+		bitmap_size, 0);
+
+	struct chunk_header run_data_hdr;
+
+	run_data_hdr.type = CHUNK_TYPE_RUN_DATA;
+	run_data_hdr.flags = 0;
+
+	VALGRIND_ADD_TO_TX(&z->chunk_headers[chunk_id],
+		sizeof(struct chunk_header) * size_idx);
+
+	struct chunk_header *data_hdr;
+
+	for (unsigned i = 1; i < size_idx; ++i) {
+		data_hdr = &z->chunk_headers[chunk_id + i];
+		VALGRIND_DO_MAKE_MEM_UNDEFINED(data_hdr, sizeof(*data_hdr));
+		VALGRIND_ANNOTATE_NEW_MEMORY(data_hdr, sizeof(*data_hdr));
+		run_data_hdr.size_idx = i;
+		*data_hdr = run_data_hdr;
+	}
+	mo_wal_persist(&heap->p_ops,
+		&z->chunk_headers[chunk_id + 1],
+		sizeof(struct chunk_header) * (size_idx - 1));
+
+	struct chunk_header *hdr = &z->chunk_headers[chunk_id];
+
+	ASSERT(hdr->type == CHUNK_TYPE_FREE);
+
+	VALGRIND_ANNOTATE_NEW_MEMORY(hdr, sizeof(*hdr));
+
+	uint64_t run_hdr = chunk_get_chunk_hdr_value(CHUNK_TYPE_RUN,
+		rdsc->flags, hdr->size_idx);
+	util_atomic_store_explicit64((uint64_t *)hdr,
+		run_hdr, memory_order_relaxed);
+	mo_wal_persist(&heap->p_ops, hdr, sizeof(*hdr));
+
+	VALGRIND_REMOVE_FROM_TX(&z->chunk_headers[chunk_id],
+		sizeof(struct chunk_header) * size_idx);
+
+	memblock_rebuild_state(heap, &m);
+	m.cached_bitmap = &rdsc->bitmap;
+
+	return m;
+}
+
+/*
+ * memblock_detect_type -- looks for the corresponding chunk header and
+ *	depending on the chunks type returns the right memory block type
+ */
+static enum memory_block_type
+memblock_detect_type(struct palloc_heap *heap, const struct memory_block *m)
+{
+	enum memory_block_type ret = MEMORY_BLOCK_HUGE;
+
+	switch (heap_get_chunk_hdr(heap, m)->type) {
+	case CHUNK_TYPE_RUN:
+	case CHUNK_TYPE_RUN_DATA:
+		ret = MEMORY_BLOCK_RUN;
+		break;
+	case CHUNK_TYPE_FREE:
+	case CHUNK_TYPE_USED:
+	case CHUNK_TYPE_FOOTER:
+		ret = MEMORY_BLOCK_HUGE;
+		break;
+	default:
+		/* unreachable */
+		FATAL("possible zone chunks metadata corruption");
+	}
+	return ret;
+}
+
+/*
+ * memblock_from_offset -- resolves a memory block data from an offset that
+ *	originates from the heap
+ */
+struct memory_block
+memblock_from_offset_opt(struct palloc_heap *heap, uint64_t off, int size)
+{
+	struct memory_block m = MEMORY_BLOCK_NONE;
+
+	m.heap = heap;
+
+	off -= HEAP_PTR_TO_OFF(heap, &heap->layout->zone0);
+	m.zone_id = (uint32_t)(off / ZONE_MAX_SIZE);
+
+	off -= (ZONE_MAX_SIZE * m.zone_id) + sizeof(struct zone);
+	m.chunk_id = (uint32_t)(off / CHUNKSIZE);
+
+	struct chunk_header *hdr = heap_get_chunk_hdr(heap, &m);
+
+	if (hdr->type == CHUNK_TYPE_RUN_DATA)
+		m.chunk_id -= hdr->size_idx;
+
+	off -= CHUNKSIZE * m.chunk_id;
+
+	m.header_type = memblock_header_type(&m);
+
+	off -= header_type_to_size[m.header_type];
+
+	m.type = off != 0 ? MEMORY_BLOCK_RUN : MEMORY_BLOCK_HUGE;
+	ASSERTeq(memblock_detect_type(heap, &m), m.type);
+
+	m.m_ops = &mb_ops[m.type];
+
+	uint64_t unit_size = m.m_ops->block_size(&m);
+
+	if (off != 0) { /* run */
+		off -= run_get_data_offset(&m);
+		off -= RUN_BASE_METADATA_SIZE;
+		m.block_off = (uint16_t)(off / unit_size);
+		off -= m.block_off * unit_size;
+	}
+
+	struct alloc_class_collection *acc = heap_alloc_classes(heap);
+
+	if (acc != NULL) {
+		struct alloc_class *ac = alloc_class_by_run(acc,
+			unit_size, hdr->flags, hdr->size_idx);
+		if (ac != NULL)
+			m.cached_bitmap = &ac->rdsc.bitmap;
+	}
+
+	m.size_idx = !size ? 0 : CALC_SIZE_IDX(unit_size,
+		memblock_header_ops[m.header_type].get_size(&m));
+
+	ASSERTeq(off, 0);
+
+	return m;
+}
+
+/*
+ * memblock_from_offset -- returns memory block with size
+ */
+struct memory_block
+memblock_from_offset(struct palloc_heap *heap, uint64_t off)
+{
+	return memblock_from_offset_opt(heap, off, 1);
+}
+
+/*
+ * memblock_rebuild_state -- fills in the runtime-state related fields of a
+ *	memory block structure
+ *
+ * This function must be called on all memory blocks that were created by hand
+ * (as opposed to retrieved from memblock_from_offset function).
+ */
+void
+memblock_rebuild_state(struct palloc_heap *heap, struct memory_block *m)
+{
+	m->heap = heap;
+	m->header_type = memblock_header_type(m);
+	m->type = memblock_detect_type(heap, m);
+	m->m_ops = &mb_ops[m->type];
+	m->cached_bitmap = NULL;
+}
diff --git a/src/common/dav_v2/memblock.h b/src/common/dav_v2/memblock.h
new file mode 100644
index 00000000000..f2fe3ee91be
--- /dev/null
+++ b/src/common/dav_v2/memblock.h
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2021, Intel Corporation */
+
+/*
+ * memblock.h -- internal definitions for memory block
+ */
+
+#ifndef __DAOS_COMMON_MEMBLOCK_H
+#define __DAOS_COMMON_MEMBLOCK_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "heap_layout.h"
+#include "memops.h"
+#include "palloc.h"
+
+#define MEMORY_BLOCK_NONE \
+(struct memory_block)\
+{0, 0, 0, 0, NULL, NULL, MAX_HEADER_TYPES, MAX_MEMORY_BLOCK, NULL}
+
+#define MEMORY_BLOCK_IS_NONE(_m)\
+((_m).heap == NULL)
+
+#define MEMORY_BLOCK_EQUALS(lhs, rhs)\
+((lhs).zone_id == (rhs).zone_id && (lhs).chunk_id == (rhs).chunk_id &&\
+(lhs).block_off == (rhs).block_off && (lhs).heap == (rhs).heap)
+
+enum memory_block_type {
+	/*
+	 * Huge memory blocks are directly backed by memory chunks. A single
+	 * huge block can consist of several chunks.
+	 * The persistent representation of huge memory blocks can be thought
+	 * of as a doubly linked list with variable length elements.
+	 * That list is stored in the chunk headers array where one element
+	 * directly corresponds to one chunk.
+	 *
+	 * U - used, F - free, R - footer, . - empty
+	 * |U| represents a used chunk with a size index of 1, with type
+	 * information (CHUNK_TYPE_USED) stored in the corresponding header
+	 * array element - chunk_headers[chunk_id].
+	 *
+	 * |F...R| represents a free chunk with size index of 5. The empty
+	 * chunk headers have undefined values and shouldn't be used. All
+	 * chunks with size larger than 1 must have a footer in the last
+	 * corresponding header array - chunk_headers[chunk_id - size_idx - 1].
+	 *
+	 * The above representation of chunks will be used to describe the
+	 * way fail-safety is achieved during heap operations.
+	 *
+	 * Allocation of huge memory block with size index 5:
+	 * Initial heap state: |U| <> |F..R| <> |U| <> |F......R|
+	 *
+	 * The only block that matches that size is at very end of the chunks
+	 * list: |F......R|
+	 *
+	 * As the request was for memory block of size 5, and this ones size is
+	 * 7 there's a need to first split the chunk in two.
+	 * 1) The last chunk header of the new allocation is marked as footer
+	 *	and the block after that one is marked as free: |F...RF.R|
+	 *	This is allowed and has no impact on the heap because this
+	 *	modification is into chunk header that is otherwise unused, in
+	 *	other words the linked list didn't change.
+	 *
+	 * 2) The size index of the first header is changed from previous value
+	 *	of 7 to 5: |F...R||F.R|
+	 *	This is a single fail-safe atomic operation and this is the
+	 *	first change that is noticeable by the heap operations.
+	 *	A single linked list element is split into two new ones.
+	 *
+	 * 3) The allocation process either uses redo log or changes directly
+	 *	the chunk header type from free to used: |U...R| <> |F.R|
+	 *
+	 * In a similar fashion the reverse operation, free, is performed:
+	 * Initial heap state: |U| <> |F..R| <> |F| <> |U...R| <> |F.R|
+	 *
+	 * This is the heap after the previous example with the single chunk
+	 * in between changed from used to free.
+	 *
+	 * 1) Determine the neighbors of the memory block which is being
+	 *	freed.
+	 *
+	 * 2) Update the footer (if needed) information of the last chunk which
+	 *	is the memory block being freed or it's neighbor to the right.
+	 *	|F| <> |U...R| <> |F.R << this one|
+	 *
+	 * 3) Update the size index and type of the left-most chunk header.
+	 *	And so this: |F << this one| <> |U...R| <> |F.R|
+	 *	becomes this: |F.......R|
+	 *	The entire chunk header can be updated in a single fail-safe
+	 *	atomic operation because it's size is only 64 bytes.
+	 */
+	MEMORY_BLOCK_HUGE,
+	/*
+	 * Run memory blocks are chunks with CHUNK_TYPE_RUN and size index of 1.
+	 * The entire chunk is subdivided into smaller blocks and has an
+	 * additional metadata attached in the form of a bitmap - each bit
+	 * corresponds to a single block.
+	 * In this case there's no need to perform any coalescing or splitting
+	 * on the persistent metadata.
+	 * The bitmap is stored on a variable number of 64 bit values and
+	 * because of the requirement of allocation fail-safe atomicity the
+	 * maximum size index of a memory block from a run is 64 - since that's
+	 * the limit of atomic write guarantee.
+	 *
+	 * The allocation/deallocation process is a single 8 byte write that
+	 * sets/clears the corresponding bits. Depending on the user choice
+	 * it can either be made atomically or using redo-log when grouped with
+	 * other operations.
+	 * It's also important to note that in a case of realloc it might so
+	 * happen that a single 8 byte bitmap value has its bits both set and
+	 * cleared - that's why the run memory block metadata changes operate
+	 * on AND'ing or OR'ing a bitmask instead of directly setting the value.
+	 */
+	MEMORY_BLOCK_RUN,
+
+	MAX_MEMORY_BLOCK
+};
+
+enum memblock_state {
+	MEMBLOCK_STATE_UNKNOWN,
+	MEMBLOCK_ALLOCATED,
+	MEMBLOCK_FREE,
+
+	MAX_MEMBLOCK_STATE,
+};
+
+/* runtime bitmap information for a run */
+struct run_bitmap {
+	unsigned nvalues; /* number of 8 byte values - size of values array */
+	unsigned nbits; /* number of valid bits */
+
+	size_t size; /* total size of the bitmap in bytes */
+
+	uint64_t *values; /* pointer to the bitmap's values array */
+};
+
+/* runtime information necessary to create a run */
+struct run_descriptor {
+	uint16_t flags; /* chunk flags for the run */
+	size_t unit_size; /* the size of a single unit in a run */
+	uint32_t size_idx; /* size index of a single run instance */
+	size_t alignment; /* required alignment of objects */
+	unsigned nallocs; /* number of allocs per run */
+	struct run_bitmap bitmap;
+};
+
+struct memory_block_ops {
+	/* returns memory block size */
+	size_t (*block_size)(const struct memory_block *m);
+
+	/* prepares header modification operation */
+	void (*prep_hdr)(const struct memory_block *m,
+		enum memblock_state dest_state, struct operation_context *ctx);
+
+	/* returns lock associated with memory block */
+	pthread_mutex_t *(*get_lock)(const struct memory_block *m);
+
+	/* returns whether a block is allocated or not */
+	enum memblock_state (*get_state)(const struct memory_block *m);
+
+	/* returns pointer to the data of a block */
+	void *(*get_user_data)(const struct memory_block *m);
+
+	/*
+	 * Returns the size of a memory block without overhead.
+	 * This is the size of a data block that can be used.
+	 */
+	size_t (*get_user_size)(const struct memory_block *m);
+
+	/* returns pointer to the beginning of data of a run block */
+	void *(*get_real_data)(const struct memory_block *m);
+
+	/* returns the size of a memory block, including headers */
+	size_t (*get_real_size)(const struct memory_block *m);
+
+	/* writes a header of an allocation */
+	void (*write_header)(const struct memory_block *m,
+		uint64_t extra_field, uint16_t flags);
+	void (*invalidate)(const struct memory_block *m);
+
+	/*
+	 * Checks the header type of a chunk matches the expected type and
+	 * modifies it if necessary. This is fail-safe atomic.
+	 */
+	void (*ensure_header_type)(const struct memory_block *m,
+		enum header_type t);
+
+	/*
+	 * Reinitializes a block after a heap restart.
+	 * This is called for EVERY allocation, but *only* under Valgrind.
+	 */
+	void (*reinit_header)(const struct memory_block *m);
+
+	/* returns the extra field of an allocation */
+	uint64_t (*get_extra)(const struct memory_block *m);
+
+	/* returns the flags of an allocation */
+	uint16_t (*get_flags)(const struct memory_block *m);
+
+	/* initializes memblock in valgrind */
+	void (*vg_init)(const struct memory_block *m, int objects,
+		object_callback cb, void *arg);
+
+	/* iterates over every free block */
+	int (*iterate_free)(const struct memory_block *m,
+		object_callback cb, void *arg);
+
+	/* iterates over every used block */
+	int (*iterate_used)(const struct memory_block *m,
+		object_callback cb, void *arg);
+
+	/* calculates number of free units, valid only for runs */
+	void (*calc_free)(const struct memory_block *m,
+		uint32_t *free_space, uint32_t *max_free_block);
+
+	/* this is called exactly once for every existing chunk */
+	void (*reinit_chunk)(const struct memory_block *m);
+
+	/*
+	 * Initializes bitmap data for a run.
+	 * Do *not* use this function unless absolutely necessary, it breaks
+	 * the abstraction layer by exposing implementation details.
+	 */
+	void (*get_bitmap)(const struct memory_block *m, struct run_bitmap *b);
+
+	/* calculates the ratio between occupied and unoccupied space */
+	unsigned (*fill_pct)(const struct memory_block *m);
+};
+
+struct memory_block {
+	uint32_t chunk_id; /* index of the memory block in its zone */
+	uint32_t zone_id; /* index of this block zone in the heap */
+
+	/*
+	 * Size index of the memory block represented in either multiple of
+	 * CHUNKSIZE in the case of a huge chunk or in multiple of a run
+	 * block size.
+	 */
+	uint32_t size_idx;
+
+	/*
+	 * Used only for run chunks, must be zeroed for huge.
+	 * Number of preceding blocks in the chunk. In other words, the
+	 * position of this memory block in run bitmap.
+	 */
+	uint32_t block_off;
+
+	/*
+	 * The variables below are associated with the memory block and are
+	 * stored here for convenience. Those fields are filled by either the
+	 * memblock_from_offset or memblock_rebuild_state, and they should not
+	 * be modified manually.
+	 */
+	const struct memory_block_ops *m_ops;
+	struct palloc_heap *heap;
+	enum header_type header_type;
+	enum memory_block_type type;
+	struct run_bitmap *cached_bitmap;
+};
+
+/*
+ * This is a representation of a run memory block that is active in a bucket or
+ * is on a pending list in the recycler.
+ * This structure should never be passed around by value because the address of
+ * the nresv variable can be in reservations made through palloc_reserve(). Only
+ * if the number of reservations equals 0 the structure can be moved/freed.
+ */
+struct memory_block_reserved {
+	struct memory_block m;
+
+	struct bucket_locked *bucket;
+	/*
+	 * Number of reservations made from this run, the pointer to this value
+	 * is stored in a user facing pobj_action structure. Decremented once
+	 * the reservation is published or canceled.
+	 */
+	int nresv;
+};
+
+struct memory_block memblock_from_offset(struct palloc_heap *heap,
+	uint64_t off);
+struct memory_block memblock_from_offset_opt(struct palloc_heap *heap,
+	uint64_t off, int size);
+void memblock_rebuild_state(struct palloc_heap *heap, struct memory_block *m);
+
+struct memory_block memblock_huge_init(struct palloc_heap *heap,
+	uint32_t chunk_id, uint32_t zone_id, uint32_t size_idx);
+
+struct memory_block memblock_run_init(struct palloc_heap *heap,
+	uint32_t chunk_id, uint32_t zone_id, struct run_descriptor *rdsc);
+
+void memblock_run_bitmap(uint32_t *size_idx, uint16_t flags,
+	uint64_t unit_size, uint64_t alignment, void *content,
+	struct run_bitmap *b);
+
+#endif /* __DAOS_COMMON_MEMBLOCK_H */
diff --git a/src/common/dav_v2/memops.c b/src/common/dav_v2/memops.c
new file mode 100644
index 00000000000..a137ac28836
--- /dev/null
+++ b/src/common/dav_v2/memops.c
@@ -0,0 +1,677 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2022, Intel Corporation */
+
+/*
+ * memops.c -- aggregated memory operations helper implementation
+ *
+ * The operation collects all of the required memory modifications that
+ * need to happen in an atomic way (all of them or none), and abstracts
+ * away the storage type (transient/persistent) and the underlying
+ * implementation of how it's actually performed - in some cases using
+ * the redo log is unnecessary and the allocation process can be sped up
+ * a bit by completely omitting that whole machinery.
+ *
+ * The modifications are not visible until the context is processed.
+ */
+
+#include "memops.h"
+#include "obj.h"
+#include "out.h"
+#include "ravl.h"
+#include "valgrind_internal.h"
+#include "vecq.h"
+#include "sys_util.h"
+#include "dav_internal.h"
+#include "tx.h"
+
+static inline int
+OBJ_OFF_IS_VALID_FROM_CTX(void *ctx, uint64_t offset)
+{
+	dav_obj_t *dav_hdl = (dav_obj_t *)ctx;
+
+	return OBJ_OFF_IS_VALID(dav_hdl, offset);
+}
+
+#define ULOG_BASE_SIZE 1024
+#define OP_MERGE_SEARCH 64
+
+enum operation_state {
+	OPERATION_IDLE,
+	OPERATION_IN_PROGRESS,
+	OPERATION_CLEANUP,
+};
+
+struct operation_log {
+	size_t capacity; /* capacity of the ulog log */
+	size_t offset; /* data offset inside of the log */
+	struct ulog *ulog; /* DRAM allocated log of modifications */
+};
+
+/*
+ * operation_context -- context of an ongoing palloc operation
+ */
+struct operation_context {
+	enum log_type type;
+
+	ulog_extend_fn extend; /* function to allocate next ulog */
+	ulog_free_fn ulog_free; /* function to free next ulogs */
+
+	const struct mo_ops *p_ops;
+	struct mo_ops t_ops; /* used for transient data processing */
+	struct mo_ops s_ops; /* used for shadow copy data processing */
+
+	size_t ulog_curr_offset; /* offset in the log for buffer stores */
+	size_t ulog_curr_capacity; /* capacity of the current log */
+	size_t ulog_curr_gen_num; /* transaction counter in the current log */
+	struct ulog *ulog_curr; /* current persistent log */
+	size_t total_logged; /* total amount of buffer stores in the logs */
+
+	struct ulog *ulog; /* pointer to the ulog used by context for undo ops */
+	size_t ulog_base_nbytes; /* available bytes in initial ulog log */
+	size_t ulog_capacity; /* sum of capacity, incl all next ulog logs */
+	int ulog_auto_reserve; /* allow or do not to auto ulog reservation */
+
+	struct ulog_next next; /* vector of 'next' fields of persistent ulog */
+
+	enum operation_state state; /* operation sanity check */
+
+	struct operation_log pshadow_ops; /* used by context for redo ops */
+	struct operation_log transient_ops; /* log of transient changes */
+
+	/* collection used to look for potential merge candidates */
+	VECQ(, struct ulog_entry_val *) merge_entries;
+};
+
+/*
+ * operation_log_transient_init -- (internal) initialize operation log
+ *	containing transient memory resident changes
+ */
+static int
+operation_log_transient_init(struct operation_log *log)
+{
+	struct ulog *src;
+
+	log->capacity = ULOG_BASE_SIZE;
+	log->offset = 0;
+
+	D_ALLOC(src, (sizeof(struct ulog) + ULOG_BASE_SIZE));
+	if (src == NULL) {
+		D_CRIT("Zalloc!\n");
+		return -1;
+	}
+
+	/* initialize underlying redo log structure */
+	src->capacity = ULOG_BASE_SIZE;
+
+	log->ulog = src;
+
+	return 0;
+}
+
+/*
+ * operation_log_persistent_init -- (internal) initialize operation log
+ *	containing persistent memory resident changes
+ */
+static int
+operation_log_persistent_init(struct operation_log *log,
+	size_t ulog_base_nbytes)
+{
+	struct ulog *src;
+
+	log->capacity = ULOG_BASE_SIZE;
+	log->offset = 0;
+
+	D_ALLOC(src, (sizeof(struct ulog) + ULOG_BASE_SIZE));
+	if (src == NULL) {
+		D_CRIT("Zalloc!\n");
+		return -1;
+	}
+
+	/* initialize underlying redo log structure */
+	src->capacity = ULOG_BASE_SIZE;
+	memset(src->unused, 0, sizeof(src->unused));
+
+	log->ulog = src;
+
+	return 0;
+}
+
+/*
+ * operation_transient_clean -- cleans pmemcheck address state
+ */
+static int
+operation_transient_clean(void *base, const void *addr, size_t len,
+	unsigned flags)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(base, flags);
+
+	VALGRIND_SET_CLEAN(addr, len);
+
+	return 0;
+}
+
+/*
+ * operation_transient_drain -- noop
+ */
+static void
+operation_transient_drain(void *base)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(base);
+}
+
+/*
+ * operation_transient_memcpy -- transient memcpy wrapper
+ */
+static void *
+operation_transient_memcpy(void *base, void *dest, const void *src, size_t len,
+	unsigned flags)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(base, flags);
+
+	return memcpy(dest, src, len);
+}
+
+/*
+ * operation_new -- creates new operation context
+ */
+struct operation_context *
+operation_new(struct ulog *ulog, size_t ulog_base_nbytes,
+	ulog_extend_fn extend, ulog_free_fn ulog_free,
+	const struct mo_ops *p_ops, enum log_type type)
+{
+
+	SUPPRESS_UNUSED(p_ops);
+
+	struct operation_context *ctx;
+
+	D_ALLOC_PTR(ctx);
+	if (ctx == NULL) {
+		D_CRIT("Zalloc!\n");
+		goto error_ctx_alloc;
+	}
+
+	ctx->ulog = ulog;
+	ctx->ulog_base_nbytes = ulog_base_nbytes;
+	ctx->ulog_capacity = ulog_capacity(ulog,
+		ulog_base_nbytes);
+	ctx->extend = extend;
+	ctx->ulog_free = ulog_free;
+	ctx->state = OPERATION_IDLE;
+	VEC_INIT(&ctx->next);
+	ulog_rebuild_next_vec(ulog, &ctx->next);
+	ctx->p_ops = p_ops;
+	ctx->type = type;
+
+	ctx->ulog_curr_offset = 0;
+	ctx->ulog_curr_capacity = 0;
+	ctx->ulog_curr = NULL;
+
+	ctx->t_ops.base = NULL;
+	ctx->t_ops.flush = operation_transient_clean;
+	ctx->t_ops.memcpy = operation_transient_memcpy;
+	ctx->t_ops.drain = operation_transient_drain;
+
+	ctx->s_ops.base = p_ops->base;
+	ctx->s_ops.flush = operation_transient_clean;
+	ctx->s_ops.memcpy = operation_transient_memcpy;
+	ctx->s_ops.drain = operation_transient_drain;
+
+	VECQ_INIT(&ctx->merge_entries);
+
+	if (operation_log_transient_init(&ctx->transient_ops) != 0)
+		goto error_ulog_alloc;
+
+	if (operation_log_persistent_init(&ctx->pshadow_ops,
+	    ulog_base_nbytes) != 0)
+		goto error_ulog_alloc;
+
+	return ctx;
+
+error_ulog_alloc:
+	operation_delete(ctx);
+error_ctx_alloc:
+	return NULL;
+}
+
+/*
+ * operation_delete -- deletes operation context
+ */
+void
+operation_delete(struct operation_context *ctx)
+{
+	VECQ_DELETE(&ctx->merge_entries);
+	VEC_DELETE(&ctx->next);
+	D_FREE(ctx->pshadow_ops.ulog);
+	D_FREE(ctx->transient_ops.ulog);
+	D_FREE(ctx);
+}
+
+/*
+ * operation_free_logs -- free all logs except first
+ */
+void
+operation_free_logs(struct operation_context *ctx)
+{
+	int freed = ulog_free_next(ctx->ulog, ctx->ulog_free);
+
+	if (freed) {
+		ctx->ulog_capacity = ulog_capacity(ctx->ulog,
+			ctx->ulog_base_nbytes);
+		VEC_CLEAR(&ctx->next);
+		ulog_rebuild_next_vec(ctx->ulog, &ctx->next);
+	}
+
+	ASSERTeq(VEC_SIZE(&ctx->next), 0);
+}
+
+/*
+ * operation_merge -- (internal) performs operation on a field
+ */
+static inline int
+operation_merge(struct ulog_entry_base *entry, uint64_t value,
+	ulog_operation_type type)
+{
+	struct ulog_entry_val *e = (struct ulog_entry_val *)entry;
+	uint16_t num, num1, num2;
+	uint32_t pos, pos1, pos2;
+
+	switch (type) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	case ULOG_OPERATION_AND:
+		e->value &= value;
+		break;
+	case ULOG_OPERATION_OR:
+		e->value |= value;
+		break;
+#else
+	case ULOG_OPERATION_SET_BITS:
+	case ULOG_OPERATION_CLR_BITS:
+		num1 = ULOG_ENTRY_VAL_TO_BITS(e->value);
+		pos1 = ULOG_ENTRY_VAL_TO_POS(e->value);
+		num2 = ULOG_ENTRY_VAL_TO_BITS(value);
+		pos2 = ULOG_ENTRY_VAL_TO_POS(value);
+
+		if ((pos2 > pos1 + num1) || (pos1 > pos2 + num2))
+			return 0; /* there is a gap, no merge */
+
+		pos = MIN(pos1, pos2);
+		num = MAX(pos1 + num1, pos2 + num2) - pos;
+
+		e->value = ULOG_ENTRY_TO_VAL(pos, num);
+		break;
+#endif
+	case ULOG_OPERATION_SET:
+		e->value = value;
+	default:
+		ASSERT(0); /* unreachable */
+	}
+	return 1;
+}
+
+/*
+ * operation_try_merge_entry -- tries to merge the incoming log entry with
+ *	existing entries
+ *
+ * Because this requires a reverse foreach, it cannot be implemented using
+ * the on-media ulog log structure since there's no way to find what's
+ * the previous entry in the log. Instead, the last N entries are stored
+ * in a collection and traversed backwards.
+ */
+static int
+operation_try_merge_entry(struct operation_context *ctx,
+	void *ptr, uint64_t value, ulog_operation_type type)
+{
+	int ret = 0;
+	uint64_t offset = OBJ_PTR_TO_OFF(ctx->p_ops->base, ptr);
+
+	struct ulog_entry_val *e;
+
+	VECQ_FOREACH_REVERSE(e, &ctx->merge_entries) {
+		if (ulog_entry_offset(&e->base) == offset) {
+			if (ulog_entry_type(&e->base) == type) {
+				if (operation_merge(&e->base, value, type))
+					return 1;
+			}
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * operation_merge_entry_add -- adds a new entry to the merge collection,
+ *	keeps capacity at OP_MERGE_SEARCH. Removes old entries in FIFO fashion.
+ */
+static void
+operation_merge_entry_add(struct operation_context *ctx,
+	struct ulog_entry_val *entry)
+{
+	if (VECQ_SIZE(&ctx->merge_entries) == OP_MERGE_SEARCH)
+		(void) VECQ_DEQUEUE(&ctx->merge_entries);
+
+	if (VECQ_ENQUEUE(&ctx->merge_entries, entry) != 0) {
+		/* this is fine, only runtime perf will get slower */
+		D_CRIT("out of memory - unable to track entries\n");
+	}
+}
+
+/*
+ * operation_add_typed_value -- adds new entry to the current operation, if the
+ *	same ptr address already exists and the operation type is set,
+ *	the new value is not added and the function has no effect.
+ */
+int
+operation_add_typed_entry(struct operation_context *ctx,
+	void *ptr, uint64_t value,
+	ulog_operation_type type, enum operation_log_type log_type)
+{
+	struct operation_log *oplog = log_type == LOG_PERSISTENT ?
+		&ctx->pshadow_ops : &ctx->transient_ops;
+
+	/*
+	 * Always make sure to have one extra spare cacheline so that the
+	 * ulog log entry creation has enough room for zeroing.
+	 */
+	if (oplog->offset + CACHELINE_SIZE == oplog->capacity) {
+		size_t ncapacity = oplog->capacity + ULOG_BASE_SIZE;
+		struct ulog *ulog;
+
+		D_REALLOC_NZ(ulog, oplog->ulog, SIZEOF_ULOG(ncapacity));
+		if (ulog == NULL)
+			return -1;
+		oplog->capacity += ULOG_BASE_SIZE;
+		oplog->ulog = ulog;
+		oplog->ulog->capacity = oplog->capacity;
+
+		/*
+		 * Realloc invalidated the ulog entries that are inside of this
+		 * vector, need to clear it to avoid use after free.
+		 */
+		VECQ_CLEAR(&ctx->merge_entries);
+	}
+
+	if (log_type == LOG_PERSISTENT &&
+		operation_try_merge_entry(ctx, ptr, value, type) != 0)
+		return 0;
+
+	struct ulog_entry_val *entry = ulog_entry_val_create(
+		oplog->ulog, oplog->offset, ptr, value, type,
+		log_type == LOG_TRANSIENT ? &ctx->t_ops : &ctx->s_ops);
+
+	if (log_type == LOG_PERSISTENT)
+		operation_merge_entry_add(ctx, entry);
+
+	oplog->offset += ulog_entry_size(&entry->base);
+
+	return 0;
+}
+
+
+/*
+ * operation_add_value -- adds new entry to the current operation with
+ *	entry type autodetected based on the memory location
+ */
+int
+operation_add_entry(struct operation_context *ctx, void *ptr, uint64_t value,
+	ulog_operation_type type)
+{
+	const struct mo_ops *p_ops = ctx->p_ops;
+	dav_obj_t *pop = (dav_obj_t *)p_ops->base;
+
+	int from_pool = OBJ_PTR_IS_VALID(pop, ptr);
+
+	return operation_add_typed_entry(ctx, ptr, value, type,
+		from_pool ? LOG_PERSISTENT : LOG_TRANSIENT);
+}
+
+/*
+ * operation_add_buffer -- adds a buffer operation to the log
+ */
+int
+operation_add_buffer(struct operation_context *ctx,
+	void *dest, void *src, size_t size, ulog_operation_type type)
+{
+	size_t real_size = size + sizeof(struct ulog_entry_buf);
+
+	/* if there's no space left in the log, reserve some more */
+	if (ctx->ulog_curr_capacity == 0) {
+		ctx->ulog_curr_gen_num = ctx->ulog->gen_num;
+		if (operation_reserve(ctx, ctx->total_logged + real_size) != 0)
+			return -1;
+
+		ctx->ulog_curr = ctx->ulog_curr == NULL ? ctx->ulog :
+			ulog_next(ctx->ulog_curr);
+		ASSERTne(ctx->ulog_curr, NULL);
+		ctx->ulog_curr_offset = 0;
+		ctx->ulog_curr_capacity = ctx->ulog_curr->capacity;
+	}
+
+	size_t curr_size = MIN(real_size, ctx->ulog_curr_capacity);
+	size_t data_size = curr_size - sizeof(struct ulog_entry_buf);
+	size_t entry_size = ALIGN_UP(curr_size, CACHELINE_SIZE);
+
+	/*
+	 * To make sure that the log is consistent and contiguous, we need
+	 * make sure that the header of the entry that would be located
+	 * immediately after this one is zeroed.
+	 */
+	struct ulog_entry_base *next_entry = NULL;
+
+	if (entry_size == ctx->ulog_curr_capacity) {
+		struct ulog *u = ulog_next(ctx->ulog_curr);
+
+		if (u != NULL)
+			next_entry = (struct ulog_entry_base *)u->data;
+	} else {
+		size_t next_entry_offset = ctx->ulog_curr_offset + entry_size;
+
+		next_entry = (struct ulog_entry_base *)(ctx->ulog_curr->data +
+			next_entry_offset);
+	}
+	if (next_entry != NULL)
+		ulog_clobber_entry(next_entry);
+
+	/* create a persistent log entry */
+	struct ulog_entry_buf *e = ulog_entry_buf_create(ctx->ulog_curr,
+		ctx->ulog_curr_offset,
+		ctx->ulog_curr_gen_num,
+		dest, src, data_size,
+		type, ctx->p_ops);
+	ASSERT(entry_size == ulog_entry_size(&e->base));
+	ASSERT(entry_size <= ctx->ulog_curr_capacity);
+
+	ctx->total_logged += entry_size;
+	ctx->ulog_curr_offset += entry_size;
+	ctx->ulog_curr_capacity -= entry_size;
+
+	/*
+	 * Recursively add the data to the log until the entire buffer is
+	 * processed.
+	 */
+	return size - data_size == 0 ? 0 : operation_add_buffer(ctx,
+			(char *)dest + data_size,
+			(char *)src + data_size,
+			size - data_size, type);
+}
+
+/*
+ * operation_set_auto_reserve -- set auto reserve value for context
+ */
+void
+operation_set_auto_reserve(struct operation_context *ctx, int auto_reserve)
+{
+	ctx->ulog_auto_reserve = auto_reserve;
+}
+
+/*
+ * operation_process_persistent_redo -- (internal) process using ulog
+ */
+static void
+operation_process_persistent_redo(struct operation_context *ctx)
+{
+	ASSERTeq(ctx->pshadow_ops.capacity % CACHELINE_SIZE, 0);
+
+	/* Copy the redo log to wal redo */
+	ulog_foreach_entry(ctx->pshadow_ops.ulog, tx_create_wal_entry,
+			   NULL, ctx->p_ops);
+
+	ulog_process(ctx->pshadow_ops.ulog, OBJ_OFF_IS_VALID_FROM_CTX,
+		ctx->p_ops);
+
+	ulog_clobber(ctx->ulog, &ctx->next);
+}
+
+/*
+ * operation_reserve -- (internal) reserves new capacity in persistent ulog log
+ */
+int
+operation_reserve(struct operation_context *ctx, size_t new_capacity)
+{
+	if ((ctx->type == LOG_TYPE_UNDO) && (new_capacity > ctx->ulog_capacity)) {
+		if (ctx->extend == NULL) {
+			ERR("no extend function present");
+			return -1;
+		}
+
+		if (ulog_reserve(ctx->ulog,
+		    ctx->ulog_base_nbytes,
+		    ctx->ulog_curr_gen_num,
+		    ctx->ulog_auto_reserve,
+		    &new_capacity, ctx->extend,
+		    &ctx->next) != 0)
+			return -1;
+		ctx->ulog_capacity = new_capacity;
+	}
+
+	return 0;
+}
+
+/*
+ * operation_init -- initializes runtime state of an operation
+ */
+void
+operation_init(struct operation_context *ctx)
+{
+	struct operation_log *plog = &ctx->pshadow_ops;
+	struct operation_log *tlog = &ctx->transient_ops;
+
+	VALGRIND_ANNOTATE_NEW_MEMORY(ctx, sizeof(*ctx));
+	VALGRIND_ANNOTATE_NEW_MEMORY(tlog->ulog, sizeof(struct ulog) +
+		tlog->capacity);
+	VALGRIND_ANNOTATE_NEW_MEMORY(plog->ulog, sizeof(struct ulog) +
+		plog->capacity);
+	tlog->offset = 0;
+	plog->offset = 0;
+	VECQ_REINIT(&ctx->merge_entries);
+
+	ctx->ulog_curr_offset = 0;
+	ctx->ulog_curr_capacity = 0;
+	ctx->ulog_curr_gen_num = 0;
+	ctx->ulog_curr = NULL;
+	ctx->total_logged = 0;
+	ctx->ulog_auto_reserve = 1;
+}
+
+/*
+ * operation_start -- initializes and starts a new operation
+ */
+void
+operation_start(struct operation_context *ctx)
+{
+	operation_init(ctx);
+	ASSERTeq(ctx->state, OPERATION_IDLE);
+	ctx->state = OPERATION_IN_PROGRESS;
+}
+
+/*
+ * operation_cancel -- cancels a running operation
+ */
+void
+operation_cancel(struct operation_context *ctx)
+{
+	ASSERTeq(ctx->state, OPERATION_IN_PROGRESS);
+	ctx->state = OPERATION_IDLE;
+}
+
+/*
+ * operation_process -- processes registered operations
+ *
+ * The order of processing is important: persistent, transient.
+ * This is because the transient entries that reside on persistent memory might
+ * require write to a location that is currently occupied by a valid persistent
+ * state but becomes a transient state after operation is processed.
+ */
+void
+operation_process(struct operation_context *ctx)
+{
+	/*
+	 * If there's exactly one persistent entry there's no need to involve
+	 * the redo log. We can simply assign the value, the operation will be
+	 * atomic.
+	 */
+	int redo_process = ctx->type == LOG_TYPE_REDO &&
+		ctx->pshadow_ops.offset != 0;
+	if (redo_process &&
+	    ctx->pshadow_ops.offset == sizeof(struct ulog_entry_val)) {
+		struct ulog_entry_base *e = (struct ulog_entry_base *)
+			ctx->pshadow_ops.ulog->data;
+		ulog_operation_type t = ulog_entry_type(e);
+
+		if ((t == ULOG_OPERATION_SET) || ULOG_ENTRY_IS_BIT_OP(t)) {
+			tx_create_wal_entry(e, NULL, ctx->p_ops);
+			ulog_entry_apply(e, 1, ctx->p_ops);
+			redo_process = 0;
+		}
+	}
+
+	if (redo_process) {
+		operation_process_persistent_redo(ctx);
+		ctx->state = OPERATION_CLEANUP;
+	}
+	D_ASSERT(ctx->type != LOG_TYPE_UNDO);
+
+	/* process transient entries with transient memory ops */
+	if (ctx->transient_ops.offset != 0)
+		ulog_process(ctx->transient_ops.ulog, NULL, &ctx->t_ops);
+}
+
+/*
+ * operation_finish -- finalizes the operation
+ */
+void
+operation_finish(struct operation_context *ctx, unsigned flags)
+{
+	ASSERTne(ctx->state, OPERATION_IDLE);
+
+	if (ctx->type == LOG_TYPE_UNDO && ctx->total_logged != 0)
+		ctx->state = OPERATION_CLEANUP;
+
+	if (ctx->state != OPERATION_CLEANUP)
+		goto out;
+
+	if (ctx->type == LOG_TYPE_UNDO) {
+		int ret = ulog_clobber_data(ctx->ulog,
+			&ctx->next, ctx->ulog_free, flags);
+
+		if (ret == 0)
+			goto out;
+	} else if (ctx->type == LOG_TYPE_REDO) {
+		int ret = ulog_free_next(ctx->ulog, ctx->ulog_free);
+
+		if (ret == 0)
+			goto out;
+	}
+
+	/* clobbering shrunk the ulog */
+	ctx->ulog_capacity = ulog_capacity(ctx->ulog,
+		ctx->ulog_base_nbytes);
+	VEC_CLEAR(&ctx->next);
+	ulog_rebuild_next_vec(ctx->ulog, &ctx->next);
+
+out:
+	ctx->state = OPERATION_IDLE;
+}
diff --git a/src/common/dav_v2/memops.h b/src/common/dav_v2/memops.h
new file mode 100644
index 00000000000..035105de0c5
--- /dev/null
+++ b/src/common/dav_v2/memops.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2020, Intel Corporation */
+
+/*
+ * memops.h -- aggregated memory operations helper definitions
+ */
+
+#ifndef __DAOS_COMMON_MEMOPS_H
+#define __DAOS_COMMON_MEMOPS_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "vec.h"
+#include "mo_wal.h"
+#include "ulog.h"
+
+enum operation_log_type {
+	LOG_PERSISTENT, /* log of persistent modifications */
+	LOG_TRANSIENT, /* log of transient memory modifications */
+
+	MAX_OPERATION_LOG_TYPE
+};
+
+enum log_type {
+	LOG_TYPE_UNDO,
+	LOG_TYPE_REDO,
+
+	MAX_LOG_TYPE,
+};
+
+struct user_buffer_def {
+	void *addr;
+	size_t size;
+};
+
+struct operation_context;
+
+struct operation_context *
+operation_new(struct ulog *redo, size_t ulog_base_nbytes,
+	ulog_extend_fn extend, ulog_free_fn ulog_free,
+	const struct mo_ops *p_ops, enum log_type type);
+
+void operation_init(struct operation_context *ctx);
+void operation_start(struct operation_context *ctx);
+
+void operation_delete(struct operation_context *ctx);
+void operation_free_logs(struct operation_context *ctx);
+
+int operation_add_buffer(struct operation_context *ctx,
+	void *dest, void *src, size_t size, ulog_operation_type type);
+
+int operation_add_entry(struct operation_context *ctx,
+	void *ptr, uint64_t value, ulog_operation_type type);
+int operation_add_typed_entry(struct operation_context *ctx,
+	void *ptr, uint64_t value,
+	ulog_operation_type type, enum operation_log_type log_type);
+void operation_set_auto_reserve(struct operation_context *ctx,
+		int auto_reserve);
+
+int operation_reserve(struct operation_context *ctx, size_t new_capacity);
+void operation_process(struct operation_context *ctx);
+void operation_finish(struct operation_context *ctx, unsigned flags);
+void operation_cancel(struct operation_context *ctx);
+
+#endif /* __DAOS_COMMON_MEMOPS_H */
diff --git a/src/common/dav_v2/mo_wal.h b/src/common/dav_v2/mo_wal.h
new file mode 100644
index 00000000000..9f05eca72a9
--- /dev/null
+++ b/src/common/dav_v2/mo_wal.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2020, Intel Corporation */
+
+#ifndef __DAOS_COMMON_MO_WAL_H
+#define __DAOS_COMMON_MO_WAL_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#include "out.h"
+#include "wal_tx.h"
+
+typedef int (*persist_fn)(void *base, const void *, size_t, unsigned);
+typedef int (*flush_fn)(void *base, const void *, size_t, unsigned);
+typedef void (*drain_fn)(void *base);
+
+typedef void *(*memcpy_fn)(void *base, void *dest, const void *src, size_t len,
+		unsigned flags);
+typedef void *(*memmove_fn)(void *base, void *dest, const void *src, size_t len,
+		unsigned flags);
+typedef void *(*memset_fn)(void *base, void *dest, int c, size_t len,
+		unsigned flags);
+
+typedef int (*remote_read_fn)(void *ctx, uintptr_t base, void *dest, void *addr,
+		size_t length);
+
+struct mo_ops {
+	/* for 'master' replica: with or without data replication */
+	persist_fn persist;	/* persist function */
+	flush_fn flush;		/* flush function */
+	drain_fn drain;		/* drain function */
+	memcpy_fn memcpy; /* persistent memcpy function */
+	memmove_fn memmove; /* persistent memmove function */
+	memset_fn memset; /* persistent memset function */
+	void *base;
+
+	struct remote_ops {
+		remote_read_fn read;
+
+		void *ctx;
+		uintptr_t base;
+	} remote;
+};
+
+static force_inline void
+mo_wal_persist(const struct mo_ops *p_ops, void *d, size_t s)
+{
+	dav_wal_tx_snap(p_ops->base, d, s, d, 0);
+}
+
+static force_inline void
+mo_wal_flush(const struct mo_ops *p_ops, void *d, size_t s, int flags)
+{
+	dav_wal_tx_snap(p_ops->base, d, s, d, flags);
+}
+
+static force_inline void
+mo_wal_drain(const struct mo_ops *p_ops)
+{
+	SUPPRESS_UNUSED(p_ops);
+}
+
+static force_inline void *
+mo_wal_memcpy(const struct mo_ops *p_ops, void *dest,
+		const void *src, size_t len, unsigned flags)
+{
+	SUPPRESS_UNUSED(p_ops);
+	memcpy(dest, src, len);
+	mo_wal_flush(p_ops, dest, len, 0);
+	return dest;
+}
+
+static force_inline void *
+mo_wal_memmove(const struct mo_ops *p_ops, void *dest,
+		const void *src, size_t len, unsigned flags)
+{
+	SUPPRESS_UNUSED(p_ops);
+	memmove(dest, src, len);
+	mo_wal_flush(p_ops, dest, len, 0);
+	return dest;
+}
+
+static force_inline void *
+mo_wal_memset(const struct mo_ops *p_ops, void *dest, int c,
+		size_t len, unsigned flags)
+{
+	SUPPRESS_UNUSED(p_ops);
+	memset(dest, c, len);
+	dav_wal_tx_set(p_ops->base, dest, c, len);
+	return dest;
+}
+
+#endif /* __DAOS_COMMON_MO_WAL_H */
diff --git a/src/common/dav_v2/obj.h b/src/common/dav_v2/obj.h
new file mode 100644
index 00000000000..470323da1ef
--- /dev/null
+++ b/src/common/dav_v2/obj.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2014-2023, Intel Corporation */
+
+/*
+ * obj.h -- internal definitions for obj module
+ */
+
+#ifndef __DAOS_COMMON_OBJ_H
+#define __DAOS_COMMON_OBJ_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "dav_internal.h"
+#include "stats.h"
+
+#define OBJ_OFF_TO_PTR(pop, off) ((void *)((uintptr_t)(((dav_obj_t *)(pop))->do_base) + (off)))
+#define OBJ_PTR_TO_OFF(pop, ptr) ((uintptr_t)(ptr) - (uintptr_t)(((dav_obj_t *)(pop))->do_base))
+#define OBJ_OFF_FROM_HEAP(pop, off)\
+	((off) >= ((dav_obj_t *)(pop))->do_phdr->dp_heap_offset &&\
+	(off) < ((dav_obj_t *)(pop))->do_phdr->dp_heap_offset +\
+		((dav_obj_t *)(pop))->do_phdr->dp_heap_size)
+
+#define OBJ_OFF_IS_VALID(pop, off)\
+	(OBJ_OFF_FROM_HEAP(pop, off) ||\
+	(OBJ_PTR_TO_OFF(pop, &((dav_obj_t *)(pop))->do_phdr->dp_root_offset) == (off)) ||\
+	(OBJ_PTR_TO_OFF(pop, &((dav_obj_t *)(pop))->do_phdr->dp_root_size) == (off)))
+
+#define OBJ_PTR_IS_VALID(pop, ptr)\
+	OBJ_OFF_IS_VALID(pop, OBJ_PTR_TO_OFF(pop, ptr))
+
+#define OBJ_PTR_FROM_POOL(pop, ptr)\
+	((uintptr_t)(ptr) >= (uintptr_t)(((dav_obj_t *)pop)->do_base) &&\
+	(uintptr_t)(ptr) < (uintptr_t)(((dav_obj_t *)pop)->do_base) +\
+		(((dav_obj_t *)pop)->do_phdr->dp_heap_offset +\
+		 ((dav_obj_t *)pop)->do_phdr->dp_heap_size))
+
+#define	OBJ_OFFRANGE_FROM_HEAP(pop, start, end)\
+	(((start) >= ((dav_obj_t *)pop)->do_phdr->dp_heap_offset) &&\
+	 ((end) <=  (((dav_obj_t *)pop)->do_phdr->dp_heap_offset + \
+		     ((dav_obj_t *)pop)->do_phdr->dp_heap_size)))
+
+typedef uint64_t type_num_t;
+
+#define CLASS_ID_FROM_FLAG(flag)\
+((uint16_t)((flag) >> 48))
+
+#define EZONE_ID_FROM_FLAG(flag) ((uint32_t)((flag) >> 16))
+
+#endif /* __DAOS_COMMON_OBJ_H */
diff --git a/src/common/dav_v2/out.h b/src/common/dav_v2/out.h
new file mode 100644
index 00000000000..ebe12044db4
--- /dev/null
+++ b/src/common/dav_v2/out.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2014-2021, Intel Corporation */
+
+/*
+ * out.h -- definitions for "out" module
+ */
+
+#ifndef __DAOS_COMMON_OUT_H
+#define __DAOS_COMMON_OUT_H 1
+
+#include <daos/debug.h>
+#include "util.h"
+
+#define DAV_LOG_FAC DB_TRACE
+
+/* enable extra debug messages and extra checks */
+/*#define DAV_EXTRA_DEBUG*/
+
+#ifndef EVALUATE_DBG_EXPRESSIONS
+#if defined(DAV_EXTRA_DEBUG) || defined(__clang_analyzer__) || defined(__COVERITY__) ||\
+	defined(__KLOCWORK__)
+#define EVALUATE_DBG_EXPRESSIONS 1
+#else
+#define EVALUATE_DBG_EXPRESSIONS 0
+#endif
+#endif
+
+#define TEST_ALWAYS_TRUE_EXPR(cnd) do {	\
+	if (__builtin_constant_p(cnd))	\
+		COMPILE_ERROR_ON(cnd);	\
+} while (0)
+#define TEST_ALWAYS_EQ_EXPR(lhs, rhs) do {				\
+	if (__builtin_constant_p(lhs) && __builtin_constant_p(rhs))	\
+		COMPILE_ERROR_ON((lhs) == (rhs));			\
+} while (0)
+#define TEST_ALWAYS_NE_EXPR(lhs, rhs) do {				\
+	if (__builtin_constant_p(lhs) && __builtin_constant_p(rhs))	\
+		COMPILE_ERROR_ON((lhs) != (rhs));			\
+} while (0)
+
+/* produce debug/trace output */
+#if defined(DAV_EXTRA_DEBUG)
+#define DAV_DBG(fmt, ...) D_DEBUG(DAV_LOG_FAC, fmt "\n", ##__VA_ARGS__)
+#else
+#define DAV_DBG(fmt, ...) SUPPRESS_UNUSED(__VA_ARGS__)
+#endif
+
+/* produce output and exit */
+#define FATAL(fmt, ...)					\
+	D_ASSERTF(0, fmt "\n", ## __VA_ARGS__)
+
+/* assert a condition is true at runtime */
+#define ASSERT_rt(cnd) do {				\
+	if (!EVALUATE_DBG_EXPRESSIONS || (cnd))		\
+		break;					\
+	D_ASSERT(cnd);					\
+} while (0)
+
+/* assert two integer values are equal at runtime */
+#define ASSERTeq_rt(lhs, rhs) do {			\
+	if (!EVALUATE_DBG_EXPRESSIONS || ((lhs) == (rhs)))\
+		break; \
+	D_ASSERTF(((lhs) == (rhs)),			\
+	"assertion failure: %s (0x%llx) == %s (0x%llx)", #lhs,\
+	(unsigned long long)(lhs), #rhs, (unsigned long long)(rhs)); \
+} while (0)
+
+/* assert two integer values are not equal at runtime */
+#define ASSERTne_rt(lhs, rhs) do {			\
+	if (!EVALUATE_DBG_EXPRESSIONS || ((lhs) != (rhs)))\
+		break;					\
+	D_ASSERTF(((lhs) != (rhs)),			\
+	"assertion failure: %s (0x%llx) != %s (0x%llx)", #lhs,\
+	(unsigned long long)(lhs), #rhs, (unsigned long long)(rhs)); \
+} while (0)
+
+/*
+ * Detect useless asserts on always true expression. Please use
+ * COMPILE_ERROR_ON(!cnd) or ASSERT_rt(cnd) in such cases.
+ */
+/* assert a condition is true */
+#define ASSERT(cnd) do {\
+		TEST_ALWAYS_TRUE_EXPR(cnd);\
+		ASSERT_rt(cnd);\
+	} while (0)
+
+/* assert two integer values are equal */
+#define ASSERTeq(lhs, rhs) do {\
+		/* See comment in ASSERT. */\
+		TEST_ALWAYS_EQ_EXPR(lhs, rhs);\
+		ASSERTeq_rt(lhs, rhs);\
+	} while (0)
+
+/* assert two integer values are not equal */
+#define ASSERTne(lhs, rhs) do {\
+		/* See comment in ASSERT. */\
+		TEST_ALWAYS_NE_EXPR(lhs, rhs);\
+		ASSERTne_rt(lhs, rhs);\
+	} while (0)
+
+#define ERR(fmt, ...)\
+	D_ERROR(fmt "\n", ## __VA_ARGS__)
+
+#endif /* __DAOS_COMMON_OUT_H */
diff --git a/src/common/dav_v2/palloc.c b/src/common/dav_v2/palloc.c
new file mode 100644
index 00000000000..cf73303d655
--- /dev/null
+++ b/src/common/dav_v2/palloc.c
@@ -0,0 +1,977 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2023, Intel Corporation */
+
+/*
+ * palloc.c -- implementation of pmalloc POSIX-like API
+ *
+ * This is the front-end part of the persistent memory allocator. It uses both
+ * transient and persistent representation of the heap to provide memory blocks
+ * in a reasonable time and with an acceptable common-case fragmentation.
+ *
+ * Lock ordering in the entirety of the allocator is simple, but might be hard
+ * to follow at times because locks are, by necessity, externalized.
+ * There are two sets of locks that need to be taken into account:
+ *	- runtime state locks, represented by buckets.
+ *	- persistent state locks, represented by memory block mutexes.
+ *
+ * To properly use them, follow these rules:
+ *	- When nesting, always lock runtime state first.
+ *	Doing the reverse might cause deadlocks in other parts of the code.
+ *
+ *	- When introducing functions that would require runtime state locks,
+ *	always try to move the lock acquiring to the upper most layer. This
+ *	usually means that the functions will simply take "struct bucket" as
+ *	their argument. By doing so most of the locking can happen in
+ *	the frontend part of the allocator and it's easier to follow the first
+ *	rule because all functions in the backend can safely use the persistent
+ *	state locks - the runtime lock, if it is needed, will be already taken
+ *	by the upper layer.
+ *
+ * General lock ordering:
+ *	1. arenas.lock
+ *	2. buckets (sorted by ID)
+ *	3. memory blocks (sorted by lock address)
+ */
+
+#include "bucket.h"
+#include "valgrind_internal.h"
+#include "heap_layout.h"
+#include "heap.h"
+#include "alloc_class.h"
+#include "out.h"
+#include "sys_util.h"
+#include "palloc.h"
+#include "ravl.h"
+#include "vec.h"
+
+struct dav_action_internal {
+	/* type of operation (alloc/free vs set) */
+	enum dav_action_type type;
+
+	uint32_t padding;
+
+	/*
+	 * Action-specific lock that needs to be taken for the duration of
+	 * an action.
+	 */
+	pthread_mutex_t *lock;
+
+	/* action-specific data */
+	union {
+		/* valid only when type == DAV_ACTION_TYPE_HEAP */
+		struct {
+			uint64_t offset;
+			uint64_t usable_size;
+			enum memblock_state new_state;
+			struct memory_block m;
+			struct memory_block_reserved *mresv;
+		};
+
+		/* valid only when type == DAV_ACTION_TYPE_MEM */
+		struct {
+			uint64_t *ptr;
+			uint64_t value;
+		};
+
+		/* padding, not used */
+		uint64_t data2[14];
+	};
+};
+D_CASSERT(offsetof(struct dav_action_internal, data2) == offsetof(struct dav_action, data2),
+	  "struct dav_action misaligned!");
+
+/*
+ * palloc_set_value -- creates a new set memory action
+ */
+void
+palloc_set_value(struct palloc_heap *heap, struct dav_action *act,
+	uint64_t *ptr, uint64_t value)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(heap);
+
+	act->type = DAV_ACTION_TYPE_MEM;
+
+	struct dav_action_internal *actp = (struct dav_action_internal *)act;
+
+	actp->ptr = ptr;
+	actp->value = value;
+	actp->lock = NULL;
+}
+
+/*
+ * alloc_prep_block -- (internal) prepares a memory block for allocation
+ *
+ * Once the block is fully reserved and it's guaranteed that no one else will
+ * be able to write to this memory region it is safe to write the allocation
+ * header and call the object construction function.
+ *
+ * Because the memory block at this stage is only reserved in transient state
+ * there's no need to worry about fail-safety of this method because in case
+ * of a crash the memory will be back in the free blocks collection.
+ */
+static int
+alloc_prep_block(struct palloc_heap *heap, const struct memory_block *m,
+	palloc_constr constructor, void *arg,
+	uint64_t extra_field, uint16_t object_flags,
+	struct dav_action_internal *out)
+{
+	void *uptr = m->m_ops->get_user_data(m);
+	size_t usize = m->m_ops->get_user_size(m);
+
+	VALGRIND_DO_MEMPOOL_ALLOC(heap->layout, uptr, usize);
+	VALGRIND_DO_MAKE_MEM_UNDEFINED(uptr, usize);
+	VALGRIND_ANNOTATE_NEW_MEMORY(uptr, usize);
+
+	m->m_ops->write_header(m, extra_field, object_flags);
+
+	/*
+	 * Set allocated memory with pattern, if debug.heap.alloc_pattern CTL
+	 * parameter had been set.
+	 */
+	if (unlikely(heap->alloc_pattern > PALLOC_CTL_DEBUG_NO_PATTERN)) {
+		mo_wal_memset(&heap->p_ops, uptr, heap->alloc_pattern,
+			usize, 0);
+		VALGRIND_DO_MAKE_MEM_UNDEFINED(uptr, usize);
+	}
+
+	int ret;
+
+	if (constructor != NULL) {
+		ret = constructor(heap->p_ops.base, uptr, usize, arg);
+		if (ret  != 0) {
+			/*
+			 * If canceled, revert the block back to the free
+			 * state in vg machinery.
+			 */
+			VALGRIND_DO_MEMPOOL_FREE(heap->layout, uptr);
+			return ret;
+		}
+	}
+
+	/*
+	 * To avoid determining the user data pointer twice this method is also
+	 * responsible for calculating the offset of the object in the pool that
+	 * will be used to set the offset destination pointer provided by the
+	 * caller.
+	 */
+	out->offset = HEAP_PTR_TO_OFF(heap, uptr);
+	out->usable_size = usize;
+
+	return 0;
+}
+
+/*
+ * palloc_reservation_create -- creates a volatile reservation of a
+ *	memory block.
+ *
+ * The first step in the allocation of a new block is reserving it in
+ * the transient heap - which is represented by the bucket abstraction.
+ *
+ * To provide optimal scaling for multi-threaded applications and reduce
+ * fragmentation the appropriate bucket is chosen depending on the
+ * current thread context and to which allocation class the requested
+ * size falls into.
+ *
+ * Once the bucket is selected, just enough memory is reserved for the
+ * requested size. The underlying block allocation algorithm
+ * (best-fit, next-fit, ...) varies depending on the bucket container.
+ */
+static int
+palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr constructor,
+			  void *arg, uint64_t extra_field, uint16_t object_flags, uint16_t class_id,
+			  uint32_t zset_id, struct dav_action_internal *out)
+{
+	int                  err       = 0;
+	struct memory_block *new_block = &out->m;
+	struct zoneset      *zset;
+
+	out->type = DAV_ACTION_TYPE_HEAP;
+
+	ASSERT(class_id < UINT8_MAX);
+	struct alloc_class *c = class_id == 0 ?
+		heap_get_best_class(heap, size) :
+		alloc_class_by_id(heap_alloc_classes(heap),
+			(uint8_t)class_id);
+
+	if (c == NULL) {
+		ERR("no allocation class for size %lu bytes", size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	zset = heap_get_zoneset(heap, zset_id);
+	if (zset == NULL) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	/*
+	 * The caller provided size in bytes, but buckets operate in
+	 * 'size indexes' which are multiples of the block size in the
+	 * bucket.
+	 *
+	 * For example, to allocate 500 bytes from a bucket that
+	 * provides 256 byte blocks two memory 'units' are required.
+	 */
+	ssize_t size_idx = alloc_class_calc_size_idx(c, size);
+
+	if (size_idx < 0) {
+		ERR("allocation class not suitable for size %lu bytes",
+			size);
+		errno = EINVAL;
+		return -1;
+	}
+	ASSERT(size_idx <= UINT32_MAX);
+	*new_block = MEMORY_BLOCK_NONE;
+	new_block->size_idx = (uint32_t)size_idx;
+
+	struct bucket *b = zoneset_bucket_acquire(zset, c->id);
+
+	err = heap_get_bestfit_block(heap, b, new_block);
+	if (err != 0)
+		goto out;
+
+	if (alloc_prep_block(heap, new_block, constructor, arg,
+		extra_field, object_flags, out) != 0) {
+		/*
+		 * Constructor returned non-zero value which means
+		 * the memory block reservation has to be rolled back.
+		 */
+		if (new_block->type == MEMORY_BLOCK_HUGE)
+			bucket_insert_block(b, new_block);
+		err = ECANCELED;
+		goto out;
+	}
+
+	/*
+	 * Each as of yet unfulfilled reservation needs to be tracked in the
+	 * runtime state.
+	 * The memory block cannot be put back into the global state unless
+	 * there are no active reservations.
+	 */
+	out->mresv = bucket_active_block(b);
+	if (out->mresv != NULL)
+		util_fetch_and_add64(&out->mresv->nresv, 1);
+
+	out->lock = new_block->m_ops->get_lock(new_block);
+	out->new_state = MEMBLOCK_ALLOCATED;
+
+out:
+	zoneset_bucket_release(b);
+
+	if (err == 0)
+		return 0;
+
+	errno = err;
+	return -1;
+}
+
+/*
+ * palloc_heap_action_exec -- executes a single heap action (alloc, free)
+ */
+static void
+palloc_heap_action_exec(struct palloc_heap *heap,
+	const struct dav_action_internal *act,
+	struct operation_context *ctx)
+{
+#ifdef DAV_EXTRA_DEBUG
+	if (act->m.m_ops->get_state(&act->m) == act->new_state) {
+		D_CRIT("invalid operation or heap corruption\n");
+		ASSERT(0);
+	}
+#endif
+
+	/*
+	 * The actual required metadata modifications are chunk-type
+	 * dependent, but it always is a modification of a single 8 byte
+	 * value - either modification of few bits in a bitmap or
+	 * changing a chunk type from free to used or vice versa.
+	 */
+	act->m.m_ops->prep_hdr(&act->m, act->new_state, ctx);
+}
+
+/*
+ * palloc_restore_free_chunk_state -- updates the runtime state of a free chunk.
+ *
+ * This function also takes care of coalescing of huge chunks.
+ */
+static void
+palloc_restore_free_chunk_state(struct palloc_heap *heap,
+	struct memory_block *m)
+{
+	struct zoneset *zset = heap_get_zoneset(heap, m->zone_id);
+
+	if (m->type == MEMORY_BLOCK_HUGE) {
+		struct bucket *b = zoneset_bucket_acquire(zset, DEFAULT_ALLOC_CLASS_ID);
+
+		if (heap_free_chunk_reuse(heap, b, m) != 0) {
+			if (errno == EEXIST)
+				FATAL("duplicate runtime chunk state, possible double free");
+			else
+				D_CRIT("unable to track runtime chunk state\n");
+		}
+		zoneset_bucket_release(b);
+	}
+}
+
+/*
+ * palloc_mem_action_noop -- empty handler for unused memory action funcs
+ */
+static void
+palloc_mem_action_noop(struct palloc_heap *heap,
+	struct dav_action_internal *act)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(heap, act);
+}
+
+/*
+ * palloc_reservation_clear -- clears the reservation state of the block,
+ *	discards the associated memory block if possible
+ */
+static void
+palloc_reservation_clear(struct palloc_heap *heap,
+	struct dav_action_internal *act, int publish)
+{
+	if (act->mresv == NULL)
+		return;
+
+	struct memory_block_reserved *mresv = act->mresv;
+	struct bucket_locked *locked = mresv->bucket;
+
+	if (!publish) {
+		/*
+		 * If a memory block used for the action is the currently active
+		 * memory block of the bucket it can be returned back to the
+		 * bucket. This way it will be available for future allocation
+		 * requests, improving performance.
+		 */
+		struct bucket *b = bucket_acquire(locked);
+
+		bucket_try_insert_attached_block(b, &act->m);
+		bucket_release(b);
+	}
+
+	if (util_fetch_and_sub64(&mresv->nresv, 1) == 1) {
+		VALGRIND_ANNOTATE_HAPPENS_AFTER(&mresv->nresv);
+		/*
+		 * If the memory block used for the action is not currently used
+		 * in any bucket nor action it can be discarded (given back to
+		 * the heap).
+		 */
+		heap_discard_run(heap, &mresv->m);
+		D_FREE(mresv);
+	} else {
+		VALGRIND_ANNOTATE_HAPPENS_BEFORE(&mresv->nresv);
+	}
+}
+
+/*
+ * palloc_heap_action_on_cancel -- restores the state of the heap
+ */
+static void
+palloc_heap_action_on_cancel(struct palloc_heap *heap,
+	struct dav_action_internal *act)
+{
+	if (act->new_state == MEMBLOCK_FREE)
+		return;
+
+	VALGRIND_DO_MEMPOOL_FREE(heap->layout,
+		act->m.m_ops->get_user_data(&act->m));
+
+	act->m.m_ops->invalidate(&act->m);
+	palloc_restore_free_chunk_state(heap, &act->m);
+
+	palloc_reservation_clear(heap, act, 0 /* publish */);
+}
+
+/*
+ * palloc_heap_action_on_process -- performs finalization steps under a lock
+ *	on the persistent state
+ */
+static void
+palloc_heap_action_on_process(struct palloc_heap *heap,
+	struct dav_action_internal *act)
+{
+	if (act->new_state == MEMBLOCK_ALLOCATED) {
+		STATS_INC(heap->stats, persistent, heap_curr_allocated,
+			act->m.m_ops->get_real_size(&act->m));
+		if (act->m.type == MEMORY_BLOCK_RUN) {
+			STATS_INC(heap->stats, transient, heap_run_allocated,
+				act->m.m_ops->get_real_size(&act->m));
+		}
+	} else if (act->new_state == MEMBLOCK_FREE) {
+		if (On_memcheck) {
+			void *ptr = act->m.m_ops->get_user_data(&act->m);
+
+			VALGRIND_DO_MEMPOOL_FREE(heap->layout, ptr);
+		}
+
+		STATS_SUB(heap->stats, persistent, heap_curr_allocated,
+			act->m.m_ops->get_real_size(&act->m));
+		if (act->m.type == MEMORY_BLOCK_RUN) {
+			STATS_SUB(heap->stats, transient, heap_run_allocated,
+				act->m.m_ops->get_real_size(&act->m));
+		}
+		heap_memblock_on_free(heap, &act->m);
+	}
+}
+
+/*
+ * palloc_heap_action_on_unlock -- performs finalization steps that need to be
+ *	performed without a lock on persistent state
+ */
+static void
+palloc_heap_action_on_unlock(struct palloc_heap *heap,
+	struct dav_action_internal *act)
+{
+	if (act->new_state == MEMBLOCK_ALLOCATED)
+		palloc_reservation_clear(heap, act, 1 /* publish */);
+	else if (act->new_state == MEMBLOCK_FREE)
+		palloc_restore_free_chunk_state(heap, &act->m);
+}
+
+/*
+ * palloc_mem_action_exec -- executes a single memory action (set, and, or)
+ */
+static void
+palloc_mem_action_exec(struct palloc_heap *heap,
+	const struct dav_action_internal *act,
+	struct operation_context *ctx)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(heap);
+
+	operation_add_entry(ctx, act->ptr, act->value, ULOG_OPERATION_SET);
+}
+
+static const struct {
+	/*
+	 * Translate action into some number of operation_entry'ies.
+	 */
+	void (*exec)(struct palloc_heap *heap,
+		const struct dav_action_internal *act,
+		struct operation_context *ctx);
+
+	/*
+	 * Cancel any runtime state changes. Can be called only when action has
+	 * not been translated to persistent operation yet.
+	 */
+	void (*on_cancel)(struct palloc_heap *heap,
+		struct dav_action_internal *act);
+
+	/*
+	 * Final steps after persistent state has been modified. Performed
+	 * under action-specific lock.
+	 */
+	void (*on_process)(struct palloc_heap *heap,
+		struct dav_action_internal *act);
+
+	/*
+	 * Final steps after persistent state has been modified. Performed
+	 * after action-specific lock has been dropped.
+	 */
+	void (*on_unlock)(struct palloc_heap *heap,
+		struct dav_action_internal *act);
+} action_funcs[DAV_MAX_ACTION_TYPE] = {
+	[DAV_ACTION_TYPE_HEAP] = {
+		.exec = palloc_heap_action_exec,
+		.on_cancel = palloc_heap_action_on_cancel,
+		.on_process = palloc_heap_action_on_process,
+		.on_unlock = palloc_heap_action_on_unlock,
+	},
+	[DAV_ACTION_TYPE_MEM] = {
+		.exec = palloc_mem_action_exec,
+		.on_cancel = palloc_mem_action_noop,
+		.on_process = palloc_mem_action_noop,
+		.on_unlock = palloc_mem_action_noop,
+	}
+};
+
+/*
+ * palloc_action_compare -- compares two actions based on lock address
+ */
+static int
+palloc_action_compare(const void *lhs, const void *rhs)
+{
+	const struct dav_action_internal *mlhs = lhs;
+	const struct dav_action_internal *mrhs = rhs;
+	uintptr_t vlhs = (uintptr_t)(mlhs->lock);
+	uintptr_t vrhs = (uintptr_t)(mrhs->lock);
+
+	if (vlhs < vrhs)
+		return -1;
+	if (vlhs > vrhs)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * palloc_exec_actions -- perform the provided free/alloc operations
+ */
+static void
+palloc_exec_actions(struct palloc_heap *heap,
+	struct operation_context *ctx,
+	struct dav_action_internal *actv,
+	size_t actvcnt)
+{
+	/*
+	 * The operations array is sorted so that proper lock ordering is
+	 * ensured.
+	 */
+	if (actv)
+		qsort(actv, actvcnt, sizeof(struct dav_action_internal),
+			palloc_action_compare);
+	else
+		ASSERTeq(actvcnt, 0);
+
+	struct dav_action_internal *act;
+
+	for (size_t i = 0; i < actvcnt; ++i) {
+		act = &actv[i];
+
+		/*
+		 * This lock must be held for the duration between the creation
+		 * of the allocation metadata updates in the operation context
+		 * and the operation processing. This is because a different
+		 * thread might operate on the same 8-byte value of the run
+		 * bitmap and override allocation performed by this thread.
+		 */
+		if (i == 0 || act->lock != actv[i - 1].lock) {
+			if (act->lock)
+				util_mutex_lock(act->lock);
+		}
+
+		/* translate action to some number of operation_entry'ies */
+		action_funcs[act->type].exec(heap, act, ctx);
+	}
+
+	/* wait for all allocated object headers to be persistent */
+	mo_wal_drain(&heap->p_ops);
+
+	/* perform all persistent memory operations */
+	operation_process(ctx);
+
+	for (size_t i = 0; i < actvcnt; ++i) {
+		act = &actv[i];
+
+		action_funcs[act->type].on_process(heap, act);
+
+		if (i == actvcnt - 1 || act->lock != actv[i + 1].lock) {
+			if (act->lock)
+				util_mutex_unlock(act->lock);
+		}
+	}
+
+	for (size_t i = 0; i < actvcnt; ++i) {
+		act = &actv[i];
+
+		action_funcs[act->type].on_unlock(heap, act);
+	}
+
+	operation_finish(ctx, 0);
+}
+
+/*
+ * palloc_reserve -- creates a single reservation
+ */
+int
+palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg,
+	       uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t zset_id,
+	       struct dav_action *act)
+{
+	COMPILE_ERROR_ON(sizeof(struct dav_action) !=
+		sizeof(struct dav_action_internal));
+
+	return palloc_reservation_create(heap, size, constructor, arg, extra_field, object_flags,
+					 class_id, zset_id, (struct dav_action_internal *)act);
+}
+
+/*
+ * palloc_action_isalloc - action is a heap reservation
+ *			   created by palloc_reserve().
+ */
+int
+palloc_action_isalloc(struct dav_action *act)
+{
+	struct dav_action_internal *actp = (struct dav_action_internal *)act;
+
+	return ((actp->type == DAV_ACTION_TYPE_HEAP) &&
+		(actp->new_state == MEMBLOCK_ALLOCATED));
+}
+
+uint64_t
+palloc_get_realoffset(struct palloc_heap *heap, uint64_t off)
+{
+	struct memory_block m = memblock_from_offset(heap, off);
+
+	return HEAP_PTR_TO_OFF(m.heap, m.m_ops->get_real_data(&m));
+}
+
+/*
+ * palloc_get_prange -- get the start offset and size of allocated memory that
+ *			needs to be persisted.
+ *
+ * persist_udata - if true, persist the user data.
+ */
+void
+palloc_get_prange(struct dav_action *act, uint64_t *const offp, uint64_t *const sizep,
+		  int persist_udata)
+{
+	struct dav_action_internal *act_in = (struct dav_action_internal *)act;
+
+	D_ASSERT(act_in->type == DAV_ACTION_TYPE_HEAP);
+	/* we need to persist the header if present */
+	*offp = HEAP_PTR_TO_OFF(act_in->m.heap, act_in->m.m_ops->get_real_data(&act_in->m));
+	*sizep = header_type_to_size[act_in->m.header_type];
+
+	D_ASSERT(act_in->offset == *offp + header_type_to_size[act_in->m.header_type]);
+	/* persist the user data */
+	if (persist_udata)
+		*sizep += act_in->usable_size;
+}
+
+/*
+ * palloc_defer_free -- creates an internal deferred free action
+ */
+static void
+palloc_defer_free_create(struct palloc_heap *heap, uint64_t off,
+			 struct dav_action_internal *out)
+{
+	COMPILE_ERROR_ON(sizeof(struct dav_action) !=
+		sizeof(struct dav_action_internal));
+
+	out->type = DAV_ACTION_TYPE_HEAP;
+	out->offset = off;
+	out->m = memblock_from_offset(heap, off);
+
+	/*
+	 * For the duration of free we may need to protect surrounding
+	 * metadata from being modified.
+	 */
+	out->lock = out->m.m_ops->get_lock(&out->m);
+	out->mresv = NULL;
+	out->new_state = MEMBLOCK_FREE;
+}
+
+/*
+ * palloc_defer_free -- creates a deferred free action
+ */
+void
+palloc_defer_free(struct palloc_heap *heap, uint64_t off, struct dav_action *act)
+{
+	COMPILE_ERROR_ON(sizeof(struct dav_action) !=
+		sizeof(struct dav_action_internal));
+
+	palloc_defer_free_create(heap, off, (struct dav_action_internal *)act);
+}
+
+/*
+ * palloc_cancel -- cancels all reservations in the array
+ */
+void
+palloc_cancel(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt)
+{
+	struct dav_action_internal *act;
+
+	for (size_t i = 0; i < actvcnt; ++i) {
+		act = (struct dav_action_internal *)&actv[i];
+		action_funcs[act->type].on_cancel(heap, act);
+	}
+}
+
+/*
+ * palloc_publish -- publishes all reservations in the array
+ */
+void
+palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt,
+	       struct operation_context *ctx)
+{
+	palloc_exec_actions(heap, ctx,
+		(struct dav_action_internal *)actv, actvcnt);
+}
+
+/*
+ * palloc_operation -- persistent memory operation. Takes a NULL pointer
+ *	or an existing memory block and modifies it to occupy, at least, 'size'
+ *	number of bytes.
+ *
+ * The malloc, free and realloc routines are implemented in the context of this
+ * common operation which encompasses all of the functionality usually done
+ * separately in those methods.
+ *
+ * The first thing that needs to be done is determining which memory blocks
+ * will be affected by the operation - this varies depending on the whether the
+ * operation will need to modify or free an existing block and/or allocate
+ * a new one.
+ *
+ * Simplified allocation process flow is as follows:
+ *	- reserve a new block in the transient heap
+ *	- prepare the new block
+ *	- create redo log of required modifications
+ *		- chunk metadata
+ *		- offset of the new object
+ *	- commit and process the redo log
+ *
+ * And similarly, the deallocation process:
+ *	- create redo log of required modifications
+ *		- reverse the chunk metadata back to the 'free' state
+ *		- set the destination of the object offset to zero
+ *	- commit and process the redo log
+ * There's an important distinction in the deallocation process - it does not
+ * return the memory block to the transient container. That is done once no more
+ * memory is available.
+ *
+ * Reallocation is a combination of the above, with one additional step
+ * of copying the old content.
+ */
+int
+palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size,
+		 palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags,
+		 uint16_t class_id, uint32_t zset_id, struct operation_context *ctx)
+{
+	size_t user_size = 0;
+
+	size_t nops = 0;
+	uint64_t aoff;
+	uint64_t asize;
+	struct dav_action_internal ops[2];
+	struct dav_action_internal *alloc = NULL;
+	struct dav_action_internal *dealloc = NULL;
+
+	/*
+	 * The offset of an existing block can be nonzero which means this
+	 * operation is either free or a realloc - either way the offset of the
+	 * object needs to be translated into memory block, which is a structure
+	 * that all of the heap methods expect.
+	 */
+	if (off != 0) {
+		dealloc = &ops[nops++];
+		palloc_defer_free_create(heap, off, dealloc);
+		user_size = dealloc->m.m_ops->get_user_size(&dealloc->m);
+		if (user_size == size) {
+			operation_cancel(ctx);
+			return 0;
+		}
+	}
+
+	/* alloc or realloc */
+	if (size != 0) {
+		alloc = &ops[nops++];
+		if (palloc_reservation_create(heap, size, constructor, arg, extra_field,
+					      object_flags, class_id, zset_id, alloc) != 0) {
+			operation_cancel(ctx);
+			return -1;
+		}
+
+		palloc_get_prange((struct dav_action *)alloc, &aoff, &asize, 0);
+		if (asize) /* != CHUNK_FLAG_HEADER_NONE */
+			dav_wal_tx_snap(heap->p_ops.base, HEAP_OFF_TO_PTR(heap, aoff),
+					asize, HEAP_OFF_TO_PTR(heap, aoff), 0);
+	}
+
+	/* realloc */
+	if (alloc != NULL && dealloc != NULL) {
+		/* copy data to newly allocated memory */
+		size_t old_size = user_size;
+		size_t to_cpy = old_size > size ? size : old_size;
+
+		VALGRIND_ADD_TO_TX(
+			HEAP_OFF_TO_PTR(heap, alloc->offset),
+			to_cpy);
+		mo_wal_memcpy(&heap->p_ops,
+			HEAP_OFF_TO_PTR(heap, alloc->offset),
+			HEAP_OFF_TO_PTR(heap, off),
+			to_cpy,
+			0);
+		VALGRIND_REMOVE_FROM_TX(
+			HEAP_OFF_TO_PTR(heap, alloc->offset),
+			to_cpy);
+	}
+
+	/*
+	 * If the caller provided a destination value to update, it needs to be
+	 * modified atomically alongside the heap metadata, and so the operation
+	 * context must be used.
+	 */
+	if (dest_off) {
+		operation_add_entry(ctx, dest_off,
+			alloc ? alloc->offset : 0, ULOG_OPERATION_SET);
+	}
+
+	/* and now actually perform the requested operation! */
+	palloc_exec_actions(heap, ctx, ops, nops);
+
+	return 0;
+}
+
+/*
+ * palloc_usable_size -- returns the number of bytes in the memory block
+ */
+size_t
+palloc_usable_size(struct palloc_heap *heap, uint64_t off)
+{
+	struct memory_block m = memblock_from_offset(heap, off);
+
+	return m.m_ops->get_user_size(&m);
+}
+
+/*
+ * palloc_extra -- returns allocation extra field
+ */
+uint64_t
+palloc_extra(struct palloc_heap *heap, uint64_t off)
+{
+	struct memory_block m = memblock_from_offset(heap, off);
+
+	return m.m_ops->get_extra(&m);
+}
+
+/*
+ * palloc_flags -- returns allocation flags
+ */
+uint16_t
+palloc_flags(struct palloc_heap *heap, uint64_t off)
+{
+	struct memory_block m = memblock_from_offset(heap, off);
+
+	return m.m_ops->get_flags(&m);
+}
+
+/*
+ * pmalloc_search_cb -- (internal) foreach callback.
+ */
+static int
+pmalloc_search_cb(const struct memory_block *m, void *arg)
+{
+	struct memory_block *out = arg;
+
+	if (MEMORY_BLOCK_EQUALS(*m, *out))
+		return 0; /* skip the same object */
+
+	*out = *m;
+
+	return 1;
+}
+
+/*
+ * palloc_first -- returns the first object from the heap.
+ */
+uint64_t
+palloc_first(struct palloc_heap *heap)
+{
+	struct memory_block search = MEMORY_BLOCK_NONE;
+
+	heap_foreach_object(heap, pmalloc_search_cb,
+		&search, MEMORY_BLOCK_NONE);
+
+	if (MEMORY_BLOCK_IS_NONE(search))
+		return 0;
+
+	void *uptr = search.m_ops->get_user_data(&search);
+
+	return HEAP_PTR_TO_OFF(heap, uptr);
+}
+
+/*
+ * palloc_next -- returns the next object relative to 'off'.
+ */
+uint64_t
+palloc_next(struct palloc_heap *heap, uint64_t off)
+{
+	struct memory_block m = memblock_from_offset(heap, off);
+	struct memory_block search = m;
+
+	heap_foreach_object(heap, pmalloc_search_cb, &search, m);
+
+	if (MEMORY_BLOCK_IS_NONE(search) ||
+		MEMORY_BLOCK_EQUALS(search, m))
+		return 0;
+
+	void *uptr = search.m_ops->get_user_data(&search);
+
+	return HEAP_PTR_TO_OFF(heap, uptr);
+}
+
+/*
+ * palloc_boot -- initializes allocator section
+ */
+int
+palloc_boot(struct palloc_heap *heap, void *heap_start,
+	    uint64_t heap_size, uint64_t *sizep,
+	    void *base, struct mo_ops *p_ops, struct stats *stats,
+	    struct pool_set *set)
+{
+	return heap_boot(heap, heap_start, heap_size, sizep,
+		base, p_ops, stats, set);
+}
+
+/*
+ * palloc_init -- initializes palloc heap
+ */
+int
+palloc_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops)
+{
+	return heap_init(heap_start, heap_size, sizep, p_ops);
+}
+
+/*
+ * palloc_heap_end -- returns first address after heap
+ */
+void *
+palloc_heap_end(struct palloc_heap *h)
+{
+	return heap_end(h);
+}
+
+/*
+ * palloc_heap_check -- verifies heap state
+ */
+int
+palloc_heap_check(void *heap_start, uint64_t heap_size)
+{
+	return heap_check(heap_start, heap_size);
+}
+
+/*
+ * palloc_heap_check_remote -- verifies state of remote replica
+ */
+int
+palloc_heap_check_remote(void *heap_start, uint64_t heap_size,
+	struct remote_ops *ops)
+{
+	return heap_check_remote(heap_start, heap_size, ops);
+}
+
+#if VG_MEMCHECK_ENABLED
+/*
+ * palloc_vg_register_alloc -- (internal) registers allocation header
+ * in Valgrind
+ */
+static int
+palloc_vg_register_alloc(const struct memory_block *m, void *arg)
+{
+	struct palloc_heap *heap = arg;
+
+	m->m_ops->reinit_header(m);
+
+	void *uptr = m->m_ops->get_user_data(m);
+	size_t usize = m->m_ops->get_user_size(m);
+
+	VALGRIND_DO_MEMPOOL_ALLOC(heap->layout, uptr, usize);
+	VALGRIND_DO_MAKE_MEM_DEFINED(uptr, usize);
+
+	return 0;
+}
+
+/*
+ * palloc_heap_vg_open -- notifies Valgrind about heap layout
+ */
+void
+palloc_heap_vg_open(struct palloc_heap *heap, int objects)
+{
+	heap_vg_open(heap, palloc_vg_register_alloc, heap, objects);
+}
+#endif
diff --git a/src/common/dav_v2/palloc.h b/src/common/dav_v2/palloc.h
new file mode 100644
index 00000000000..0560cd97890
--- /dev/null
+++ b/src/common/dav_v2/palloc.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2020, Intel Corporation */
+
+/*
+ * palloc.h -- internal definitions for persistent allocator
+ */
+
+#ifndef __DAOS_COMMON_PALLOC_H
+#define __DAOS_COMMON_PALLOC_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "memops.h"
+#include "ulog.h"
+#include "valgrind_internal.h"
+#include "stats.h"
+#include "dav_v2.h"
+
+#define PALLOC_CTL_DEBUG_NO_PATTERN (-1)
+
+struct palloc_heap {
+	struct mo_ops       p_ops;
+	struct heap_layout *layout;
+	struct heap_rt     *rt;
+	uint64_t           *sizep;
+	uint64_t            growsize;
+	struct stats       *stats;
+	struct pool_set    *set;
+	void               *base;
+	int                 alloc_pattern;
+};
+
+struct memory_block;
+struct zoneset;
+
+typedef int (*palloc_constr)(void *base, void *ptr, size_t usable_size, void *arg);
+
+int
+palloc_operation(struct palloc_heap *heap, uint64_t off, uint64_t *dest_off, size_t size,
+		 palloc_constr constructor, void *arg, uint64_t extra_field, uint16_t object_flags,
+		 uint16_t class_id, uint32_t zset_id, struct operation_context *ctx);
+
+int
+palloc_reserve(struct palloc_heap *heap, size_t size, palloc_constr constructor, void *arg,
+	       uint64_t extra_field, uint16_t object_flags, uint16_t class_id, uint32_t zset_id,
+	       struct dav_action *act);
+
+int
+palloc_action_isalloc(struct dav_action *act);
+void
+palloc_get_prange(struct dav_action *act, uint64_t *const off, uint64_t *const size,
+		  int persist_udata);
+uint64_t
+palloc_get_realoffset(struct palloc_heap *heap, uint64_t off);
+
+void
+palloc_defer_free(struct palloc_heap *heap, uint64_t off, struct dav_action *act);
+
+void
+palloc_cancel(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt);
+
+void
+palloc_publish(struct palloc_heap *heap, struct dav_action *actv, size_t actvcnt,
+	       struct operation_context *ctx);
+
+void
+palloc_set_value(struct palloc_heap *heap, struct dav_action *act, uint64_t *ptr, uint64_t value);
+
+uint64_t
+palloc_first(struct palloc_heap *heap);
+uint64_t
+palloc_next(struct palloc_heap *heap, uint64_t off);
+
+size_t
+palloc_usable_size(struct palloc_heap *heap, uint64_t off);
+uint64_t
+palloc_extra(struct palloc_heap *heap, uint64_t off);
+uint16_t
+palloc_flags(struct palloc_heap *heap, uint64_t off);
+
+int
+palloc_boot(struct palloc_heap *heap, void *heap_start, uint64_t heap_size, uint64_t *sizep,
+	    void *base, struct mo_ops *p_ops, struct stats *stats, struct pool_set *set);
+
+int
+palloc_init(void *heap_start, uint64_t heap_size, uint64_t *sizep, struct mo_ops *p_ops);
+void *
+palloc_heap_end(struct palloc_heap *h);
+int
+palloc_heap_check(void *heap_start, uint64_t heap_size);
+int
+palloc_heap_check_remote(void *heap_start, uint64_t heap_size, struct remote_ops *ops);
+size_t
+palloc_heap(void *heap_start);
+
+/* foreach callback, terminates iteration if return value is non-zero */
+typedef int (*object_callback)(const struct memory_block *m, void *arg);
+
+#if VG_MEMCHECK_ENABLED
+void
+palloc_heap_vg_open(struct palloc_heap *heap, int objects);
+#endif
+
+#endif /* __DAOS_COMMON_PALLOC_H */
diff --git a/src/common/dav_v2/queue.h b/src/common/dav_v2/queue.h
new file mode 100644
index 00000000000..654c60cec9b
--- /dev/null
+++ b/src/common/dav_v2/queue.h
@@ -0,0 +1,112 @@
+/*
+ * Source: glibc 2.24 (git://sourceware.org/glibc.git /misc/sys/queue.h)
+ *
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2016, Microsoft Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)queue.h	8.5 (Berkeley) 8/20/94
+ */
+
+#ifndef	__DAOS_COMMON_QUEUE_H_
+#define	__DAOS_COMMON_QUEUE_H_
+
+/*
+ * This file defines five types of data structures: singly-linked lists,
+ * lists, simple queues, tail queues, and circular queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The
+ * elements are singly linked for minimum space and pointer manipulation
+ * overhead at the expense of O(n) removal for arbitrary elements. New
+ * elements can be added to the list after an existing element or at the
+ * head of the list.  Elements being removed from the head of the list
+ * should use the explicit macro for this purpose for optimum
+ * efficiency. A singly-linked list may only be traversed in the forward
+ * direction.  Singly-linked lists are ideal for applications with large
+ * datasets and few or no removals or for implementing a LIFO queue.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ */
+
+/*
+ * Singly-linked List definitions.
+ */
+#define	DAV_SLIST_HEAD(name, type)					\
+struct name {								\
+	struct type *slh_first;	/* first element */			\
+}
+
+#define	DAV_SLIST_HEAD_INITIALIZER(head)				\
+	{ NULL }
+
+#define	DAV_SLIST_ENTRY(type)						\
+struct {								\
+	struct type *sle_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define	DAV_SLIST_INIT(head) ((head)->slh_first = NULL)
+
+#define	DAV_SLIST_INSERT_AFTER(slistelm, elm, field) do {		\
+	(elm)->field.sle_next = (slistelm)->field.sle_next;		\
+	(slistelm)->field.sle_next = (elm);				\
+} while (/*CONSTCOND*/0)
+
+#define	DAV_SLIST_INSERT_HEAD(head, elm, field) do {			\
+	(elm)->field.sle_next = (head)->slh_first;			\
+	(head)->slh_first = (elm);					\
+} while (/*CONSTCOND*/0)
+
+#define	DAV_SLIST_REMOVE_HEAD(head, field)				\
+	((head)->slh_first = (head)->slh_first->field.sle_next)
+
+#define	DAV_SLIST_REMOVE(head, elm, type, field) do {			\
+	if ((head)->slh_first == (elm)) {				\
+		DAV_SLIST_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		struct type *curelm = (head)->slh_first;		\
+		while (curelm->field.sle_next != (elm))			\
+			curelm = curelm->field.sle_next;		\
+		curelm->field.sle_next =				\
+		    curelm->field.sle_next->field.sle_next;		\
+	}								\
+} while (/*CONSTCOND*/0)
+
+#define	DAV_SLIST_FOREACH(var, head, field)					\
+	for ((var) = (head)->slh_first; (var); (var) = (var)->field.sle_next)
+
+/*
+ * Singly-linked List access methods.
+ */
+#define	DAV_SLIST_EMPTY(head)	((head)->slh_first == NULL)
+#define	DAV_SLIST_FIRST(head)	((head)->slh_first)
+#define	DAV_SLIST_NEXT(elm, field)	((elm)->field.sle_next)
+
+#endif	/* __DAOS_COMMON_QUEUE_H_ */
diff --git a/src/common/dav_v2/ravl.c b/src/common/dav_v2/ravl.c
new file mode 100644
index 00000000000..5192e2abbdb
--- /dev/null
+++ b/src/common/dav_v2/ravl.c
@@ -0,0 +1,613 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2022, Intel Corporation */
+
+/*
+ * ravl.c -- implementation of a RAVL tree
+ * https://sidsen.azurewebsites.net//papers/ravl-trees-journal.pdf
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "out.h"
+#include "ravl.h"
+#include "util.h"
+
+#define RAVL_DEFAULT_DATA_SIZE (sizeof(void *))
+
+enum ravl_slot_type {
+	RAVL_LEFT,
+	RAVL_RIGHT,
+
+	MAX_SLOTS,
+
+	RAVL_ROOT
+};
+
+struct ravl_node {
+	struct ravl_node *parent;
+	struct ravl_node *slots[MAX_SLOTS];
+	int32_t rank; /* cannot be greater than height of the subtree */
+	int32_t pointer_based;
+	char data[];
+};
+
+struct ravl {
+	struct ravl_node *root;
+	ravl_compare *compare;
+	size_t data_size;
+};
+
+/*
+ * ravl_new -- creates a new ravl tree instance
+ */
+struct ravl *
+ravl_new_sized(ravl_compare *compare, size_t data_size)
+{
+	struct ravl *r;
+
+	D_ALLOC_PTR_NZ(r);
+	if (r == NULL) {
+		D_CRIT("Malloc!\n");
+		return r;
+	}
+
+	r->compare = compare;
+	r->root = NULL;
+	r->data_size = data_size;
+
+	return r;
+}
+
+/*
+ * ravl_new -- creates a new tree that stores data pointers
+ */
+struct ravl *
+ravl_new(ravl_compare *compare)
+{
+	return ravl_new_sized(compare, RAVL_DEFAULT_DATA_SIZE);
+}
+
+/*
+ * ravl_clear_node -- (internal) recursively clears the given subtree,
+ *	calls callback in an in-order fashion. Optionally frees the given node.
+ */
+static void
+ravl_foreach_node(struct ravl_node *n, ravl_cb cb, void *arg, int free_node)
+{
+	if (n == NULL)
+		return;
+
+	ravl_foreach_node(n->slots[RAVL_LEFT], cb, arg, free_node);
+	if (cb)
+		cb((void *)n->data, arg);
+	ravl_foreach_node(n->slots[RAVL_RIGHT], cb, arg, free_node);
+
+	if (free_node)
+		D_FREE(n);
+}
+
+/*
+ * ravl_clear -- clears the entire tree, starting from the root
+ */
+void
+ravl_clear(struct ravl *ravl)
+{
+	ravl_foreach_node(ravl->root, NULL, NULL, 1);
+	ravl->root = NULL;
+}
+
+/*
+ * ravl_delete_cb -- clears and deletes the given ravl instance, calls callback
+ */
+void
+ravl_delete_cb(struct ravl *ravl, ravl_cb cb, void *arg)
+{
+	ravl_foreach_node(ravl->root, cb, arg, 1);
+	D_FREE(ravl);
+}
+
+/*
+ * ravl_delete -- clears and deletes the given ravl instance
+ */
+void
+ravl_delete(struct ravl *ravl)
+{
+	ravl_delete_cb(ravl, NULL, NULL);
+}
+
+/*
+ * ravl_foreach -- traverses the entire tree, calling callback for every node
+ */
+void
+ravl_foreach(struct ravl *ravl, ravl_cb cb, void *arg)
+{
+	ravl_foreach_node(ravl->root, cb, arg, 0);
+}
+
+/*
+ * ravl_empty -- checks whether the given tree is empty
+ */
+int
+ravl_empty(struct ravl *ravl)
+{
+	return ravl->root == NULL;
+}
+
+/*
+ * ravl_node_insert_constructor -- node data constructor for ravl_insert
+ */
+static void
+ravl_node_insert_constructor(void *data, size_t data_size, const void *arg)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(data_size);
+
+	/* copy only the 'arg' pointer */
+	memcpy(data, &arg, sizeof(arg));
+}
+
+/*
+ * ravl_node_copy_constructor -- node data constructor for ravl_emplace_copy
+ */
+static void
+ravl_node_copy_constructor(void *data, size_t data_size, const void *arg)
+{
+	memcpy(data, arg, data_size);
+}
+
+/*
+ * ravl_new_node -- (internal) allocates and initializes a new node
+ */
+static struct ravl_node *
+ravl_new_node(struct ravl *ravl, ravl_constr constr, const void *arg)
+{
+	struct ravl_node *n;
+
+	D_ALLOC_NZ(n, (sizeof(*n) + ravl->data_size));
+	if (n == NULL) {
+		D_CRIT("Malloc!\n");
+		return n;
+	}
+
+	n->parent = NULL;
+	n->slots[RAVL_LEFT] = NULL;
+	n->slots[RAVL_RIGHT] = NULL;
+	n->rank = 0;
+	n->pointer_based = constr == ravl_node_insert_constructor;
+	constr(n->data, ravl->data_size, arg);
+
+	return n;
+}
+
+/*
+ * ravl_slot_opposite -- (internal) returns the opposite slot type, cannot be
+ *	called for root type
+ */
+static enum ravl_slot_type
+ravl_slot_opposite(enum ravl_slot_type t)
+{
+	ASSERTne(t, RAVL_ROOT);
+
+	return t == RAVL_LEFT ? RAVL_RIGHT : RAVL_LEFT;
+}
+
+/*
+ * ravl_node_slot_type -- (internal) returns the type of the given node:
+ *	left child, right child or root
+ */
+static enum ravl_slot_type
+ravl_node_slot_type(struct ravl_node *n)
+{
+	if (n->parent == NULL)
+		return RAVL_ROOT;
+
+	return n->parent->slots[RAVL_LEFT] == n ? RAVL_LEFT : RAVL_RIGHT;
+}
+
+/*
+ * ravl_node_sibling -- (internal) returns the sibling of the given node,
+ *	NULL if the node is root (has no parent)
+ */
+static struct ravl_node *
+ravl_node_sibling(struct ravl_node *n)
+{
+	enum ravl_slot_type t = ravl_node_slot_type(n);
+
+	if (t == RAVL_ROOT)
+		return NULL;
+
+	return n->parent->slots[t == RAVL_LEFT ? RAVL_RIGHT : RAVL_LEFT];
+}
+
+/*
+ * ravl_node_ref -- (internal) returns the pointer to the memory location in
+ *	which the given node resides
+ */
+static struct ravl_node **
+ravl_node_ref(struct ravl *ravl, struct ravl_node *n)
+{
+	enum ravl_slot_type t = ravl_node_slot_type(n);
+
+	return t == RAVL_ROOT ? &ravl->root : &n->parent->slots[t];
+}
+
+/*
+ * ravl_rotate -- (internal) performs a rotation around a given node
+ *
+ * The node n swaps place with its parent. If n is right child, parent becomes
+ * the left child of n, otherwise parent becomes right child of n.
+ */
+static void
+ravl_rotate(struct ravl *ravl, struct ravl_node *n)
+{
+	ASSERTne(n->parent, NULL);
+	struct ravl_node *p = n->parent;
+	struct ravl_node **pref = ravl_node_ref(ravl, p);
+
+	enum ravl_slot_type t = ravl_node_slot_type(n);
+	enum ravl_slot_type t_opposite = ravl_slot_opposite(t);
+
+	n->parent = p->parent;
+	p->parent = n;
+	*pref = n;
+
+	p->slots[t] = n->slots[t_opposite];
+	if (p->slots[t] != NULL)
+		p->slots[t]->parent = p;
+	n->slots[t_opposite] = p;
+}
+
+/*
+ * ravl_node_rank -- (internal) returns the rank of the node
+ *
+ * For the purpose of balancing, NULL nodes have rank -1.
+ */
+static int
+ravl_node_rank(struct ravl_node *n)
+{
+	return n == NULL ? -1 : n->rank;
+}
+
+/*
+ * ravl_node_rank_difference_parent -- (internal) returns the rank different
+ *	between parent node p and its child n
+ *
+ * Every rank difference must be positive.
+ *
+ * Either of these can be NULL.
+ */
+static int
+ravl_node_rank_difference_parent(struct ravl_node *p, struct ravl_node *n)
+{
+	return ravl_node_rank(p) - ravl_node_rank(n);
+}
+
+/*
+ * ravl_node_rank_differenced - (internal) returns the rank difference between
+ *	parent and its child
+ *
+ * Can be used to check if a given node is an i-child.
+ */
+static int
+ravl_node_rank_difference(struct ravl_node *n)
+{
+	return ravl_node_rank_difference_parent(n->parent, n);
+}
+
+/*
+ * ravl_node_is_i_j -- (internal) checks if a given node is strictly i,j-node
+ */
+static int
+ravl_node_is_i_j(struct ravl_node *n, int i, int j)
+{
+	return (ravl_node_rank_difference_parent(n, n->slots[RAVL_LEFT]) == i &&
+		ravl_node_rank_difference_parent(n, n->slots[RAVL_RIGHT]) == j);
+}
+
+/*
+ * ravl_node_is -- (internal) checks if a given node is i,j-node or j,i-node
+ */
+static int
+ravl_node_is(struct ravl_node *n, int i, int j)
+{
+	return ravl_node_is_i_j(n, i, j) || ravl_node_is_i_j(n, j, i);
+}
+
+/*
+ * ravl_node_promote -- promotes a given node by increasing its rank
+ */
+static void
+ravl_node_promote(struct ravl_node *n)
+{
+	n->rank += 1;
+}
+
+/*
+ * ravl_node_promote -- demotes a given node by increasing its rank
+ */
+static void
+ravl_node_demote(struct ravl_node *n)
+{
+	ASSERT(n->rank > 0);
+	n->rank -= 1;
+}
+
+/*
+ * ravl_balance -- balances the tree after insert
+ *
+ * This function must restore the invariant that every rank
+ * difference is positive.
+ */
+static void
+ravl_balance(struct ravl *ravl, struct ravl_node *n)
+{
+	/* walk up the tree, promoting nodes */
+	while (n->parent && ravl_node_is(n->parent, 0, 1)) {
+		ravl_node_promote(n->parent);
+		n = n->parent;
+	}
+
+	/*
+	 * Either the rank rule holds or n is a 0-child whose sibling is an
+	 * i-child with i > 1.
+	 */
+	struct ravl_node *s = ravl_node_sibling(n);
+
+	if (!(ravl_node_rank_difference(n) == 0 &&
+	    ravl_node_rank_difference_parent(n->parent, s) > 1))
+		return;
+
+	struct ravl_node *y = n->parent;
+	/* if n is a left child, let z be n's right child and vice versa */
+	enum ravl_slot_type t = ravl_slot_opposite(ravl_node_slot_type(n));
+	struct ravl_node *z = n->slots[t];
+
+	if (z == NULL || ravl_node_rank_difference(z) == 2) {
+		ravl_rotate(ravl, n);
+		ravl_node_demote(y);
+	} else if (ravl_node_rank_difference(z) == 1) {
+		ravl_rotate(ravl, z);
+		ravl_rotate(ravl, z);
+		ravl_node_promote(z);
+		ravl_node_demote(n);
+		ravl_node_demote(y);
+	}
+}
+
+/*
+ * ravl_insert -- insert data into the tree
+ */
+int
+ravl_insert(struct ravl *ravl, const void *data)
+{
+	return ravl_emplace(ravl, ravl_node_insert_constructor, data);
+}
+
+/*
+ * ravl_insert -- copy construct data inside of a new tree node
+ */
+int
+ravl_emplace_copy(struct ravl *ravl, const void *data)
+{
+	return ravl_emplace(ravl, ravl_node_copy_constructor, data);
+}
+
+/*
+ * ravl_emplace -- construct data inside of a new tree node
+ */
+int
+ravl_emplace(struct ravl *ravl, ravl_constr constr, const void *arg)
+{
+	struct ravl_node *n = ravl_new_node(ravl, constr, arg);
+
+	if (n == NULL)
+		return -1;
+
+	/* walk down the tree and insert the new node into a missing slot */
+	struct ravl_node **dstp = &ravl->root;
+	struct ravl_node *dst = NULL;
+
+	while (*dstp != NULL) {
+		dst = (*dstp);
+		int cmp_result = ravl->compare(ravl_data(n), ravl_data(dst));
+
+		if (cmp_result == 0)
+			goto error_duplicate;
+
+		dstp = &dst->slots[cmp_result > 0];
+	}
+	n->parent = dst;
+	*dstp = n;
+
+	ravl_balance(ravl, n);
+
+	return 0;
+
+error_duplicate:
+	errno = EEXIST;
+	D_FREE(n);
+	return -1;
+}
+
+/*
+ * ravl_node_type_most -- (internal) returns left-most or right-most node in
+ *	the subtree
+ */
+static struct ravl_node *
+ravl_node_type_most(struct ravl_node *n, enum ravl_slot_type t)
+{
+	while (n->slots[t] != NULL)
+		n = n->slots[t];
+
+	return n;
+}
+
+/*
+ * ravl_node_cessor -- (internal) returns the successor or predecessor of the
+ *	node
+ */
+static struct ravl_node *
+ravl_node_cessor(struct ravl_node *n, enum ravl_slot_type t)
+{
+	/*
+	 * If t child is present, we are looking for t-opposite-most node
+	 * in t child subtree
+	 */
+	if (n->slots[t])
+		return ravl_node_type_most(n->slots[t], ravl_slot_opposite(t));
+
+	/* otherwise get the first parent on the t path */
+	while (n->parent != NULL && n == n->parent->slots[t])
+		n = n->parent;
+
+	return n->parent;
+}
+
+/*
+ * ravl_node_successor -- (internal) returns node's successor
+ *
+ * It's the first node larger than n.
+ */
+static struct ravl_node *
+ravl_node_successor(struct ravl_node *n)
+{
+	return ravl_node_cessor(n, RAVL_RIGHT);
+}
+
+/*
+ * ravl_node_successor -- (internal) returns node's successor
+ *
+ * It's the first node smaller than n.
+ */
+static struct ravl_node *
+ravl_node_predecessor(struct ravl_node *n)
+{
+	return ravl_node_cessor(n, RAVL_LEFT);
+}
+
+/*
+ * ravl_predicate_holds -- (internal) verifies the given predicate for
+ *	the current node in the search path
+ *
+ * If the predicate holds for the given node or a node that can be directly
+ * derived from it, returns 1. Otherwise returns 0.
+ */
+static int
+ravl_predicate_holds(int result, struct ravl_node **ret,
+	struct ravl_node *n, enum ravl_predicate flags)
+{
+	if (flags & RAVL_PREDICATE_EQUAL) {
+		if (result == 0) {
+			*ret = n;
+			return 1;
+		}
+	}
+	if (flags & RAVL_PREDICATE_GREATER) {
+		if (result < 0) { /* data < n->data */
+			*ret = n;
+			return 0;
+		} else if (result == 0) {
+			*ret = ravl_node_successor(n);
+			return 1;
+		}
+	}
+	if (flags & RAVL_PREDICATE_LESS) {
+		if (result > 0) { /* data > n->data */
+			*ret = n;
+			return 0;
+		} else if (result == 0) {
+			*ret = ravl_node_predecessor(n);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * ravl_find -- searches for the node in the tree
+ */
+struct ravl_node *
+ravl_find(struct ravl *ravl, const void *data, enum ravl_predicate flags)
+{
+	struct ravl_node *r = NULL;
+	struct ravl_node *n = ravl->root;
+
+	while (n) {
+		int result = ravl->compare(data, ravl_data(n));
+
+		if (ravl_predicate_holds(result, &r, n, flags))
+			return r;
+
+		n = n->slots[result > 0];
+	}
+
+	return r;
+}
+
+/*
+ * ravl_remove -- removes the given node from the tree
+ */
+void
+ravl_remove(struct ravl *ravl, struct ravl_node *n)
+{
+	if (n->slots[RAVL_LEFT] != NULL && n->slots[RAVL_RIGHT] != NULL) {
+		/* if both children are present, remove the successor instead */
+		struct ravl_node *s = ravl_node_successor(n);
+
+		memcpy(n->data, s->data, ravl->data_size);
+		ravl_remove(ravl, s);
+	} else {
+		/* swap n with the child that may exist */
+		struct ravl_node *r = n->slots[RAVL_LEFT] ?
+			n->slots[RAVL_LEFT] : n->slots[RAVL_RIGHT];
+
+		if (r != NULL)
+			r->parent = n->parent;
+
+		*ravl_node_ref(ravl, n) = r;
+		D_FREE(n);
+	}
+}
+
+/*
+ * ravl_data -- returns the data contained within the node
+ */
+void *
+ravl_data(struct ravl_node *node)
+{
+	if (node->pointer_based) {
+		void *data;
+
+		memcpy(&data, node->data, sizeof(void *));
+		return data;
+	} else {
+		return (void *)node->data;
+	}
+}
+
+/*
+ * ravl_first -- returns first (left-most) node in the tree
+ */
+struct ravl_node *
+ravl_first(struct ravl *ravl)
+{
+	if (ravl->root)
+		return ravl_node_type_most(ravl->root, RAVL_LEFT);
+
+	return NULL;
+}
+
+/*
+ * ravl_last -- returns last (right-most) node in the tree
+ */
+struct ravl_node *
+ravl_last(struct ravl *ravl)
+{
+	if (ravl->root)
+		return ravl_node_type_most(ravl->root, RAVL_RIGHT);
+
+	return NULL;
+}
diff --git a/src/common/dav_v2/ravl.h b/src/common/dav_v2/ravl.h
new file mode 100644
index 00000000000..e44f1877791
--- /dev/null
+++ b/src/common/dav_v2/ravl.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2021, Intel Corporation */
+
+/*
+ * ravl.h -- internal definitions for ravl tree
+ */
+
+#ifndef __DAOS_COMMON_RAVL_H
+#define __DAOS_COMMON_RAVL_H 1
+
+#include <stddef.h>
+
+struct ravl;
+struct ravl_node;
+
+enum ravl_predicate {
+	RAVL_PREDICATE_EQUAL		=	1 << 0,
+	RAVL_PREDICATE_GREATER		=	1 << 1,
+	RAVL_PREDICATE_LESS		=	1 << 2,
+	RAVL_PREDICATE_LESS_EQUAL	=
+		RAVL_PREDICATE_EQUAL | RAVL_PREDICATE_LESS,
+	RAVL_PREDICATE_GREATER_EQUAL	=
+		RAVL_PREDICATE_EQUAL | RAVL_PREDICATE_GREATER,
+};
+
+typedef int ravl_compare(const void *lhs, const void *rhs);
+typedef void ravl_cb(void *data, void *arg);
+typedef void ravl_constr(void *data, size_t data_size, const void *arg);
+
+struct ravl *ravl_new(ravl_compare *compare);
+struct ravl *ravl_new_sized(ravl_compare *compare, size_t data_size);
+void ravl_delete(struct ravl *ravl);
+void ravl_delete_cb(struct ravl *ravl, ravl_cb cb, void *arg);
+void ravl_foreach(struct ravl *ravl, ravl_cb cb, void *arg);
+int ravl_empty(struct ravl *ravl);
+void ravl_clear(struct ravl *ravl);
+int ravl_insert(struct ravl *ravl, const void *data);
+int ravl_emplace(struct ravl *ravl, ravl_constr constr, const void *arg);
+int ravl_emplace_copy(struct ravl *ravl, const void *data);
+
+struct ravl_node *ravl_find(struct ravl *ravl, const void *data,
+	enum ravl_predicate predicate_flags);
+struct ravl_node *ravl_first(struct ravl *ravl);
+struct ravl_node *ravl_last(struct ravl *ravl);
+void *ravl_data(struct ravl_node *node);
+void ravl_remove(struct ravl *ravl, struct ravl_node *node);
+
+#endif /* __DAOS_COMMON_RAVL_H */
diff --git a/src/common/dav_v2/ravl_interval.c b/src/common/dav_v2/ravl_interval.c
new file mode 100644
index 00000000000..de37ee167a0
--- /dev/null
+++ b/src/common/dav_v2/ravl_interval.c
@@ -0,0 +1,344 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2020-2022, Intel Corporation */
+
+/*
+ * ravl_interval.c -- ravl_interval implementation
+ */
+
+#include <stdbool.h>
+
+#include "ravl_interval.h"
+#include "sys_util.h"
+#include "ravl.h"
+
+/*
+ * ravl_interval - structure representing two points
+ *                 on the number line
+ */
+struct ravl_interval {
+	struct ravl *tree;
+	ravl_interval_min *get_min;
+	ravl_interval_max *get_max;
+};
+
+/*
+ * ravl_interval_node - structure holding min, max functions and address
+ */
+struct ravl_interval_node {
+	void *addr;
+	ravl_interval_min *get_min;
+	ravl_interval_max *get_max;
+	bool overlap;
+};
+
+/*
+ * ravl_interval_compare -- compare intervals by its boundaries
+ */
+static int
+ravl_interval_compare(const void *lhs, const void *rhs)
+{
+	const struct ravl_interval_node *left = lhs;
+	const struct ravl_interval_node *right = rhs;
+
+	/*
+	 * when searching, comparing should return the
+	 * earliest overlapped record
+	 */
+	if (left->overlap) {
+		if (left->get_min(left->addr) >= right->get_max(right->addr))
+			return 1;
+		if (left->get_min(left->addr) == right->get_min(right->addr))
+			return 0;
+		return -1;
+	}
+
+	/* when inserting, comparing shouldn't allow overlapping intervals */
+	if (left->get_min(left->addr) >= right->get_max(right->addr))
+		return 1;
+	if (left->get_max(left->addr) <= right->get_min(right->addr))
+		return -1;
+	return 0;
+}
+
+/*
+ * ravl_interval_delete - finalize the ravl interval module
+ */
+void
+ravl_interval_delete(struct ravl_interval *ri)
+{
+	ravl_delete(ri->tree);
+	ri->tree = NULL;
+	D_FREE(ri);
+}
+
+/*
+ * ravl_interval_delete_cb - finalize the ravl interval module with entries
+ * and execute provided callback function for each entry.
+ */
+void
+ravl_interval_delete_cb(struct ravl_interval *ri, ravl_cb cb, void *arg)
+{
+	ravl_delete_cb(ri->tree, cb, arg);
+	ri->tree = NULL;
+	D_FREE(ri);
+}
+
+/*
+ * ravl_interval_new -- initialize the ravl interval module
+ */
+struct ravl_interval *
+ravl_interval_new(ravl_interval_min *get_min, ravl_interval_max *get_max)
+{
+	struct ravl_interval *interval;
+
+	D_ALLOC_PTR_NZ(interval);
+	if (!interval)
+		return NULL;
+
+	interval->tree = ravl_new_sized(ravl_interval_compare,
+			sizeof(struct ravl_interval_node));
+	if (!(interval->tree))
+		goto free_alloc;
+
+	interval->get_min = get_min;
+	interval->get_max = get_max;
+
+	return interval;
+
+free_alloc:
+	D_FREE(interval);
+	return NULL;
+}
+
+/*
+ * ravl_interval_insert -- insert interval entry into the tree
+ */
+int
+ravl_interval_insert(struct ravl_interval *ri, void *addr)
+{
+	struct ravl_interval_node rin;
+
+	rin.addr = addr;
+	rin.get_min = ri->get_min;
+	rin.get_max = ri->get_max;
+	rin.overlap = false;
+
+	int ret = ravl_emplace_copy(ri->tree, &rin);
+
+	if (ret && errno)
+		return -errno;
+
+	return ret;
+}
+
+/*
+ * ravl_interval_remove -- remove interval entry from the tree
+ */
+int
+ravl_interval_remove(struct ravl_interval *ri, struct ravl_interval_node *rin)
+{
+	struct ravl_node *node = ravl_find(ri->tree, rin,
+			RAVL_PREDICATE_EQUAL);
+	if (!node)
+		return -ENOENT;
+
+	ravl_remove(ri->tree, node);
+
+	return 0;
+}
+
+/*
+ * ravl_interval_find_prior -- find overlapping interval starting prior to
+ *                             the current one
+ */
+static struct ravl_interval_node *
+ravl_interval_find_prior(struct ravl *tree, struct ravl_interval_node *rin)
+{
+	struct ravl_node *node;
+	struct ravl_interval_node *cur;
+
+	node = ravl_find(tree, rin, RAVL_PREDICATE_LESS);
+	if (!node)
+		return NULL;
+
+	cur = ravl_data(node);
+	/*
+	 * If the end of the found interval is below the searched boundary, then
+	 * those intervals are not overlapping.
+	 */
+	if (cur->get_max(cur->addr) <= rin->get_min(rin->addr))
+		return NULL;
+
+	return cur;
+}
+
+/*
+ * ravl_interval_find_eq -- find overlapping interval starting neither prior or
+ *                          lather than the current one
+ */
+static struct ravl_interval_node *
+ravl_interval_find_eq(struct ravl *tree, struct ravl_interval_node *rin)
+{
+	struct ravl_node *node;
+
+	node = ravl_find(tree, rin, RAVL_PREDICATE_EQUAL);
+	if (!node)
+		return NULL;
+
+	return ravl_data(node);
+}
+
+/*
+ * ravl_interval_find_later -- find overlapping interval starting later than
+ *                             the current one
+ */
+static struct ravl_interval_node *
+ravl_interval_find_later(struct ravl *tree, struct ravl_interval_node *rin)
+{
+	struct ravl_node *node;
+	struct ravl_interval_node *cur;
+
+	node = ravl_find(tree, rin, RAVL_PREDICATE_GREATER);
+	if (!node)
+		return NULL;
+
+	cur = ravl_data(node);
+
+	/*
+	 * If the beginning of the found interval is above the end of
+	 * the searched range, then those interval are not overlapping
+	 */
+	if (cur->get_min(cur->addr) >= rin->get_max(rin->addr))
+		return NULL;
+
+	return cur;
+}
+
+/*
+ * ravl_interval_find_equal -- find the interval with exact (min, max) range
+ */
+struct ravl_interval_node *
+ravl_interval_find_equal(struct ravl_interval *ri, void *addr)
+{
+	struct ravl_interval_node range;
+
+	range.addr = addr;
+	range.get_min = ri->get_min;
+	range.get_max = ri->get_max;
+	range.overlap = true;
+
+	struct ravl_node *node;
+
+	node = ravl_find(ri->tree, &range, RAVL_PREDICATE_EQUAL);
+	if (!node)
+		return NULL;
+
+	return ravl_data(node);
+}
+
+/*
+ * ravl_interval_find -- find the earliest interval within (min, max) range
+ */
+struct ravl_interval_node *
+ravl_interval_find(struct ravl_interval *ri, void *addr)
+{
+	struct ravl_interval_node range;
+
+	range.addr = addr;
+	range.get_min = ri->get_min;
+	range.get_max = ri->get_max;
+	range.overlap = true;
+
+	struct ravl_interval_node *cur;
+
+	cur = ravl_interval_find_prior(ri->tree, &range);
+	if (!cur)
+		cur = ravl_interval_find_eq(ri->tree, &range);
+	if (!cur)
+		cur = ravl_interval_find_later(ri->tree, &range);
+
+	return cur;
+}
+
+/*
+ * ravl_interval_data -- returns the data contained within an interval node
+ */
+void *
+ravl_interval_data(struct ravl_interval_node *rin)
+{
+	return (void *)rin->addr;
+}
+
+/*
+ * ravl_interval_find_first -- returns first interval in the tree
+ */
+struct ravl_interval_node *
+ravl_interval_find_first(struct ravl_interval *ri)
+{
+	struct ravl_node *first;
+
+	first = ravl_first(ri->tree);
+	if (first)
+		return ravl_data(first);
+
+	return NULL;
+}
+
+/*
+ * ravl_interval_find_last -- returns last interval in the tree
+ */
+struct ravl_interval_node *
+ravl_interval_find_last(struct ravl_interval *ri)
+{
+	struct ravl_node *last;
+
+	last = ravl_last(ri->tree);
+	if (last)
+		return ravl_data(last);
+
+	return NULL;
+}
+
+/*
+ * ravl_interval_find_next -- returns interval succeeding the one provided
+ */
+struct ravl_interval_node *
+ravl_interval_find_next(struct ravl_interval *ri, void *addr)
+{
+	struct ravl_interval_node range;
+
+	range.addr = addr;
+	range.get_min = ri->get_min;
+	range.get_max = ri->get_max;
+	range.overlap = true;
+
+	struct ravl_node *next = NULL;
+
+	next = ravl_find(ri->tree, &range, RAVL_PREDICATE_GREATER);
+	if (next)
+		return ravl_data(next);
+
+	return NULL;
+}
+
+/*
+ * ravl_interval_find_prev -- returns interval preceding the one provided
+ */
+struct ravl_interval_node *
+ravl_interval_find_prev(struct ravl_interval *ri, void *addr)
+{
+	struct ravl_interval_node range;
+
+	range.addr = addr;
+	range.get_min = ri->get_min;
+	range.get_max = ri->get_max;
+	range.overlap = true;
+
+	struct ravl_node *prev = NULL;
+
+	prev = ravl_find(ri->tree, &range, RAVL_PREDICATE_LESS);
+	if (prev)
+		return ravl_data(prev);
+
+	return NULL;
+}
diff --git a/src/common/dav_v2/ravl_interval.h b/src/common/dav_v2/ravl_interval.h
new file mode 100644
index 00000000000..0f1370da713
--- /dev/null
+++ b/src/common/dav_v2/ravl_interval.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2020-2021, Intel Corporation */
+
+/*
+ * ravl_interval.h -- internal definitions for ravl_interval
+ */
+
+#ifndef RAVL_INTERVAL_H
+#define RAVL_INTERVAL_H
+
+#include "ravl.h"
+
+struct ravl_interval;
+struct ravl_interval_node;
+
+typedef size_t ravl_interval_min(void *addr);
+typedef size_t ravl_interval_max(void *addr);
+
+struct ravl_interval *ravl_interval_new(ravl_interval_min *min,
+		ravl_interval_min *max);
+void ravl_interval_delete(struct ravl_interval *ri);
+void ravl_interval_delete_cb(struct ravl_interval *ri, ravl_cb cb, void *arg);
+int ravl_interval_insert(struct ravl_interval *ri, void *addr);
+int ravl_interval_remove(struct ravl_interval *ri,
+		struct ravl_interval_node *rin);
+struct ravl_interval_node *ravl_interval_find_equal(struct ravl_interval *ri,
+		void *addr);
+struct ravl_interval_node *ravl_interval_find(struct ravl_interval *ri,
+		void *addr);
+struct ravl_interval_node *ravl_interval_find_first(struct ravl_interval *ri);
+struct ravl_interval_node *ravl_interval_find_last(struct ravl_interval *ri);
+struct ravl_interval_node *ravl_interval_find_next(struct ravl_interval *ri,
+		void *addr);
+struct ravl_interval_node *ravl_interval_find_prev(struct ravl_interval *ri,
+		void *addr);
+void *ravl_interval_data(struct ravl_interval_node *rin);
+#endif
diff --git a/src/common/dav_v2/recycler.c b/src/common/dav_v2/recycler.c
new file mode 100644
index 00000000000..de948a9f9c5
--- /dev/null
+++ b/src/common/dav_v2/recycler.c
@@ -0,0 +1,323 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2023, Intel Corporation */
+
+/*
+ * recycler.c -- implementation of run recycler
+ */
+
+#include "heap.h"
+#include "recycler.h"
+#include "vec.h"
+#include "out.h"
+#include "util.h"
+#include "sys_util.h"
+#include "ravl.h"
+#include "valgrind_internal.h"
+
+#define THRESHOLD_MUL 4
+
+/*
+ * recycler_element_cmp -- compares two recycler elements
+ */
+static int
+recycler_element_cmp(const void *lhs, const void *rhs)
+{
+	const struct recycler_element *l = lhs;
+	const struct recycler_element *r = rhs;
+
+	int64_t diff = (int64_t)l->max_free_block - (int64_t)r->max_free_block;
+
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->free_space - (int64_t)r->free_space;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->zone_id - (int64_t)r->zone_id;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	diff = (int64_t)l->chunk_id - (int64_t)r->chunk_id;
+	if (diff != 0)
+		return diff > 0 ? 1 : -1;
+
+	return 0;
+}
+
+struct recycler {
+	struct ravl *runs;
+	struct palloc_heap *heap;
+	struct zoneset     *zset;
+
+	/*
+	 * How many unaccounted units there *might* be inside of the memory
+	 * blocks stored in the recycler.
+	 * The value is not meant to be accurate, but rather a rough measure on
+	 * how often should the memory block scores be recalculated.
+	 *
+	 * Per-chunk unaccounted units are shared for all zones, which might
+	 * lead to some unnecessary recalculations.
+	 */
+	size_t unaccounted_units[MAX_CHUNK];
+	size_t unaccounted_total;
+	size_t              nallocs;
+
+	VEC(, struct recycler_element) recalc;
+
+	pthread_mutex_t lock;
+};
+
+/*
+ * recycler_new -- creates new recycler instance
+ */
+struct recycler *
+recycler_new(struct palloc_heap *heap, size_t nallocs, struct zoneset *zset)
+{
+	struct recycler *r;
+
+	D_ALLOC_PTR_NZ(r);
+	if (r == NULL)
+		goto error_alloc_recycler;
+
+	r->runs = ravl_new_sized(recycler_element_cmp,
+		sizeof(struct recycler_element));
+	if (r->runs == NULL)
+		goto error_alloc_tree;
+
+	r->heap = heap;
+	r->nallocs = nallocs;
+	r->zset              = zset;
+	r->unaccounted_total = 0;
+	memset(&r->unaccounted_units, 0, sizeof(r->unaccounted_units));
+
+	VEC_INIT(&r->recalc);
+
+	util_mutex_init(&r->lock);
+
+	return r;
+
+error_alloc_tree:
+	D_FREE(r);
+error_alloc_recycler:
+	return NULL;
+}
+
+/*
+ * recycler_delete -- deletes recycler instance
+ */
+void
+recycler_delete(struct recycler *r)
+{
+	VEC_DELETE(&r->recalc);
+
+	util_mutex_destroy(&r->lock);
+	ravl_delete(r->runs);
+	D_FREE(r);
+}
+
+/*
+ * recycler_element_new -- calculates how many free bytes does a run have and
+ *	what's the largest request that the run can handle, returns that as
+ *	recycler element struct
+ */
+struct recycler_element
+recycler_element_new(struct palloc_heap *heap, const struct memory_block *m)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(heap);
+
+	/*
+	 * Counting of the clear bits can race with a concurrent deallocation
+	 * that operates on the same run. This race is benign and has absolutely
+	 * no effect on the correctness of this algorithm. Ideally, we would
+	 * avoid grabbing the lock, but helgrind gets very confused if we
+	 * try to disable reporting for this function.
+	 */
+	pthread_mutex_t *lock = m->m_ops->get_lock(m);
+
+	util_mutex_lock(lock);
+
+	struct recycler_element e = {
+		.free_space = 0,
+		.max_free_block = 0,
+		.chunk_id = m->chunk_id,
+		.zone_id = m->zone_id,
+	};
+	m->m_ops->calc_free(m, &e.free_space, &e.max_free_block);
+
+	util_mutex_unlock(lock);
+
+	return e;
+}
+
+/*
+ * recycler_put -- inserts new run into the recycler
+ */
+int
+recycler_put(struct recycler *r, struct recycler_element element)
+{
+	int ret = 0;
+
+	util_mutex_lock(&r->lock);
+
+	ret = ravl_emplace_copy(r->runs, &element);
+
+	util_mutex_unlock(&r->lock);
+
+	return ret;
+}
+
+/*
+ * recycler_get -- retrieves a chunk from the recycler
+ */
+int
+recycler_get(struct recycler *r, struct memory_block *m)
+{
+	int ret = 0;
+
+	util_mutex_lock(&r->lock);
+
+	struct recycler_element e = { .max_free_block = m->size_idx, 0, 0, 0};
+	struct ravl_node *n = ravl_find(r->runs, &e,
+		RAVL_PREDICATE_GREATER_EQUAL);
+	if (n == NULL) {
+		ret = ENOMEM;
+		goto out;
+	}
+
+	struct recycler_element *ne = ravl_data(n);
+
+	m->chunk_id = ne->chunk_id;
+	m->zone_id = ne->zone_id;
+
+	ravl_remove(r->runs, n);
+
+	struct chunk_header *hdr = heap_get_chunk_hdr(r->heap, m);
+
+	m->size_idx = hdr->size_idx;
+
+	memblock_rebuild_state(r->heap, m);
+
+out:
+	util_mutex_unlock(&r->lock);
+
+	return ret;
+}
+
+/*
+ * recycler_recalc -- recalculates the scores of runs in the recycler to match
+ *	the updated persistent state
+ */
+struct empty_runs
+recycler_recalc(struct recycler *r, int force)
+{
+	struct empty_runs runs;
+
+	VEC_INIT(&runs);
+
+	uint64_t units = r->unaccounted_total;
+
+	uint64_t recalc_threshold = THRESHOLD_MUL * r->nallocs;
+
+	if (!force && units < recalc_threshold)
+		return runs;
+
+	if (util_mutex_trylock(&r->lock) != 0)
+		return runs;
+
+	/* If the search is forced, recalculate everything */
+	uint64_t search_limit = force ? UINT64_MAX : units;
+
+	uint64_t found_units = 0;
+	struct memory_block nm = MEMORY_BLOCK_NONE;
+	struct ravl_node *n;
+	struct recycler_element next = {0, 0, 0, 0};
+	enum ravl_predicate p = RAVL_PREDICATE_GREATER_EQUAL;
+
+	do {
+		n = ravl_find(r->runs, &next, p);
+		if (n == NULL)
+			break;
+
+		p = RAVL_PREDICATE_GREATER;
+
+		struct recycler_element *ne = ravl_data(n);
+
+		next = *ne;
+
+		uint64_t chunk_units = r->unaccounted_units[ne->chunk_id];
+
+		if (!force && chunk_units == 0)
+			continue;
+
+		uint32_t existing_free_space = ne->free_space;
+
+		nm.chunk_id = ne->chunk_id;
+		nm.zone_id = ne->zone_id;
+		memblock_rebuild_state(r->heap, &nm);
+
+		struct recycler_element e = recycler_element_new(r->heap, &nm);
+
+		ASSERT(e.free_space >= existing_free_space);
+		uint64_t free_space_diff = e.free_space - existing_free_space;
+
+		found_units += free_space_diff;
+
+		if (free_space_diff == 0)
+			continue;
+
+		/*
+		 * Decrease the per chunk_id counter by the number of nallocs
+		 * found, increased by the blocks potentially freed in the
+		 * active memory block. Cap the sub value to prevent overflow.
+		 */
+		util_fetch_and_sub64(&r->unaccounted_units[nm.chunk_id],
+			MIN(chunk_units, free_space_diff + r->nallocs));
+
+		ravl_remove(r->runs, n);
+
+		if (e.free_space == r->nallocs) {
+			memblock_rebuild_state(r->heap, &nm);
+			if (VEC_PUSH_BACK(&runs, nm) != 0)
+				ASSERT(0); /* XXX: fix after refactoring */
+		} else {
+			VEC_PUSH_BACK(&r->recalc, e);
+		}
+	} while (found_units < search_limit);
+
+	struct recycler_element *e;
+
+	VEC_FOREACH_BY_PTR(e, &r->recalc) {
+		ravl_emplace_copy(r->runs, e);
+	}
+
+	VEC_CLEAR(&r->recalc);
+
+	util_mutex_unlock(&r->lock);
+
+	util_fetch_and_sub64(&r->unaccounted_total, units);
+
+	return runs;
+}
+
+/*
+ * recycler_inc_unaccounted -- increases the number of unaccounted units in the
+ *	recycler
+ */
+void
+recycler_inc_unaccounted(struct recycler *r, const struct memory_block *m)
+{
+	util_fetch_and_add64(&r->unaccounted_total, m->size_idx);
+	util_fetch_and_add64(&r->unaccounted_units[m->chunk_id],
+		m->size_idx);
+}
+
+/*
+ * Return the zoneset associated with the recycler.
+ */
+struct zoneset *
+recycler_get_zoneset(struct recycler *r)
+{
+	return r->zset;
+}
diff --git a/src/common/dav_v2/recycler.h b/src/common/dav_v2/recycler.h
new file mode 100644
index 00000000000..7904289937d
--- /dev/null
+++ b/src/common/dav_v2/recycler.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2023, Intel Corporation */
+
+/*
+ * recycler.h -- internal definitions of run recycler
+ *
+ * This is a container that stores runs that are currently not used by any of
+ * the buckets.
+ */
+
+#ifndef __DAOS_COMMON_RECYCLER_H
+#define __DAOS_COMMON_RECYCLER_H 1
+
+#include "memblock.h"
+#include "vec.h"
+
+struct recycler;
+VEC(empty_runs, struct memory_block);
+
+struct recycler_element {
+	uint32_t max_free_block;
+	uint32_t free_space;
+
+	uint32_t chunk_id;
+	uint32_t zone_id;
+};
+
+struct recycler      *
+recycler_new(struct palloc_heap *layout, size_t nallocs, struct zoneset *zset);
+void recycler_delete(struct recycler *r);
+struct recycler_element recycler_element_new(struct palloc_heap *heap,
+	const struct memory_block *m);
+
+int recycler_put(struct recycler *r, struct recycler_element element);
+
+int recycler_get(struct recycler *r, struct memory_block *m);
+
+struct empty_runs recycler_recalc(struct recycler *r, int force);
+
+void recycler_inc_unaccounted(struct recycler *r,
+	const struct memory_block *m);
+
+struct zoneset *
+recycler_get_zoneset(struct recycler *r);
+
+#endif /* __DAOS_COMMON_RECYCLER_H */
diff --git a/src/common/dav_v2/stats.c b/src/common/dav_v2/stats.c
new file mode 100644
index 00000000000..d7162a462f0
--- /dev/null
+++ b/src/common/dav_v2/stats.c
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2017-2021, Intel Corporation */
+
+/*
+ * stats.c -- implementation of statistics
+ */
+
+#include <errno.h>
+
+#include "dav_internal.h"
+#include "obj.h"
+#include "stats.h"
+
+/*
+ * stats_new -- allocates and initializes statistics instance
+ */
+struct stats *
+stats_new(dav_obj_t *pop)
+{
+	struct stats *s;
+
+	D_ALLOC_PTR_NZ(s);
+	if (s == NULL) {
+		D_CRIT("Malloc\n");
+		return NULL;
+	}
+
+	s->persistent = &pop->do_phdr->dp_stats_persistent;
+	VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(s->persistent, sizeof(*s->persistent));
+	D_ALLOC_PTR(s->transient);
+	if (s->transient == NULL)
+		goto error_transient_alloc;
+
+	return s;
+
+error_transient_alloc:
+	D_FREE(s);
+	return NULL;
+}
+
+/*
+ * stats_delete -- deletes statistics instance
+ */
+void
+stats_delete(dav_obj_t *pop, struct stats *s)
+{
+	D_FREE(s->transient);
+	D_FREE(s);
+}
+
+/*
+ * stats_persist -- save the persistent statistics to wal
+ */
+void
+stats_persist(dav_obj_t *pop, struct stats *s)
+{
+	if (s->transient->heap_prev_pval !=
+	    s->persistent->heap_curr_allocated) {
+		mo_wal_persist(&pop->p_ops, s->persistent,
+			       sizeof(struct stats_persistent));
+		s->transient->heap_prev_pval =
+		    s->persistent->heap_curr_allocated;
+	}
+}
+
+DAV_FUNC_EXPORT int
+dav_get_heap_stats_v2(dav_obj_t *pop, struct dav_heap_stats *st)
+{
+	if ((pop == NULL) || (st == NULL)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	st->curr_allocated = pop->do_stats->persistent->heap_curr_allocated;
+	st->run_allocated = pop->do_stats->transient->heap_run_allocated;
+	st->run_active = pop->do_stats->transient->heap_run_active;
+	return 0;
+}
diff --git a/src/common/dav_v2/stats.h b/src/common/dav_v2/stats.h
new file mode 100644
index 00000000000..ab3a0e33ee0
--- /dev/null
+++ b/src/common/dav_v2/stats.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2017-2021, Intel Corporation */
+
+/*
+ * stats.h -- definitions of statistics
+ */
+
+#ifndef __DAOS_COMMON_STATS_H
+#define __DAOS_COMMON_STATS_H 1
+
+struct stats_transient {
+	uint64_t heap_run_allocated;
+	uint64_t heap_run_active;
+	uint64_t heap_prev_pval; /* previous persisted value of curr allocated */
+};
+
+struct stats_persistent {
+	uint64_t heap_curr_allocated;
+};
+
+struct stats {
+	struct stats_transient *transient;
+	struct stats_persistent *persistent;
+};
+
+#define STATS_INC(stats, type, name, value) \
+	STATS_INC_##type(stats, name, value)
+
+#define STATS_INC_transient(stats, name, value)\
+	util_fetch_and_add64((&(stats)->transient->name), (value))
+
+#define STATS_INC_persistent(stats, name, value)\
+	util_fetch_and_add64((&(stats)->persistent->name), (value))
+
+#define STATS_SUB(stats, type, name, value)\
+	STATS_SUB_##type(stats, name, value)
+
+#define STATS_SUB_transient(stats, name, value)\
+	util_fetch_and_sub64((&(stats)->transient->name), (value))
+
+#define STATS_SUB_persistent(stats, name, value)\
+	util_fetch_and_sub64((&(stats)->persistent->name), (value))
+
+#define STATS_SET(stats, type, name, value)\
+	STATS_SET_##type(stats, name, value)
+
+#define STATS_SET_transient(stats, name, value)\
+	util_atomic_store_explicit64((&(stats)->transient->name),\
+		(value), memory_order_release)\
+
+#define STATS_SET_persistent(stats, name, value)\
+	util_atomic_store_explicit64((&(stats)->persistent->name),\
+		(value), memory_order_release)\
+
+struct dav_obj;
+
+struct stats *stats_new(struct dav_obj *pop);
+void stats_delete(struct dav_obj *pop, struct stats *stats);
+void stats_persist(struct dav_obj *pop, struct stats *s);
+
+#endif /* __DAOS_COMMON_STATS_H */
diff --git a/src/common/dav_v2/sys_util.h b/src/common/dav_v2/sys_util.h
new file mode 100644
index 00000000000..79d1a4f12d7
--- /dev/null
+++ b/src/common/dav_v2/sys_util.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2020, Intel Corporation */
+
+/*
+ * sys_util.h -- internal utility wrappers around system functions
+ */
+
+#ifndef __DAOS_COMMON_SYS_UTIL_H
+#define __DAOS_COMMON_SYS_UTIL_H 1
+
+#include <errno.h>
+
+#include <gurt/common.h>
+#include "out.h"
+
+/*
+ * util_mutex_init -- os_mutex_init variant that never fails from
+ * caller perspective. If os_mutex_init failed, this function aborts
+ * the program.
+ */
+static inline void
+util_mutex_init(pthread_mutex_t *m)
+{
+	int tmp = D_MUTEX_INIT(m, NULL);
+
+	D_ASSERTF(tmp == 0, "!os_mutex_init");
+}
+
+/*
+ * util_mutex_destroy -- os_mutex_destroy variant that never fails from
+ * caller perspective. If os_mutex_destroy failed, this function aborts
+ * the program.
+ */
+static inline void
+util_mutex_destroy(pthread_mutex_t *m)
+{
+	int tmp = D_MUTEX_DESTROY(m);
+
+	D_ASSERTF(tmp == 0, "!os_mutex_destroy");
+}
+
+/*
+ * util_mutex_lock -- os_mutex_lock variant that never fails from
+ * caller perspective. If os_mutex_lock failed, this function aborts
+ * the program.
+ */
+static inline void
+util_mutex_lock(pthread_mutex_t *m)
+{
+	int tmp = D_MUTEX_LOCK(m);
+
+	D_ASSERTF(tmp == 0, "!os_mutex_destroy");
+}
+
+/*
+ * util_mutex_trylock -- os_mutex_trylock variant that never fails from
+ * caller perspective (other than EBUSY). If util_mutex_trylock failed, this
+ * function aborts the program.
+ * Returns 0 if locked successfully, otherwise returns EBUSY.
+ */
+static inline int
+util_mutex_trylock(pthread_mutex_t *m)
+{
+	int tmp = D_MUTEX_TRYLOCK(m);
+
+	D_ASSERTF((!tmp || (tmp == -DER_BUSY)), "!os_mutex_trylock");
+	return tmp?EBUSY:0;
+}
+
+/*
+ * util_mutex_unlock -- os_mutex_unlock variant that never fails from
+ * caller perspective. If os_mutex_unlock failed, this function aborts
+ * the program.
+ */
+static inline void
+util_mutex_unlock(pthread_mutex_t *m)
+{
+	int tmp = D_MUTEX_UNLOCK(m);
+
+	D_ASSERTF(tmp == 0, "!os_mutex_unlock");
+}
+
+#endif /* __DAOS_COMMON_SYS_UTIL_H */
diff --git a/src/common/dav_v2/tx.c b/src/common/dav_v2/tx.c
new file mode 100644
index 00000000000..6d08757ea70
--- /dev/null
+++ b/src/common/dav_v2/tx.c
@@ -0,0 +1,1855 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2023, Intel Corporation */
+
+/*
+ * tx.c -- transactions implementation
+ */
+
+#include <inttypes.h>
+#include <wchar.h>
+#include <errno.h>
+
+#include "queue.h"
+#include "ravl.h"
+#include "obj.h"
+#include "out.h"
+#include "tx.h"
+#include "valgrind_internal.h"
+#include "memops.h"
+#include "dav_internal.h"
+
+struct tx_data {
+	DAV_SLIST_ENTRY(tx_data) tx_entry;
+	jmp_buf env;
+	enum dav_tx_failure_behavior failure_behavior;
+};
+
+struct tx {
+	dav_obj_t *pop;
+	enum dav_tx_stage stage;
+	int last_errnum;
+
+	DAV_SLIST_HEAD(txd, tx_data) tx_entries;
+
+	struct ravl *ranges;
+
+	VEC(, struct dav_action) actions;
+
+	dav_tx_callback stage_callback;
+	void *stage_callback_arg;
+
+	int first_snapshot;
+};
+
+/*
+ * get_tx -- returns current transaction
+ *
+ * This function should be used only in high-level functions.
+ */
+static struct tx *
+get_tx()
+{
+	static __thread struct tx tx;
+
+	return &tx;
+}
+
+struct tx_alloc_args {
+	uint64_t flags;
+	const void *copy_ptr;
+	size_t copy_size;
+};
+
+#define ALLOC_ARGS(flags)\
+(struct tx_alloc_args){flags, NULL, 0}
+
+struct tx_range_def {
+	uint64_t offset;
+	uint64_t size;
+	uint64_t flags;
+};
+
+/*
+ * tx_range_def_cmp -- compares two snapshot ranges
+ */
+static int
+tx_range_def_cmp(const void *lhs, const void *rhs)
+{
+	const struct tx_range_def *l = lhs;
+	const struct tx_range_def *r = rhs;
+
+	if (l->offset > r->offset)
+		return 1;
+	else if (l->offset < r->offset)
+		return -1;
+
+	return 0;
+}
+
+static void
+obj_tx_abort(int errnum, int user);
+
+/*
+ * obj_tx_fail_err -- (internal) dav_tx_abort variant that returns
+ * error code
+ */
+static inline int
+obj_tx_fail_err(int errnum, uint64_t flags)
+{
+	if ((flags & DAV_FLAG_TX_NO_ABORT) == 0)
+		obj_tx_abort(errnum, 0);
+	errno = errnum;
+	return errnum;
+}
+
+/*
+ * obj_tx_fail_null -- (internal) dav_tx_abort variant that returns
+ * null PMEMoid
+ */
+static inline uint64_t
+obj_tx_fail_null(int errnum, uint64_t flags)
+{
+	if ((flags & DAV_FLAG_TX_NO_ABORT) == 0)
+		obj_tx_abort(errnum, 0);
+	errno = errnum;
+	return 0;
+}
+
+/* ASSERT_IN_TX -- checks whether there's open transaction */
+#define ASSERT_IN_TX(tx) do {\
+	if ((tx)->stage == DAV_TX_STAGE_NONE)\
+		FATAL("%s called outside of transaction", __func__);\
+} while (0)
+
+/* ASSERT_TX_STAGE_WORK -- checks whether current transaction stage is WORK */
+#define ASSERT_TX_STAGE_WORK(tx) do {\
+	if ((tx)->stage != DAV_TX_STAGE_WORK)\
+		FATAL("%s called in invalid stage %d", __func__, (tx)->stage);\
+} while (0)
+
+/*
+ * tx_action_reserve -- (internal) reserve space for the given number of actions
+ */
+static int
+tx_action_reserve(struct tx *tx, size_t n)
+{
+	size_t entries_size = (VEC_SIZE(&tx->actions) + n) *
+		sizeof(struct ulog_entry_val);
+
+	if (operation_reserve(tx->pop->external, entries_size) != 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * tx_action_add -- (internal) reserve space and add a new tx action
+ */
+static struct dav_action *
+tx_action_add(struct tx *tx)
+{
+	if (tx_action_reserve(tx, 1) != 0)
+		return NULL;
+
+	VEC_INC_BACK(&tx->actions);
+
+	return &VEC_BACK(&tx->actions);
+}
+
+/*
+ * tx_action_remove -- (internal) remove last tx action
+ */
+static void
+tx_action_remove(struct tx *tx)
+{
+	VEC_POP_BACK(&tx->actions);
+}
+
+/*
+ * constructor_tx_alloc -- (internal) constructor for normal alloc
+ */
+static int
+constructor_tx_alloc(void *ctx, void *ptr, size_t usable_size, void *arg)
+{
+	ASSERTne(ptr, NULL);
+	ASSERTne(arg, NULL);
+
+	struct tx_alloc_args *args = arg;
+
+	/* do not report changes to the new object */
+	VALGRIND_ADD_TO_TX(ptr, usable_size);
+
+	if (args->flags & DAV_FLAG_ZERO)
+		memset(ptr, 0, usable_size);
+
+	if (args->copy_ptr && args->copy_size != 0) {
+		FATAL("dav xalloc does not support copy_ptr\n");
+		memcpy(ptr, args->copy_ptr, args->copy_size);
+	}
+
+	return 0;
+}
+
+/*
+ * tx_restore_range -- (internal) restore a single range from undo log
+ */
+static void
+tx_restore_range(dav_obj_t *pop, struct ulog_entry_buf *range)
+{
+	void *begin, *end;
+	size_t size = range->size;
+	uint64_t range_offset = ulog_entry_offset(&range->base);
+
+	begin = OBJ_OFF_TO_PTR(pop, range_offset);
+	end = (char *)begin + size;
+	ASSERT((char *)end >= (char *)begin);
+
+	memcpy(begin, range->data, size);
+}
+
+/*
+ * tx_undo_entry_apply -- applies modifications of a single ulog entry
+ */
+static int
+tx_undo_entry_apply(struct ulog_entry_base *e, void *arg,
+		    const struct mo_ops *p_ops)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(arg);
+
+	struct ulog_entry_buf *eb;
+
+	switch (ulog_entry_type(e)) {
+	case ULOG_OPERATION_BUF_CPY:
+		eb = (struct ulog_entry_buf *)e;
+
+		tx_restore_range(p_ops->base, eb);
+		break;
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	case ULOG_OPERATION_AND:
+	case ULOG_OPERATION_OR:
+#else
+	case ULOG_OPERATION_CLR_BITS:
+	case ULOG_OPERATION_SET_BITS:
+#endif
+	case ULOG_OPERATION_SET:
+	case ULOG_OPERATION_BUF_SET:
+	default:
+		ASSERT(0);
+	}
+
+	return 0;
+}
+
+/*
+ * tx_abort_set -- (internal) abort all set operations
+ */
+static void
+tx_abort_set(dav_obj_t *pop)
+{
+	ulog_foreach_entry((struct ulog *)&pop->clogs.undo,
+		tx_undo_entry_apply, NULL, &pop->p_ops);
+	operation_finish(pop->undo, ULOG_INC_FIRST_GEN_NUM);
+}
+
+/*
+ * tx_flush_range -- (internal) flush one range
+ */
+static void
+tx_flush_range(void *data, void *ctx)
+{
+	dav_obj_t *pop = ctx;
+	struct tx_range_def *range = data;
+
+	if (!(range->flags & DAV_FLAG_NO_FLUSH)) {
+		mo_wal_flush(&pop->p_ops, OBJ_OFF_TO_PTR(pop, range->offset),
+			     range->size, range->flags & DAV_XADD_WAL_CPTR);
+	}
+	VALGRIND_REMOVE_FROM_TX(OBJ_OFF_TO_PTR(pop, range->offset),
+				range->size);
+}
+
+/*
+ * tx_clean_range -- (internal) clean one range
+ */
+static void
+tx_clean_range(void *data, void *ctx)
+{
+	dav_obj_t *pop = ctx;
+	struct tx_range_def *range = data;
+
+	VALGRIND_REMOVE_FROM_TX(OBJ_OFF_TO_PTR(pop, range->offset),
+		range->size);
+	VALGRIND_SET_CLEAN(OBJ_OFF_TO_PTR(pop, range->offset), range->size);
+}
+
+/*
+ * tx_pre_commit -- (internal) do pre-commit operations
+ */
+static void
+tx_pre_commit(struct tx *tx)
+{
+	/* Flush all regions and destroy the whole tree. */
+	ravl_delete_cb(tx->ranges, tx_flush_range, tx->pop);
+	tx->ranges = NULL;
+}
+
+/*
+ * tx_abort -- (internal) abort all allocated objects
+ */
+static void
+tx_abort(dav_obj_t *pop)
+{
+	struct tx *tx = get_tx();
+
+	tx_abort_set(pop);
+
+	ravl_delete_cb(tx->ranges, tx_clean_range, pop);
+	palloc_cancel(pop->do_heap,
+		VEC_ARR(&tx->actions), VEC_SIZE(&tx->actions));
+	tx->ranges = NULL;
+}
+
+/*
+ * tx_ranges_insert_def -- (internal) allocates and inserts a new range
+ *	definition into the ranges tree
+ */
+static int
+tx_ranges_insert_def(dav_obj_t *pop, struct tx *tx,
+	const struct tx_range_def *rdef)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(pop);
+
+	DAV_DBG("(%lu,%lu) size=%zu",
+		rdef->offset / 4096, rdef->offset % 4096, rdef->size);
+
+	int ret = ravl_emplace_copy(tx->ranges, rdef);
+
+	if (ret && errno == EEXIST)
+		FATAL("invalid state of ranges tree");
+	return ret;
+}
+
+/*
+ * tx_alloc_common -- (internal) common function for alloc and zalloc
+ */
+static uint64_t
+tx_alloc_common(struct tx *tx, size_t size, type_num_t type_num,
+		palloc_constr constructor, struct tx_alloc_args args)
+{
+	const struct tx_range_def *r;
+	uint64_t off;
+
+	if (size > DAV_MAX_ALLOC_SIZE) {
+		ERR("requested size too large");
+		return obj_tx_fail_null(ENOMEM, args.flags);
+	}
+
+	dav_obj_t *pop = tx->pop;
+
+	struct dav_action *action = tx_action_add(tx);
+
+	if (action == NULL)
+		return obj_tx_fail_null(ENOMEM, args.flags);
+
+	if (palloc_reserve(pop->do_heap, size, constructor, &args, type_num, 0,
+			   CLASS_ID_FROM_FLAG(args.flags), EZONE_ID_FROM_FLAG(args.flags),
+			   action) != 0)
+		goto err_oom;
+
+	palloc_get_prange(action, &off, &size, 1);
+	r = &(struct tx_range_def){off, size, args.flags};
+	if (tx_ranges_insert_def(pop, tx, r) != 0)
+		goto err_oom;
+
+	return action->heap.offset;
+
+err_oom:
+	tx_action_remove(tx);
+	D_CRIT("out of memory\n");
+	return obj_tx_fail_null(ENOMEM, args.flags);
+}
+
+/*
+ * tx_create_wal_entry -- convert to WAL a single ulog UNDO entry
+ */
+int
+tx_create_wal_entry(struct ulog_entry_base *e, void *arg,
+		    const struct mo_ops *p_ops)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(arg);
+
+	int			 rc = 0;
+	uint64_t		 offset = ulog_entry_offset(e);
+	daos_size_t		 dst_size = sizeof(uint64_t);
+	struct ulog_entry_val	*ev;
+	struct ulog_entry_buf	*eb;
+	uint64_t		 v;
+	uint64_t		*dst;
+
+	D_ASSERT(p_ops->base != NULL);
+	dst = (uint64_t *)((uintptr_t)((dav_obj_t *)p_ops->base)->do_base + offset);
+
+	switch (ulog_entry_type(e)) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	case ULOG_OPERATION_AND:
+		ev = (struct ulog_entry_val *)e;
+		v = ev->value;
+
+		rc = dav_wal_tx_and(p_ops->base, dst, v);
+		break;
+	case ULOG_OPERATION_OR:
+		ev = (struct ulog_entry_val *)e;
+		v = ev->value;
+
+		rc = dav_wal_tx_or(p_ops->base, dst, v);
+		break;
+#else
+	case ULOG_OPERATION_CLR_BITS:
+		ev = (struct ulog_entry_val *)e;
+		v = ev->value;
+
+		rc = dav_wal_tx_clr_bits(p_ops->base, dst, ULOG_ENTRY_VAL_TO_POS(v),
+					 ULOG_ENTRY_VAL_TO_BITS(v));
+		break;
+	case ULOG_OPERATION_SET_BITS:
+		ev = (struct ulog_entry_val *)e;
+		v = ev->value;
+
+		rc = dav_wal_tx_set_bits(p_ops->base, dst, ULOG_ENTRY_VAL_TO_POS(v),
+					 ULOG_ENTRY_VAL_TO_BITS(v));
+		break;
+#endif
+	case ULOG_OPERATION_SET:
+		ev = (struct ulog_entry_val *)e;
+
+		rc = dav_wal_tx_snap(p_ops->base, dst, dst_size, (void *)&ev->value, 0);
+		break;
+	case ULOG_OPERATION_BUF_SET:
+		eb = (struct ulog_entry_buf *)e;
+
+		dst_size = eb->size;
+		rc = dav_wal_tx_set(p_ops->base, dst, 0, dst_size);
+		break;
+	case ULOG_OPERATION_BUF_CPY:
+		eb = (struct ulog_entry_buf *)e;
+
+		dst_size = eb->size;
+		/* The only undo entry from dav that needs to be
+		 * transformed into redo
+		 */
+		rc = dav_wal_tx_snap(p_ops->base, dst, dst_size, dst, 0);
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	return rc;
+}
+
+int
+lw_tx_begin(dav_obj_t *pop)
+{
+	struct umem_wal_tx	*utx = NULL;
+	int			 rc;
+	uint64_t		 wal_id;
+
+	rc = dav_wal_tx_reserve(pop, &wal_id);
+	if (rc) {
+		D_ERROR("so_wal_reserv failed, "DF_RC"\n", DP_RC(rc));
+		return rc;
+	}
+	if (pop->do_utx == NULL) {
+		utx = dav_umem_wtx_new(pop);
+		if (utx == NULL)
+			return obj_tx_fail_err(EINVAL, 0);
+	}
+	pop->do_utx->utx_id = wal_id;
+	return rc;
+}
+
+int
+lw_tx_end(dav_obj_t *pop, void *data)
+{
+	struct umem_wal_tx	*utx;
+	int			 rc;
+
+	/* Persist the frequently updated persistent globals */
+	stats_persist(pop, pop->do_stats);
+
+	utx = pop->do_utx;
+	D_ASSERT(utx != NULL);
+	pop->do_utx = NULL;
+
+	rc = dav_wal_tx_commit(pop, utx, data);
+	D_FREE(utx);
+	return rc;
+}
+
+/*
+ * dav_tx_begin -- initializes new transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_begin_v2(dav_obj_t *pop, jmp_buf env, ...)
+{
+	int		 err = 0;
+	struct tx	*tx = get_tx();
+	uint64_t	 wal_id;
+
+	enum dav_tx_failure_behavior failure_behavior = DAV_TX_FAILURE_ABORT;
+
+	if (tx->stage == DAV_TX_STAGE_WORK) {
+		if (tx->pop != pop) {
+			ERR("nested transaction for different pool");
+			return obj_tx_fail_err(EINVAL, 0);
+		}
+
+		/* inherits this value from the parent transaction */
+		struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+		failure_behavior = txd->failure_behavior;
+
+		VALGRIND_START_TX;
+	} else if (tx->stage == DAV_TX_STAGE_NONE) {
+		struct umem_wal_tx *utx = NULL;
+
+		DAV_DBG("");
+		err = dav_wal_tx_reserve(pop, &wal_id);
+		if (err) {
+			D_ERROR("so_wal_reserv failed, "DF_RC"\n", DP_RC(err));
+			goto err_abort;
+		}
+
+		if (pop->do_utx == NULL) {
+			utx = dav_umem_wtx_new(pop);
+			if (utx == NULL) {
+				err = ENOMEM;
+				goto err_abort;
+			}
+		}
+		pop->do_utx->utx_id = wal_id;
+
+		tx = get_tx();
+
+		VALGRIND_START_TX;
+
+		dav_hold_clogs(pop);
+		operation_start(pop->undo);
+
+		VEC_INIT(&tx->actions);
+		DAV_SLIST_INIT(&tx->tx_entries);
+
+		tx->ranges = ravl_new_sized(tx_range_def_cmp,
+			sizeof(struct tx_range_def));
+		tx->first_snapshot = 1;
+		tx->pop = pop;
+	} else {
+		FATAL("Invalid stage %d to begin new transaction", tx->stage);
+	}
+
+	struct tx_data *txd;
+
+	D_ALLOC_PTR_NZ(txd);
+	if (txd == NULL) {
+		err = errno;
+		D_CRIT("Malloc!\n");
+		goto err_abort;
+	}
+
+	tx->last_errnum = 0;
+	ASSERT(env == NULL);
+	if (env != NULL)
+		memcpy(txd->env, env, sizeof(jmp_buf));
+	else
+		memset(txd->env, 0, sizeof(jmp_buf));
+
+	txd->failure_behavior = failure_behavior;
+
+	DAV_SLIST_INSERT_HEAD(&tx->tx_entries, txd, tx_entry);
+
+	tx->stage = DAV_TX_STAGE_WORK;
+
+	/* handle locks */
+	va_list argp;
+
+	va_start(argp, env);
+
+	enum dav_tx_param param_type;
+
+	while ((param_type = va_arg(argp, enum dav_tx_param)) !=
+			DAV_TX_PARAM_NONE) {
+		if (param_type == DAV_TX_PARAM_CB) {
+			dav_tx_callback cb =
+					va_arg(argp, dav_tx_callback);
+			void *arg = va_arg(argp, void *);
+
+			if (tx->stage_callback &&
+					(tx->stage_callback != cb ||
+					tx->stage_callback_arg != arg)) {
+				FATAL(
+			 "transaction callback is already set, old %p new %p old_arg %p new_arg %p",
+					tx->stage_callback, cb,
+					tx->stage_callback_arg, arg);
+			}
+
+			tx->stage_callback = cb;
+			tx->stage_callback_arg = arg;
+		} else {
+			ASSERT(param_type == DAV_TX_PARAM_CB);
+		}
+	}
+	va_end(argp);
+
+	ASSERT(err == 0);
+	return 0;
+
+err_abort:
+	if (tx->stage == DAV_TX_STAGE_WORK)
+		obj_tx_abort(err, 0);
+	else
+		tx->stage = DAV_TX_STAGE_ONABORT;
+	return err;
+}
+
+/*
+ * tx_abort_on_failure_flag -- (internal) return 0 or DAV_FLAG_TX_NO_ABORT
+ * based on transaction setting
+ */
+static uint64_t
+tx_abort_on_failure_flag(struct tx *tx)
+{
+	struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+	if (txd->failure_behavior == DAV_TX_FAILURE_RETURN)
+		return DAV_FLAG_TX_NO_ABORT;
+	return 0;
+}
+
+/*
+ * obj_tx_callback -- (internal) executes callback associated with current stage
+ */
+static void
+obj_tx_callback(struct tx *tx)
+{
+	if (!tx->stage_callback)
+		return;
+
+	struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+	/* is this the outermost transaction? */
+	if (DAV_SLIST_NEXT(txd, tx_entry) == NULL)
+		tx->stage_callback(tx->pop, tx->stage, tx->stage_callback_arg);
+}
+
+/*
+ * dav_tx_stage -- returns current transaction stage
+ */
+DAV_FUNC_EXPORT enum dav_tx_stage
+dav_tx_stage_v2(void)
+{
+	return get_tx()->stage;
+}
+
+/*
+ * obj_tx_abort -- aborts current transaction
+ */
+static void
+obj_tx_abort(int errnum, int user)
+{
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+	ASSERT(tx->pop != NULL);
+
+	if (errnum == 0)
+		errnum = ECANCELED;
+
+	tx->stage = DAV_TX_STAGE_ONABORT;
+	struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+	if (DAV_SLIST_NEXT(txd, tx_entry) == NULL) {
+		/* this is the outermost transaction */
+
+		/* process the undo log */
+		tx_abort(tx->pop);
+
+		dav_release_clogs(tx->pop);
+	}
+
+	tx->last_errnum = errnum;
+	errno = errnum;
+	if (user) {
+		DAV_DBG("!explicit transaction abort");
+	}
+
+	/* ONABORT */
+	obj_tx_callback(tx);
+
+	if (!util_is_zeroed(txd->env, sizeof(jmp_buf)))
+		longjmp(txd->env, errnum);
+}
+
+/*
+ * dav_tx_abort -- aborts current transaction
+ *
+ * Note: this function should not be called from inside of dav.
+ */
+DAV_FUNC_EXPORT void
+dav_tx_abort_v2(int errnum)
+{
+	DAV_API_START();
+	DAV_DBG("");
+	obj_tx_abort(errnum, 1);
+	DAV_API_END();
+}
+
+/*
+ * dav_tx_errno -- returns last transaction error code
+ */
+DAV_FUNC_EXPORT int
+dav_tx_errno_v2(void)
+{
+	DAV_DBG("err:%d", get_tx()->last_errnum);
+
+	return get_tx()->last_errnum;
+}
+
+static void
+tx_post_commit(struct tx *tx)
+{
+	operation_finish(tx->pop->undo, 0);
+}
+
+/*
+ * dav_tx_commit -- commits current transaction
+ */
+DAV_FUNC_EXPORT void
+dav_tx_commit_v2(void)
+{
+	DAV_API_START();
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+	ASSERT(tx->pop);
+	DAV_DBG("");
+
+	/* WORK */
+	obj_tx_callback(tx);
+	dav_obj_t *pop = tx->pop;
+
+	struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+	if (DAV_SLIST_NEXT(txd, tx_entry) == NULL) {
+		/* this is the outermost transaction */
+
+		/* pre-commit phase */
+		tx_pre_commit(tx);
+
+		mo_wal_drain(&pop->p_ops);
+
+		operation_start(pop->external);
+
+		palloc_publish(pop->do_heap, VEC_ARR(&tx->actions),
+			       VEC_SIZE(&tx->actions), pop->external);
+
+		tx_post_commit(tx);
+
+		dav_release_clogs(pop);
+	}
+
+	tx->stage = DAV_TX_STAGE_ONCOMMIT;
+
+	/* ONCOMMIT */
+	obj_tx_callback(tx);
+	DAV_API_END();
+}
+
+/*
+ * dav_tx_end -- ends current transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_end_v2(void *data)
+{
+	struct tx *tx = get_tx();
+
+	if (tx->stage == DAV_TX_STAGE_WORK)
+		FATAL("dav_tx_end called without dav_tx_commit");
+
+	if (tx->pop == NULL)
+		FATAL("dav_tx_end called without dav_tx_begin");
+
+	if (tx->stage_callback &&
+			(tx->stage == DAV_TX_STAGE_ONCOMMIT ||
+			 tx->stage == DAV_TX_STAGE_ONABORT)) {
+		tx->stage = DAV_TX_STAGE_FINALLY;
+		obj_tx_callback(tx);
+	}
+
+	struct tx_data *txd = DAV_SLIST_FIRST(&tx->tx_entries);
+
+	DAV_SLIST_REMOVE_HEAD(&tx->tx_entries, tx_entry);
+
+	D_FREE(txd);
+
+	VALGRIND_END_TX;
+	int ret = tx->last_errnum;
+
+	if (DAV_SLIST_EMPTY(&tx->tx_entries)) {
+		dav_obj_t *pop = tx->pop;
+		dav_tx_callback cb = tx->stage_callback;
+		void *arg = tx->stage_callback_arg;
+		int rc;
+
+		DAV_DBG("");
+		ASSERT(pop);
+		tx->pop = NULL;
+		tx->stage = DAV_TX_STAGE_NONE;
+		tx->stage_callback = NULL;
+		tx->stage_callback_arg = NULL;
+
+		VEC_DELETE(&tx->actions);
+		/* tx should not be accessed after this */
+
+		/* commit to WAL */
+		rc = lw_tx_end(pop, data);
+		/* TODO: Handle WAL commit errors */
+		D_ASSERT(rc == 0);
+
+		if (cb)
+			cb(pop, DAV_TX_STAGE_NONE, arg);
+	} else {
+		/* resume the next transaction */
+		tx->stage = DAV_TX_STAGE_WORK;
+
+		/* abort called within inner transaction, waterfall the error */
+		if (tx->last_errnum)
+			obj_tx_abort(tx->last_errnum, 0);
+	}
+
+	return ret;
+}
+
+/*
+ * vg_verify_initialized -- when executed under Valgrind verifies that
+ *   the buffer has been initialized; explicit check at snapshotting time,
+ *   because Valgrind may find it much later when it's impossible to tell
+ *   for which snapshot it triggered
+ */
+static void
+vg_verify_initialized(dav_obj_t *pop, const struct tx_range_def *def)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(pop, def);
+#if VG_MEMCHECK_ENABLED
+	if (!On_memcheck)
+		return;
+
+	VALGRIND_DO_DISABLE_ERROR_REPORTING;
+	char *start = OBJ_OFF_TO_PTR(pop, def->offset);
+	char *uninit = (char *)VALGRIND_CHECK_MEM_IS_DEFINED(start, def->size);
+
+	if (uninit) {
+		VALGRIND_PRINTF(
+			"Snapshotting uninitialized data in range <%p,%p> (<offset:0x%lx,size:0x%lx>)\n",
+			start, start + def->size, def->offset, def->size);
+
+		if (uninit != start)
+			VALGRIND_PRINTF("Uninitialized data starts at: %p\n",
+					uninit);
+
+		VALGRIND_DO_ENABLE_ERROR_REPORTING;
+		VALGRIND_CHECK_MEM_IS_DEFINED(start, def->size);
+	} else {
+		VALGRIND_DO_ENABLE_ERROR_REPORTING;
+	}
+#endif
+}
+
+/*
+ * dav_tx_add_snapshot -- (internal) creates a variably sized snapshot
+ */
+static int
+dav_tx_add_snapshot(struct tx *tx, struct tx_range_def *snapshot)
+{
+	/*
+	 * Depending on the size of the block, either allocate an
+	 * entire new object or use cache.
+	 */
+	void *ptr = OBJ_OFF_TO_PTR(tx->pop, snapshot->offset);
+
+	VALGRIND_ADD_TO_TX(ptr, snapshot->size);
+
+	/* do nothing */
+	if (snapshot->flags & DAV_XADD_NO_SNAPSHOT)
+		return 0;
+
+	if (!(snapshot->flags & DAV_XADD_ASSUME_INITIALIZED))
+		vg_verify_initialized(tx->pop, snapshot);
+
+	/*
+	 * If we are creating the first snapshot, setup a redo log action to
+	 * increment counter in the undo log, so that the log becomes
+	 * invalid once the redo log is processed.
+	 */
+	if (tx->first_snapshot) {
+		struct dav_action *action = tx_action_add(tx);
+
+		if (action == NULL)
+			return -1;
+
+		uint64_t *n = &tx->pop->clogs.undo.gen_num;
+
+		palloc_set_value(tx->pop->do_heap, action,
+			n, *n + 1);
+
+		tx->first_snapshot = 0;
+	}
+
+	return operation_add_buffer(tx->pop->undo, ptr, ptr, snapshot->size,
+		ULOG_OPERATION_BUF_CPY);
+}
+
+/*
+ * dav_tx_merge_flags -- (internal) common code for merging flags between
+ * two ranges to ensure resultant behavior is correct
+ */
+static void
+dav_tx_merge_flags(struct tx_range_def *dest, struct tx_range_def *merged)
+{
+	/*
+	 * DAV_XADD_NO_FLUSH should only be set in merged range if set in
+	 * both ranges
+	 */
+	if ((dest->flags & DAV_XADD_NO_FLUSH) &&
+				!(merged->flags & DAV_XADD_NO_FLUSH)) {
+		dest->flags = dest->flags & (~DAV_XADD_NO_FLUSH);
+	}
+
+	/*
+	 * Extend DAV_XADD_WAL_CPTR when merged.
+	 * REVISIT: Ideally merge should happen only if address ranges
+	 * overlap. Current code merges adjacent ranges even if only one
+	 * of them has this flag set. Fix this before closing DAOS-11049.
+	 */
+	if (merged->flags & DAV_XADD_WAL_CPTR)
+		dest->flags = dest->flags | DAV_XADD_WAL_CPTR;
+}
+
+/*
+ * dav_tx_add_common -- (internal) common code for adding persistent memory
+ * into the transaction
+ */
+static int
+dav_tx_add_common(struct tx *tx, struct tx_range_def *args)
+{
+	if (args->size > DAV_MAX_ALLOC_SIZE) {
+		ERR("snapshot size too large");
+		return obj_tx_fail_err(EINVAL, args->flags);
+	}
+
+	if (!OBJ_OFFRANGE_FROM_HEAP(tx->pop, args->offset, (args->offset + args->size))) {
+		ERR("object outside of heap");
+		return obj_tx_fail_err(EINVAL, args->flags);
+	}
+
+	int ret = 0;
+
+	/*
+	 * Search existing ranges backwards starting from the end of the
+	 * snapshot.
+	 */
+	struct tx_range_def r = *args;
+
+	DAV_DBG("(%lu,%lu) size=%zu", r.offset / 4096, r.offset % 4096, r.size);
+	struct tx_range_def search = {0, 0, 0};
+	/*
+	 * If the range is directly adjacent to an existing one,
+	 * they can be merged, so search for less or equal elements.
+	 */
+	enum ravl_predicate p = RAVL_PREDICATE_LESS_EQUAL;
+	struct ravl_node *nprev = NULL;
+
+	while (r.size != 0) {
+		search.offset = r.offset + r.size;
+		struct ravl_node *n = ravl_find(tx->ranges, &search, p);
+		/*
+		 * We have to skip searching for LESS_EQUAL because
+		 * the snapshot we would find is the one that was just
+		 * created.
+		 */
+		p = RAVL_PREDICATE_LESS;
+
+		struct tx_range_def *f = n ? ravl_data(n) : NULL;
+
+		size_t fend = f == NULL ? 0 : f->offset + f->size;
+		size_t rend = r.offset + r.size;
+
+		if (fend == 0 || fend < r.offset) {
+			/*
+			 * If found no range or the found range is not
+			 * overlapping or adjacent on the left side, we can just
+			 * create the entire r.offset + r.size snapshot.
+			 *
+			 * Snapshot:
+			 *	--+-
+			 * Existing ranges:
+			 *	---- (no ranges)
+			 * or	+--- (no overlap)
+			 * or	---+ (adjacent on on right side)
+			 */
+			if (nprev != NULL) {
+				/*
+				 * But, if we have an existing adjacent snapshot
+				 * on the right side, we can just extend it to
+				 * include the desired range.
+				 */
+				struct tx_range_def *fprev = ravl_data(nprev);
+
+				ASSERTeq(rend, fprev->offset);
+				fprev->offset -= r.size;
+				fprev->size += r.size;
+			} else {
+				/*
+				 * If we don't have anything adjacent, create
+				 * a new range in the tree.
+				 */
+				ret = tx_ranges_insert_def(tx->pop,
+					tx, &r);
+				if (ret != 0)
+					break;
+			}
+			ret = dav_tx_add_snapshot(tx, &r);
+			break;
+		} else if (fend <= rend) {
+			/*
+			 * If found range has its end inside of the desired
+			 * snapshot range, we can extend the found range by the
+			 * size leftover on the left side.
+			 *
+			 * Snapshot:
+			 *	--+++--
+			 * Existing ranges:
+			 *	+++---- (overlap on left)
+			 * or	---+--- (found snapshot is inside)
+			 * or	---+-++ (inside, and adjacent on the right)
+			 * or	+++++-- (desired snapshot is inside)
+			 *
+			 */
+			struct tx_range_def snapshot = *args;
+
+			snapshot.offset = fend;
+			/* the side not yet covered by an existing snapshot */
+			snapshot.size = rend - fend;
+
+			/* the number of bytes intersecting in both ranges */
+			size_t intersection = fend - MAX(f->offset, r.offset);
+
+			r.size -= intersection + snapshot.size;
+			f->size += snapshot.size;
+			dav_tx_merge_flags(f, args);
+
+			if (snapshot.size != 0) {
+				ret = dav_tx_add_snapshot(tx, &snapshot);
+				if (ret != 0)
+					break;
+			}
+
+			/*
+			 * If there's a snapshot adjacent on right side, merge
+			 * the two ranges together.
+			 */
+			if (nprev != NULL) {
+				struct tx_range_def *fprev = ravl_data(nprev);
+
+				ASSERTeq(rend, fprev->offset);
+				f->size += fprev->size;
+				dav_tx_merge_flags(f, fprev);
+				ravl_remove(tx->ranges, nprev);
+			}
+		} else if (fend >= r.offset) {
+			/*
+			 * If found range has its end extending beyond the
+			 * desired snapshot.
+			 *
+			 * Snapshot:
+			 *	--+++--
+			 * Existing ranges:
+			 *	-----++ (adjacent on the right)
+			 * or	----++- (overlapping on the right)
+			 * or	----+++ (overlapping and adjacent on the right)
+			 * or	--+++++ (desired snapshot is inside)
+			 *
+			 * Notice that we cannot create a snapshot based solely
+			 * on this information without risking overwriting an
+			 * existing one. We have to continue iterating, but we
+			 * keep the information about adjacent snapshots in the
+			 * nprev variable.
+			 */
+			size_t overlap = rend - MAX(f->offset, r.offset);
+
+			r.size -= overlap;
+			dav_tx_merge_flags(f, args);
+		} else {
+			ASSERT(0);
+		}
+
+		nprev = n;
+	}
+
+	if (ret != 0) {
+		DAV_DBG("out of memory\n");
+		return obj_tx_fail_err(ENOMEM, args->flags);
+	}
+
+	return 0;
+}
+
+/*
+ * dav_tx_add_range_direct -- adds persistent memory range into the
+ *					transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_add_range_direct_v2(const void *ptr, size_t size)
+{
+	DAV_API_START();
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+	ASSERT(tx->pop != NULL);
+
+	int ret;
+
+	uint64_t flags = tx_abort_on_failure_flag(tx);
+
+	if (!OBJ_PTR_FROM_POOL(tx->pop, ptr)) {
+		ERR("object outside of pool");
+		ret = obj_tx_fail_err(EINVAL, flags);
+		DAV_API_END();
+		return ret;
+	}
+
+	struct tx_range_def args = {
+		.offset = OBJ_PTR_TO_OFF(tx->pop, ptr),
+		.size = size,
+		.flags = flags,
+	};
+
+	ret = dav_tx_add_common(tx, &args);
+
+	DAV_API_END();
+	return ret;
+}
+
+/*
+ * dav_tx_xadd_range_direct -- adds persistent memory range into the
+ *					transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_xadd_range_direct_v2(const void *ptr, size_t size, uint64_t flags)
+{
+
+	DAV_API_START();
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	int ret;
+	uint64_t off;
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	if (flags & ~DAV_XADD_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64, flags
+			& ~DAV_XADD_VALID_FLAGS);
+		ret = obj_tx_fail_err(EINVAL, flags);
+		DAV_API_END();
+		return ret;
+	}
+
+	if (!OBJ_PTR_FROM_POOL(tx->pop, ptr)) {
+		ERR("object outside of pool");
+		ret = obj_tx_fail_err(EINVAL, flags);
+		DAV_API_END();
+		return ret;
+	}
+
+	off = OBJ_PTR_TO_OFF(tx->pop, ptr);
+	struct tx_range_def args = {
+		.offset = off,
+		.size = size,
+		.flags = flags,
+	};
+
+	ret = dav_tx_add_common(tx, &args);
+
+	DAV_API_END();
+	return ret;
+}
+
+/*
+ * dav_tx_add_range -- adds persistent memory range into the transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_add_range_v2(uint64_t hoff, size_t size)
+{
+	DAV_API_START();
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	int ret;
+
+	uint64_t flags = tx_abort_on_failure_flag(tx);
+
+	ASSERT(OBJ_OFF_IS_VALID(tx->pop, hoff));
+
+	struct tx_range_def args = {
+		.offset = hoff,
+		.size = size,
+		.flags = flags,
+	};
+
+	ret = dav_tx_add_common(tx, &args);
+
+	DAV_API_END();
+	return ret;
+}
+
+/*
+ * dav_tx_xadd_range -- adds persistent memory range into the transaction
+ */
+DAV_FUNC_EXPORT int
+dav_tx_xadd_range_v2(uint64_t hoff, size_t size, uint64_t flags)
+{
+	DAV_API_START();
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	int ret;
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	if (flags & ~DAV_XADD_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64, flags
+			& ~DAV_XADD_VALID_FLAGS);
+		ret = obj_tx_fail_err(EINVAL, flags);
+		DAV_API_END();
+		return ret;
+	}
+
+	ASSERT(OBJ_OFF_IS_VALID(tx->pop, hoff));
+
+	struct tx_range_def args = {
+		.offset = hoff,
+		.size = size,
+		.flags = flags,
+	};
+
+	ret = dav_tx_add_common(tx, &args);
+
+	DAV_API_END();
+	return ret;
+}
+
+/*
+ * dav_tx_alloc -- allocates a new object
+ */
+DAV_FUNC_EXPORT uint64_t
+dav_tx_alloc_v2(size_t size, uint64_t type_num, uint64_t flags)
+{
+	uint64_t off;
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	DAV_API_START();
+
+	if (size == 0) {
+		ERR("allocation with size 0");
+		off = obj_tx_fail_null(EINVAL, flags);
+		DAV_API_END();
+		return off;
+	}
+
+	if (flags & ~DAV_TX_XALLOC_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64, flags
+			& ~(DAV_TX_XALLOC_VALID_FLAGS));
+		off = obj_tx_fail_null(EINVAL, flags);
+		DAV_API_END();
+		return off;
+	}
+
+	off = tx_alloc_common(tx, size, (type_num_t)type_num,
+			constructor_tx_alloc, ALLOC_ARGS(flags));
+
+	DAV_API_END();
+	return off;
+}
+
+/*
+ * dav_tx_xfree -- frees an existing object, with no_abort option
+ */
+static int
+dav_tx_xfree(uint64_t off, uint64_t flags)
+{
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	if (flags & ~DAV_XFREE_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64,
+				flags & ~DAV_XFREE_VALID_FLAGS);
+		return obj_tx_fail_err(EINVAL, flags);
+	}
+
+	if (off == 0)
+		return 0;
+
+	dav_obj_t *pop = tx->pop;
+
+	ASSERT(pop != NULL);
+	ASSERT(OBJ_OFF_IS_VALID(pop, off));
+
+	DAV_API_START();
+
+	struct dav_action *action;
+	uint64_t roff = palloc_get_realoffset(pop->do_heap, off);
+
+	struct tx_range_def range = {roff, 0, 0};
+	struct ravl_node *n = ravl_find(tx->ranges, &range,
+			RAVL_PREDICATE_LESS_EQUAL);
+
+	/*
+	 * If attempting to free an object allocated within the same
+	 * transaction, simply cancel the alloc and remove it from the actions.
+	 */
+	if (n != NULL) {
+		struct tx_range_def *r = ravl_data(n);
+
+		if ((r->offset + r->size) < roff)
+			goto out;
+
+		VEC_FOREACH_BY_PTR(action, &tx->actions) {
+			if (action->type == DAV_ACTION_TYPE_HEAP &&
+			    action->heap.offset == off) {
+				void *ptr = OBJ_OFF_TO_PTR(pop, roff);
+				uint64_t toff, usize;
+
+				palloc_get_prange(action, &toff, &usize, 1);
+				D_ASSERT(usize <= r->size);
+				if ((r->offset == roff) && (r->size == usize)) {
+					/* Exact match. */
+					ravl_remove(tx->ranges, n);
+				} else if (r->offset == roff) {
+					/* Retain the right portion. */
+					r->offset += usize;
+					r->size   -= usize;
+				} else {
+					/* Retain the left portion. */
+					uint64_t osize = r->size;
+
+					r->size = roff - r->offset;
+
+					/* Still data after range remove. */
+					osize -= (r->size + usize);
+					if (osize) {
+						struct tx_range_def *r1 =
+							&(struct tx_range_def)
+							 {roff + usize, osize, r->flags};
+
+						tx_ranges_insert_def(pop, tx, r1);
+					}
+				}
+
+				VALGRIND_SET_CLEAN(ptr, usize);
+				VALGRIND_REMOVE_FROM_TX(ptr, usize);
+				palloc_cancel(pop->do_heap, action, 1);
+				VEC_ERASE_BY_PTR(&tx->actions, action);
+				DAV_API_END();
+				return 0;
+			}
+		}
+	}
+
+out:
+	action = tx_action_add(tx);
+	if (action == NULL) {
+		int ret = obj_tx_fail_err(errno, flags);
+
+		DAV_API_END();
+		return ret;
+	}
+
+	palloc_defer_free(pop->do_heap, off, action);
+
+	DAV_API_END();
+	return 0;
+}
+
+/*
+ * dav_tx_free -- frees an existing object
+ */
+DAV_FUNC_EXPORT int
+dav_tx_free_v2(uint64_t off)
+{
+	return dav_tx_xfree(off, 0);
+}
+
+DAV_FUNC_EXPORT void*
+dav_tx_off2ptr_v2(uint64_t off)
+{
+	struct tx *tx = get_tx();
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+	ASSERT(tx->pop != NULL);
+
+	ASSERT(OBJ_OFF_IS_VALID(tx->pop, off));
+	return (void *)OBJ_OFF_TO_PTR(tx->pop, off);
+}
+
+/* arguments for constructor_alloc */
+struct constr_args {
+	int zero_init;
+	dav_constr constructor;
+	void *arg;
+};
+
+/* arguments for constructor_alloc_root */
+struct carg_root {
+	size_t size;
+	dav_constr constructor;
+	void *arg;
+};
+
+/* arguments for constructor_realloc and constructor_zrealloc */
+struct carg_realloc {
+	void *ptr;
+	size_t old_size;
+	size_t new_size;
+	int zero_init;
+	type_num_t user_type;
+	dav_constr constructor;
+	void *arg;
+};
+
+/*
+ * constructor_zrealloc_root -- (internal) constructor for dav_root
+ */
+static int
+constructor_zrealloc_root(void *ctx, void *ptr, size_t usable_size, void *arg)
+{
+	dav_obj_t *pop = ctx;
+
+	DAV_DBG("pop %p ptr %p arg %p", pop, ptr, arg);
+
+	ASSERTne(ptr, NULL);
+	ASSERTne(arg, NULL);
+
+	VALGRIND_ADD_TO_TX(ptr, usable_size);
+
+	struct carg_realloc *carg = arg;
+
+	if (usable_size > carg->old_size) {
+		size_t grow_len = usable_size - carg->old_size;
+		void *new_data_ptr = (void *)((uintptr_t)ptr + carg->old_size);
+
+		mo_wal_memset(&pop->p_ops, new_data_ptr, 0, grow_len, 0);
+	}
+	int ret = 0;
+
+	if (carg->constructor)
+		ret = carg->constructor(pop, ptr, carg->arg);
+
+	VALGRIND_REMOVE_FROM_TX(ptr, usable_size);
+
+	return ret;
+}
+
+/*
+ * obj_realloc_root -- (internal) reallocate root object
+ */
+static int
+obj_alloc_root(dav_obj_t *pop, size_t size)
+{
+	struct operation_context *ctx;
+	struct carg_realloc carg;
+
+	DAV_DBG("pop %p size %zu", pop, size);
+
+	carg.ptr = OBJ_OFF_TO_PTR(pop, pop->do_phdr->dp_root_offset);
+	carg.old_size = pop->do_phdr->dp_root_size;
+	carg.new_size = size;
+	carg.user_type = 0;
+	carg.constructor = NULL;
+	carg.zero_init = 1;
+	carg.arg = NULL;
+
+	lw_tx_begin(pop);
+	ctx = pop->external;
+	operation_start(ctx);
+
+	operation_add_entry(ctx, &pop->do_phdr->dp_root_size, size, ULOG_OPERATION_SET);
+
+	int ret = palloc_operation(pop->do_heap, pop->do_phdr->dp_root_offset,
+			&pop->do_phdr->dp_root_offset, size,
+			constructor_zrealloc_root, &carg,
+			0, 0, 0, 0, ctx); /* REVISIT: object_flags and type num ignored*/
+
+	lw_tx_end(pop, NULL);
+	return ret;
+}
+
+/*
+ * dav_root_construct -- returns root object
+ */
+DAV_FUNC_EXPORT uint64_t
+dav_root_v2(dav_obj_t *pop, size_t size)
+{
+	DAV_DBG("pop %p size %zu", pop, size);
+
+	DAV_API_START();
+	if (size > DAV_MAX_ALLOC_SIZE) {
+		ERR("requested size too large");
+		errno = ENOMEM;
+		DAV_API_END();
+		return 0;
+	}
+
+	if (size == 0 && pop->do_phdr->dp_root_offset == 0) {
+		ERR("requested size cannot equals zero");
+		errno = EINVAL;
+		DAV_API_END();
+		return 0;
+	}
+
+	/* REVISIT START
+	 * For thread safety the below block has to be protected by lock
+	 */
+	if (size > pop->do_phdr->dp_root_size &&
+			obj_alloc_root(pop, size)) {
+		ERR("dav_root failed");
+		DAV_API_END();
+		return 0;
+	}
+
+	/* REVISIT END */
+
+	DAV_API_END();
+	return pop->do_phdr->dp_root_offset;
+}
+
+/*
+ * constructor_alloc -- (internal) constructor for obj_alloc_construct
+ */
+static int
+constructor_alloc(void *ctx, void *ptr, size_t usable_size, void *arg)
+{
+	dav_obj_t *pop = ctx;
+
+	struct mo_ops *p_ops = &pop->p_ops;
+
+	DAV_DBG("pop %p ptr %p arg %p", pop, ptr, arg);
+
+	ASSERTne(ptr, NULL);
+	ASSERTne(arg, NULL);
+
+	struct constr_args *carg = arg;
+
+	if (carg->zero_init)
+		mo_wal_memset(p_ops, ptr, 0, usable_size, 0);
+
+	int ret = 0;
+
+	if (carg->constructor)
+		ret = carg->constructor(pop, ptr, carg->arg);
+
+	return ret;
+}
+
+/*
+ * obj_alloc_construct -- (internal) allocates a new object with constructor
+ */
+static int
+obj_alloc_construct(dav_obj_t *pop, uint64_t *offp, size_t size,
+	type_num_t type_num, uint64_t flags,
+	dav_constr constructor, void *arg)
+{
+	struct operation_context *ctx;
+	struct constr_args carg;
+
+	if (size > DAV_MAX_ALLOC_SIZE) {
+		ERR("requested size too large");
+		errno = ENOMEM;
+		return -1;
+	}
+
+	carg.zero_init = flags & DAV_FLAG_ZERO;
+	carg.constructor = constructor;
+	carg.arg = arg;
+
+	lw_tx_begin(pop);
+	ctx = pop->external;
+	operation_start(ctx);
+
+	int ret = palloc_operation(pop->do_heap, 0, offp, size, constructor_alloc, &carg, type_num,
+				   0, CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), ctx);
+
+	lw_tx_end(pop, NULL);
+	return ret;
+}
+
+/*
+ * dav_alloc -- allocates a new object
+ */
+DAV_FUNC_EXPORT int
+dav_alloc_v2(dav_obj_t *pop, uint64_t *offp, size_t size, uint64_t type_num, uint64_t flags,
+	   dav_constr constructor, void *arg)
+{
+	DAV_DBG(3, "pop %p offp %p size %zu type_num %llx flags %llx constructor %p arg %p", pop,
+		offp, size, (unsigned long long)type_num, (unsigned long long)flags, constructor,
+		arg);
+
+	if (size == 0) {
+		ERR("allocation with size 0");
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (flags & ~DAV_TX_XALLOC_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64, flags & ~DAV_TX_XALLOC_VALID_FLAGS);
+		errno = EINVAL;
+		return -1;
+	}
+
+	DAV_API_START();
+	int ret = obj_alloc_construct(pop, offp, size, type_num, flags, constructor, arg);
+
+	DAV_API_END();
+	return ret;
+}
+
+/*
+ * dav_free -- frees an existing object
+ */
+DAV_FUNC_EXPORT void
+dav_free_v2(dav_obj_t *pop, uint64_t off)
+{
+	struct operation_context *ctx;
+
+	DAV_DBG("oid.off 0x%016" PRIx64, off);
+
+	if (off == 0)
+		return;
+
+	DAV_API_START();
+
+	ASSERTne(pop, NULL);
+	ASSERT(OBJ_OFF_IS_VALID(pop, off));
+	lw_tx_begin(pop);
+	ctx = pop->external;
+	operation_start(ctx);
+
+	palloc_operation(pop->do_heap, off, NULL, 0, NULL, NULL,
+			0, 0, 0, 0, ctx);
+
+	lw_tx_end(pop, NULL);
+	DAV_API_END();
+}
+
+/*
+ * dav_memcpy_persist -- dav version of memcpy
+ */
+DAV_FUNC_EXPORT void *
+dav_memcpy_persist_v2(dav_obj_t *pop, void *dest, const void *src,
+	size_t len)
+{
+	DAV_DBG("pop %p dest %p src %p len %zu", pop, dest, src, len);
+	D_ASSERT((dav_tx_stage_v2() == DAV_TX_STAGE_NONE));
+
+	DAV_API_START();
+	lw_tx_begin(pop);
+
+	void *ptr = mo_wal_memcpy(&pop->p_ops, dest, src, len, 0);
+
+	lw_tx_end(pop, NULL);
+	DAV_API_END();
+	return ptr;
+}
+
+/*
+ * dav_memcpy_persist -- dav version of memcpy with deferrred commit to blob.
+ */
+DAV_FUNC_EXPORT void *
+dav_memcpy_persist_relaxed_v2(dav_obj_t *pop, void *dest, const void *src,
+			   size_t len)
+{
+	DAV_DBG("pop %p dest %p src %p len %zu", pop, dest, src, len);
+	DAV_API_START();
+	if (pop->do_utx == NULL && dav_umem_wtx_new(pop) == NULL)
+		return 0;
+
+	void *ptr = mo_wal_memcpy(&pop->p_ops, dest, src, len, 0);
+
+	DAV_API_END();
+	return ptr;
+}
+
+/*
+ * dav_reserve -- reserves a single object
+ */
+DAV_FUNC_EXPORT uint64_t
+dav_reserve_v2(dav_obj_t *pop, struct dav_action *act, size_t size, uint64_t type_num,
+		uint64_t flags)
+{
+	struct constr_args carg;
+
+	DAV_DBG(3, "pop %p act %p size %zu type_num %llx flags %llx", pop, act, size,
+		(unsigned long long)type_num, (unsigned long long)flags);
+
+	if (flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS) {
+		ERR("unknown flags 0x%" PRIx64, flags & ~DAV_ACTION_XRESERVE_VALID_FLAGS);
+		errno = EINVAL;
+		return 0;
+	}
+
+	DAV_API_START();
+
+	if (pop->do_utx == NULL && dav_umem_wtx_new(pop) == NULL)
+		return 0;
+
+	carg.zero_init   = flags & DAV_FLAG_ZERO;
+	carg.constructor = NULL;
+	carg.arg         = NULL;
+
+	if (palloc_reserve(pop->do_heap, size, constructor_alloc, &carg, type_num, 0,
+			   CLASS_ID_FROM_FLAG(flags), EZONE_ID_FROM_FLAG(flags), act) != 0) {
+		DAV_API_END();
+		return 0;
+	}
+
+	DAV_API_END();
+	return act->heap.offset;
+}
+
+/*
+ * dav_defer_free -- creates a deferred free action
+ */
+DAV_FUNC_EXPORT void
+dav_defer_free_v2(dav_obj_t *pop, uint64_t off, struct dav_action *act)
+{
+	ASSERT(off != 0);
+	ASSERT(OBJ_OFF_IS_VALID(pop, off));
+	palloc_defer_free(pop->do_heap, off, act);
+}
+
+#if 0
+/*
+ * dav_publish -- publishes a collection of actions
+ */
+int
+dav_publish(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt)
+{
+	DAV_API_START();
+	struct operation_context *ctx = pmalloc_operation_hold(pop);
+
+	size_t entries_size = actvcnt * sizeof(struct ulog_entry_val);
+
+	if (operation_reserve(ctx, entries_size) != 0) {
+		DAV_API_END();
+		return -1;
+	}
+
+	palloc_publish(&pop->do_heap, actv, actvcnt, ctx);
+
+	pmalloc_operation_release(pop);
+
+	DAV_API_END();
+	return 0;
+}
+#endif
+
+/*
+ * dav_cancel -- cancels collection of actions
+ */
+DAV_FUNC_EXPORT void
+dav_cancel_v2(dav_obj_t *pop, struct dav_action *actv, size_t actvcnt)
+{
+	DAV_DBG("actvcnt=%zu", actvcnt);
+	DAV_API_START();
+	palloc_cancel(pop->do_heap, actv, actvcnt);
+	DAV_API_END();
+}
+
+/*
+ * dav_tx_publish -- publishes actions inside of a transaction,
+ * with no_abort option
+ */
+DAV_FUNC_EXPORT int
+dav_tx_publish_v2(struct dav_action *actv, size_t actvcnt)
+{
+	struct tx *tx    = get_tx();
+	uint64_t   flags = 0;
+	uint64_t   off, size;
+	int        ret;
+
+	ASSERT_IN_TX(tx);
+	ASSERT_TX_STAGE_WORK(tx);
+
+	flags |= tx_abort_on_failure_flag(tx);
+
+	DAV_API_START();
+
+	if (tx_action_reserve(tx, actvcnt) != 0) {
+		ret = obj_tx_fail_err(ENOMEM, flags);
+
+		DAV_API_END();
+		return ret;
+	}
+
+	for (size_t i = 0; i < actvcnt; ++i) {
+		VEC_PUSH_BACK(&tx->actions, actv[i]);
+		if (palloc_action_isalloc(&actv[i])) {
+			palloc_get_prange(&actv[i], &off, &size, 1);
+			struct tx_range_def r = {off, size,
+						 DAV_XADD_NO_SNAPSHOT | DAV_XADD_WAL_CPTR};
+
+			ret = dav_tx_add_common(tx, &r);
+			D_ASSERT(ret == 0);
+		}
+	}
+
+	DAV_API_END();
+	return 0;
+}
+
+/*
+ * dav_get_zone_evictable -- Returns an evictable zone id that can be used for
+ * allocations. If there are no evictable zone with sufficient free space then
+ * zero is returned which maps to non-evictable zone.
+ */
+DAV_FUNC_EXPORT uint32_t
+dav_get_zone_evictable_v2(dav_obj_t *pop, int flags)
+{
+	D_ASSERT(flags == 0);
+	/* REVISIT: TBD
+	 * Return evictable zone that is currently marked as in-use and has sufficient free space.
+	 * Else, find an evictable zone that has more that x% of free memory and mark it as in-use.
+	 */
+	return 0;
+}
diff --git a/src/common/dav_v2/tx.h b/src/common/dav_v2/tx.h
new file mode 100644
index 00000000000..ba1fca6fc93
--- /dev/null
+++ b/src/common/dav_v2/tx.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2016-2020, Intel Corporation */
+
+/*
+ * tx.h -- internal definitions for transactions
+ */
+
+#ifndef __DAOS_COMMON_INTERNAL_TX_H
+#define __DAOS_COMMON_INTERNAL_TX_H 1
+
+#include <stdint.h>
+
+#define TX_DEFAULT_RANGE_CACHE_SIZE (1 << 15)
+
+struct ulog_entry_base;
+struct mo_ops;
+/*
+ * tx_create_wal_entry -- convert to WAL a single ulog UNDO entry
+ */
+int tx_create_wal_entry(struct ulog_entry_base *e, void *arg, const struct mo_ops *p_ops);
+
+#endif
diff --git a/src/common/dav_v2/ulog.c b/src/common/dav_v2/ulog.c
new file mode 100644
index 00000000000..d04d2e6732a
--- /dev/null
+++ b/src/common/dav_v2/ulog.c
@@ -0,0 +1,695 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2022, Intel Corporation */
+
+/*
+ * ulog.c -- unified log implementation
+ */
+
+#include <inttypes.h>
+#include <string.h>
+
+#include "dav_internal.h"
+#include "mo_wal.h"
+#include "ulog.h"
+#include "obj.h"
+#include "out.h"
+#include "valgrind_internal.h"
+
+/*
+ * Operation flag at the three most significant bits
+ */
+#define ULOG_OPERATION(op)		((uint64_t)(op))
+#define ULOG_OPERATION_MASK		((uint64_t)(0b111ULL << 61ULL))
+#define ULOG_OPERATION_FROM_OFFSET(off)	\
+	((ulog_operation_type) ((off) & ULOG_OPERATION_MASK))
+#define ULOG_OFFSET_MASK		(~(ULOG_OPERATION_MASK))
+
+#define CACHELINE_ALIGN(size) ALIGN_UP(size, CACHELINE_SIZE)
+#define IS_CACHELINE_ALIGNED(ptr)\
+	(((uintptr_t)(ptr) & (CACHELINE_SIZE - 1)) == 0)
+
+/*
+ * ulog_next -- retrieves the pointer to the next ulog
+ */
+struct ulog *
+ulog_next(struct ulog *ulog)
+{
+	return ulog->next;
+}
+
+/*
+ * ulog_operation -- returns the type of entry operation
+ */
+ulog_operation_type
+ulog_entry_type(const struct ulog_entry_base *entry)
+{
+	return ULOG_OPERATION_FROM_OFFSET(entry->offset);
+}
+
+/*
+ * ulog_offset -- returns offset
+ */
+uint64_t
+ulog_entry_offset(const struct ulog_entry_base *entry)
+{
+	return entry->offset & ULOG_OFFSET_MASK;
+}
+
+/*
+ * ulog_entry_size -- returns the size of a ulog entry
+ */
+size_t
+ulog_entry_size(const struct ulog_entry_base *entry)
+{
+	struct ulog_entry_buf *eb;
+
+	switch (ulog_entry_type(entry)) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	case ULOG_OPERATION_AND:
+	case ULOG_OPERATION_OR:
+#else
+	case ULOG_OPERATION_CLR_BITS:
+	case ULOG_OPERATION_SET_BITS:
+#endif
+	case ULOG_OPERATION_SET:
+		return sizeof(struct ulog_entry_val);
+	case ULOG_OPERATION_BUF_SET:
+	case ULOG_OPERATION_BUF_CPY:
+		eb = (struct ulog_entry_buf *)entry;
+		return CACHELINE_ALIGN(
+			sizeof(struct ulog_entry_buf) + eb->size);
+	default:
+		ASSERT(0);
+	}
+
+	return 0;
+}
+
+/*
+ * ulog_entry_valid -- (internal) checks if a ulog entry is valid
+ * Returns 1 if the range is valid, otherwise 0 is returned.
+ */
+static int
+ulog_entry_valid(struct ulog *ulog, const struct ulog_entry_base *entry)
+{
+	if (entry->offset == 0)
+		return 0;
+
+	size_t size;
+	struct ulog_entry_buf *b;
+
+	switch (ulog_entry_type(entry)) {
+	case ULOG_OPERATION_BUF_CPY:
+	case ULOG_OPERATION_BUF_SET:
+		size = ulog_entry_size(entry);
+		b = (struct ulog_entry_buf *)entry;
+
+		uint64_t csum = util_checksum_compute(b, size,
+				&b->checksum, 0);
+		csum = util_checksum_seq(&ulog->gen_num,
+				sizeof(ulog->gen_num), csum);
+
+		if (b->checksum != csum)
+			return 0;
+		break;
+	default:
+		break;
+	}
+
+	return 1;
+}
+
+/*
+ * ulog_construct -- initializes the ulog structure
+ */
+void
+ulog_construct_new(struct ulog *ulog, size_t capacity, uint64_t gen_num, uint64_t flags)
+{
+	ASSERTne(ulog, NULL);
+
+	ulog->capacity = capacity;
+	ulog->checksum = 0;
+	ulog->next = 0;
+	ulog->gen_num = gen_num;
+	ulog->flags = flags;
+	memset(ulog->unused, 0, sizeof(ulog->unused));
+
+	/* we only need to zero out the header of ulog's first entry */
+	size_t zeroed_data = CACHELINE_ALIGN(sizeof(struct ulog_entry_base));
+	/*
+	 * We want to avoid replicating zeroes for every ulog of every
+	 * lane, to do that, we need to use plain old memset.
+	 */
+	memset(ulog->data, 0, zeroed_data);
+}
+
+/*
+ * ulog_foreach_entry -- iterates over every existing entry in the ulog
+ */
+int
+ulog_foreach_entry(struct ulog *ulog, ulog_entry_cb cb, void *arg, const struct mo_ops *ops)
+{
+	struct ulog_entry_base *e;
+	int ret = 0;
+
+	for (struct ulog *r = ulog; r != NULL; r = ulog_next(r)) {
+		for (size_t offset = 0; offset < r->capacity; ) {
+			e = (struct ulog_entry_base *)(r->data + offset);
+			if (!ulog_entry_valid(ulog, e))
+				return ret;
+
+			ret = cb(e, arg, ops);
+			if (ret != 0)
+				return ret;
+
+			offset += ulog_entry_size(e);
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * ulog_capacity -- (internal) returns the total capacity of the ulog
+ */
+size_t
+ulog_capacity(struct ulog *ulog, size_t ulog_base_bytes)
+{
+	size_t capacity = ulog_base_bytes;
+
+	ulog = ulog_next(ulog);
+	/* skip the first one, we count it in 'ulog_base_bytes' */
+	while (ulog != NULL) {
+		capacity += ulog->capacity;
+		ulog = ulog_next(ulog);
+	}
+
+	return capacity;
+}
+
+/*
+ * ulog_rebuild_next_vec -- rebuilds the vector of next entries
+ */
+void
+ulog_rebuild_next_vec(struct ulog *ulog, struct ulog_next *next)
+{
+	do {
+		if (ulog->next != 0)
+			VEC_PUSH_BACK(next, ulog->next);
+	} while ((ulog = ulog_next(ulog)) != NULL);
+}
+
+/*
+ * ulog_reserve -- reserves new capacity in the ulog
+ */
+int
+ulog_reserve(struct ulog *ulog,
+	size_t ulog_base_nbytes, size_t gen_num,
+	int auto_reserve, size_t *new_capacity,
+	ulog_extend_fn extend, struct ulog_next *next)
+{
+	if (!auto_reserve) {
+		D_CRIT("cannot auto reserve next ulog\n");
+		return -1;
+	}
+
+	size_t capacity = ulog_base_nbytes;
+
+	VEC_FOREACH(ulog, next) {
+		ASSERTne(ulog, NULL);
+		capacity += ulog->capacity;
+	}
+
+	while (capacity < *new_capacity) {
+		if (extend(&ulog->next, gen_num) != 0)
+			return -1;
+		VEC_PUSH_BACK(next, ulog->next);
+		ulog = ulog_next(ulog);
+		ASSERTne(ulog, NULL);
+
+		capacity += ulog->capacity;
+	}
+	*new_capacity = capacity;
+
+	return 0;
+}
+
+/*
+ * ulog_checksum -- (internal) calculates ulog checksum
+ */
+static int
+ulog_checksum(struct ulog *ulog, size_t ulog_base_bytes, int insert)
+{
+	return util_checksum(ulog, SIZEOF_ULOG(ulog_base_bytes),
+		&ulog->checksum, insert, 0);
+}
+
+/*
+ * ulog_entry_val_create -- creates a new log value entry in the ulog
+ *
+ * This function requires at least a cacheline of space to be available in the
+ * ulog.
+ */
+struct ulog_entry_val *
+ulog_entry_val_create(struct ulog *ulog, size_t offset, uint64_t *dest,
+		      uint64_t value, ulog_operation_type type, const struct mo_ops *p_ops)
+{
+	struct ulog_entry_val *e =
+		(struct ulog_entry_val *)(ulog->data + offset);
+
+	struct {
+		struct ulog_entry_val v;
+		struct ulog_entry_base zeroes;
+	} data;
+	COMPILE_ERROR_ON(sizeof(data) != sizeof(data.v) + sizeof(data.zeroes));
+
+	/*
+	 * Write a little bit more to the buffer so that the next entry that
+	 * resides in the log is erased. This will prevent leftovers from
+	 * a previous, clobbered, log from being incorrectly applied.
+	 */
+	data.zeroes.offset = 0;
+	data.v.base.offset = p_ops->base ? (uint64_t)(dest) -
+		(uint64_t)((dav_obj_t *)p_ops->base)->do_base :
+		(uint64_t)dest;
+	data.v.base.offset |= ULOG_OPERATION(type);
+	data.v.value = value;
+
+	memcpy(e, &data, sizeof(data));
+
+	return e;
+}
+
+/*
+ * ulog_clobber_entry -- zeroes out a single log entry header
+ */
+void
+ulog_clobber_entry(const struct ulog_entry_base *e)
+{
+	static const size_t aligned_entry_size =
+		CACHELINE_ALIGN(sizeof(struct ulog_entry_base));
+
+	memset((char *)e, 0, aligned_entry_size);
+}
+
+/*
+ * ulog_entry_buf_create -- atomically creates a buffer entry in the log
+ */
+struct ulog_entry_buf *
+ulog_entry_buf_create(struct ulog *ulog, size_t offset, uint64_t gen_num,
+		uint64_t *dest, const void *src, uint64_t size,
+		ulog_operation_type type, const struct mo_ops *p_ops)
+{
+	struct ulog_entry_buf *e =
+		(struct ulog_entry_buf *)(ulog->data + offset);
+
+	/*
+	 * Depending on the size of the source buffer, we might need to perform
+	 * up to three separate copies:
+	 *	1. The first cacheline, 24b of metadata and 40b of data
+	 * If there's still data to be logged:
+	 *	2. The entire remainder of data data aligned down to cacheline,
+	 *	for example, if there's 150b left, this step will copy only
+	 *	128b.
+	 * Now, we are left with between 0 to 63 bytes. If nonzero:
+	 *	3. Create a stack allocated cacheline-sized buffer, fill in the
+	 *	remainder of the data, and copy the entire cacheline.
+	 *
+	 * This is done so that we avoid a cache-miss on misaligned writes.
+	 */
+
+	struct ulog_entry_buf *b = alloca(CACHELINE_SIZE);
+
+	ASSERT(p_ops->base != NULL);
+	b->base.offset = (uint64_t)dest - (uint64_t)((dav_obj_t *)p_ops->base)->do_base;
+	b->base.offset |= ULOG_OPERATION(type);
+	b->size = size;
+	b->checksum = 0;
+
+	size_t bdatasize = CACHELINE_SIZE - sizeof(struct ulog_entry_buf);
+	size_t ncopy = MIN(size, bdatasize);
+
+	memcpy(b->data, src, ncopy);
+	memset(b->data + ncopy, 0, bdatasize - ncopy);
+
+	size_t remaining_size = ncopy > size ? 0 : size - ncopy;
+
+	char *srcof = (char *)src + ncopy;
+	size_t rcopy = ALIGN_DOWN(remaining_size, CACHELINE_SIZE);
+	size_t lcopy = remaining_size - rcopy;
+
+	uint8_t last_cacheline[CACHELINE_SIZE];
+
+	if (lcopy != 0) {
+		memcpy(last_cacheline, srcof + rcopy, lcopy);
+		memset(last_cacheline + lcopy, 0, CACHELINE_SIZE - lcopy);
+	}
+
+	if (rcopy != 0) {
+		void *rdest = e->data + ncopy;
+
+		ASSERT(IS_CACHELINE_ALIGNED(rdest));
+		memcpy(rdest, srcof, rcopy);
+	}
+
+	if (lcopy != 0) {
+		void *ldest = e->data + ncopy + rcopy;
+
+		ASSERT(IS_CACHELINE_ALIGNED(ldest));
+
+		memcpy(ldest, last_cacheline, CACHELINE_SIZE);
+	}
+
+	b->checksum = util_checksum_seq(b, CACHELINE_SIZE, 0);
+	if (rcopy != 0)
+		b->checksum = util_checksum_seq(srcof, rcopy, b->checksum);
+	if (lcopy != 0)
+		b->checksum = util_checksum_seq(last_cacheline,
+			CACHELINE_SIZE, b->checksum);
+
+	b->checksum = util_checksum_seq(&gen_num, sizeof(gen_num),
+			b->checksum);
+
+	ASSERT(IS_CACHELINE_ALIGNED(e));
+
+	memcpy(e, b, CACHELINE_SIZE);
+
+	/*
+	 * Allow having uninitialized data in the buffer - this requires marking
+	 * data as defined so that comparing checksums is not reported as an
+	 * error by memcheck.
+	 */
+	VALGRIND_DO_MAKE_MEM_DEFINED(e->data, ncopy + rcopy + lcopy);
+	VALGRIND_DO_MAKE_MEM_DEFINED(&e->checksum, sizeof(e->checksum));
+
+	ASSERT(ulog_entry_valid(ulog, &e->base));
+
+	return e;
+}
+
+/*
+ * ulog_entry_apply -- applies modifications of a single ulog entry
+ */
+void
+ulog_entry_apply(const struct ulog_entry_base *e, int persist,
+		 const struct mo_ops *p_ops)
+{
+	ulog_operation_type t = ulog_entry_type(e);
+	uint64_t offset = ulog_entry_offset(e);
+
+	size_t dst_size = sizeof(uint64_t);
+	uint64_t *dst = p_ops->base ?
+		(uint64_t *)((uintptr_t)((dav_obj_t *)p_ops->base)->do_base + offset) :
+		(uint64_t *)offset;
+
+	struct ulog_entry_val *ev;
+	struct ulog_entry_buf *eb;
+
+	uint16_t nbits;
+	uint32_t pos;
+	uint64_t bmask;
+
+	SUPPRESS_UNUSED(persist);
+
+	switch (t) {
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+	case ULOG_OPERATION_AND:
+		ev = (struct ulog_entry_val *)e;
+
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		*dst &= ev->value;
+		break;
+	case ULOG_OPERATION_OR:
+		ev = (struct ulog_entry_val *)e;
+
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		*dst |= ev->value;
+		break;
+#else
+	case ULOG_OPERATION_CLR_BITS:
+		ev = (struct ulog_entry_val *)e;
+		pos = ULOG_ENTRY_VAL_TO_POS(ev->value);
+		nbits = ULOG_ENTRY_VAL_TO_BITS(ev->value);
+		if (nbits == RUN_BITS_PER_VALUE)
+			bmask = UINT64_MAX;
+		else
+			bmask = ((1ULL << nbits) - 1ULL) << pos;
+
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		*dst &= ~bmask;
+		break;
+	case ULOG_OPERATION_SET_BITS:
+		ev = (struct ulog_entry_val *)e;
+		pos = ULOG_ENTRY_VAL_TO_POS(ev->value);
+		nbits = ULOG_ENTRY_VAL_TO_BITS(ev->value);
+		if (nbits == RUN_BITS_PER_VALUE)
+			bmask = UINT64_MAX;
+		else
+			bmask = ((1ULL << nbits) - 1ULL) << pos;
+
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		*dst |= bmask;
+		break;
+#endif
+	case ULOG_OPERATION_SET:
+		ev = (struct ulog_entry_val *)e;
+
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		*dst = ev->value;
+		break;
+	case ULOG_OPERATION_BUF_CPY:
+		eb = (struct ulog_entry_buf *)e;
+
+		dst_size = eb->size;
+		VALGRIND_ADD_TO_TX(dst, dst_size);
+		mo_wal_memcpy(p_ops, dst, eb->data, eb->size, 0);
+		break;
+	case ULOG_OPERATION_BUF_SET:
+	default:
+		ASSERT(0);
+	}
+	VALGRIND_REMOVE_FROM_TX(dst, dst_size);
+}
+
+/*
+ * ulog_process_entry -- (internal) processes a single ulog entry
+ */
+static int
+ulog_process_entry(struct ulog_entry_base *e, void *arg,
+		   const struct mo_ops *p_ops)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(arg);
+
+	ulog_entry_apply(e, 0, p_ops);
+
+	return 0;
+}
+/*
+ * ulog_inc_gen_num -- (internal) increments gen num in the ulog
+ */
+static void
+ulog_inc_gen_num(struct ulog *ulog)
+{
+	ulog->gen_num++;
+}
+
+/*
+ * ulog_free_next -- free all ulogs starting from the indicated one.
+ * Function returns 1 if any ulog have been freed or unpinned, 0 otherwise.
+ */
+int
+ulog_free_next(struct ulog *u, ulog_free_fn ulog_free)
+{
+	int ret = 0;
+
+	if (u == NULL)
+		return ret;
+
+	VEC(, struct ulog **) ulogs_internal_except_first;
+	VEC_INIT(&ulogs_internal_except_first);
+
+	while (u->next != 0) {
+		if (VEC_PUSH_BACK(&ulogs_internal_except_first,
+			&u->next) != 0) {
+			/* this is fine, it will just use more memory */
+			DAV_DBG("unable to free transaction logs memory");
+			goto out;
+		}
+		u = u->next;
+	}
+
+	/* free non-user defined logs */
+	struct ulog **ulog_ptr;
+
+	VEC_FOREACH_REVERSE(ulog_ptr, &ulogs_internal_except_first) {
+		ulog_free(*ulog_ptr);
+		*ulog_ptr = NULL;
+		ret = 1;
+	}
+
+out:
+	VEC_DELETE(&ulogs_internal_except_first);
+	return ret;
+}
+
+/*
+ * ulog_clobber -- zeroes the metadata of the ulog
+ */
+void
+ulog_clobber(struct ulog *dest, struct ulog_next *next)
+{
+	struct ulog empty;
+
+	memset(&empty, 0, sizeof(empty));
+
+	if (next != NULL)
+		empty.next = VEC_SIZE(next) == 0 ? 0 : VEC_FRONT(next);
+	else
+		empty.next = dest->next;
+
+	memcpy(dest, &empty, sizeof(empty));
+}
+
+/*
+ * ulog_clobber_data -- zeroes out 'nbytes' of data in the logs
+ */
+int
+ulog_clobber_data(struct ulog *ulog_first,
+	struct ulog_next *next, ulog_free_fn ulog_free,
+	unsigned flags)
+{
+	ASSERTne(ulog_first, NULL);
+
+	/* In case of abort we need to increment counter in the first ulog. */
+	if (flags & ULOG_INC_FIRST_GEN_NUM)
+		ulog_inc_gen_num(ulog_first);
+
+	/*
+	 * In the case of abort or commit, we are not going to free all ulogs,
+	 * but rather increment the generation number to be consistent in the
+	 * first two ulogs.
+	 */
+	struct ulog *ulog_second = VEC_SIZE(next) == 0 ? 0 : *VEC_GET(next, 0);
+
+	if (ulog_second && !(flags & ULOG_FREE_AFTER_FIRST))
+		/*
+		 * We want to keep gen_nums consistent between ulogs.
+		 * If the transaction will commit successfully we'll reuse the
+		 * second buffer (third and next ones will be freed anyway).
+		 * If the application will crash we'll free 2nd ulog on
+		 * recovery, which means we'll never read gen_num of the
+		 * second ulog in case of an ungraceful shutdown.
+		 */
+		ulog_inc_gen_num(ulog_second);
+
+	struct ulog *u;
+
+	/*
+	 * To make sure that transaction logs do not occupy too
+	 * much of space, all of them, expect for the first one,
+	 * are freed at the end of the operation. The reasoning for
+	 * this is that pmalloc() is a relatively cheap operation for
+	 * transactions where many hundreds of kilobytes are being
+	 * snapshot, and so, allocating and freeing the buffer for
+	 * each transaction is an acceptable overhead for the average
+	 * case.
+	 */
+	if (flags & ULOG_FREE_AFTER_FIRST)
+		u = ulog_first;
+	else
+		u = ulog_second;
+
+	if (u == NULL)
+		return 0;
+
+	return ulog_free_next(u, ulog_free);
+}
+
+/*
+ * ulog_process -- process ulog entries
+ */
+void
+ulog_process(struct ulog *ulog, ulog_check_offset_fn check,
+	     const struct mo_ops *p_ops)
+{
+	/* suppress unused-parameter errors */
+	SUPPRESS_UNUSED(check);
+
+#ifdef DAV_EXTRA_DEBUG
+	if (check)
+		ulog_check(ulog, check, p_ops);
+#endif
+
+	ulog_foreach_entry(ulog, ulog_process_entry, NULL, p_ops);
+	mo_wal_drain(p_ops);
+}
+
+/*
+ * ulog_base_nbytes -- (internal) counts the actual of number of bytes
+ *	occupied by the ulog
+ */
+size_t
+ulog_base_nbytes(struct ulog *ulog)
+{
+	size_t offset = 0;
+	struct ulog_entry_base *e;
+
+	for (offset = 0; offset < ulog->capacity; ) {
+		e = (struct ulog_entry_base *)(ulog->data + offset);
+		if (!ulog_entry_valid(ulog, e))
+			break;
+
+		offset += ulog_entry_size(e);
+	}
+
+	return offset;
+}
+
+/*
+ * ulog_recovery_needed -- checks if the logs needs recovery
+ */
+int
+ulog_recovery_needed(struct ulog *ulog, int verify_checksum)
+{
+	size_t nbytes = MIN(ulog_base_nbytes(ulog), ulog->capacity);
+
+	if (nbytes == 0)
+		return 0;
+
+	if (verify_checksum && !ulog_checksum(ulog, nbytes, 0))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * ulog_check_entry --
+ *	(internal) checks consistency of a single ulog entry
+ */
+static int
+ulog_check_entry(struct ulog_entry_base *e, void *arg, const struct mo_ops *p_ops)
+{
+	uint64_t offset = ulog_entry_offset(e);
+	ulog_check_offset_fn check = arg;
+
+	if (!check(p_ops->base, offset)) {
+		DAV_DBG("ulog %p invalid offset %" PRIu64,
+				e, e->offset);
+		return -1;
+	}
+
+	return offset == 0 ? -1 : 0;
+}
+
+/*
+ * ulog_check -- (internal) check consistency of ulog entries
+ */
+int
+ulog_check(struct ulog *ulog, ulog_check_offset_fn check, const struct mo_ops *p_ops)
+{
+	DAV_DBG("ulog %p", ulog);
+
+	return ulog_foreach_entry(ulog,
+			ulog_check_entry, check, p_ops);
+}
diff --git a/src/common/dav_v2/ulog.h b/src/common/dav_v2/ulog.h
new file mode 100644
index 00000000000..0873dfdeb64
--- /dev/null
+++ b/src/common/dav_v2/ulog.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2021, Intel Corporation */
+
+/*
+ * ulog.h -- unified log public interface
+ */
+
+#ifndef __DAOS_COMMON_ULOG_H
+#define __DAOS_COMMON_ULOG_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "util.h"
+#include "vec.h"
+#include "mo_wal.h"
+
+struct ulog_entry_base {
+	uint64_t offset; /* offset with operation type flag */
+};
+
+/*
+ * ulog_entry_val -- log entry
+ */
+struct ulog_entry_val {
+	struct ulog_entry_base base;
+	uint64_t value; /* value to be applied */
+};
+
+/*
+ * ulog_entry_buf - ulog buffer entry
+ */
+struct ulog_entry_buf {
+	struct ulog_entry_base base; /* offset with operation type flag */
+	uint64_t checksum; /* checksum of the entire log entry */
+	uint64_t size; /* size of the buffer to be modified */
+	uint8_t data[]; /* content to fill in */
+};
+
+#define ULOG_UNUSED ((CACHELINE_SIZE - 40) / 8)
+/*
+ * This structure *must* be located at a cacheline boundary. To achieve this,
+ * the next field is always allocated with extra padding, and then the offset
+ * is additionally aligned.
+ */
+#define ULOG(capacity_bytes) {\
+	/* 64 bytes of metadata */\
+	uint64_t checksum; /* checksum of ulog header and its entries */\
+	struct ulog *next; /* offset of ulog extension */\
+	uint64_t capacity; /* capacity of this ulog in bytes */\
+	uint64_t gen_num; /* generation counter */\
+	uint64_t flags; /* ulog flags */\
+	uint64_t unused[ULOG_UNUSED]; /* must be 0 */\
+	uint8_t data[capacity_bytes]; /* N bytes of data */\
+}
+
+#define SIZEOF_ULOG(base_capacity)\
+(sizeof(struct ulog) + base_capacity)
+
+/*
+ * Ulog buffer allocated by the user must be marked by this flag.
+ * It is important to not free it at the end:
+ * what user has allocated - user should free himself.
+ */
+#define ULOG_USER_OWNED (1U << 0)
+
+/* use this for allocations of aligned ulog extensions */
+#define SIZEOF_ALIGNED_ULOG(base_capacity)\
+ALIGN_UP(SIZEOF_ULOG(base_capacity + (2 * CACHELINE_SIZE)), CACHELINE_SIZE)
+
+struct ulog ULOG(0);
+
+VEC(ulog_next, struct ulog *);
+
+typedef uint64_t ulog_operation_type;
+
+#define ULOG_OPERATION_SET		(0b000ULL << 61ULL)
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+#define ULOG_OPERATION_AND		(0b001ULL << 61ULL)
+#define ULOG_OPERATION_OR		(0b010ULL << 61ULL)
+#else
+#define ULOG_OPERATION_CLR_BITS		(0b001ULL << 61ULL)
+#define ULOG_OPERATION_SET_BITS		(0b010ULL << 61ULL)
+#endif
+#define ULOG_OPERATION_BUF_SET		(0b101ULL << 61ULL)
+#define ULOG_OPERATION_BUF_CPY		(0b110ULL << 61ULL)
+
+#ifndef	WAL_SUPPORTS_AND_OR_OPS
+#endif
+
+#ifdef	WAL_SUPPORTS_AND_OR_OPS
+#define	ULOG_ENTRY_IS_BIT_OP(opc)	((opc == ULOG_OPERATION_AND) || \
+					 (opc == ULOG_OPERATION_OR))
+#else
+#define	ULOG_ENTRY_IS_BIT_OP(opc)	((opc == ULOG_OPERATION_CLR_BITS) || \
+					 (opc == ULOG_OPERATION_SET_BITS))
+#define ULOG_ENTRY_OPS_POS		16 /* bits' pos at value:16 */
+#define ULOG_ENTRY_OPS_BITS_MASK	((1ULL << ULOG_ENTRY_OPS_POS) - 1)
+#define ULOG_ENTRY_VAL_TO_BITS(val)	((val) & ULOG_ENTRY_OPS_BITS_MASK)
+#define ULOG_ENTRY_VAL_TO_POS(val)	((val) >> ULOG_ENTRY_OPS_POS)
+#define ULOG_ENTRY_OPS_POS_MASK		(RUN_BITS_PER_VALUE - 1ULL)
+#define ULOG_ENTRY_TO_VAL(pos, nbits)	(((uint64_t)(nbits) & ULOG_ENTRY_OPS_BITS_MASK) | \
+					 ((pos) & ULOG_ENTRY_OPS_POS_MASK) << ULOG_ENTRY_OPS_POS)
+#endif
+
+/* immediately frees all associated ulog structures */
+#define ULOG_FREE_AFTER_FIRST (1U << 0)
+/* increments gen_num of the first, preallocated, ulog */
+#define ULOG_INC_FIRST_GEN_NUM (1U << 1)
+
+typedef int (*ulog_check_offset_fn)(void *ctx, uint64_t offset);
+typedef int (*ulog_extend_fn)(struct ulog **, uint64_t);
+typedef int (*ulog_entry_cb)(struct ulog_entry_base *e, void *arg,
+	const struct mo_ops *p_ops);
+typedef void (*ulog_free_fn)(struct ulog *ptr);
+
+struct ulog *ulog_next(struct ulog *ulog);
+
+void ulog_construct(uint64_t offset, size_t capacity, uint64_t gen_num,
+		    int flush, uint64_t flags, const struct mo_ops *p_ops);
+void ulog_construct_new(struct ulog *ulog, size_t capacity, uint64_t gen_num,
+			uint64_t flags);
+
+size_t ulog_capacity(struct ulog *ulog, size_t ulog_base_bytes);
+void ulog_rebuild_next_vec(struct ulog *ulog, struct ulog_next *next);
+
+int ulog_foreach_entry(struct ulog *ulog,
+		       ulog_entry_cb cb, void *arg, const struct mo_ops *ops);
+
+int ulog_reserve(struct ulog *ulog,
+		 size_t ulog_base_nbytes, size_t gen_num,
+		 int auto_reserve, size_t *new_capacity_bytes,
+		 ulog_extend_fn extend, struct ulog_next *next);
+
+int ulog_free_next(struct ulog *u, ulog_free_fn ulog_free);
+void ulog_clobber(struct ulog *dest, struct ulog_next *next);
+int ulog_clobber_data(struct ulog *dest,
+		      struct ulog_next *next, ulog_free_fn ulog_free, unsigned flags);
+void ulog_clobber_entry(const struct ulog_entry_base *e);
+
+void ulog_process(struct ulog *ulog, ulog_check_offset_fn check,
+		  const struct mo_ops *p_ops);
+
+size_t ulog_base_nbytes(struct ulog *ulog);
+int ulog_recovery_needed(struct ulog *ulog, int verify_checksum);
+
+uint64_t ulog_entry_offset(const struct ulog_entry_base *entry);
+ulog_operation_type ulog_entry_type(const struct ulog_entry_base *entry);
+
+struct ulog_entry_val *
+ulog_entry_val_create(struct ulog *ulog, size_t offset, uint64_t *dest, uint64_t value,
+		      ulog_operation_type type, const struct mo_ops *p_ops);
+
+struct ulog_entry_buf *
+ulog_entry_buf_create(struct ulog *ulog, size_t offset, uint64_t gen_num,
+		      uint64_t *dest, const void *src, uint64_t size,
+		      ulog_operation_type type, const struct mo_ops *p_ops);
+
+void ulog_entry_apply(const struct ulog_entry_base *e, int persist,
+		      const struct mo_ops *p_ops);
+
+size_t ulog_entry_size(const struct ulog_entry_base *entry);
+
+int ulog_check(struct ulog *ulog, ulog_check_offset_fn check,
+	       const struct mo_ops *p_ops);
+
+#endif /* __DAOS_COMMON_ULOG_H */
diff --git a/src/common/dav_v2/util.c b/src/common/dav_v2/util.c
new file mode 100644
index 00000000000..5ef73b0577d
--- /dev/null
+++ b/src/common/dav_v2/util.c
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2014-2022, Intel Corporation */
+
+/*
+ * util.c -- very basic utilities
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <endian.h>
+
+#include "util.h"
+#include "valgrind_internal.h"
+
+
+#if ANY_VG_TOOL_ENABLED
+/* Initialized to true if the process is running inside Valgrind. */
+unsigned _On_valgrind;
+#endif
+
+#if VG_HELGRIND_ENABLED
+/* Initialized to true if the process is running inside Valgrind helgrind. */
+unsigned _On_helgrind;
+#endif
+
+#if VG_DRD_ENABLED
+/* Initialized to true if the process is running inside Valgrind drd. */
+unsigned _On_drd;
+#endif
+
+#if VG_HELGRIND_ENABLED || VG_DRD_ENABLED
+/* Initialized to true if the process is running inside Valgrind drd or hg. */
+unsigned _On_drd_or_hg;
+#endif
+
+#if VG_MEMCHECK_ENABLED
+/* Initialized to true if the process is running inside Valgrind memcheck. */
+unsigned _On_memcheck;
+#endif
+
+#if VG_TXINFO_ENABLED
+/* true if DAV API and TX-related messages has to be enabled in Valgrind log. */
+int _Vg_txinfo_emit;
+#endif /* VG_TXINFO_ENABLED */
+
+/*
+ * util_is_zeroed -- check if given memory range is all zero
+ */
+int
+util_is_zeroed(const void *addr, size_t len)
+{
+	const char *a = addr;
+
+	if (len == 0)
+		return 1;
+
+	if (a[0] == 0 && memcmp(a, a + 1, len - 1) == 0)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * util_checksum_compute -- compute Fletcher64-like checksum
+ *
+ * csump points to where the checksum lives, so that location
+ * is treated as zeros while calculating the checksum. The
+ * checksummed data is assumed to be in little endian order.
+ */
+uint64_t
+util_checksum_compute(void *addr, size_t len, uint64_t *csump, size_t skip_off)
+{
+	if (len % 4 != 0)
+		abort();
+
+	uint32_t *p32 = addr;
+	uint32_t *p32end = (uint32_t *)((char *)addr + len);
+	uint32_t *skip;
+	uint32_t lo32 = 0;
+	uint32_t hi32 = 0;
+
+	if (skip_off)
+		skip = (uint32_t *)((char *)addr + skip_off);
+	else
+		skip = (uint32_t *)((char *)addr + len);
+
+	while (p32 < p32end)
+		if (p32 == (uint32_t *)csump || p32 >= skip) {
+			/* lo32 += 0; treat first 32-bits as zero */
+			p32++;
+			hi32 += lo32;
+			/* lo32 += 0; treat second 32-bits as zero */
+			p32++;
+			hi32 += lo32;
+		} else {
+			lo32 += le32toh(*p32);
+			++p32;
+			hi32 += lo32;
+		}
+
+	return (uint64_t)hi32 << 32 | lo32;
+}
+
+/*
+ * util_checksum -- compute Fletcher64-like checksum
+ *
+ * csump points to where the checksum lives, so that location
+ * is treated as zeros while calculating the checksum.
+ * If insert is true, the calculated checksum is inserted into
+ * the range at *csump.  Otherwise the calculated checksum is
+ * checked against *csump and the result returned (true means
+ * the range checksummed correctly).
+ */
+int
+util_checksum(void *addr, size_t len, uint64_t *csump,
+		int insert, size_t skip_off)
+{
+	uint64_t csum = util_checksum_compute(addr, len, csump, skip_off);
+
+	if (insert) {
+		*csump = htole64(csum);
+		return 1;
+	}
+
+	return *csump == htole64(csum);
+}
+
+/*
+ * util_checksum_seq -- compute sequential Fletcher64-like checksum
+ *
+ * Merges checksum from the old buffer with checksum for current buffer.
+ */
+uint64_t
+util_checksum_seq(const void *addr, size_t len, uint64_t csum)
+{
+	if (len % 4 != 0)
+		abort();
+	const uint32_t *p32 = addr;
+	const uint32_t *p32end = (const uint32_t *)((const char *)addr + len);
+	uint32_t lo32 = (uint32_t)csum;
+	uint32_t hi32 = (uint32_t)(csum >> 32);
+
+	while (p32 < p32end) {
+		lo32 += le32toh(*p32);
+		++p32;
+		hi32 += lo32;
+	}
+	return (uint64_t)hi32 << 32 | lo32;
+}
+
+/*
+ * util_init -- initialize the utils
+ *
+ * This is called from the library initialization code.
+ */
+#if ANY_VG_TOOL_ENABLED
+__attribute__((constructor))
+static void
+_util_init(void)
+{
+	util_init();
+}
+#endif
+
+void
+util_init(void)
+{
+#if ANY_VG_TOOL_ENABLED
+	_On_valgrind = RUNNING_ON_VALGRIND;
+#endif
+
+#if VG_MEMCHECK_ENABLED
+	if (_On_valgrind) {
+		unsigned tmp;
+		unsigned result;
+		unsigned res = VALGRIND_GET_VBITS(&tmp, &result, sizeof(tmp));
+
+		_On_memcheck = res ? 1 : 0;
+	} else {
+		_On_memcheck = 0;
+	}
+#endif
+
+#if VG_DRD_ENABLED
+	if (_On_valgrind)
+		_On_drd = DRD_GET_DRD_THREADID ? 1 : 0;
+	else
+		_On_drd = 0;
+#endif
+
+#if VG_HELGRIND_ENABLED
+	if (_On_valgrind) {
+		unsigned tmp;
+		unsigned result;
+		/*
+		 * As of now (pmem-3.15) VALGRIND_HG_GET_ABITS is broken on
+		 * the upstream version of Helgrind headers. It generates
+		 * a sign-conversion error and actually returns UINT32_MAX-1
+		 * when not running under Helgrind.
+		 */
+		long res = VALGRIND_HG_GET_ABITS(&tmp, &result, sizeof(tmp));
+
+		_On_helgrind = res != -2 ? 1 : 0;
+	} else {
+		_On_helgrind = 0;
+	}
+#endif
+
+#if VG_DRD_ENABLED || VG_HELGRIND_ENABLED
+	_On_drd_or_hg = (unsigned)(On_helgrind + On_drd);
+#endif
+
+#if VG_TXINFO_ENABLED
+	if (_On_valgrind) {
+		char *txinfo_env = secure_getenv("D_DAV_VG_TXINFO");
+
+		if (txinfo_env)
+			_Vg_txinfo_emit = atoi(txinfo_env);
+	} else {
+		_Vg_txinfo_emit = 0;
+	}
+#endif
+}
diff --git a/src/common/dav_v2/util.h b/src/common/dav_v2/util.h
new file mode 100644
index 00000000000..f1e12321918
--- /dev/null
+++ b/src/common/dav_v2/util.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2014-2021, Intel Corporation */
+/*
+ * Copyright (c) 2016-2020, Microsoft Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *
+ *     * Neither the name of the copyright holder nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * util.h -- internal definitions for util module
+ */
+
+#ifndef __DAOS_COMMON_UTIL_H
+#define __DAOS_COMMON_UTIL_H 1
+
+#include <string.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdatomic.h>
+#include <sys/param.h>
+
+#if defined(__x86_64) || defined(_M_X64) || defined(__aarch64__) || \
+	defined(__riscv)
+#define PAGESIZE 4096
+#elif defined(__PPC64__)
+#define PAGESIZE 65536
+#else
+#error unable to recognize ISA at compile time
+#endif
+
+#if defined(__x86_64) || defined(_M_X64) || defined(__aarch64__) || \
+	defined(__riscv)
+#define CACHELINE_SIZE 64ULL
+#elif defined(__PPC64__)
+#define CACHELINE_SIZE 128ULL
+#else
+#error unable to recognize architecture at compile time
+#endif
+
+#define ALIGN_UP(size, align) (((size) + (align) - 1) & ~((align) - 1))
+#define ALIGN_DOWN(size, align) ((size) & ~((align) - 1))
+
+void util_init(void);
+int util_is_zeroed(const void *addr, size_t len);
+uint64_t util_checksum_compute(void *addr, size_t len, uint64_t *csump,
+		size_t skip_off);
+int util_checksum(void *addr, size_t len, uint64_t *csump,
+		int insert, size_t skip_off);
+uint64_t util_checksum_seq(const void *addr, size_t len, uint64_t csum);
+
+#define force_inline __attribute__((always_inline)) inline
+
+typedef uint64_t ua_uint64_t __attribute__((aligned(1)));
+typedef uint32_t ua_uint32_t __attribute__((aligned(1)));
+typedef uint16_t ua_uint16_t __attribute__((aligned(1)));
+
+/*
+ * util_div_ceil -- divides a by b and rounds up the result
+ */
+static force_inline unsigned
+util_div_ceil(unsigned a, unsigned b)
+{
+	return (unsigned)(((unsigned long)a + b - 1) / b);
+}
+
+/*
+ * util_bool_compare_and_swap -- perform an atomic compare and swap
+ * util_fetch_and_* -- perform an operation atomically, return old value
+ * util_popcount -- count number of set bits
+ * util_lssb_index -- return index of least significant set bit,
+ *			undefined on zero
+ * util_mssb_index -- return index of most significant set bit
+ *			undefined on zero
+ *
+ * XXX assertions needed on (value != 0) in both versions of bitscans
+ *
+ */
+
+/*
+ * ISO C11 -- 7.17.7.2 The atomic_load generic functions
+ * Integer width specific versions as supplement for:
+ *
+ *
+ * #include <stdatomic.h>
+ * C atomic_load(volatile A *object);
+ * C atomic_load_explicit(volatile A *object, memory_order order);
+ *
+ * The atomic_load interface doesn't return the loaded value, but instead
+ * copies it to a specified address.
+ *
+ * void util_atomic_load64(volatile A *object, A *destination);
+ * void util_atomic_load_explicit32(volatile A *object, A *destination,
+ *                                  memory_order order);
+ * void util_atomic_load_explicit64(volatile A *object, A *destination,
+ *                                  memory_order order);
+ * Also, instead of generic functions, two versions are available:
+ * for 32 bit fundamental integers, and for 64 bit ones.
+ */
+
+#define util_atomic_load_explicit32 __atomic_load
+#define util_atomic_load_explicit64 __atomic_load
+
+/* ISO C11 -- 7.17.7.1 The atomic_store generic functions */
+/*
+ * ISO C11 -- 7.17.7.1 The atomic_store generic functions
+ * Integer width specific versions as supplement for:
+ *
+ * #include <stdatomic.h>
+ * void atomic_store(volatile A *object, C desired);
+ * void atomic_store_explicit(volatile A *object, C desired,
+ *                            memory_order order);
+ */
+#define util_atomic_store_explicit32 __atomic_store_n
+#define util_atomic_store_explicit64 __atomic_store_n
+
+/*
+ * https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html
+ * https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+ * https://clang.llvm.org/docs/LanguageExtensions.html#builtin-functions
+ */
+#define util_bool_compare_and_swap64 __sync_bool_compare_and_swap
+#define util_fetch_and_add64 __sync_fetch_and_add
+#define util_fetch_and_sub64 __sync_fetch_and_sub
+#define util_popcount64(value) ((unsigned char)__builtin_popcountll(value))
+
+#define util_lssb_index64(value) ((unsigned char)__builtin_ctzll(value))
+#define util_mssb_index64(value) ((unsigned char)(63 - __builtin_clzll(value)))
+
+/* ISO C11 -- 7.17.7 Operations on atomic types */
+#define util_atomic_load64(object, dest)\
+	util_atomic_load_explicit64(object, dest, memory_order_seq_cst)
+
+#define COMPILE_ERROR_ON(cond) ((void)sizeof(char[(cond) ? -1 : 1]))
+
+/* macro for counting the number of varargs (up to 9) */
+#define COUNT(...)\
+	COUNT_11TH(_, ##__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define COUNT_11TH(_11, _10, _9, _8, _7, _6, _5, _4, _3, _2,  X, ...) X
+
+/* concatenation macro */
+#define GLUE(A, B) GLUE_I(A, B)
+#define GLUE_I(A, B) A##B
+
+/* macro for suppressing errors from unused variables (zero to 9) */
+#define SUPPRESS_UNUSED(...)\
+	GLUE(SUPPRESS_ARG_, COUNT(__VA_ARGS__))(__VA_ARGS__)
+#define SUPPRESS_ARG_0(X)
+#define SUPPRESS_ARG_1(X) ((void)(X))
+#define SUPPRESS_ARG_2(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_1(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_3(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_2(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_4(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_3(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_5(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_4(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_6(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_5(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_7(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_6(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_8(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_7(__VA_ARGS__);\
+} while (0)
+#define SUPPRESS_ARG_9(X, ...) do {\
+	SUPPRESS_ARG_1(X); SUPPRESS_ARG_8(__VA_ARGS__);\
+} while (0)
+
+#endif /* __DAOS_COMMON_UTIL_H */
diff --git a/src/common/dav_v2/valgrind_internal.h b/src/common/dav_v2/valgrind_internal.h
new file mode 100644
index 00000000000..57253b9bac0
--- /dev/null
+++ b/src/common/dav_v2/valgrind_internal.h
@@ -0,0 +1,293 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2015-2021, Intel Corporation */
+
+/*
+ * valgrind_internal.h -- internal definitions for valgrind macros
+ */
+
+#ifndef __DAOS_COMMON_VALGRIND_INTERNAL_H
+#define __DAOS_COMMON_VALGRIND_INTERNAL_H 1
+
+#ifdef D_HAS_VALGRIND
+#if !defined(_WIN32) && !defined(__FreeBSD__) && !defined(__riscv)
+#define VG_TXINFO_ENABLED 1
+#define VG_HELGRIND_ENABLED 1
+#define VG_MEMCHECK_ENABLED 1
+#define VG_DRD_ENABLED 1
+#endif
+#endif
+
+#if VG_TXINFO_ENABLED || VG_HELGRIND_ENABLED || VG_MEMCHECK_ENABLED || \
+	VG_DRD_ENABLED
+#define ANY_VG_TOOL_ENABLED 1
+#else
+#define ANY_VG_TOOL_ENABLED 0
+#endif
+
+#if ANY_VG_TOOL_ENABLED
+extern unsigned _On_valgrind;
+#define On_valgrind __builtin_expect(_On_valgrind, 0)
+#include "valgrind/valgrind.h"
+#else
+#define On_valgrind (0)
+#endif
+
+#if VG_HELGRIND_ENABLED
+extern unsigned _On_helgrind;
+#define On_helgrind __builtin_expect(_On_helgrind, 0)
+#include "valgrind/helgrind.h"
+#else
+#define On_helgrind (0)
+#endif
+
+#if VG_DRD_ENABLED
+extern unsigned _On_drd;
+#define On_drd __builtin_expect(_On_drd, 0)
+#include "valgrind/drd.h"
+#else
+#define On_drd (0)
+#endif
+
+#if VG_HELGRIND_ENABLED || VG_DRD_ENABLED
+
+extern unsigned _On_drd_or_hg;
+#define On_drd_or_hg __builtin_expect(_On_drd_or_hg, 0)
+
+#define VALGRIND_ANNOTATE_HAPPENS_BEFORE(obj) do {\
+	if (On_drd_or_hg) \
+		ANNOTATE_HAPPENS_BEFORE((obj));\
+} while (0)
+
+#define VALGRIND_ANNOTATE_HAPPENS_AFTER(obj) do {\
+	if (On_drd_or_hg) \
+		ANNOTATE_HAPPENS_AFTER((obj));\
+} while (0)
+
+#define VALGRIND_ANNOTATE_NEW_MEMORY(addr, size) do {\
+	if (On_drd_or_hg) \
+		ANNOTATE_NEW_MEMORY((addr), (size));\
+} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_READS_BEGIN() do {\
+	if (On_drd_or_hg) \
+	ANNOTATE_IGNORE_READS_BEGIN();\
+} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_READS_END() do {\
+	if (On_drd_or_hg) \
+	ANNOTATE_IGNORE_READS_END();\
+} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_WRITES_BEGIN() do {\
+	if (On_drd_or_hg) \
+	ANNOTATE_IGNORE_WRITES_BEGIN();\
+} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_WRITES_END() do {\
+	if (On_drd_or_hg) \
+	ANNOTATE_IGNORE_WRITES_END();\
+} while (0)
+
+/* Supported by both helgrind and drd. */
+#define VALGRIND_HG_DRD_DISABLE_CHECKING(addr, size) do {\
+	if (On_drd_or_hg) \
+		VALGRIND_HG_DISABLE_CHECKING((addr), (size));\
+} while (0)
+
+#else
+
+#define On_drd_or_hg (0)
+
+#define VALGRIND_ANNOTATE_HAPPENS_BEFORE(obj) { (void)(obj); }
+
+#define VALGRIND_ANNOTATE_HAPPENS_AFTER(obj) { (void)(obj); }
+
+#define VALGRIND_ANNOTATE_NEW_MEMORY(addr, size) do {\
+	(void) (addr);\
+	(void) (size);\
+} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_READS_BEGIN() do {} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_READS_END() do {} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_WRITES_BEGIN() do {} while (0)
+
+#define VALGRIND_ANNOTATE_IGNORE_WRITES_END() do {} while (0)
+
+#define VALGRIND_HG_DRD_DISABLE_CHECKING(addr, size) do {\
+	(void) (addr);\
+	(void) (size);\
+} while (0)
+
+#endif
+
+#if VG_TXINFO_ENABLED
+
+extern int _Vg_txinfo_emit;
+#define VG_txinfo_emit __builtin_expect(_Vg_txinfo_emit, 0)
+
+void util_emit_log(const char *func, int order);
+
+#define VALGRIND_SET_CLEAN(addr, len) do {\
+	(void)(addr);\
+	(void)(len);\
+} while (0)
+
+#define VALGRIND_START_TX do {} while (0)
+
+#define VALGRIND_END_TX do {} while (0)
+
+#define VALGRIND_ADD_TO_TX(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define VALGRIND_REMOVE_FROM_TX(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+/*
+ * Logs library and function name with proper suffix
+ * to VG log file.
+ */
+#define DAV_API_START() do {\
+	if (VG_txinfo_emit)\
+		VALGRIND_PRINTF("%s BEGIN\n", __func__);\
+} while (0)
+#define DAV_API_END() do {\
+	if (VG_txinfo_emit)\
+		VALGRIND_PRINTF("%s END\n", __func__);\
+} while (0)
+
+#else /* VG_TXINFO_ENABLED */
+
+#define VG_txinfo_emit (0)
+
+#define VALGRIND_SET_CLEAN(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define VALGRIND_START_TX do {} while (0)
+
+#define VALGRIND_END_TX do {} while (0)
+
+#define VALGRIND_ADD_TO_TX(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define VALGRIND_REMOVE_FROM_TX(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(addr, len) do {\
+	(void) (addr);\
+	(void) (len);\
+} while (0)
+
+#define DAV_API_START() do {} while (0)
+
+#define DAV_API_END() do {} while (0)
+
+#endif /* VG_TXINFO_ENABLED */
+
+#if VG_MEMCHECK_ENABLED
+
+extern unsigned _On_memcheck;
+#define On_memcheck __builtin_expect(_On_memcheck, 0)
+
+#include "valgrind/memcheck.h"
+
+#define VALGRIND_DO_DISABLE_ERROR_REPORTING do {\
+	if (On_valgrind)\
+		VALGRIND_DISABLE_ERROR_REPORTING;\
+} while (0)
+
+#define VALGRIND_DO_ENABLE_ERROR_REPORTING do {\
+	if (On_valgrind)\
+		VALGRIND_ENABLE_ERROR_REPORTING;\
+} while (0)
+
+#define VALGRIND_DO_CREATE_MEMPOOL(heap, rzB, is_zeroed) do {\
+	if (On_memcheck)\
+		VALGRIND_CREATE_MEMPOOL(heap, rzB, is_zeroed);\
+} while (0)
+
+#define VALGRIND_DO_DESTROY_MEMPOOL(heap) do {\
+	if (On_memcheck)\
+		VALGRIND_DESTROY_MEMPOOL(heap);\
+} while (0)
+
+#define VALGRIND_DO_MEMPOOL_ALLOC(heap, addr, size) do {\
+	if (On_memcheck)\
+		VALGRIND_MEMPOOL_ALLOC(heap, addr, size);\
+} while (0)
+
+#define VALGRIND_DO_MEMPOOL_FREE(heap, addr) do {\
+	if (On_memcheck)\
+		VALGRIND_MEMPOOL_FREE(heap, addr);\
+} while (0)
+
+#define VALGRIND_DO_MAKE_MEM_DEFINED(addr, len) do {\
+	if (On_memcheck)\
+		VALGRIND_MAKE_MEM_DEFINED(addr, len);\
+} while (0)
+
+#define VALGRIND_DO_MAKE_MEM_UNDEFINED(addr, len) do {\
+	if (On_memcheck)\
+		VALGRIND_MAKE_MEM_UNDEFINED(addr, len);\
+} while (0)
+
+#define VALGRIND_DO_MAKE_MEM_NOACCESS(addr, len) do {\
+	if (On_memcheck)\
+		VALGRIND_MAKE_MEM_NOACCESS(addr, len);\
+} while (0)
+
+#define VALGRIND_DO_CHECK_MEM_IS_ADDRESSABLE(addr, len) do {\
+	if (On_memcheck)\
+		VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, len);\
+} while (0)
+
+#else /* VG_MEMCHECK_ENABLED */
+
+#define On_memcheck (0)
+
+#define VALGRIND_DO_DISABLE_ERROR_REPORTING do {} while (0)
+
+#define VALGRIND_DO_ENABLE_ERROR_REPORTING do {} while (0)
+
+#define VALGRIND_DO_CREATE_MEMPOOL(heap, rzB, is_zeroed)\
+	do { (void) (heap); (void) (rzB); (void) (is_zeroed); } while (0)
+
+#define VALGRIND_DO_DESTROY_MEMPOOL(heap) { (void) (heap); }
+
+#define VALGRIND_DO_MEMPOOL_ALLOC(heap, addr, size)\
+	do { (void) (heap); (void) (addr); (void) (size); } while (0)
+
+#define VALGRIND_DO_MEMPOOL_FREE(heap, addr)\
+	do { (void) (heap); (void) (addr); } while (0)
+
+#define VALGRIND_DO_MAKE_MEM_DEFINED(addr, len)\
+	do { (void) (addr); (void) (len); } while (0)
+
+#define VALGRIND_DO_MAKE_MEM_UNDEFINED(addr, len)\
+	do { (void) (addr); (void) (len); } while (0)
+
+#define VALGRIND_DO_MAKE_MEM_NOACCESS(addr, len)\
+	do { (void) (addr); (void) (len); } while (0)
+
+#define VALGRIND_DO_CHECK_MEM_IS_ADDRESSABLE(addr, len)\
+	do { (void) (addr); (void) (len); } while (0)
+
+#endif /* VG_MEMCHECK_ENABLED */
+
+#endif /* __DAOS_COMMON_VALGRIND_INTERNAL_H */
diff --git a/src/common/dav_v2/vec.h b/src/common/dav_v2/vec.h
new file mode 100644
index 00000000000..14bbe667687
--- /dev/null
+++ b/src/common/dav_v2/vec.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2017-2020, Intel Corporation */
+
+/*
+ * vec.h -- vector interface
+ */
+
+#ifndef __DAOS_COMMON_VEC_H
+#define __DAOS_COMMON_VEC_H 1
+
+#include <stddef.h>
+#include "valgrind_internal.h"
+#include "util.h"
+#include "out.h"
+
+#define VEC_INIT_SIZE (64)
+
+#define VEC(name, type)\
+struct name {\
+	type *buffer;\
+	size_t size;\
+	size_t capacity;\
+}
+
+#define VEC_INITIALIZER {NULL, 0, 0}
+
+#define VEC_INIT(vec) do {\
+	(vec)->buffer = NULL;\
+	(vec)->size = 0;\
+	(vec)->capacity = 0;\
+} while (0)
+
+#define VEC_MOVE(vecl, vecr) do {\
+	D_FREE((vecl)->buffer);\
+	(vecl)->buffer = (vecr)->buffer;\
+	(vecl)->size = (vecr)->size;\
+	(vecl)->capacity = (vecr)->capacity;\
+	(vecr)->buffer = NULL;\
+	(vecr)->size = 0;\
+	(vecr)->capacity = 0;\
+} while (0)
+
+#define VEC_REINIT(vec) do {\
+	VALGRIND_ANNOTATE_NEW_MEMORY((vec), sizeof(*vec));\
+	VALGRIND_ANNOTATE_NEW_MEMORY((vec)->buffer,\
+		(sizeof(*(vec)->buffer) * ((vec)->capacity)));\
+	(vec)->size = 0;\
+} while (0)
+
+static inline int
+vec_reserve(void *vec, size_t ncapacity, size_t s)
+{
+	void *tbuf;
+	size_t ncap = ncapacity == 0 ? VEC_INIT_SIZE : ncapacity;
+
+	VEC(vvec, void) *vecp = (struct vvec *)vec;
+
+	D_REALLOC_NZ(tbuf, vecp->buffer, s * ncap);
+	if (tbuf == NULL) {
+		D_CRIT("Realloc!\n");
+		return -1;
+	}
+	vecp->buffer = tbuf;
+	vecp->capacity = ncap;
+	return 0;
+}
+
+#define VEC_RESERVE(vec, ncapacity)\
+(((vec)->size == 0 || (ncapacity) > (vec)->size) ?\
+	vec_reserve((void *)vec, ncapacity, sizeof(*(vec)->buffer)) :\
+	0)
+
+#define VEC_POP_BACK(vec) ((vec)->size -= 1)
+
+#define VEC_FRONT(vec) ((vec)->buffer[0])
+
+#define VEC_BACK(vec) ((vec)->buffer[(vec)->size - 1])
+
+#define VEC_ERASE_BY_POS(vec, pos) do {\
+	if ((pos) != ((vec)->size - 1))\
+		(vec)->buffer[(pos)] = VEC_BACK(vec);\
+	VEC_POP_BACK(vec);\
+} while (0)
+
+#define VEC_ERASE_BY_PTR(vec, element) do {\
+	if ((element) != &VEC_BACK(vec))\
+		*(element) = VEC_BACK(vec);\
+	VEC_POP_BACK(vec);\
+} while (0)
+
+#define VEC_INSERT(vec, element)\
+((vec)->buffer[(vec)->size - 1] = (element), 0)
+
+#define VEC_INC_SIZE(vec)\
+(((vec)->size++), 0)
+
+#define VEC_INC_BACK(vec)\
+((vec)->capacity == (vec)->size ? \
+	(VEC_RESERVE((vec), ((vec)->capacity * 2)) == 0 ? \
+		VEC_INC_SIZE(vec) : -1) : \
+	VEC_INC_SIZE(vec))
+
+#define VEC_PUSH_BACK(vec, element)\
+(VEC_INC_BACK(vec) == 0 ? VEC_INSERT(vec, element) : -1)
+
+#define VEC_FOREACH(el, vec)\
+for (size_t _vec_i = 0;\
+	_vec_i < (vec)->size && (((el) = (vec)->buffer[_vec_i]), 1);\
+	++_vec_i)
+
+#define VEC_FOREACH_REVERSE(el, vec)\
+for (size_t _vec_i = ((vec)->size);\
+	_vec_i != 0 && (((el) = (vec)->buffer[_vec_i - 1]), 1);\
+	--_vec_i)
+
+#define VEC_FOREACH_BY_POS(elpos, vec)\
+for ((elpos) = 0; (elpos) < (vec)->size; ++(elpos))
+
+#define VEC_FOREACH_BY_PTR(el, vec)\
+for (size_t _vec_i = 0;\
+	_vec_i < (vec)->size && (((el) = &(vec)->buffer[_vec_i]), 1);\
+	++_vec_i)
+
+#define VEC_SIZE(vec)\
+((vec)->size)
+
+#define VEC_CAPACITY(vec)\
+((vec)->capacity)
+
+#define VEC_ARR(vec)\
+((vec)->buffer)
+
+#define VEC_GET(vec, id)\
+(&(vec)->buffer[id])
+
+#define VEC_CLEAR(vec) ((vec)->size = 0)
+
+#define VEC_DELETE(vec) do {\
+	D_FREE((vec)->buffer);\
+	(vec)->buffer = NULL;\
+	(vec)->size = 0;\
+	(vec)->capacity = 0;\
+} while (0)
+
+#endif /* __DAOS_COMMON_VEC_H */
diff --git a/src/common/dav_v2/vecq.h b/src/common/dav_v2/vecq.h
new file mode 100644
index 00000000000..8af909439e0
--- /dev/null
+++ b/src/common/dav_v2/vecq.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2018-2020, Intel Corporation */
+
+/*
+ * vecq.h -- vector queue (FIFO) interface
+ */
+
+#ifndef __DAOS_COMMON_VECQ_H
+#define __DAOS_COMMON_VECQ_H 1
+
+#include <stddef.h>
+
+#include "util.h"
+#include "out.h"
+
+#define VECQ_INIT_SIZE (64)
+
+#define VECQ(name, type)\
+struct name {\
+	type *buffer;\
+	size_t capacity;\
+	size_t front;\
+	size_t back;\
+}
+
+#define VECQ_INIT(vec) do {\
+	(vec)->buffer = NULL;\
+	(vec)->capacity = 0;\
+	(vec)->front = 0;\
+	(vec)->back = 0;\
+} while (0)
+
+#define VECQ_REINIT(vec) do {\
+	VALGRIND_ANNOTATE_NEW_MEMORY((vec), sizeof(*vec));\
+	VALGRIND_ANNOTATE_NEW_MEMORY((vec)->buffer,\
+		(sizeof(*(vec)->buffer) * ((vec)->capacity)));\
+	(vec)->front = 0;\
+	(vec)->back = 0;\
+} while (0)
+
+#define VECQ_FRONT_POS(vec)\
+((vec)->front & ((vec)->capacity - 1))
+
+#define VECQ_BACK_POS(vec)\
+((vec)->back & ((vec)->capacity - 1))
+
+#define VECQ_FRONT(vec)\
+((vec)->buffer[VECQ_FRONT_POS(vec)])
+
+#define VECQ_BACK(vec) ((vec)->buffer[VECQ_BACK_POS(vec)])
+
+#define VECQ_DEQUEUE(vec)\
+((vec)->buffer[(((vec)->front++) & ((vec)->capacity - 1))])
+
+#define VECQ_SIZE(vec)\
+((vec)->back - (vec)->front)
+
+static inline int
+realloc_set(void **buf, size_t s)
+{
+	void *tbuf;
+
+	D_REALLOC_NZ(tbuf, *buf, s);
+	if (tbuf == NULL) {
+		D_CRIT("Realloc!\n");
+		return -1;
+	}
+	*buf = tbuf;
+	return 0;
+}
+
+#define VECQ_NCAPACITY(vec)\
+((vec)->capacity == 0 ? VECQ_INIT_SIZE : (vec)->capacity * 2)
+#define VECQ_GROW(vec)\
+(realloc_set((void **)&(vec)->buffer,\
+		VECQ_NCAPACITY(vec) * sizeof(*(vec)->buffer)) ? -1 :\
+	(memcpy((vec)->buffer + (vec)->capacity, (vec)->buffer,\
+		VECQ_FRONT_POS(vec) * sizeof(*(vec)->buffer)),\
+	(vec)->front = VECQ_FRONT_POS(vec),\
+	(vec)->back = (vec)->front + (vec)->capacity,\
+	(vec)->capacity = VECQ_NCAPACITY(vec),\
+	0\
+))
+
+#define VECQ_INSERT(vec, element)\
+(VECQ_BACK(vec) = element, (vec)->back += 1, 0)
+
+#define VECQ_ENQUEUE(vec, element)\
+((vec)->capacity == VECQ_SIZE(vec) ?\
+	(VECQ_GROW(vec) == 0 ? VECQ_INSERT(vec, element) : -1) :\
+VECQ_INSERT(vec, element))
+
+#define VECQ_CAPACITY(vec)\
+((vec)->capacity)
+
+#define VECQ_FOREACH(el, vec)\
+for (size_t _vec_i = 0;\
+	_vec_i < VECQ_SIZE(vec) &&\
+	(((el) = (vec)->buffer[_vec_i & ((vec)->capacity - 1)]), 1);\
+	++_vec_i)
+
+#define VECQ_FOREACH_REVERSE(el, vec)\
+for (size_t _vec_i = VECQ_SIZE(vec);\
+	_vec_i > 0 &&\
+	(((el) = (vec)->buffer[(_vec_i - 1) & ((vec)->capacity - 1)]), 1);\
+	--_vec_i)
+
+#define VECQ_CLEAR(vec) do {\
+	(vec)->front = 0;\
+	(vec)->back = 0;\
+} while (0)
+
+#define VECQ_DELETE(vec) do {\
+	D_FREE((vec)->buffer);\
+	(vec)->buffer = NULL;\
+	(vec)->capacity = 0;\
+	(vec)->front = 0;\
+	(vec)->back = 0;\
+} while (0)
+
+#endif /* __DAOS_COMMON_VECQ_H */
diff --git a/src/common/dav_v2/wal_tx.c b/src/common/dav_v2/wal_tx.c
new file mode 100644
index 00000000000..8776127a1f1
--- /dev/null
+++ b/src/common/dav_v2/wal_tx.c
@@ -0,0 +1,509 @@
+/**
+ * (C) Copyright 2022-2023 Intel Corporation.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause-Patent
+ */
+
+#include <daos/mem.h>
+#include "dav_internal.h"
+#include "wal_tx.h"
+#include "util.h"
+
+struct umem_wal_tx_ops dav_wal_tx_ops;
+
+static inline uint64_t
+mdblob_addr2offset(struct dav_obj *hdl, void *addr)
+{
+	D_ASSERT(((uintptr_t)addr >= (uintptr_t)hdl->do_base) &&
+		 ((uintptr_t)addr <= ((uintptr_t)hdl->do_base + hdl->do_size)));
+	return (uintptr_t)addr - (uintptr_t)hdl->do_base;
+}
+
+#define AD_TX_ACT_ADD(tx, wa)							\
+	do {									\
+		d_list_add_tail(&(wa)->wa_link, &(tx)->wt_redo);		\
+		(tx)->wt_redo_cnt++;						\
+		if ((wa)->wa_act.ac_opc == UMEM_ACT_COPY ||			\
+		    (wa)->wa_act.ac_opc == UMEM_ACT_COPY_PTR) {			\
+			(tx)->wt_redo_payload_len += (wa)->wa_act.ac_copy.size;	\
+		} else if ((wa)->wa_act.ac_opc == UMEM_ACT_MOVE) {		\
+			/* ac_move src addr is playload after wal_trans_entry */\
+			(tx)->wt_redo_payload_len += sizeof(uint64_t);		\
+		}								\
+	} while (0)
+
+/** allocate wal_action, if success the wa_link and wa_act.ac_opc will be init-ed */
+#define D_ALLOC_ACT(wa, opc, size)							\
+	do {										\
+		if (opc == UMEM_ACT_COPY)						\
+			D_ALLOC(wa, offsetof(struct wal_action,				\
+					     wa_act.ac_copy.payload[size]));		\
+		else									\
+			D_ALLOC_PTR(wa);						\
+		if (likely(wa != NULL)) {						\
+			D_INIT_LIST_HEAD(&wa->wa_link);					\
+			wa->wa_act.ac_opc = opc;					\
+		}									\
+	} while (0)
+
+static inline void
+act_copy_payload(struct umem_action *act, void *addr, daos_size_t size)
+{
+	char	*dst = (char *)&act->ac_copy.payload[0];
+
+	if (size > 0)
+		memcpy(dst, addr, size);
+}
+
+static void
+dav_wal_tx_init(struct umem_wal_tx *utx, struct dav_obj *dav_hdl)
+{
+	struct dav_tx	*tx = utx2wtx(utx);
+
+	D_INIT_LIST_HEAD(&tx->wt_redo);
+	tx->wt_redo_cnt = 0;
+	tx->wt_redo_payload_len = 0;
+	tx->wt_redo_act_pos = NULL;
+	tx->wt_dav_hdl = dav_hdl;
+}
+
+struct umem_wal_tx *
+dav_umem_wtx_new(struct dav_obj *dav_hdl)
+{
+	struct umem_wal_tx *umem_wtx;
+
+	D_ASSERT(dav_hdl->do_utx == NULL);
+	D_ALLOC_PTR(umem_wtx);
+	if (umem_wtx == NULL)
+		return NULL;
+
+	umem_wtx->utx_ops = &dav_wal_tx_ops;
+	umem_wtx->utx_id = ULLONG_MAX;
+	dav_wal_tx_init(umem_wtx, dav_hdl);
+	dav_hdl->do_utx = umem_wtx;
+	return umem_wtx;
+}
+
+void
+dav_umem_wtx_cleanup(struct umem_wal_tx *utx)
+{
+	struct dav_tx		*tx = utx2wtx(utx);
+	d_list_t		*list = &tx->wt_redo;
+	struct wal_action	*wa, *next;
+
+	d_list_for_each_entry_safe(wa, next, list, wa_link) {
+		d_list_del(&wa->wa_link);
+		D_FREE(wa);
+	}
+}
+
+static int
+dav_wal_tx_submit(struct dav_obj *dav_hdl, struct umem_wal_tx *utx, void *data)
+{
+	struct wal_action	*wa, *next;
+	struct umem_action	*ua;
+	struct umem_store	*store = dav_hdl->do_store;
+	struct dav_tx		*tx = utx2wtx(utx);
+	d_list_t		*redo_list = &tx->wt_redo;
+
+	char	*pathname = basename(dav_hdl->do_path);
+	uint64_t id = utx->utx_id;
+	int	 rc;
+
+	if (wal_tx_act_nr(utx) == 0)
+		return 0;
+
+	d_list_for_each_entry_safe(wa, next, redo_list, wa_link) {
+		ua = &wa->wa_act;
+		switch (ua->ac_opc) {
+		case UMEM_ACT_COPY:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_COPY txid=%lu, (p,o)=%lu,%lu size=%lu\n",
+				pathname, id,
+				ua->ac_copy.addr / PAGESIZE, ua->ac_copy.addr % PAGESIZE,
+				ua->ac_copy.size);
+			break;
+		case UMEM_ACT_COPY_PTR:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_COPY_PTR txid=%lu, (p,o)=%lu,%lu size=%lu ptr=0x%lx\n",
+				pathname, id,
+				ua->ac_copy_ptr.addr / PAGESIZE, ua->ac_copy_ptr.addr % PAGESIZE,
+				ua->ac_copy_ptr.size, ua->ac_copy_ptr.ptr);
+			break;
+		case UMEM_ACT_ASSIGN:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_ASSIGN txid=%lu, (p,o)=%lu,%lu size=%u\n",
+				pathname, id,
+				ua->ac_assign.addr / PAGESIZE, ua->ac_assign.addr % PAGESIZE,
+				ua->ac_assign.size);
+			break;
+		case UMEM_ACT_SET:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_SET txid=%lu, (p,o)=%lu,%lu size=%u val=%u\n",
+				pathname, id,
+				ua->ac_set.addr / PAGESIZE, ua->ac_set.addr % PAGESIZE,
+				ua->ac_set.size, ua->ac_set.val);
+			break;
+		case UMEM_ACT_SET_BITS:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_SET_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n",
+				pathname, id,
+				ua->ac_op_bits.addr / PAGESIZE, ua->ac_op_bits.addr % PAGESIZE,
+				ua->ac_op_bits.pos, ua->ac_op_bits.num);
+			break;
+		case UMEM_ACT_CLR_BITS:
+			D_DEBUG(DB_TRACE,
+				"%s: ACT_CLR_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n",
+				pathname, id,
+				ua->ac_op_bits.addr / PAGESIZE, ua->ac_op_bits.addr % PAGESIZE,
+				ua->ac_op_bits.pos, ua->ac_op_bits.num);
+			break;
+		default:
+			D_ERROR("%s: unknown opc %d\n", dav_hdl->do_path, ua->ac_opc);
+			ASSERT(0);
+		}
+	}
+	DAV_DBG("tx_id:%lu submitting to WAL: %u bytes in %u actions",
+		id, tx->wt_redo_payload_len, tx->wt_redo_cnt);
+	rc = store->stor_ops->so_wal_submit(store, utx, data);
+	return rc;
+}
+
+/** complete the wl transaction */
+int
+dav_wal_tx_commit(struct dav_obj *hdl, struct umem_wal_tx *utx, void *data)
+{
+	int rc;
+
+	/* write actions in redo list to WAL */
+	rc = dav_wal_tx_submit(hdl, utx, data);
+
+	/* FAIL the engine if commit fails */
+	D_ASSERT(rc == 0);
+	dav_umem_wtx_cleanup(utx);
+	return 0;
+}
+
+int
+dav_wal_tx_reserve(struct dav_obj *hdl, uint64_t *id)
+{
+	int rc;
+
+	rc = hdl->do_store->stor_ops->so_wal_reserv(hdl->do_store, id);
+	/* REVISIT:
+	 * Remove this assert once callers of dav_free() and dav_memcpy_persist()
+	 * are modified to handle failures.
+	 */
+	D_ASSERT(rc == 0);
+	return rc;
+}
+
+/**
+ * snapshot data from src to either wal redo log.
+ */
+int
+dav_wal_tx_snap(void *hdl, void *addr, daos_size_t size, void *src, uint32_t flags)
+{
+	struct dav_obj		*dav_hdl = (struct dav_obj *)hdl;
+	struct dav_tx		*tx = utx2wtx(dav_hdl->do_utx);
+	struct wal_action	*wa_redo;
+	int                      rc;
+
+	D_ASSERT(hdl != NULL);
+
+	if (addr == NULL || size == 0 || size > UMEM_ACT_PAYLOAD_MAX_LEN)
+		return -DER_INVAL;
+
+	rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id,
+			      mdblob_addr2offset(tx->wt_dav_hdl, addr), size);
+	if (rc != 0)
+		return rc;
+
+	if (flags & DAV_XADD_WAL_CPTR) {
+		D_ALLOC_ACT(wa_redo, UMEM_ACT_COPY_PTR, size);
+		if (wa_redo == NULL)
+			return -DER_NOMEM;
+		wa_redo->wa_act.ac_copy_ptr.ptr = (uintptr_t)src;
+		wa_redo->wa_act.ac_copy_ptr.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+		wa_redo->wa_act.ac_copy_ptr.size = size;
+	} else {
+		D_ALLOC_ACT(wa_redo, UMEM_ACT_COPY, size);
+		if (wa_redo == NULL)
+			return -DER_NOMEM;
+		act_copy_payload(&wa_redo->wa_act, src, size);
+		wa_redo->wa_act.ac_copy.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+		wa_redo->wa_act.ac_copy.size = size;
+	}
+	AD_TX_ACT_ADD(tx, wa_redo);
+	return 0;
+}
+
+/** assign uint64_t value to @addr */
+int
+dav_wal_tx_assign(void *hdl, void *addr, uint64_t val)
+{
+	struct dav_obj		*dav_hdl = (struct dav_obj *)hdl;
+	struct dav_tx		*tx = utx2wtx(dav_hdl->do_utx);
+	struct wal_action	*wa_redo;
+	int                      rc;
+
+	D_ASSERT(hdl != NULL);
+	if (addr == NULL)
+		return -DER_INVAL;
+
+	rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id,
+			      mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t));
+	if (rc != 0)
+		return rc;
+
+	D_ALLOC_ACT(wa_redo, UMEM_ACT_ASSIGN, sizeof(uint64_t));
+	if (wa_redo == NULL)
+		return -DER_NOMEM;
+	wa_redo->wa_act.ac_assign.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+	wa_redo->wa_act.ac_assign.size = 8;
+	wa_redo->wa_act.ac_assign.val = val;
+	AD_TX_ACT_ADD(tx, wa_redo);
+
+	return 0;
+}
+
+/** Set bits starting from pos */
+int
+dav_wal_tx_set_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits)
+{
+	struct dav_obj		*dav_hdl = (struct dav_obj *)hdl;
+	struct dav_tx		*tx = utx2wtx(dav_hdl->do_utx);
+	struct wal_action	*wa_redo;
+	int                      rc;
+
+	D_ASSERT(hdl != NULL);
+	if (addr == NULL)
+		return -DER_INVAL;
+
+	rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id,
+			      mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t));
+	if (rc != 0)
+		return rc;
+
+	D_ALLOC_ACT(wa_redo, UMEM_ACT_SET_BITS, sizeof(uint64_t));
+	if (wa_redo == NULL)
+		return -DER_NOMEM;
+	wa_redo->wa_act.ac_op_bits.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+	wa_redo->wa_act.ac_op_bits.num = num_bits;
+	wa_redo->wa_act.ac_op_bits.pos = pos;
+	AD_TX_ACT_ADD(tx, wa_redo);
+
+	return 0;
+}
+
+/** Clr bits starting from pos */
+int
+dav_wal_tx_clr_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits)
+{
+	struct dav_obj		*dav_hdl = (struct dav_obj *)hdl;
+	struct dav_tx		*tx = utx2wtx(dav_hdl->do_utx);
+	struct wal_action	*wa_redo;
+	int                      rc;
+
+	D_ASSERT(hdl != NULL);
+	if (addr == NULL)
+		return -DER_INVAL;
+
+	rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id,
+			      mdblob_addr2offset(tx->wt_dav_hdl, addr), sizeof(uint64_t));
+	if (rc != 0)
+		return rc;
+
+	D_ALLOC_ACT(wa_redo, UMEM_ACT_CLR_BITS, sizeof(uint64_t));
+	if (wa_redo == NULL)
+		return -DER_NOMEM;
+	wa_redo->wa_act.ac_op_bits.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+	wa_redo->wa_act.ac_op_bits.num = num_bits;
+	wa_redo->wa_act.ac_op_bits.pos = pos;
+	AD_TX_ACT_ADD(tx, wa_redo);
+
+	return 0;
+}
+
+/**
+ * memset a storage region, save the operation for redo
+ */
+int
+dav_wal_tx_set(void *hdl, void *addr, char c, daos_size_t size)
+{
+	struct dav_obj		*dav_hdl = (struct dav_obj *)hdl;
+	struct dav_tx		*tx = utx2wtx(dav_hdl->do_utx);
+	struct wal_action	*wa_redo;
+	int                      rc;
+
+	D_ASSERT(hdl != NULL);
+
+	if (addr == NULL || size == 0 || size > UMEM_ACT_PAYLOAD_MAX_LEN)
+		return -DER_INVAL;
+
+	rc = umem_cache_touch(dav_hdl->do_store, dav_hdl->do_utx->utx_id,
+			      mdblob_addr2offset(tx->wt_dav_hdl, addr), size);
+	if (rc != 0)
+		return rc;
+
+	D_ALLOC_ACT(wa_redo, UMEM_ACT_SET, size);
+	if (wa_redo == NULL)
+		return -DER_NOMEM;
+
+	wa_redo->wa_act.ac_set.addr = mdblob_addr2offset(tx->wt_dav_hdl, addr);
+	wa_redo->wa_act.ac_set.size = size;
+	wa_redo->wa_act.ac_set.val = c;
+	AD_TX_ACT_ADD(tx, wa_redo);
+	return 0;
+}
+
+/**
+ * query action number in redo list.
+ */
+uint32_t
+wal_tx_act_nr(struct umem_wal_tx *utx)
+{
+	struct dav_tx *tx = utx2wtx(utx);
+
+	return tx->wt_redo_cnt;
+}
+
+/**
+ * query payload length in redo list.
+ */
+uint32_t
+wal_tx_payload_len(struct umem_wal_tx *utx)
+{
+	struct dav_tx *tx = utx2wtx(utx);
+
+	return tx->wt_redo_payload_len;
+}
+
+/**
+ * get first action pointer, NULL for list empty.
+ */
+struct umem_action *
+wal_tx_act_first(struct umem_wal_tx *utx)
+{
+	struct dav_tx *tx = utx2wtx(utx);
+
+	if (d_list_empty(&tx->wt_redo)) {
+		tx->wt_redo_act_pos = NULL;
+		return NULL;
+	}
+
+	tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo);
+	return &tx->wt_redo_act_pos->wa_act;
+}
+
+/**
+ * get next action pointer, NULL for done or list empty.
+ */
+struct umem_action *
+wal_tx_act_next(struct umem_wal_tx *utx)
+{
+	struct dav_tx *tx = utx2wtx(utx);
+
+	if (tx->wt_redo_act_pos == NULL) {
+		if (d_list_empty(&tx->wt_redo))
+			return NULL;
+		tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo);
+		return &tx->wt_redo_act_pos->wa_act;
+	}
+
+	D_ASSERT(!d_list_empty(&tx->wt_redo));
+	tx->wt_redo_act_pos = dav_action_get_next(tx->wt_redo_act_pos->wa_link);
+	if (&tx->wt_redo_act_pos->wa_link == &tx->wt_redo) {
+		tx->wt_redo_act_pos = NULL;
+		return NULL;
+	}
+	return &tx->wt_redo_act_pos->wa_act;
+}
+
+struct umem_wal_tx_ops dav_wal_tx_ops = {
+	.wtx_act_nr = wal_tx_act_nr,
+	.wtx_payload_sz = wal_tx_payload_len,
+	.wtx_act_first = wal_tx_act_first,
+	.wtx_act_next = wal_tx_act_next,
+};
+
+int
+dav_wal_replay_cb(uint64_t tx_id, struct umem_action *act, void *arg)
+{
+	void *src, *dst;
+	ptrdiff_t off;
+	uint64_t *p, mask;
+	daos_size_t size;
+	int pos, num, val;
+	int rc = 0;
+	dav_obj_t         *dav_hdl = arg;
+	void              *base    = dav_hdl->do_base;
+	struct umem_store *store   = dav_hdl->do_store;
+
+	switch (act->ac_opc) {
+	case UMEM_ACT_COPY:
+		D_DEBUG(DB_TRACE,
+			"ACT_COPY txid=%lu, (p,o)=%lu,%lu size=%lu\n",
+			tx_id,
+			act->ac_copy.addr / PAGESIZE, act->ac_copy.addr % PAGESIZE,
+			act->ac_copy.size);
+		off = act->ac_copy.addr;
+		dst = base + off;
+		src = (void *)&act->ac_copy.payload;
+		size = act->ac_copy.size;
+		memcpy(dst, src, size);
+		break;
+	case UMEM_ACT_ASSIGN:
+		D_DEBUG(DB_TRACE,
+			"ACT_ASSIGN txid=%lu, (p,o)=%lu,%lu size=%u\n",
+			tx_id,
+			act->ac_assign.addr / PAGESIZE, act->ac_assign.addr % PAGESIZE,
+			act->ac_assign.size);
+		off = act->ac_assign.addr;
+		dst = base + off;
+		size = act->ac_assign.size;
+		ASSERT_rt(size == 1 || size == 2 || size == 4);
+		src = &act->ac_assign.val;
+		memcpy(dst, src, size);
+		break;
+	case UMEM_ACT_SET:
+		D_DEBUG(DB_TRACE,
+			"ACT_SET txid=%lu, (p,o)=%lu,%lu size=%u val=%u\n",
+			tx_id,
+			act->ac_set.addr / PAGESIZE, act->ac_set.addr % PAGESIZE,
+			act->ac_set.size, act->ac_set.val);
+		off = act->ac_set.addr;
+		dst = base + off;
+		size = act->ac_set.size;
+		val = act->ac_set.val;
+		memset(dst, val, size);
+		break;
+	case UMEM_ACT_SET_BITS:
+	case UMEM_ACT_CLR_BITS:
+		D_DEBUG(DB_TRACE,
+			"ACT_CLR_BITS txid=%lu, (p,o)=%lu,%lu bit_pos=%u num_bits=%u\n",
+			tx_id,
+			act->ac_op_bits.addr / PAGESIZE, act->ac_op_bits.addr % PAGESIZE,
+			act->ac_op_bits.pos, act->ac_op_bits.num);
+		off = act->ac_op_bits.addr;
+		size = sizeof(uint64_t);
+		p = (uint64_t *)(base + off);
+		num = act->ac_op_bits.num;
+		pos = act->ac_op_bits.pos;
+		ASSERT_rt((pos >= 0) && (pos + num) <= 64);
+		mask = ((1ULL << num) - 1) << pos;
+		if (act->ac_opc == UMEM_ACT_SET_BITS)
+			*p |= mask;
+		else
+			*p &= ~mask;
+		break;
+	default:
+		D_ASSERT(0);
+		break;
+	}
+
+	if (rc == 0)
+		rc = umem_cache_touch(store, tx_id, off, size);
+
+	return rc;
+}
diff --git a/src/common/dav_v2/wal_tx.h b/src/common/dav_v2/wal_tx.h
new file mode 100644
index 00000000000..e02759b9b3f
--- /dev/null
+++ b/src/common/dav_v2/wal_tx.h
@@ -0,0 +1,44 @@
+/**
+ * (C) Copyright 2021-2022 Intel Corporation.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause-Patent
+ */
+
+#ifndef __DAOS_COMMON_DAV_WAL_TX_
+#define __DAOS_COMMON_DAV_WAL_TX_
+
+#include <gurt/list.h>
+#include <daos_types.h>
+#include <daos/mem.h>
+
+struct dav_obj;
+
+struct wal_action {
+	d_list_t                wa_link;
+	struct umem_action      wa_act;
+};
+
+struct dav_tx {
+	struct dav_obj		*wt_dav_hdl;
+	d_list_t		 wt_redo;
+	uint32_t		 wt_redo_cnt;
+	uint32_t		 wt_redo_payload_len;
+	struct wal_action	*wt_redo_act_pos;
+};
+D_CASSERT(sizeof(struct dav_tx) <= UTX_PRIV_SIZE,
+	  "Size of struct dav_tx is too big!");
+
+#define dav_action_get_next(it) d_list_entry(it.next, struct wal_action, wa_link)
+
+struct umem_wal_tx *dav_umem_wtx_new(struct dav_obj *dav_hdl);
+void dav_umem_wtx_cleanup(struct umem_wal_tx *utx);
+int dav_wal_tx_reserve(struct dav_obj *hdl, uint64_t *id);
+int dav_wal_tx_commit(struct dav_obj *hdl, struct umem_wal_tx *utx, void *data);
+int dav_wal_tx_snap(void *hdl, void *addr, daos_size_t size, void *src, uint32_t flags);
+int dav_wal_tx_assign(void *hdl, void *addr, uint64_t val);
+int dav_wal_tx_clr_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits);
+int dav_wal_tx_set_bits(void *hdl, void *addr, uint32_t pos, uint16_t num_bits);
+int dav_wal_tx_set(void *hdl, void *addr, char c, daos_size_t size);
+int dav_wal_replay_cb(uint64_t tx_id, struct umem_action *act, void *base);
+
+#endif	/*__DAOS_COMMON_DAV_WAL_TX_*/
diff --git a/src/common/mem.c b/src/common/mem.c
index afc7f70eb66..03f870e51ab 100644
--- a/src/common/mem.c
+++ b/src/common/mem.c
@@ -17,7 +17,9 @@
 #ifdef DAOS_PMEM_BUILD
 #include <libpmemobj.h>
 #include <daos_srv/ad_mem.h>
+#define DAV_V2_BUILD
 #include "dav/dav.h"
+#include "dav_v2/dav_v2.h"
 #endif
 
 #define UMEM_TX_DATA_MAGIC	(0xc01df00d)
@@ -44,7 +46,7 @@ umem_get_mb_evictable(struct umem_instance *umm, int flags)
 
 	if (umm->umm_pool->up_store.store_type == DAOS_MD_BMEM) {
 		pop = (dav_obj_t *)umm->umm_pool->up_priv;
-		return dav_get_zone_evictable(pop, flags);
+		return dav_get_zone_evictable_v2(pop, flags);
 	}
 	return 0;
 }
@@ -82,6 +84,9 @@ umempobj_settings_init(bool md_on_ssd)
 	case DAOS_MD_ADMEM:
 		D_INFO("UMEM will use AD-hoc Memory as the metadata backend interface\n");
 		break;
+	case DAOS_MD_BMEM_V2:
+		D_INFO("UMEM will use Blob Backed Memory v2 as the metadata backend interface\n");
+		break;
 	default:
 		D_ERROR("DAOS_MD_ON_SSD_MODE=%d envar invalid, use %d for BMEM or %d for ADMEM\n",
 			md_mode, DAOS_MD_BMEM, DAOS_MD_ADMEM);
@@ -106,6 +111,8 @@ int umempobj_backend_type2class_id(int backend)
 		return UMEM_CLASS_BMEM;
 	case DAOS_MD_ADMEM:
 		return UMEM_CLASS_ADMEM;
+	case DAOS_MD_BMEM_V2:
+		return UMEM_CLASS_BMEM_V2;
 	default:
 		D_ASSERTF(0,
 			  "bad daos_md_backend %d\n", backend);
@@ -173,6 +180,16 @@ set_slab_desc(struct umem_pool *ph_p, struct umem_slab_desc *slab)
 		/* update with the new slab id */
 		slab->class_id = davslab.class_id;
 		break;
+	case DAOS_MD_BMEM_V2:
+		davslab.unit_size = slab->unit_size;
+		davslab.alignment = 0;
+		davslab.units_per_block = 1000;
+		davslab.header_type = DAV_HEADER_NONE;
+		davslab.class_id = slab->class_id;
+		rc = dav_class_register_v2((dav_obj_t *)ph_p->up_priv, &davslab);
+		/* update with the new slab id */
+		slab->class_id = davslab.class_id;
+		break;
 	case DAOS_MD_ADMEM:
 		/* NOOP for ADMEM now */
 		slab->class_id = class_id++;
@@ -337,6 +354,15 @@ umempobj_create(const char *path, const char *layout_name, int flags,
 		}
 		umm_pool->up_priv = dav_hdl;
 		break;
+	case DAOS_MD_BMEM_V2:
+		dav_hdl = dav_obj_create_v2(path, 0, poolsize, mode, &umm_pool->up_store);
+		if (!dav_hdl) {
+			D_ERROR("Failed to create pool %s, size="DF_U64": errno = %d\n",
+				path, poolsize, errno);
+			goto error;
+		}
+		umm_pool->up_priv = dav_hdl;
+		break;
 	case DAOS_MD_ADMEM:
 		rc = ad_blob_create(path, 0, store, &bh);
 		if (rc) {
@@ -420,6 +446,16 @@ umempobj_open(const char *path, const char *layout_name, int flags, struct umem_
 			goto error;
 		}
 
+		umm_pool->up_priv = dav_hdl;
+		break;
+	case DAOS_MD_BMEM_V2:
+		dav_hdl = dav_obj_open_v2(path, 0, &umm_pool->up_store);
+		if (!dav_hdl) {
+			D_ERROR("Error in opening the pool %s: errno =%d\n",
+				path, errno);
+			goto error;
+		}
+
 		umm_pool->up_priv = dav_hdl;
 		break;
 	case DAOS_MD_ADMEM:
@@ -464,6 +500,9 @@ umempobj_close(struct umem_pool *ph_p)
 	case DAOS_MD_BMEM:
 		dav_obj_close((dav_obj_t *)ph_p->up_priv);
 		break;
+	case DAOS_MD_BMEM_V2:
+		dav_obj_close_v2((dav_obj_t *)ph_p->up_priv);
+		break;
 	case DAOS_MD_ADMEM:
 		bh.bh_blob = (struct ad_blob *)ph_p->up_priv;
 		ad_blob_close(bh);
@@ -503,6 +542,9 @@ umempobj_get_rootptr(struct umem_pool *ph_p, size_t size)
 	case DAOS_MD_BMEM:
 		off = dav_root((dav_obj_t *)ph_p->up_priv, size);
 		return (char *)dav_get_base_ptr((dav_obj_t *)ph_p->up_priv) + off;
+	case DAOS_MD_BMEM_V2:
+		off = dav_root_v2((dav_obj_t *)ph_p->up_priv, size);
+		return (char *)dav_get_base_ptr((dav_obj_t *)ph_p->up_priv) + off;
 	case DAOS_MD_ADMEM:
 		bh.bh_blob = (struct ad_blob *)ph_p->up_priv;
 		return ad_root(bh, size);
@@ -540,6 +582,11 @@ umempobj_get_heapusage(struct umem_pool *ph_p, daos_size_t *curr_allocated)
 		if (rc == 0)
 			*curr_allocated = st.curr_allocated;
 		break;
+	case DAOS_MD_BMEM_V2:
+		rc = dav_get_heap_stats_v2((dav_obj_t *)ph_p->up_priv, &st);
+		if (rc == 0)
+			*curr_allocated = st.curr_allocated;
+		break;
 	case DAOS_MD_ADMEM:
 		*curr_allocated = 40960; /* TODO */
 		break;
@@ -579,6 +626,12 @@ umempobj_log_fraginfo(struct umem_pool *ph_p)
 		  DF_U64", run_active: "DF_U64"\n",
 		  st.run_allocated, st.run_active);
 		break;
+	case DAOS_MD_BMEM_V2:
+		dav_get_heap_stats_v2((dav_obj_t *)ph_p->up_priv, &st);
+		D_ERROR("Fragmentation info, run_allocated: "
+		  DF_U64", run_active: "DF_U64"\n",
+		  st.run_allocated, st.run_active);
+		break;
 	case DAOS_MD_ADMEM:
 		/* TODO */
 		D_ERROR("Fragmentation info, not implemented in ADMEM yet.\n");
@@ -1074,9 +1127,7 @@ bmem_tx_alloc(struct umem_instance *umm, size_t size, uint64_t flags, unsigned i
 		pflags |= DAV_FLAG_ZERO;
 	if (flags & UMEM_FLAG_NO_FLUSH)
 		pflags |= DAV_FLAG_NO_FLUSH;
-	if (mbkt_id != 0)
-		pflags |= DAV_EZONE_ID(mbkt_id);
-	return dav_tx_alloc(size, type_num, pflags);
+	return dav_tx_xalloc(size, type_num, pflags);
 }
 
 static int
@@ -1183,9 +1234,8 @@ bmem_reserve(struct umem_instance *umm, void *act, size_t size, unsigned int typ
 	     unsigned int mbkt_id)
 {
 	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
-	uint64_t   flags = DAV_EZONE_ID(mbkt_id);
 
-	return dav_reserve(pop, (struct dav_action *)act, size, type_num, flags);
+	return dav_reserve(pop, (struct dav_action *)act, size, type_num);
 }
 
 static void
@@ -1228,9 +1278,8 @@ bmem_atomic_alloc(struct umem_instance *umm, size_t size, unsigned int type_num,
 	uint64_t off;
 	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
 	int rc;
-	uint64_t   flags = DAV_EZONE_ID(mbkt_id);
 
-	rc = dav_alloc(pop, &off, size, type_num, flags, NULL, NULL);
+	rc = dav_alloc(pop, &off, size, type_num, NULL, NULL);
 	if (rc)
 		return UMOFF_NULL;
 	return off;
@@ -1277,6 +1326,251 @@ static umem_ops_t	bmem_ops = {
 	.mo_tx_add_callback	= umem_tx_add_cb,
 };
 
+/** BMEM v2 operations (depends on dav) */
+
+static int
+bmem_tx_free_v2(struct umem_instance *umm, umem_off_t umoff)
+{
+	/*
+	 * This free call could be on error cleanup code path where
+	 * the transaction is already aborted due to previous failed
+	 * pmemobj_tx call. Let's just skip it in this case.
+	 *
+	 * The reason we don't fix caller to avoid calling tx_free()
+	 * in an aborted transaction is that the caller code could be
+	 * shared by both transactional and non-transactional (where
+	 * UMEM_CLASS_VMEM is used, see btree code) interfaces, and
+	 * the explicit umem_free() on error cleanup is necessary for
+	 * non-transactional case.
+	 */
+	if (dav_tx_stage_v2() == DAV_TX_STAGE_ONABORT)
+		return 0;
+
+	if (!UMOFF_IS_NULL(umoff)) {
+		int	rc;
+
+		rc = dav_tx_free_v2(umem_off2offset(umoff));
+		return rc ? umem_tx_errno(rc) : 0;
+	}
+
+	return 0;
+}
+
+static umem_off_t
+bmem_tx_alloc_v2(struct umem_instance *umm, size_t size, uint64_t flags, unsigned int type_num,
+	      unsigned int mbkt_id)
+{
+	uint64_t pflags = 0;
+
+	get_slab(umm, &pflags, &size);
+
+	if (flags & UMEM_FLAG_ZERO)
+		pflags |= DAV_FLAG_ZERO;
+	if (flags & UMEM_FLAG_NO_FLUSH)
+		pflags |= DAV_FLAG_NO_FLUSH;
+	if (mbkt_id != 0)
+		pflags |= DAV_EZONE_ID(mbkt_id);
+	return dav_tx_alloc_v2(size, type_num, pflags);
+}
+
+static int
+bmem_tx_add_v2(struct umem_instance *umm, umem_off_t umoff,
+	    uint64_t offset, size_t size)
+{
+	int	rc;
+
+	rc = dav_tx_add_range_v2(umem_off2offset(umoff), size);
+	return rc ? umem_tx_errno(rc) : 0;
+}
+
+static int
+bmem_tx_xadd_v2(struct umem_instance *umm, umem_off_t umoff, uint64_t offset,
+	     size_t size, uint64_t flags)
+{
+	int	rc;
+	uint64_t pflags = 0;
+
+	if (flags & UMEM_XADD_NO_SNAPSHOT)
+		pflags |= DAV_XADD_NO_SNAPSHOT;
+
+	rc = dav_tx_xadd_range_v2(umem_off2offset(umoff), size, pflags);
+	return rc ? umem_tx_errno(rc) : 0;
+}
+
+
+static int
+bmem_tx_add_ptr_v2(struct umem_instance *umm, void *ptr, size_t size)
+{
+	int	rc;
+
+	rc = dav_tx_add_range_direct_v2(ptr, size);
+	return rc ? umem_tx_errno(rc) : 0;
+}
+
+static int
+bmem_tx_abort_v2(struct umem_instance *umm, int err)
+{
+	/*
+	 * obj_tx_abort() may have already been called in the error
+	 * handling code of pmemobj APIs.
+	 */
+	if (dav_tx_stage_v2() != DAV_TX_STAGE_ONABORT)
+		dav_tx_abort_v2(err);
+
+	err = dav_tx_end_v2(NULL);
+	return err ? umem_tx_errno(err) : 0;
+}
+
+static int
+bmem_tx_begin_v2(struct umem_instance *umm, struct umem_tx_stage_data *txd)
+{
+	int rc;
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+
+	if (txd != NULL) {
+		D_ASSERT(txd->txd_magic == UMEM_TX_DATA_MAGIC);
+		rc = dav_tx_begin_v2(pop, NULL, DAV_TX_PARAM_CB, pmem_stage_callback,
+				      txd, DAV_TX_PARAM_NONE);
+	} else {
+		rc = dav_tx_begin_v2(pop, NULL, DAV_TX_PARAM_NONE);
+	}
+
+	if (rc != 0) {
+		/*
+		 * dav_tx_end() needs be called to re-initialize the
+		 * tx state when dav_tx_begin() failed.
+		 */
+		rc = dav_tx_end_v2(NULL);
+		return rc ? umem_tx_errno(rc) : 0;
+	}
+	return 0;
+}
+
+static int
+bmem_tx_commit_v2(struct umem_instance *umm, void *data)
+{
+	int rc;
+
+	dav_tx_commit_v2();
+	rc = dav_tx_end_v2(data);
+
+	return rc ? umem_tx_errno(rc) : 0;
+}
+
+static int
+bmem_tx_stage_v2(void)
+{
+	return dav_tx_stage_v2();
+}
+
+static void
+bmem_defer_free_v2(struct umem_instance *umm, umem_off_t off, void *act)
+{
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+
+	dav_defer_free_v2(pop, umem_off2offset(off),
+			(struct dav_action *)act);
+}
+
+static umem_off_t
+bmem_reserve_v2(struct umem_instance *umm, void *act, size_t size, unsigned int type_num,
+	     unsigned int mbkt_id)
+{
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+	uint64_t   flags = DAV_EZONE_ID(mbkt_id);
+
+	return dav_reserve_v2(pop, (struct dav_action *)act, size, type_num, flags);
+}
+
+static void
+bmem_cancel_v2(struct umem_instance *umm, void *actv, int actv_cnt)
+{
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+
+	dav_cancel_v2(pop, (struct dav_action *)actv, actv_cnt);
+}
+
+static int
+bmem_tx_publish_v2(struct umem_instance *umm, void *actv, int actv_cnt)
+{
+	int	rc;
+
+	rc = dav_tx_publish_v2((struct dav_action *)actv, actv_cnt);
+	return rc ? umem_tx_errno(rc) : 0;
+}
+
+static void *
+bmem_atomic_copy_v2(struct umem_instance *umm, void *dest, const void *src,
+		 size_t len, enum acopy_hint hint)
+{
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+
+	if (hint == UMEM_RESERVED_MEM) {
+		memcpy(dest, src, len);
+		return dest;
+	} else if (hint == UMEM_COMMIT_IMMEDIATE) {
+		return dav_memcpy_persist_v2(pop, dest, src, len);
+	} else { /* UMEM_COMMIT_DEFER */
+		return dav_memcpy_persist_relaxed_v2(pop, dest, src, len);
+	}
+}
+
+static umem_off_t
+bmem_atomic_alloc_v2(struct umem_instance *umm, size_t size, unsigned int type_num,
+		  unsigned int mbkt_id)
+{
+	uint64_t off;
+	dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+	int rc;
+	uint64_t   flags = DAV_EZONE_ID(mbkt_id);
+
+	rc = dav_alloc_v2(pop, &off, size, type_num, flags, NULL, NULL);
+	if (rc)
+		return UMOFF_NULL;
+	return off;
+}
+
+static int
+bmem_atomic_free_v2(struct umem_instance *umm, umem_off_t umoff)
+{
+	if (!UMOFF_IS_NULL(umoff)) {
+		uint64_t off = umem_off2offset(umoff);
+
+		dav_free_v2((dav_obj_t *)umm->umm_pool->up_priv, off);
+	}
+	return 0;
+}
+
+static void
+bmem_atomic_flush_v2(struct umem_instance *umm, void *addr, size_t len)
+{
+	/* REVISIT: We need to update the WAL with this info
+	 * dav_obj_t *pop = (dav_obj_t *)umm->umm_pool->up_priv;
+	 * dav_flush(pop, addr, len);
+	 */
+}
+
+static umem_ops_t	bmem_v2_ops = {
+	.mo_tx_free		= bmem_tx_free_v2,
+	.mo_tx_alloc		= bmem_tx_alloc_v2,
+	.mo_tx_add		= bmem_tx_add_v2,
+	.mo_tx_xadd		= bmem_tx_xadd_v2,
+	.mo_tx_add_ptr		= bmem_tx_add_ptr_v2,
+	.mo_tx_abort		= bmem_tx_abort_v2,
+	.mo_tx_begin		= bmem_tx_begin_v2,
+	.mo_tx_commit		= bmem_tx_commit_v2,
+	.mo_tx_stage		= bmem_tx_stage_v2,
+	.mo_reserve		= bmem_reserve_v2,
+	.mo_defer_free		= bmem_defer_free_v2,
+	.mo_cancel		= bmem_cancel_v2,
+	.mo_tx_publish		= bmem_tx_publish_v2,
+	.mo_atomic_copy		= bmem_atomic_copy_v2,
+	.mo_atomic_alloc	= bmem_atomic_alloc_v2,
+	.mo_atomic_free		= bmem_atomic_free_v2,
+	.mo_atomic_flush	= bmem_atomic_flush_v2,
+	.mo_tx_add_callback	= umem_tx_add_cb,
+};
+
 int
 umem_tx_errno(int err)
 {
@@ -1366,6 +1660,11 @@ static struct umem_class umem_class_defined[] = {
 		.umc_ops	= &bmem_ops,
 		.umc_name	= "bmem",
 	},
+	{
+		.umc_id		= UMEM_CLASS_BMEM_V2,
+		.umc_ops	= &bmem_v2_ops,
+		.umc_name	= "bmem_v2",
+	},
 	{
 		.umc_id		= UMEM_CLASS_ADMEM,
 		.umc_ops	= &ad_mem_ops,
@@ -1415,6 +1714,11 @@ set_offsets(struct umem_instance *umm)
 
 		umm->umm_base = (uint64_t)dav_get_base_ptr(dav_pop);
 		break;
+	case UMEM_CLASS_BMEM_V2:
+		dav_pop = (dav_obj_t *)umm->umm_pool->up_priv;
+
+		umm->umm_base = (uint64_t)dav_get_base_ptr_v2(dav_pop);
+		break;
 	case UMEM_CLASS_ADMEM:
 		bh.bh_blob = (struct ad_blob *)umm->umm_pool->up_priv;
 		umm->umm_base = (uint64_t)ad_base(bh);
@@ -1560,6 +1864,7 @@ umem_rsrvd_item_size(struct umem_instance *umm)
 	case UMEM_CLASS_ADMEM:
 		return sizeof(struct ad_reserv_act);
 	case UMEM_CLASS_BMEM:
+	case UMEM_CLASS_BMEM_V2:
 		return sizeof(struct dav_action);
 	default:
 		D_ERROR("bad umm_id %d\n", umm->umm_id);
diff --git a/src/common/tests/umem_test_bmem.c b/src/common/tests/umem_test_bmem.c
index 99cf620fe5d..fb963f9b21a 100644
--- a/src/common/tests/umem_test_bmem.c
+++ b/src/common/tests/umem_test_bmem.c
@@ -132,6 +132,7 @@ global_setup(void **state)
 		print_message("Failed to set the md_on_ssd tunable\n");
 		return 1;
 	}
+	ustore.store_type = umempobj_get_backend_type();
 
 	D_ALLOC_PTR(arg);
 	if (arg == NULL) {
diff --git a/src/include/daos/mem.h b/src/include/daos/mem.h
index a06a9785270..db4af896c95 100644
--- a/src/include/daos/mem.h
+++ b/src/include/daos/mem.h
@@ -38,6 +38,7 @@ enum {
 	DAOS_MD_PMEM	= 0,
 	DAOS_MD_BMEM	= 1,
 	DAOS_MD_ADMEM	= 2,
+	DAOS_MD_BMEM_V2	= 3,
 };
 
 /* return umem backend type */
@@ -271,6 +272,8 @@ typedef enum {
 	UMEM_CLASS_BMEM,
 	/** ad-hoc memory */
 	UMEM_CLASS_ADMEM,
+	/** blob backed memory v2 */
+	UMEM_CLASS_BMEM_V2,
 	/** unknown */
 	UMEM_CLASS_UNKNOWN,
 } umem_class_id_t;
diff --git a/src/include/gurt/common.h b/src/include/gurt/common.h
index cfce1a490ec..068e6d4c9b7 100644
--- a/src/include/gurt/common.h
+++ b/src/include/gurt/common.h
@@ -360,6 +360,7 @@ d_realpath(const char *path, char *resolved_path) _dalloc_;
 #define D_SPIN_LOCK(x)		__D_PTHREAD(pthread_spin_lock, x)
 #define D_SPIN_UNLOCK(x)	__D_PTHREAD(pthread_spin_unlock, x)
 #define D_MUTEX_LOCK(x)		__D_PTHREAD(pthread_mutex_lock, x)
+#define D_MUTEX_TRYLOCK(x)	__D_PTHREAD_TRYLOCK(pthread_mutex_trylock, x)
 #define D_MUTEX_UNLOCK(x)	__D_PTHREAD(pthread_mutex_unlock, x)
 #define D_RWLOCK_RDLOCK(x)	__D_PTHREAD(pthread_rwlock_rdlock, x)
 #define D_RWLOCK_WRLOCK(x)	__D_PTHREAD(pthread_rwlock_wrlock, x)
diff --git a/utils/rpms/daos.rpmlintrc b/utils/rpms/daos.rpmlintrc
index 2c905deda8e..889bb3b53f1 100644
--- a/utils/rpms/daos.rpmlintrc
+++ b/utils/rpms/daos.rpmlintrc
@@ -44,7 +44,7 @@ addFilter("E: static-library-without-debuginfo \/usr\/lib64\/lib(dfuse|ioil)\.a"
 
 # these need to be fixed:
 # https://daosio.atlassian.net/browse/DAOS-11539
-addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)).so")
+addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)|dav_v2).so")
 
 # Tests rpm needs to be able to build daos from source so pulls in build deps and is expected.
 addFilter("daos-client-tests.x86_64: E: devel-dependency protobuf-c-devel")
diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec
index e77db2c49f8..71c2c2902a6 100644
--- a/utils/rpms/daos.spec
+++ b/utils/rpms/daos.spec
@@ -15,7 +15,7 @@
 
 Name:          daos
 Version:       2.5.100
-Release:       9%{?relval}%{?dist}
+Release:       10%{?relval}%{?dist}
 Summary:       DAOS Storage Engine
 
 License:       BSD-2-Clause-Patent
@@ -451,6 +451,7 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent
 %{_libdir}/daos_srv/libplacement.so
 %{_libdir}/daos_srv/libpipeline.so
 %{_libdir}/libdaos_common_pmem.so
+%{_libdir}/libdav_v2.so
 %config(noreplace) %{conf_dir}/vos_size_input.yaml
 %{_bindir}/daos_storage_estimator.py
 %{python3_sitearch}/storage_estimator/*.py
@@ -585,6 +586,10 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent
 # No files in a shim package
 
 %changelog
+* Wed Oct 16 2023 Sherin T George <sherin-t.george@hpe.com> 2.5.100-10
+- The modified DAV allocator with memory bucket support for md_on_ssd
+  phase-2 is delivered as dav_v2.so.
+
 * Wed Aug 23 2023 Brian J. Murrell <brian.murrell@intel.com> 2.5.100-9
 - Update fuse3 requirement to R: /usr/bin/fusermount3 by path
   rather than by package name, for portability and future-proofing
diff --git a/utils/utest.yaml b/utils/utest.yaml
index fd7580be142..211346090d4 100644
--- a/utils/utest.yaml
+++ b/utils/utest.yaml
@@ -114,6 +114,16 @@
   sudo: True
   required_src: ["src/vos/tests/bio_ut.c"]
   tests:
+    - cmd: ["bin/vos_tests", "-A", "50"]
+      env_vars:
+        DAOS_MD_ON_SSD_MODE: "3"
+      aio: "AIO_7"
+      size: 13
+    - cmd: ["bin/bio_ut"]
+      env_vars:
+        DAOS_MD_ON_SSD_MODE: "3"
+      aio: "AIO_7"
+      size: 4
     - cmd: ["bin/vos_tests", "-A", "50"]
       aio: "AIO_7"
       size: 13