From d393c99856ba79dff2da0b83157f43eb4e9cb6e2 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 3 Nov 2022 13:17:14 +0000
Subject: [PATCH 01/37] Refactor penciller memory

In high-volume tests on large key-count clusters, so significant variation in the P0031 time has been seen:

TimeBucket	PatchA
a.0ms_to_1ms	18554
b.1ms_to_2ms	51778
c.2ms_to_3ms	696
d.3ms_to_5ms	220
e.5ms_to_8ms	59
f.8ms_to_13ms	40
g.13ms_to_21ms	364
h.21ms_to_34ms	277
i.34ms_to_55ms	34
j.55ms_to_89ms	17
k.89ms_to_144ms	21
l.144ms_to_233ms	31
m.233ms_to_377ms	45
n.377ms_to_610ms	52
o.610ms_to_987ms	59
p.987ms_to_1597ms	55
q.1597ms_to_2684ms	54
r.2684ms_to_4281ms	29
s.4281ms_to_6965ms	7
t.6295ms_to_11246ms	1

It is unclear why this varies so much.  The time to add to the cache appears to be minimal (but perhaps there is an issue with timing points in the code), whereas the time to add to the index is much more significant and variable.  There is also variable time when the memory is rolled (although the actual activity here appears to be minimal.

The refactoring here is two-fold:

- tidy and simplify by keeping LoopState managed within handle_call, and add more helpful dialyzer specs;

- change the update to the index to be a simple extension of a list, rather than any conversion.

This alternative version of the pmem index in unit test is orders of magnitude faster to add - and is the same order of magnitude to check.  Anticipation is that it may be more efficient in terms of memory changes.
---
 src/leveled_bookie.erl    |   4 +-
 src/leveled_log.erl       |   4 +-
 src/leveled_penciller.erl | 375 +++++++++++++++++++-------------------
 src/leveled_pmem.erl      | 153 +++++++++-------
 4 files changed, 286 insertions(+), 250 deletions(-)

diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl
index 96df523f..497c103f 100644
--- a/src/leveled_bookie.erl
+++ b/src/leveled_bookie.erl
@@ -162,7 +162,7 @@
                         loader = leveled_tree:empty(?CACHE_TYPE)
                                     :: tuple()|empty_cache,
                         load_queue = [] :: list(),
-                        index = leveled_pmem:new_index(), % array or empty_index
+                        index = leveled_pmem:new_index(),
                         min_sqn = infinity :: integer()|infinity,
                         max_sqn = 0 :: integer()}).
 
@@ -2384,7 +2384,7 @@ addto_ledgercache({H, SQN, KeyChanges}, Cache, loader) ->
 %% Check the ledger cache for a Key, when the ledger cache is in loader mode
 %% and so is populating a queue not an ETS table
 check_in_ledgercache(PK, Hash, Cache, loader) ->
-    case leveled_pmem:check_index(Hash, Cache#ledger_cache.index) of
+    case leveled_pmem:check_index(Hash, [Cache#ledger_cache.index]) of
         [] ->
             false;
         _ ->
diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index 88d0058c..9f4009c8 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -148,7 +148,9 @@
     {"P0031",
         {info, "Completion of update to levelzero"
                     ++ " with cache_size=~w level0_due=~w"
-                    ++ " and change_pending=~w"}},
+                    ++ " change_pending=~w"
+                    ++ " MinSQN=~w MaxSQN=~w"
+                    ++ " CacheTime_us=~w RollTime_us=~w"}},
     {"P0032",
         {info, "Fetch head timing with sample_count=~w and level timings of"
                     ++ " foundmem_time=~w found0_time=~w found1_time=~w" 
diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl
index 99c3a703..24e3cef8 100644
--- a/src/leveled_penciller.erl
+++ b/src/leveled_penciller.erl
@@ -246,7 +246,8 @@
                 levelzero_size = 0 :: integer(),
                 levelzero_maxcachesize :: integer() | undefined,
                 levelzero_cointoss = false :: boolean(),
-                levelzero_index :: array:array() | undefined | redacted,
+                levelzero_index ::
+                    leveled_pmem:index_array() | undefined | redacted,
                 
                 is_snapshot = false :: boolean(),
                 snapshot_fully_loaded = false :: boolean(),
@@ -267,7 +268,7 @@
                 snaptimeout_short :: pos_integer()|undefined,
                 snaptimeout_long :: pos_integer()|undefined,
 
-                sst_options = #sst_options{} :: #sst_options{}}).
+                sst_options = #sst_options{} :: sst_options()}).
 
 -record(pcl_timings, 
                     {sample_count = 0 :: integer(),
@@ -288,8 +289,7 @@
 
 -type penciller_options() :: #penciller_options{}.
 -type bookies_memory() :: {tuple()|empty_cache,
-                            % array:array()|empty_array,
-                            any()|empty_array, % Issue of type compatability with OTP16
+                            array:array()|empty_array,
                             integer()|infinity,
                             integer()}.
 -type pcl_state() :: #state{}.
@@ -313,7 +313,7 @@
         fun((leveled_codec:ledger_key(),
                 leveled_codec:ledger_value(),
                 any()) -> any()).
-
+-type sst_options() :: #sst_options{}.
 
 -export_type([levelzero_cacheentry/0, levelzero_returnfun/0, sqn_check/0]).
 
@@ -669,59 +669,106 @@ handle_call({push_mem, {LedgerTable, PushedIdx, MinSQN, MaxSQN}},
                 State=#state{is_snapshot=Snap}) when Snap == false ->
     % The push_mem process is as follows:
     %
-    % 1 - Receive a cache.  The cache has four parts: a tree of keys and
-    % values, an array of 256 binaries listing the hashes present in the
-    % tree, a min SQN and a max SQN
-    %
-    % 2 - Check to see if there is a levelzero file pending.  If so, the
-    % update must be returned.  If not the update can be accepted
-    %
-    % 3 - The Penciller can now reply to the Bookie to show if the push has
-    % been accepted
-    %
-    % 4 - Update the cache:
-    % a) Append the cache to the list
-    % b) Add each of the 256 hash-listing binaries to the master L0 index array
-    %
-    % Check the approximate size of the cache.  If it is over the maximum size,
-    % trigger a background L0 file write and update state of levelzero_pending.
-    CacheUpdateBlockedByPendingWork
-        = State#state.levelzero_pending or State#state.work_backlog,
-    CacheFull = leveled_pmem:cache_full(State#state.levelzero_cache),
-    case {CacheUpdateBlockedByPendingWork, CacheFull} of
-        {true, _} ->
-            leveled_log:log("P0018", [returned,
-                                        State#state.levelzero_pending,
-                                        State#state.work_backlog]),
+    % 1. If either the penciller is still waiting on the last L0 file to be
+    % written, or there is a work backlog - the cache is returned with the
+    % expectation that PUTs should be slowed.  Also if the cache has reached
+    % the maximum number of lines (by default after 31 pushes from the bookie)
+    % 
+    % 2. If (1) doe snot apply, the bookie's cache will be added to the
+    % penciller's cache.
+    % 
+    % 3. If the cache is over the size requirement (in terms of approximate
+    % number of keys), the writing of the cache to L0 will be prompted - unless
+    % there is already compaction work ongoing, or the last L0 file has not yet
+    % been merged to L1.  Unlike (1) the penciller's cache is still updated in
+    % this case - so the bookie can empty its ledger cache.
+    SW0 = os:timestamp(),
+    L0Pending = State#state.levelzero_pending,
+    WorkBacklog = State#state.work_backlog,
+    WorkOngoing = State#state.work_ongoing,
+    CacheAlreadyFull = leveled_pmem:cache_full(State#state.levelzero_cache),
+    case L0Pending or WorkBacklog of
+        true ->
+            % Cannot update the cache, or roll the memory so reply as such
+            % immediately
+            leveled_log:log(
+                "P0018",
+                [returned, L0Pending, WorkBacklog]),
             {reply, returned, State};
-        {false, true} ->
-            leveled_log:log("P0042", [State#state.levelzero_size]),
-            % The cache is full (the maximum line items have been reached), so
-            % can't accept any more.  However, we need to try and roll memory
-            % otherwise cache may be permanently full.
-            gen_server:reply(From, returned),
-            {UpdState, none} = maybe_roll_memory(State, true, false),
-            {noreply, UpdState};
-        {false, false} ->
-            % leveled_log:log("P0018", [ok, false, false]),
-            PushedTree =
-                case is_tuple(LedgerTable) of
+        false ->
+            {UpdState, Response} =
+                case CacheAlreadyFull of
                     true ->
-                        LedgerTable;
+                        % Don't update the cache on State if cache has reached
+                        % the maximum number of lines, otherwise we can still
+                        % add to the cache (but it may be over-sized and
+                        % require rolling to file)
+                        leveled_log:log("P0042", [State#state.levelzero_size]),
+                        {State, returned};
                     false ->
-                        leveled_tree:from_orderedset(LedgerTable,
-                                                        ?CACHE_TYPE)
+                        % Return ok as cache has been updated on State and
+                        % the Bookie should clear its cache which is now
+                        % received
+                        {UpdL0Cache, NewL0Size, UpdL0Index, UpdMaxSQN} =
+                            update_levelzero_cache(
+                                State#state.levelzero_size,
+                                {LedgerTable, PushedIdx, MinSQN, MaxSQN},
+                                State#state.ledger_sqn,
+                                State#state.levelzero_cache,
+                                State#state.levelzero_index),
+                        {State#state{
+                            levelzero_cache = UpdL0Cache,
+                            levelzero_size = NewL0Size,
+                            levelzero_index = UpdL0Index,
+                            ledger_sqn = UpdMaxSQN}, ok}
                 end,
-            % Reply must happen after the table has been converted
-            gen_server:reply(From, ok),
-            % Update LevelZero will add to the cache and maybe roll the
-            % cache from memory to L0 disk if the cache is too big
-            {noreply,
-                update_levelzero(State#state.levelzero_size,
-                                    {PushedTree, PushedIdx, MinSQN, MaxSQN},
-                                    State#state.ledger_sqn,
-                                    State#state.levelzero_cache,
-                                    State)}
+            SW1 = os:timestamp(),
+            % Reply now as the Bookie need not wait for the roll decision, it
+            % just needs to know if the Cache update is accpeted (ok) or if the
+            % entry has not been added (returned)
+            gen_server:reply(From, Response),
+            Man0 = UpdState#state.manifest,
+            CacheOverSize =
+                maybe_cache_too_big(
+                    UpdState#state.levelzero_size,
+                    UpdState#state.levelzero_maxcachesize,
+                    UpdState#state.levelzero_cointoss),
+            ToRoll =
+                not (leveled_pmanifest:levelzero_present(Man0) or WorkOngoing)
+                    and (CacheAlreadyFull or CacheOverSize),
+            case ToRoll of
+                true ->
+                    % Rolling the memory is to create a new Level Zero file
+                    {Constructor, none} =
+                        roll_memory(
+                            leveled_pmanifest:get_manifest_sqn(Man0) + 1,
+                            UpdState#state.ledger_sqn,
+                            UpdState#state.root_path,
+                            none,
+                            length(UpdState#state.levelzero_cache),
+                            UpdState#state.sst_options,
+                            false),
+                    % Log timings if we've accepted a cache, and are rolling a
+                    % file
+                    case Response of
+                        ok ->
+                            CacheTime = timer:now_diff(SW1, SW0),
+                            RollTime = timer:now_diff(os:timestamp(), SW1),
+                            leveled_log:log_timer(
+                                "P0031", 
+                                [UpdState#state.levelzero_size, true, true,
+                                    MinSQN, MaxSQN, CacheTime, RollTime],
+                                SW0);
+                        returned ->
+                            ok
+                    end,
+                    {noreply,
+                        UpdState#state{
+                            levelzero_pending=true,
+                            levelzero_constructor=Constructor}};
+                false ->
+                    {noreply, UpdState}
+            end
     end;
 handle_call({fetch, Key, Hash, UseL0Index}, _From, State) ->
     L0Idx = 
@@ -892,15 +939,16 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
                                                 {LM1Cache, MinSQN, MaxSQN},
                                                 State#state.ledger_sqn,
                                                 State#state.levelzero_cache),
-                L0Index =
+                LM1Idx =
                     case BookieIdx of
                         empty_index ->
-                            State#state.levelzero_index;
+                            leveled_pmem:new_index();
                         _ ->
-                            leveled_pmem:add_to_index(BookieIdx,
-                                                        State#state.levelzero_index,
-                                                        length(L0Cache))
+                            BookieIdx
                     end,
+                L0Index =
+                    leveled_pmem:add_to_index(
+                        LM1Idx, State#state.levelzero_index, length(L0Cache)),
                 {#state{levelzero_cache = L0Cache,
                         levelzero_index = L0Index,
                         levelzero_size = UpdSize,
@@ -931,18 +979,20 @@ handle_call(close, _From, State) ->
     % on the clerk.
     ok = leveled_pclerk:clerk_close(State#state.clerk),
     leveled_log:log("P0008", [close]),
-    L0Empty = State#state.levelzero_size == 0,
-    case (not State#state.levelzero_pending and not L0Empty) of
+    L0Left = State#state.levelzero_size > 0,
+    case (not State#state.levelzero_pending and L0Left) of
         true ->
-            L0_Left = State#state.levelzero_size > 0,
-            {UpdState, _L0Bloom} = maybe_roll_memory(State, L0_Left, true),
-            L0Pid = UpdState#state.levelzero_constructor,
-            case is_pid(L0Pid) of
-                true ->
-                    ok = leveled_sst:sst_close(L0Pid);
-                false ->
-                    leveled_log:log("P0010", [State#state.levelzero_size])
-            end;
+            Man0 = State#state.manifest,
+            {Constructor, _} =
+                roll_memory(
+                    leveled_pmanifest:get_manifest_sqn(Man0) + 1,
+                    State#state.ledger_sqn,
+                    State#state.root_path,
+                    State#state.levelzero_cache,
+                    length(State#state.levelzero_cache),
+                    State#state.sst_options,
+                    true),
+            ok = leveled_sst:sst_close(Constructor);
         false ->
             leveled_log:log("P0010", [State#state.levelzero_size])
     end,
@@ -1072,10 +1122,9 @@ handle_cast({levelzero_complete, FN, StartKey, EndKey, Bloom}, State) ->
                                                         0,
                                                         ManEntry),
     % Prompt clerk to ask about work - do this for every L0 roll
-    UpdIndex = leveled_pmem:clear_index(State#state.levelzero_index),
     ok = leveled_pclerk:clerk_prompt(State#state.clerk),
     {noreply, State#state{levelzero_cache=[],
-                            levelzero_index=UpdIndex,
+                            levelzero_index=[],
                             levelzero_pending=false,
                             levelzero_constructor=undefined,
                             levelzero_size=0,
@@ -1215,7 +1264,7 @@ start_from_file(PCLopts) ->
                         root_path = RootPath,
                         levelzero_maxcachesize = MaxTableSize,
                         levelzero_cointoss = CoinToss,
-                        levelzero_index = leveled_pmem:new_index(),
+                        levelzero_index = [],
                         snaptimeout_short = SnapTimeoutShort,
                         snaptimeout_long = SnapTimeoutLong,
                         sst_options = OptsSST},
@@ -1350,136 +1399,96 @@ archive_files(RootPath, UsedFileList) ->
     ok.
 
 
--spec update_levelzero(integer(), tuple(), integer(), list(), pcl_state()) 
-                                                            -> pcl_state().
+-spec update_levelzero_cache(
+    non_neg_integer(), bookies_memory(), non_neg_integer(),
+    levelzero_cache(), leveled_pmem:index_array()) 
+        ->
+            {levelzero_cache(), pos_integer(),
+                leveled_pmem:index_array(), pos_integer()}.
 %% @doc
 %% Update the in-memory cache of recent changes for the penciller.  This is 
 %% the level zero at the top of the tree.
-%% Once the update is made, there needs to be a decision to potentially roll
-%% the level-zero memory to an on-disk level zero sst file.  This can only
-%% happen when the cache has exeeded the size threshold (with some jitter 
-%% to prevent coordination across multiple leveled instances), and when there
-%% is no level zero file already present, and when there is no manifest change
-%% pending. 
-update_levelzero(L0Size, {PushedTree, PushedIdx, MinSQN, MaxSQN},
-                                                LedgerSQN, L0Cache, State) ->
-    SW = os:timestamp(), % Time this for logging purposes
-    Update = leveled_pmem:add_to_cache(L0Size,
-                                        {PushedTree, MinSQN, MaxSQN},
-                                        LedgerSQN,
-                                        L0Cache),
-    UpdL0Index = leveled_pmem:add_to_index(PushedIdx,
-                                            State#state.levelzero_index,
-                                            length(L0Cache) + 1),
-    
-    {UpdMaxSQN, NewL0Size, UpdL0Cache} = Update,
-    if
-        UpdMaxSQN >= LedgerSQN ->
-            UpdState = State#state{levelzero_cache=UpdL0Cache,
-                                    levelzero_size=NewL0Size,
-                                    levelzero_index=UpdL0Index,
-                                    ledger_sqn=UpdMaxSQN},
-            CacheTooBig =
-                NewL0Size > State#state.levelzero_maxcachesize,
-            CacheMuchTooBig = 
-                NewL0Size > min(?SUPER_MAX_TABLE_SIZE,
-                                2 * State#state.levelzero_maxcachesize),
-            RandomFactor =
-                case State#state.levelzero_cointoss of
-                    true ->
-                        case leveled_rand:uniform(?COIN_SIDECOUNT) of
-                            1 ->
-                                true;
-                            _ ->
-                                false
-                        end;
-                    false ->
-                        true
-                end,
-            JitterCheck = RandomFactor or CacheMuchTooBig,
-            Due = CacheTooBig and JitterCheck,
-            {UpdState0, _L0Bloom} = maybe_roll_memory(UpdState, Due, false),
-            LogSubs = [NewL0Size, Due, State#state.work_ongoing],
-            case Due of
-                true ->
-                    leveled_log:log_timer("P0031", LogSubs, SW);
-                _ ->
-                    ok
-            end,
-            UpdState0
-    end.
+update_levelzero_cache(
+        L0Size,
+        {LedgerTable, PushedIdx, MinSQN, MaxSQN},
+        LedgerSQN, L0Cache, L0Index) ->
+    PushedTree =
+        case is_tuple(LedgerTable) of
+            true ->
+                LedgerTable;
+            false ->
+                leveled_tree:from_orderedset(LedgerTable, ?CACHE_TYPE)
+        end,
+    {UpdMaxSQN, NewL0Size, UpdL0Cache} =
+        leveled_pmem:add_to_cache(
+            L0Size, {PushedTree, MinSQN, MaxSQN}, LedgerSQN, L0Cache),
+    UpdL0Index =
+        leveled_pmem:add_to_index(
+            PushedIdx, L0Index, length(L0Cache) + 1),
+    {UpdL0Cache, NewL0Size, UpdL0Index, UpdMaxSQN}.
 
 
--spec maybe_roll_memory(pcl_state(), boolean(), boolean())
-                -> {pcl_state(), leveled_ebloom:bloom()|none}.
+-spec maybe_cache_too_big(
+    pos_integer(), pos_integer(), boolean()) -> boolean().
 %% @doc
-%% Check that no L0 file is present before rolling memory.  Returns a boolean
-%% to indicate if memory has been rolled, the Pid of the L0 constructor and 
-%% The bloom of the L0 file (or none)
-maybe_roll_memory(State, false, _SyncRoll) ->
-    {State, none};
-maybe_roll_memory(State, true, SyncRoll) ->
-    BlockedByL0 = leveled_pmanifest:levelzero_present(State#state.manifest),
-    PendingManifestChange = State#state.work_ongoing,
-    % It is critical that memory is not rolled if the manifest is due to be
-    % updated by a change by the clerk.  When that manifest change is made it
-    % will override the addition of L0 and data will be lost.
-    case (BlockedByL0 or PendingManifestChange) of
-        true ->
-            {State, none};
-        false ->
-            {L0Constructor, Bloom} = roll_memory(State, SyncRoll),
-            {State#state{levelzero_pending=true,
-                            levelzero_constructor=L0Constructor},
-                Bloom}
-    end.
+%% Is the cache too big - should it be flushed to on-disk Level 0
+%% There exists some jitter to prevent all caches from flushing concurrently
+%% where there are multiple leveled instances on one machine.
+maybe_cache_too_big(NewL0Size, L0MaxSize, CoinToss) ->
+    CacheTooBig = NewL0Size > L0MaxSize,
+    CacheMuchTooBig = 
+        NewL0Size > min(?SUPER_MAX_TABLE_SIZE, 2 * L0MaxSize),
+    RandomFactor =
+        case CoinToss of
+            true ->
+                case leveled_rand:uniform(?COIN_SIDECOUNT) of
+                    1 ->
+                        true;
+                    _ ->
+                        false
+                end;
+            false ->
+                true
+        end,
+    CacheTooBig and (RandomFactor or CacheMuchTooBig).
 
--spec roll_memory(pcl_state(), boolean()) 
-                                    -> {pid(), leveled_ebloom:bloom()|none}.
+-spec roll_memory(
+    pos_integer(), non_neg_integer(), string(),
+    levelzero_cache()|none, pos_integer(),
+    sst_options(), boolean())
+        -> {pid(), leveled_ebloom:bloom()|none}.
 %% @doc
 %% Roll the in-memory cache into a L0 file.  If this is done synchronously, 
 %% will return a bloom representing the contents of the file. 
 %%
-%% Casting a large object (the levelzero cache) to the gen_server did not lead
-%% to an immediate return as expected.  With 32K keys in the TreeList it could
-%% take around 35-40ms.
+%% Casting a large object (the levelzero cache) to the SST file does not lead
+%% to an immediate return.  With 32K keys in the TreeList it could take around
+%% 35-40ms due to the overheads of copying.
 %%
-%% To avoid blocking this gen_server, the SST file can request each item of the
+%% To avoid blocking the penciller, the SST file can request each item of the
 %% cache one at a time.
 %%
 %% The Wait is set to false to use a cast when calling this in normal operation
 %% where as the Wait of true is used at shutdown
-roll_memory(State, false) ->
-    ManSQN = leveled_pmanifest:get_manifest_sqn(State#state.manifest) + 1,
-    RootPath = sst_rootpath(State#state.root_path),
-    FileName = sst_filename(ManSQN, 0, 0),
-    leveled_log:log("P0019", [FileName, State#state.ledger_sqn]),
+roll_memory(NextManSQN, LedgerSQN, RootPath, none, CL, SSTOpts, false) ->
+    L0Path = sst_rootpath(RootPath),
+    L0FN = sst_filename(NextManSQN, 0, 0),
+    leveled_log:log("P0019", [L0FN, LedgerSQN]),
     PCL = self(),
     FetchFun =
         fun(Slot, ReturnFun) -> pcl_fetchlevelzero(PCL, Slot, ReturnFun) end,
-    R = leveled_sst:sst_newlevelzero(RootPath,
-                                        FileName,
-                                        length(State#state.levelzero_cache),
-                                        FetchFun,
-                                        PCL,
-                                        State#state.ledger_sqn,
-                                        State#state.sst_options),
-    {ok, Constructor, _} = R,
+    {ok, Constructor, _} =
+        leveled_sst:sst_newlevelzero(
+            L0Path, L0FN, CL, FetchFun, PCL, LedgerSQN, SSTOpts),
     {Constructor, none};
-roll_memory(State, true) ->
-    ManSQN = leveled_pmanifest:get_manifest_sqn(State#state.manifest) + 1,
-    RootPath = sst_rootpath(State#state.root_path),
-    FileName = sst_filename(ManSQN, 0, 0),
-    LZC = State#state.levelzero_cache,
-    FetchFun = fun(Slot) -> lists:nth(Slot, LZC) end,
-    KVList = leveled_pmem:to_list(length(LZC), FetchFun),
-    R = leveled_sst:sst_new(RootPath,
-                            FileName,
-                            0,
-                            KVList,
-                            State#state.ledger_sqn,
-                            State#state.sst_options),
-    {ok, Constructor, _, Bloom} = R,
+roll_memory(NextManSQN, LedgerSQN, RootPath, L0Cache, CL, SSTOpts, true) ->
+    L0Path = sst_rootpath(RootPath),
+    L0FN = sst_filename(NextManSQN, 0, 0),
+    FetchFun = fun(Slot) -> lists:nth(Slot, L0Cache) end,
+    KVList = leveled_pmem:to_list(CL, FetchFun),
+    {ok, Constructor, _, Bloom} =
+        leveled_sst:sst_new(
+            L0Path, L0FN, 0, KVList, LedgerSQN, SSTOpts),
     {Constructor, Bloom}.
 
 
diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl
index 6f4c8736..cbbdff87 100644
--- a/src/leveled_pmem.erl
+++ b/src/leveled_pmem.erl
@@ -37,7 +37,6 @@
         merge_trees/4,
         add_to_index/3,
         new_index/0,
-        clear_index/1,
         check_index/2,
         cache_full/1
         ]).      
@@ -46,8 +45,7 @@
 
 -define(MAX_CACHE_LINES, 31). % Must be less than 128
 
-% -type index_array() :: array:array().
--type index_array() :: any()|none. % To live with OTP16
+-type index_array() :: list(array:array())|[]|none. 
 
 -export_type([index_array/0]).
 
@@ -61,8 +59,8 @@
 cache_full(L0Cache) ->
     length(L0Cache) == ?MAX_CACHE_LINES.
 
--spec prepare_for_index(index_array(), leveled_codec:segment_hash()) 
-                                                            -> index_array().
+-spec prepare_for_index(
+    array:array(), leveled_codec:segment_hash()) -> array:array().
 %% @doc
 %% Add the hash of a key to the index.  This is 'prepared' in the sense that
 %% this index is not use until it is loaded into the main index.
@@ -77,45 +75,39 @@ prepare_for_index(IndexArray, Hash) ->
     Bin = array:get(Slot, IndexArray),
     array:set(Slot, <<Bin/binary, 1:1/integer, H0:23/integer>>, IndexArray).
 
--spec add_to_index(index_array(), index_array(), integer()) -> index_array().
+-spec add_to_index(array:array(), index_array(), integer()) -> index_array().
 %% @doc
 %% Expand the penciller's current index array with the details from a new
 %% ledger cache tree sent from the Bookie.  The tree will have a cache slot
 %% which is the index of this ledger_cache in the list of the ledger_caches
 add_to_index(LM1Array, L0Index, CacheSlot) when CacheSlot < 128 ->
-    IndexAddFun =
-        fun(Slot, Acc) ->
-            Bin0 = array:get(Slot, Acc),
-            BinLM1 = array:get(Slot, LM1Array),
-            array:set(Slot,
-                        <<Bin0/binary,
-                            0:1/integer, CacheSlot:7/integer,
-                            BinLM1/binary>>,
-                        Acc)
-        end,
-    lists:foldl(IndexAddFun, L0Index, lists:seq(0, 255)).
+    [LM1Array|L0Index].
 
--spec new_index() -> index_array().
+-spec new_index() -> array:array().
 %% @doc
 %% Create a new index array
 new_index() ->
     array:new([{size, 256}, {default, <<>>}]).
 
--spec clear_index(index_array()) -> index_array().
-%% @doc
-%% Create a new index array
-clear_index(_L0Index) ->
-    new_index().
-
--spec check_index({integer(), integer()}, index_array()) -> list(integer()).
+-spec check_index(leveled_codec:segment_hash(), index_array())
+        -> list(non_neg_integer()).
 %% @doc
 %% return a list of positions in the list of cache arrays that may contain the
 %% key associated with the hash being checked
 check_index(Hash, L0Index) ->
     {Slot, H0} = split_hash(Hash),
-    Bin = array:get(Slot, L0Index),
-    find_pos(Bin, H0, [], 0).    
-
+    {_L, Positions} =
+        lists:foldl(
+            fun(A, {SlotC, PosList}) ->
+                B = array:get(Slot, A),
+                case find_pos(B, H0) of
+                    true -> {SlotC + 1, [SlotC|PosList]};
+                    false -> {SlotC + 1, PosList}
+                end
+            end,
+            {1, []},
+            L0Index),
+    lists:reverse(Positions).    
 
 -spec add_to_cache(integer(),
                     {tuple(), integer(), integer()},
@@ -128,16 +120,11 @@ check_index(Hash, L0Index) ->
 %% the Ledger's SQN.
 add_to_cache(L0Size, {LevelMinus1, MinSQN, MaxSQN}, LedgerSQN, TreeList) ->
     LM1Size = leveled_tree:tsize(LevelMinus1),
-    case LM1Size of
-        0 ->
-            {LedgerSQN, L0Size, TreeList};
-        _ ->
-            if
-                MinSQN >= LedgerSQN ->
-                    {MaxSQN,
-                        L0Size + LM1Size,
-                        lists:append(TreeList, [LevelMinus1])}
-            end
+    if
+        MinSQN >= LedgerSQN ->
+            {MaxSQN,
+                L0Size + LM1Size,
+                [LevelMinus1|TreeList]}
     end.
 
 -spec to_list(
@@ -151,7 +138,7 @@ add_to_cache(L0Size, {LevelMinus1, MinSQN, MaxSQN}, LedgerSQN, TreeList) ->
 %% does a large object copy of the whole cache.
 to_list(Slots, FetchFun) ->
     SW = os:timestamp(),
-    SlotList = lists:reverse(lists:seq(1, Slots)),
+    SlotList = lists:seq(1, Slots),
     FullList = lists:foldl(fun(Slot, Acc) ->
                                 Tree = FetchFun(Slot),
                                 L = leveled_tree:to_list(Tree),
@@ -194,32 +181,24 @@ check_levelzero(Key, Hash, PosList, TreeList) ->
 %% currently unmerged bookie's ledger cache) that are between StartKey
 %% and EndKey (inclusive).
 merge_trees(StartKey, EndKey, TreeList, LevelMinus1) ->
-    lists:foldl(fun(Tree, Acc) ->
-                        R = leveled_tree:match_range(StartKey,
-                                                        EndKey,
-                                                        Tree),
-                        lists:ukeymerge(1, Acc, R) end,
-                    [],
-                    [LevelMinus1|lists:reverse(TreeList)]).
+    lists:foldl(
+        fun(Tree, Acc) ->
+            R = leveled_tree:match_range(StartKey, EndKey, Tree),
+            lists:ukeymerge(1, Acc, R) end,
+            [],
+            [LevelMinus1|TreeList]).
 
 %%%============================================================================
 %%% Internal Functions
 %%%============================================================================
 
 
-find_pos(<<>>, _Hash, PosList, _SlotID) ->
-    PosList;
-find_pos(<<1:1/integer, Hash:23/integer, T/binary>>, Hash, PosList, SlotID) ->
-    case lists:member(SlotID, PosList) of
-        true ->
-            find_pos(T, Hash, PosList, SlotID);
-        false ->
-            find_pos(T, Hash, PosList ++ [SlotID], SlotID)
-    end;
-find_pos(<<1:1/integer, _Miss:23/integer, T/binary>>, Hash, PosList, SlotID) ->
-    find_pos(T, Hash, PosList, SlotID);
-find_pos(<<0:1/integer, NxtSlot:7/integer, T/binary>>, Hash, PosList, _SlotID) ->
-    find_pos(T, Hash, PosList, NxtSlot).
+find_pos(<<>>, _Hash) ->
+    false;
+find_pos(<<1:1/integer, Hash:23/integer, _T/binary>>, Hash) ->
+    true;
+find_pos(<<1:1/integer, _Miss:23/integer, T/binary>>, Hash) ->
+    find_pos(T, Hash).
 
 
 split_hash({SegmentID, ExtraHash}) ->
@@ -243,9 +222,7 @@ check_slotlist(Key, _Hash, CheckList, TreeList) ->
                     end
             end
             end,
-    lists:foldl(SlotCheckFun,
-                    {false, not_found},
-                    lists:reverse(CheckList)).
+    lists:foldl(SlotCheckFun, {false, not_found}, CheckList).
 
 %%%============================================================================
 %%% Test
@@ -326,7 +303,7 @@ compare_method_test() ->
             end,
     
     S0 = lists:foldl(fun({Key, _V}, Acc) ->
-                            R0 = lists:foldr(FindKeyFun(Key),
+                            R0 = lists:foldl(FindKeyFun(Key),
                                                 {false, not_found},
                                                 TreeList),
                             [R0|Acc] end,
@@ -395,7 +372,7 @@ with_index_test2() ->
             {R, UpdL0Index, lists:ukeymerge(1, LM1, SrcList)}
         end,
     
-    R0 = lists:foldl(LoadFun, {{0, 0, []}, new_index(), []}, lists:seq(1, 16)),
+    R0 = lists:foldl(LoadFun, {{0, 0, []}, [], []}, lists:seq(1, 16)),
     
     {{SQN, Size, TreeList}, L0Index, SrcKVL} = R0,
     ?assertMatch(32000, SQN),
@@ -413,4 +390,52 @@ with_index_test2() ->
     _R1 = lists:foldl(CheckFun, {L0Index, TreeList}, SrcKVL).
             
 
+index_performance_test() ->
+    LM1 = generate_randomkeys_aslist(1, 2000, 1, 500),
+    LM2 = generate_randomkeys_aslist(2001, 2000, 1, 500),
+    HL1 = lists:map(fun({K, _V}) -> leveled_codec:segment_hash(K) end, LM1),
+    HL2 = lists:map(fun({K, _V}) -> leveled_codec:segment_hash(K) end, LM2),
+
+    SWP = os:timestamp(),
+    A1 =
+        lists:foldl(
+            fun(H, A) -> prepare_for_index(A, H) end,
+            new_index(),
+            HL1),
+    io:format(
+        user, 
+        "~nPrepare single index takes ~w microsec~n",
+        [timer:now_diff(os:timestamp(), SWP)]),
+    
+    SWL = os:timestamp(),
+    PMI1 = 
+        lists:foldl(
+            fun(I, Idx) -> add_to_index(A1, Idx, I) end, [], lists:seq(1, 8)),
+    io:format(
+        user, 
+        "Appending to array takes ~w microsec~n",
+        [timer:now_diff(os:timestamp(), SWL)]),
+    
+    SWC1 = os:timestamp(),
+    R0 = lists:seq(1, 8),
+    lists:foreach(fun(H) -> ?assertMatch(R0, check_index(H, PMI1)) end, HL1),
+    io:format(
+        user, 
+        "Checking 2000 matches in array at each level takes ~w microsec~n",
+        [timer:now_diff(os:timestamp(), SWC1)]),
+    
+    SWC2 = os:timestamp(),
+    FPT = 
+        lists:foldl(
+            fun(H, FPC) -> FPC + length(check_index(H, PMI1)) end,
+            0,
+            HL2),
+    io:format(
+        user, 
+        "Checking 2000 misses in array at each level takes ~w microsec " ++
+        "with ~w false positives~n",
+        [timer:now_diff(os:timestamp(), SWC2), FPT]).
+
+
+
 -endif.

From 415e682c6d8d2b9c9354a05f487abd2984e4dc44 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 9 Nov 2022 09:21:35 +0000
Subject: [PATCH 02/37] Compress SST index

Reduces the size of the leveled_sst index with two changes:

1 - Where there is a common prefix of tuple elements (e.g. Bucket) across the whole leveled_sst file - only the non-common part is indexed, and a function is used to compare.

2 - There is less "indexing" of the index i.e. only 1 in 16 keys are passed into the gb_trees part instead of 1 in 4
---
 src/leveled_codec.erl |  50 +++++++--
 src/leveled_sst.erl   | 241 ++++++++++++++++++++++++++++++++++++++----
 src/leveled_tree.erl  | 132 ++++++++++++++++-------
 3 files changed, 356 insertions(+), 67 deletions(-)

diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl
index e8be2448..3faec9cd 100644
--- a/src/leveled_codec.erl
+++ b/src/leveled_codec.erl
@@ -80,6 +80,8 @@
         tomb|{active, non_neg_integer()|infinity}.
 -type ledger_key() :: 
         {tag(), any(), any(), any()}|all.
+-type slimmed_key() ::
+        {binary(), binary()|null}|binary()|null|all.
 -type ledger_value() ::
         ledger_value_v1()|ledger_value_v2().
 -type ledger_value_v1() ::
@@ -342,20 +344,46 @@ isvalid_ledgerkey({Tag, _B, _K, _SK}) ->
 isvalid_ledgerkey(_LK) ->
     false.
 
--spec endkey_passed(ledger_key(), ledger_key()) -> boolean().
-%% @oc
+-spec endkey_passed(
+    ledger_key()|slimmed_key(),
+    ledger_key()|slimmed_key()) -> boolean().
+%% @doc
 %% Compare a key against a query key, only comparing elements that are non-null
-%% in the Query key.  This is used for comparing against end keys in queries.
+%% in the Query key.  
+%% 
+%% Query key of `all` matches all keys
+%% Query key element of `null` matches all keys less than or equal in previous
+%% elements
+%% 
+%% This function is required to make sense of this with erlang term order,
+%% where otherwise atom() < binary()
+%% 
+%% endkey_passed means "Query End Key has been passed when scanning this range"
+%% 
+%% If the Query End Key is within the range ending in RangeEndkey then
+%% endkey_passed is true.  This range extends beyond the end of the Query
+%% range, and so no further ranges need to be added to the Query results.
+%% If the Query End Key is beyond the Range End Key, then endkey_passed is
+%% false and further results may be required from further ranges.
 endkey_passed(all, _) ->
     false;
-endkey_passed({EK1, null, null, null}, {CK1, _, _, _}) ->
-    EK1 < CK1;
-endkey_passed({EK1, EK2, null, null}, {CK1, CK2, _, _}) ->
-    {EK1, EK2} < {CK1, CK2};
-endkey_passed({EK1, EK2, EK3, null}, {CK1, CK2, CK3, _}) ->
-    {EK1, EK2, EK3} < {CK1, CK2, CK3};
-endkey_passed(EndKey, CheckingKey) ->
-    EndKey < CheckingKey.
+endkey_passed({K1, null, null, null}, {K1, _, _, _}) ->
+    false;
+endkey_passed({K1, K2, null, null}, {K1, K2, _, _}) ->
+    false;
+endkey_passed({K1, K2, K3, null}, {K1, K2, K3, _}) ->
+    false;
+endkey_passed({K1, null}, {K1, _}) ->
+    % See leveled_sst SlotIndex implementation.  Here keys may be slimmed to
+    % single binaries or two element tuples before forming the index.
+    false;
+endkey_passed(null, _) ->
+    false;
+endkey_passed(QueryEndKey, RangeEndKey) ->
+    % i.e. false = Keep searching not yet beyond query range
+    % true = this range extends byeond the end of the query, no further results
+    % required
+    QueryEndKey < RangeEndKey.
 
 
 %%%============================================================================
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index be100c8a..b9ecdf27 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -83,7 +83,7 @@
 -define(DISCARD_EXT, ".discarded").
 -define(DELETE_TIMEOUT, 10000).
 -define(TREE_TYPE, idxt).
--define(TREE_SIZE, 4).
+-define(TREE_SIZE, 16).
 -define(TIMING_SAMPLECOUNTDOWN, 20000).
 -define(TIMING_SAMPLESIZE, 100).
 -define(BLOCK_LENGTHS_LENGTH, 20).
@@ -148,11 +148,14 @@
                             start_position :: integer(),
                             length :: integer()}).
 
--record(summary,    {first_key :: tuple(),
-                        last_key :: tuple(),
-                        index :: tuple() | undefined,
-                        size :: integer(),
-                        max_sqn :: integer()}).
+-record(summary,
+            {first_key :: tuple(),
+            last_key :: tuple(),
+            index :: tuple() | undefined,
+            size :: integer(),
+            max_sqn :: integer(),
+            filter_fun ::
+                fun((leveled_codec:ledger_key()) -> any()) | undefined}).
 
 -type press_method() 
         :: lz4|native|none.
@@ -1300,7 +1303,9 @@ fetch(LedgerKey, Hash, State, Timings0) ->
     Summary = State#state.summary,
     PressMethod = State#state.compression_method,
     IdxModDate = State#state.index_moddate,
-    Slot = lookup_slot(LedgerKey, Summary#summary.index),
+    Slot =
+        lookup_slot(
+            LedgerKey, Summary#summary.index, Summary#summary.filter_fun),
     
     {SW1, Timings1} = update_timings(SW0, Timings0, index_query, true),
     
@@ -1386,7 +1391,12 @@ fetch(LedgerKey, Hash, State, Timings0) ->
 fetch_range(StartKey, EndKey, ScanWidth, SegList, LowLastMod, State) ->
     Summary = State#state.summary,
     Handle = State#state.handle,
-    {Slots, RTrim} = lookup_slots(StartKey, EndKey, Summary#summary.index),
+    {Slots, RTrim} =
+        lookup_slots(
+            StartKey,
+            EndKey,
+            Summary#summary.index,
+            Summary#summary.filter_fun),
     Self = self(),
     SL = length(Slots),
     
@@ -1492,8 +1502,10 @@ read_file(Filename, State, LoadPageCache) ->
         read_table_summary(SummaryBin, UpdState0#state.tomb_count),
     BlockIndexCache = new_blockindex_cache(Summary#summary.size),
     UpdState1 = UpdState0#state{blockindex_cache = BlockIndexCache},
-    SlotIndex = from_list(SlotList),
-    UpdSummary = Summary#summary{index = SlotIndex},
+    {SlotIndex, FilterFun} =
+        from_list(
+            SlotList, Summary#summary.first_key, Summary#summary.last_key),
+    UpdSummary = Summary#summary{index = SlotIndex, filter_fun = FilterFun},
     leveled_log:log("SST03", [Filename,
                                 Summary#summary.size,
                                 Summary#summary.max_sqn]),
@@ -1725,20 +1737,66 @@ hmac(Int) when is_integer(Int) ->
 %%
 %% This implementation of the SlotIndex uses leveled_tree
 
-from_list(SlotList) ->
-    leveled_tree:from_orderedlist(SlotList, ?TREE_TYPE, ?TREE_SIZE).
+from_list(SlotList, FirstKey, LastKey) ->
+    FilterFun = get_filterfun(FirstKey, LastKey),
+    FilteredList =
+        lists:map(fun({K, S}) -> {FilterFun(K), S} end, SlotList),
+    {leveled_tree:from_orderedlist(FilteredList, ?TREE_TYPE, ?TREE_SIZE),
+        FilterFun}.
+
+-spec get_filterfun(
+    leveled_codec:ledger_key(), leveled_codec:ledger_key()) ->
+        fun((leveled_codec:ledger_key()) -> any()).
+get_filterfun(
+        {Tag, Bucket, {Field, FT}, FK}, {Tag, Bucket, {Field, LT}, LK})
+            when is_binary(FT), is_binary(FK), is_binary(LT), is_binary(LK) ->
+    case binary:longest_common_prefix([FT, LT]) of
+        0 ->
+            fun({_Tag, _Bucket, {_Field, Term}, Key}) ->
+                {Term, Key}
+            end;
+        N ->
+            fun({_Tag, _Bucket, {_Field, Term}, Key}) ->
+                case Term of
+                    T when byte_size(T) == N ->
+                        {<<>>, Key};
+                    <<_:N/binary, Suffix/binary>> ->
+                        {Suffix, Key}
+                end
+            end
+    end;
+get_filterfun(
+        {Tag, Bucket, FK, null}, {Tag, Bucket, LK, null})
+            when is_binary(FK), is_binary(LK), FK < LK ->
+    case binary:longest_common_prefix([FK, LK]) of
+        0 ->
+            fun({_Tag, _Bucket, Key, null}) -> Key end;
+        N ->
+            fun({_Tag, _Bucket, Key, null}) ->
+                case Key of
+                    null ->
+                        null;
+                    K when byte_size(K) == N ->
+                        <<>>;
+                    <<_:N/binary, Suffix/binary>> ->
+                        Suffix
+                end
+            end
+    end;
+get_filterfun(_FirstKey, _LastKey) ->
+    fun(K) -> K end.
 
-lookup_slot(Key, Tree) ->
+lookup_slot(Key, Tree, FilterFun) ->
     StartKeyFun =
         fun(_V) ->
             all
         end,
     % The penciller should never ask for presence out of range - so will
-    % always return a slot (As we don't compare to StartKey)
-    {_LK, Slot} = leveled_tree:search(Key, Tree, StartKeyFun),
+    % always return a slot (as we don't compare to StartKey)
+    {_LK, Slot} = leveled_tree:search(FilterFun(Key), Tree, StartKeyFun),
     Slot.
 
-lookup_slots(StartKey, EndKey, Tree) ->
+lookup_slots(StartKey, EndKey, Tree, FilterFun) ->
     StartKeyFun =
         fun(_V) ->
             all
@@ -1747,9 +1805,25 @@ lookup_slots(StartKey, EndKey, Tree) ->
         fun({_LK, Slot}) ->
             Slot
         end,
-    SlotList = leveled_tree:search_range(StartKey, EndKey, Tree, StartKeyFun),
+    FilteredStartKey =
+        case StartKey of
+            all -> all;
+            _ -> FilterFun(StartKey)
+        end,
+    FilteredEndKey =
+        case EndKey of
+            all -> all;
+            _ -> FilterFun(EndKey)
+        end,
+    SlotList =
+        leveled_tree:search_range(
+            FilteredStartKey,
+            FilteredEndKey,
+            Tree,
+            StartKeyFun),
     {EK, _EndSlot} = lists:last(SlotList),
-    {lists:map(MapFun, SlotList), not leveled_codec:endkey_passed(EK, EndKey)}.
+    {lists:map(MapFun, SlotList),
+        leveled_codec:endkey_passed(FilteredEndKey, EK)}.
 
 
 %%%============================================================================
@@ -3651,7 +3725,7 @@ simple_persisted_rangesegfilter_tester(SSTNewFun) ->
 
 additional_range_test() ->
     % Test fetching ranges that fall into odd situations with regards to the
-    % summayr index
+    % summary index
     % - ranges which fall between entries in summary
     % - ranges which go beyond the end of the range of the sst
     % - ranges which match to an end key in the summary index
@@ -3689,6 +3763,7 @@ additional_range_test() ->
     % Testing the gap
     [GapSKV] = generate_indexkey(?NOLOOK_SLOTSIZE + 1, ?NOLOOK_SLOTSIZE + 1),
     [GapEKV] = generate_indexkey(?NOLOOK_SLOTSIZE + 2, ?NOLOOK_SLOTSIZE + 2),
+    io:format("Gap test between ~p and ~p", [GapSKV, GapEKV]),
     R3 = sst_getkvrange(P1, element(1, GapSKV), element(1, GapEKV), 1),
     ?assertMatch([], R3),
     
@@ -4281,7 +4356,135 @@ block_index_cache_test() ->
     ?assertMatch(HeaderTS, array:get(0, BIC3)),
     ?assertMatch(Now, LMD3).
 
+single_key_test() ->
+    FileName = "single_key_test",
+    LK = leveled_codec:to_ledgerkey(<<"Bucket0">>, <<"Key0">>, ?STD_TAG),
+    Chunk = leveled_rand:rand_bytes(16),
+    {_B, _K, MV, _H, _LMs} =
+        leveled_codec:generate_ledgerkv(LK, 1, Chunk, 16, infinity),
+    OptsSST = 
+        #sst_options{press_method=native,
+                        log_options=leveled_log:get_opts()},
+    {ok, P1, {LK, LK}, _Bloom1} = 
+        sst_new(?TEST_AREA, FileName, 1, [{LK, MV}], 6000, OptsSST),
+    ?assertMatch({LK, MV}, sst_get(P1, LK)),
+    ok = sst_close(P1),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")),
+
+    IndexSpecs = [{add, <<"t1_bin">>, <<"20220101">>}],
+    [{IdxK, IdxV}] = 
+        leveled_codec:idx_indexspecs(IndexSpecs, 
+                                    <<"Bucket">>, 
+                                    <<"Key">>, 
+                                    1, 
+                                    infinity),
+    {ok, P2, {IdxK, IdxK}, _Bloom2} = 
+        sst_new(?TEST_AREA, FileName, 1, [{IdxK, IdxV}], 6000, OptsSST),
+    ?assertMatch(
+        [{IdxK, IdxV}],
+        sst_getkvrange(
+            P2,
+            {?IDX_TAG, <<"Bucket">>, {"t1_bin", <<"20220100">>}, null},
+            all,
+            16)),
+    ?assertMatch(
+        [{IdxK, IdxV}],
+        sst_getkvrange(
+            P2,
+            {?IDX_TAG, <<"Bucket">>, {"t1_bin", <<"20220100">>}, null},
+            {?IDX_TAG, <<"Bucket">>, {"t1_bin", <<"20220101">>}, null},
+            16)),
+    ?assertMatch(
+        [{IdxK, IdxV}],
+        sst_getkvrange(
+            P2,
+            {?IDX_TAG, <<"Bucket">>, {"t1_bin", <<"20220101">>}, null},
+            {?IDX_TAG, <<"Bucket">>, {"t1_bin", <<"20220101">>}, null},
+            16)),
+    ok = sst_close(P2),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
+
+strange_range_test() ->
+    FileName = "strange_range_test",
+    Chunk = leveled_rand:rand_bytes(16),
+    OptsSST = 
+        #sst_options{press_method=native,
+                        log_options=leveled_log:get_opts()},
+    
+    FK = leveled_codec:to_ledgerkey({<<"T0">>, <<"B0">>}, <<"K0">>, ?STD_TAG),
+    LK = leveled_codec:to_ledgerkey({<<"T0">>, <<"B0">>}, <<"K02">>, ?STD_TAG),
+    EK = leveled_codec:to_ledgerkey({<<"T0">>, <<"B0">>}, <<"K0299">>, ?STD_TAG),
+
+    KL1 =
+        lists:map(
+            fun(I) -> 
+                leveled_codec:to_ledgerkey(
+                    {<<"T0">>, <<"B0">>},
+                    list_to_binary("K00" ++ integer_to_list(I)),
+                    ?STD_TAG)
+            end,
+            lists:seq(1, 300)),
+    KL2 =
+        lists:map(
+            fun(I) -> 
+                leveled_codec:to_ledgerkey(
+                    {<<"T0">>, <<"B0">>},
+                    list_to_binary("K02" ++ integer_to_list(I)),
+                    ?STD_TAG)
+            end,
+            lists:seq(1, 300)),
+    
+    GenerateValue =
+        fun(K) ->
+            element(
+                3, leveled_codec:generate_ledgerkv(K, 1, Chunk, 16, infinity))
+        end,
 
+    KVL = 
+        lists:ukeysort(
+            1,
+            lists:map(
+                fun(K) -> {K, GenerateValue(K)} end,
+                [FK] ++ KL1 ++ [LK] ++ KL2)),
+    
+    {ok, P1, {FK, EK}, _Bloom1} = 
+            sst_new(?TEST_AREA, FileName, 1, KVL, 6000, OptsSST),
+    
+    ?assertMatch(LK, element(1, sst_get(P1, LK))),
+    ?assertMatch(FK, element(1, sst_get(P1, FK))),
+    ok = sst_close(P1),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")),
+
+    IndexSpecs =
+        lists:map(
+            fun(I) -> {add, <<"t1_bin">>, integer_to_binary(I)} end,
+            lists:seq(1, 500)),
+    IdxKVL = 
+        leveled_codec:idx_indexspecs(IndexSpecs, 
+                                    <<"Bucket">>, 
+                                    <<"Key">>, 
+                                    1, 
+                                    infinity),
+    {ok, P2, {_FIdxK, _EIdxK}, _Bloom2} = 
+        sst_new(
+            ?TEST_AREA, FileName, 1, lists:ukeysort(1, IdxKVL), 6000, OptsSST),
+    [{IdxK1, _IdxV1}, {IdxK2, _IdxV2}] =
+        sst_getkvrange(
+            P2,
+            {?IDX_TAG, <<"Bucket">>, {<<"t1_bin">>, <<"1">>}, null},
+            {?IDX_TAG, <<"Bucket">>, {<<"t1_bin">>, <<"10">>}, null},
+            16),
+    ?assertMatch(
+        {?IDX_TAG, <<"Bucket">>, {<<"t1_bin">>, <<"1">>}, <<"Key">>},
+        IdxK1
+    ),
+    ?assertMatch(
+        {?IDX_TAG, <<"Bucket">>, {<<"t1_bin">>, <<"10">>}, <<"Key">>},
+        IdxK2
+    ),
+    ok = sst_close(P2),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
+    
 
 receive_fun() ->
     receive
diff --git a/src/leveled_tree.erl b/src/leveled_tree.erl
index d38aaa63..33f7b98c 100644
--- a/src/leveled_tree.erl
+++ b/src/leveled_tree.erl
@@ -587,14 +587,30 @@ generate_randomkeys(Seqn, Count, Acc, BucketLow, BRange) ->
     KNumber =
         lists:flatten(
             io_lib:format("K~8..0B", [leveled_rand:uniform(1000)])),
-    {K, V} = {{o, "Bucket" ++ BNumber, "Key" ++ KNumber, null},
-                {Seqn, {active, infinity}, null}},
+    {K, V} =
+        {{o_kv,
+            {<<"btype">>, list_to_binary("Bucket" ++ BNumber)},
+            list_to_binary("Key" ++ KNumber),
+            null},
+            Seqn},
     generate_randomkeys(Seqn + 1,
                         Count - 1,
                         [{K, V}|Acc],
                         BucketLow,
                         BRange).
 
+generate_simplekeys(Seqn, Count) ->
+    generate_simplekeys(Seqn, Count, []).
+
+generate_simplekeys(_Seqn, 0, Acc) ->
+    Acc;
+generate_simplekeys(Seqn, Count, Acc) ->
+    KNumber =
+        list_to_binary(
+            lists:flatten(
+                io_lib:format("K~8..0B", [leveled_rand:uniform(100000)]))),
+    generate_simplekeys(Seqn + 1, Count - 1, [{KNumber, Seqn}|Acc]).
+
 
 tree_search_test() ->
     search_test_by_type(tree),
@@ -685,34 +701,47 @@ tolist_test_by_type(Type) ->
     ?assertMatch(KL, T_Reverse).
     
 tree_timing_test() ->
+    log_tree_test_by_(16, tree, 8000),
     log_tree_test_by_(16, tree, 4000),
-    tree_test_by_(8, tree, 1000),
-    tree_test_by_(4, tree, 256).
+    log_tree_test_by_(4, tree, 256).
 
 idxt_timing_test() ->
+    log_tree_test_by_(16, idxt, 8000),
     log_tree_test_by_(16, idxt, 4000),
-    tree_test_by_(8, idxt, 1000),
-    tree_test_by_(4, idxt, 256).
+    log_tree_test_by_(4, idxt, 256).
 
 skpl_timing_test() ->
-    tree_test_by_(auto, skpl, 6000),
+    log_tree_test_by_(auto, skpl, 8000),
     log_tree_test_by_(auto, skpl, 4000),
-    tree_test_by_(auto, skpl, 1000),
-    tree_test_by_(auto, skpl, 256).
+    log_tree_test_by_simplekey_(auto, skpl, 4000),
+    log_tree_test_by_(auto, skpl, 512),
+    log_tree_test_by_simplekey_(auto, skpl, 512),
+    log_tree_test_by_(auto, skpl, 256),
+    log_tree_test_by_simplekey_(auto, skpl, 256).
 
 log_tree_test_by_(Width, Type, N) ->
-    erlang:statistics(runtime),
-    G0 = erlang:statistics(garbage_collection),
-    tree_test_by_(Width, Type, N),
-    {_, T1} = erlang:statistics(runtime),
-    G1 = erlang:statistics(garbage_collection),
-    io:format(user, "Test took ~w ms and GC transitioned from ~w to ~w~n",
-                [T1, G0, G1]).
-
-tree_test_by_(Width, Type, N) ->
-    io:format(user, "~nTree test for type and width: ~w ~w~n", [Type, Width]),
     KL = lists:ukeysort(1, generate_randomkeys(1, N, 1, N div 5)),
-    
+    SW = os:timestamp(),
+    tree_test_by_(Width, Type, KL),
+    io:format(user, "Test took ~w ms",
+                [timer:now_diff(os:timestamp(), SW) div 1000]).
+
+log_tree_test_by_simplekey_(Width, Type, N) ->
+    KL = lists:ukeysort(1, generate_simplekeys(1, N)),
+    SW = os:timestamp(),
+    tree_test_by_(Width, Type, KL, false),
+    io:format(user, "Test with simple key took ~w ms",
+                [timer:now_diff(os:timestamp(), SW) div 1000]).
+
+tree_test_by_(Width, Type, KL) ->
+    tree_test_by_(Width, Type, KL, true).
+
+tree_test_by_(Width, Type, KL, ComplexKey) ->
+    io:format(
+        user,
+        "~n~nTree test with complexkey=~w for type and width: ~w ~w~n",
+        [ComplexKey, Type, Width]),
+
     OS = ets:new(test, [ordered_set, private]),
     ets:insert(OS, KL),
     SWaETS = os:timestamp(),
@@ -721,6 +750,9 @@ tree_test_by_(Width, Type, N) ->
                         " of size ~w~n",
                 [timer:now_diff(os:timestamp(), SWaETS),
                     tsize(Tree0)]),
+    io:format(user,
+        "Tree has footprint size ~w flat_size ~w~n",
+        [erts_debug:size(Tree0), erts_debug:flat_size(Tree0)]),
     
     SWaGSL = os:timestamp(),
     Tree1 = from_orderedlist(KL, Type, Width),
@@ -728,6 +760,10 @@ tree_test_by_(Width, Type, N) ->
                         " of size ~w~n",
                 [timer:now_diff(os:timestamp(), SWaGSL),
                     tsize(Tree1)]),
+    io:format(user,
+        "Tree has footprint size ~w flat_size ~w~n",
+        [erts_debug:size(Tree1), erts_debug:flat_size(Tree1)]),
+
     SWaLUP = os:timestamp(),
     lists:foreach(match_fun(Tree0), KL),
     lists:foreach(match_fun(Tree1), KL),
@@ -743,11 +779,25 @@ tree_test_by_(Width, Type, N) ->
                 [timer:now_diff(os:timestamp(), SWaSRCH1)]),
     
     BitBiggerKeyFun =
-        fun(Idx) ->
-            {K, _V} = lists:nth(Idx, KL),
-            {o, B, FullKey, null} = K,
-            {{o, B, FullKey ++ "0", null}, lists:nth(Idx + 1, KL)}
-        end,
+        case ComplexKey of
+            true ->
+                fun(Idx) ->
+                    {K, _V} = lists:nth(Idx, KL),
+                    {o_kv, B, FullKey, null} = K,
+                    {{o_kv,
+                        B,
+                        list_to_binary(binary_to_list(FullKey) ++ "0"),
+                        null},
+                        lists:nth(Idx + 1, KL)}
+                end;
+            false ->
+                fun(Idx) ->
+                    {K, _V} = lists:nth(Idx, KL),
+                    {list_to_binary(binary_to_list(K) ++ "0"),
+                        lists:nth(Idx + 1, KL)}
+                end
+        end,            
+    
     SrchKL = lists:map(BitBiggerKeyFun, lists:seq(1, length(KL) - 1)),
     
     SWaSRCH2 = os:timestamp(),
@@ -778,10 +828,14 @@ matchrange_test_by_type(Type) ->
     FirstKey = element(1, lists:nth(1, KL)),
     FinalKey = element(1, lists:last(KL)),
     PenultimateKey = element(1, lists:nth(length(KL) - 1, KL)),
-    AfterFirstKey = setelement(3, FirstKey, element(3, FirstKey) ++ "0"),
-    AfterPenultimateKey = setelement(3,
-                                    PenultimateKey,
-                                    element(3, PenultimateKey) ++ "0"),
+    AfterFirstKey =
+        setelement(3,
+        FirstKey,
+        list_to_binary(binary_to_list(element(3, FirstKey)) ++ "0")),
+    AfterPenultimateKey =
+        setelement(3,
+        PenultimateKey,
+        list_to_binary(binary_to_list(element(3, PenultimateKey)) ++ "0")),
     
     LengthR =
         fun(SK, EK, T) ->
@@ -812,10 +866,12 @@ extra_matchrange_test_by_type(Type) ->
         fun(RangeL) ->
             SKeyV = lists:nth(1, RangeL),
             EKeyV = lists:nth(50, RangeL),
-            {{o, SB, SK, null}, _SV} = SKeyV,
-            {{o, EB, EK, null}, _EV} = EKeyV,
-            SRangeK = {o, SB, SK ++ "0", null},
-            ERangeK = {o, EB, EK ++ "0", null},
+            {{o_kv, SB, SK, null}, _SV} = SKeyV,
+            {{o_kv, EB, EK, null}, _EV} = EKeyV,
+            SRangeK =
+                {o_kv, SB, list_to_binary(binary_to_list(SK) ++ "0"), null},
+            ERangeK =
+                {o_kv, EB, list_to_binary(binary_to_list(EK) ++ "0"), null},
             ?assertMatch(49, length(match_range(SRangeK, ERangeK, Tree0)))
         end,
     lists:foreach(TestRangeLFun, RangeLists).
@@ -840,10 +896,12 @@ extra_searchrange_test_by_type(Type) ->
             % start key
             SKeyV = lists:nth(1, RangeL),
             EKeyV = lists:nth(50, RangeL),
-            {{o, SB, SK, null}, _SV} = SKeyV,
-            {{o, EB, EK, null}, _EV} = EKeyV,
-            FRangeK = {o, SB, SK ++ "0", null},
-            BRangeK = {o, EB, EK ++ "0", null},
+            {{o_kv, SB, SK, null}, _SV} = SKeyV,
+            {{o_kv, EB, EK, null}, _EV} = EKeyV,
+            FRangeK =
+                {o_kv, SB, list_to_binary(binary_to_list(SK) ++ "0"), null},
+            BRangeK =
+                {o_kv, EB, list_to_binary(binary_to_list(EK) ++ "0"), null},
             ?assertMatch(25, length(search_range(FRangeK, BRangeK, Tree0, SKFun)))
         end,
     lists:foreach(TestRangeLFun, lists:seq(1, 50)).

From e139877ff49555ffee61cd566a1b874bbe35cc9b Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 9 Nov 2022 15:05:29 +0000
Subject: [PATCH 03/37] Immediate hibernate

Reasons for delay in hibernate were not clear.

Straight after creation the process will not be in receipt of messages (must wait for the manifest to be updated), so better to hibernate now.  This also means the log PC023 provides more accurate information.
---
 src/leveled_sst.erl | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index b9ecdf27..75d5cce9 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -595,9 +595,6 @@ starting({sst_new,
     leveled_log:log_timer("SST08",
                             [ActualFilename, Level, Summary#summary.max_sqn],
                             SW),
-    erlang:send_after(?STARTUP_TIMEOUT, self(), tidyup_after_startup),
-        % always want to have an opportunity to GC - so force the timeout to
-        % occur whether or not there is an intervening message
     {reply,
         {ok, {Summary#summary.first_key, Summary#summary.last_key}, Bloom},
         reader,
@@ -605,7 +602,8 @@ starting({sst_new,
                         high_modified_date = HighModDate,
                         starting_pid = StartingPID,
                         level = Level,
-                        fetch_cache = new_cache(Level)}};
+                        fetch_cache = new_cache(Level)},
+        hibernate};
 starting({sst_newlevelzero, RootPath, Filename,
                     Penciller, MaxSQN,
                     OptsSST, IdxModDate}, _From, State) -> 
@@ -922,17 +920,8 @@ handle_event({update_blockindex_cache, BIC}, StateName, State) ->
         State#state{blockindex_cache = BlockIndexCache,
                     high_modified_date = HighModDate}}.
 
-handle_info(tidyup_after_startup, delete_pending, State) ->
-    % No need to GC, this file is to be shutdown.  This message may have
-    % interrupted the delete timeout, so timeout straight away
-    {next_state, delete_pending, State, 0};
-handle_info(tidyup_after_startup, StateName, State) ->
-    case is_process_alive(State#state.starting_pid) of
-        true ->
-            {next_state, StateName, State, hibernate};
-        false ->
-            {stop, normal, State}
-    end.
+handle_info(_Msg, StateName, State) ->
+    {next_state, StateName, State}.
 
 terminate(normal, delete_pending, _State) ->
     ok;
@@ -1179,7 +1168,7 @@ cache_size(N) when N < 3 ->
 cache_size(3) ->
     32;
 cache_size(4) ->
-    32;
+    16;
 cache_size(5) ->
     4;
 cache_size(6) ->
@@ -4122,12 +4111,15 @@ key_dominates_test() ->
                     key_dominates([KV7|KL2], [KV2], {true, 1})).
 
 nonsense_coverage_test() ->
-    ?assertMatch({ok, reader, #state{}}, code_change(nonsense,
-                                                        reader,
-                                                        #state{},
-                                                        nonsense)),
-    ?assertMatch({reply, undefined, reader, #state{}},
-                    handle_sync_event("hello", self(), reader, #state{})),
+    ?assertMatch(
+        {ok, reader, #state{}},
+        code_change(nonsense, reader, #state{}, nonsense)),
+    ?assertMatch(
+        {next_state, reader, #state{}},
+        handle_info(nonsense, reader, #state{})),
+    ?assertMatch(
+        {reply, undefined, reader, #state{}},
+        handle_sync_event("hello", self(), reader, #state{})),
                     
     SampleBin = <<0:128/integer>>,
     FlippedBin = flip_byte(SampleBin, 0, 16),

From 56c5e2565ddde15edebd7c0097cb57882b91d3ed Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 10 Nov 2022 11:42:01 +0000
Subject: [PATCH 04/37] Refactor BIC

This patch avoids the following:

- repeated replacement of the same element in the BIC (via get_kvrange), by checking presence via GET before sing SET

- Stops re-reading of all elements to discover high modified date

Also there appears to have been a bug where a missing HMD for the file is required to add to the cache.  However, now the cache may be erased without erasing the HMD.  This means that the cache can never be rebuilt
---
 src/leveled_sst.erl | 124 +++++++++++++++++++++++++-------------------
 1 file changed, 72 insertions(+), 52 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 75d5cce9..5461a008 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -192,7 +192,7 @@
 -type sst_summary()
         :: #summary{}.
 -type blockindex_cache()
-        :: any().  % An array but OTP 16 types
+        :: {non_neg_integer(), array:array(), non_neg_integer()}.
 -type fetch_cache()
         :: any()|no_cache. % An array but OTP 16 types
 -type cache_size()
@@ -1224,48 +1224,60 @@ tune_seglist(SegList) ->
 
 -spec new_blockindex_cache(pos_integer()) -> blockindex_cache().
 new_blockindex_cache(Size) ->
-    array:new([{size, Size}, {default, none}]).
-
--spec update_blockindex_cache(boolean(),
-                                list({integer(), binary()}),
-                                blockindex_cache(),
-                                non_neg_integer()|undefined,
-                                boolean()) ->
-                                    {blockindex_cache(),
-                                        non_neg_integer()|undefined}.
-update_blockindex_cache(Needed, Entries, BIC, HighModDate, IdxModDate)
-                                            when Needed,
-                                                    HighModDate == undefined ->
-    FoldFun = 
-        fun(CacheEntry, Cache) ->
-            case CacheEntry of
-                {ID, Header} when is_binary(Header) ->
-                    array:set(ID - 1, binary:copy(Header), Cache);
+    {0, array:new([{size, Size}, {default, none}]), 0}.
+
+-spec updatebic_foldfun(boolean()) ->
+        fun(({integer(), binary()}, blockindex_cache()) -> blockindex_cache()).
+updatebic_foldfun(HMDRequired) ->
+    fun(CacheEntry, {AccCount, Cache, AccHMD}) ->
+        case CacheEntry of
+            {ID, Header} when is_binary(Header) ->
+                case array:get(ID - 1, Cache) of
+                    none ->
+                        H0 = binary:copy(Header),
+                        AccHMD0 =
+                            case HMDRequired of
+                                true ->
+                                    max(AccHMD,
+                                        element(2, extract_header(H0, true)));
+                                false ->
+                                    AccHMD
+                                end,
+                        {AccCount + 1, array:set(ID - 1, H0, Cache), AccHMD0};
+                    _ ->
+                        {AccCount, Cache, AccHMD}
+                end;
+            _ ->
+                {AccCount, Cache, AccHMD}
+        end
+    end.
+        
+-spec update_blockindex_cache(
+        boolean(), list({integer(), binary()}),
+        blockindex_cache(), non_neg_integer()|undefined,
+        boolean()) -> {blockindex_cache(), non_neg_integer()|undefined}.
+update_blockindex_cache(true, Entries, BIC, HighModDate, IdxModDate) ->
+    case {element(1, BIC), array:size(element(2, BIC))} of
+        {N, N} ->
+            {BIC, HighModDate};
+        {N, S} when N < S ->
+            FoldFun =
+                case {HighModDate, IdxModDate} of
+                    {undefined, true} ->
+                        updatebic_foldfun(true);
+                    _ ->
+                        updatebic_foldfun(false)
+                end,
+            BIC0 = lists:foldl(FoldFun, BIC, Entries),
+            case {element(1, BIC0), IdxModDate} of
+                {N, _} ->
+                    {BIC, HighModDate};
+                {S, true} ->
+                    {BIC0, element(3, BIC0)};
                 _ ->
-                    Cache
+                    {BIC0, undefined}
             end
-        end,
-    BlockIdxC0 = lists:foldl(FoldFun, BIC, Entries),
-    Size = array:size(BlockIdxC0),
-    BestModDates =
-        case IdxModDate of
-            true ->
-                ModDateFold =
-                    fun(_ID, Header, Acc) when is_binary(Header) ->
-                        [element(2, extract_header(Header, IdxModDate))|Acc]
-                    end,
-                array:sparse_foldl(ModDateFold, [], BlockIdxC0);
-            false ->
-                []
-        end,
-    BestModDate =
-        case length(BestModDates) of
-            Size ->
-                lists:max(BestModDates);
-            _ ->
-                undefined
-        end,
-    {BlockIdxC0, BestModDate};
+    end;
 update_blockindex_cache(_Needed, _Entries, BIC, HighModDate, _IdxModDate) ->
     {BIC, HighModDate}.
 
@@ -1300,7 +1312,7 @@ fetch(LedgerKey, Hash, State, Timings0) ->
     
     SlotID = Slot#slot_index_value.slot_id,
     CachedBlockIdx = 
-        array:get(SlotID - 1, State#state.blockindex_cache),
+        array:get(SlotID - 1, element(2, State#state.blockindex_cache)),
     {SW2, Timings2} = update_timings(SW1, Timings1, lookup_cache, true),
 
     case extract_header(CachedBlockIdx, IdxModDate) of 
@@ -2140,7 +2152,7 @@ binarysplit_mapfun(MultiSlotBin, StartPos) ->
 
 
 -spec read_slots(file:io_device(), list(), 
-                    {false|list(), non_neg_integer(), binary()},
+                    {false|list(), non_neg_integer(), blockindex_cache()},
                     press_method(), boolean()) -> 
                         {boolean(), list(binaryslot_element())}.
 %% @doc
@@ -2171,7 +2183,7 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache},
     BinMapFun = 
         fun(Pointer, {NeededBlockIdx, Acc}) ->
             {SP, _L, ID, SK, EK} = pointer_mapfun(Pointer),
-            CachedHeader = array:get(ID - 1, BlockIndexCache),
+            CachedHeader = array:get(ID - 1, element(2, BlockIndexCache)),
             case extract_header(CachedHeader, IdxModDate) of
                 none ->
                     % If there is an attempt to use the seg list query and the
@@ -4332,21 +4344,29 @@ block_index_cache_test() ->
                     lists:seq(1, 8)),
     HeaderTS = <<0:160/integer, Now:32/integer, 0:32/integer>>,
     HeaderNoTS = <<0:192>>,
-    BIC = array:new([{size, 8}, {default, none}]),
+    BIC = new_blockindex_cache(8),
     {BIC0, undefined} =
         update_blockindex_cache(false, EntriesNoTS, BIC, undefined, false),
     {BIC1, undefined} =
         update_blockindex_cache(false, EntriesTS, BIC, undefined, true),
     {BIC2, undefined} =
         update_blockindex_cache(true, EntriesNoTS, BIC, undefined, false),
-    {BIC3, LMD3} =
-        update_blockindex_cache(true, EntriesTS, BIC, undefined, true),
+    {ETSP1, ETSP2} = lists:split(6, EntriesTS),
+    {BIC3, undefined} =
+        update_blockindex_cache(true, ETSP1, BIC, undefined, true),
+    {BIC3, undefined} =
+        update_blockindex_cache(true, ETSP1, BIC3, undefined, true),
+    {BIC4, LMD4} =
+        update_blockindex_cache(true, ETSP2, BIC3, undefined, true),
+    {BIC4, LMD4} =
+        update_blockindex_cache(true, ETSP2, BIC4, LMD4, true),
     
-    ?assertMatch(none, array:get(0, BIC0)),
-    ?assertMatch(none, array:get(0, BIC1)),
-    ?assertMatch(HeaderNoTS, array:get(0, BIC2)),
-    ?assertMatch(HeaderTS, array:get(0, BIC3)),
-    ?assertMatch(Now, LMD3).
+    ?assertMatch(none, array:get(0, element(2, BIC0))),
+    ?assertMatch(none, array:get(0, element(2, BIC1))),
+    ?assertMatch(HeaderNoTS, array:get(0, element(2, BIC2))),
+    ?assertMatch(HeaderTS, array:get(0, element(2, BIC3))),
+    ?assertMatch(HeaderTS, array:get(0, element(2, BIC4))),
+    ?assertMatch(Now, LMD4).
 
 single_key_test() ->
     FileName = "single_key_test",

From 8370accad02ffea4eac440affa4291e8a72e50a0 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 10 Nov 2022 22:54:17 +0000
Subject: [PATCH 05/37] Use correct size in test results

erts_debug:flat_size/1 returns size in words (i.e. 8 bytes on 64-bit CPU) not bytes
---
 src/leveled_tree.erl | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/leveled_tree.erl b/src/leveled_tree.erl
index 33f7b98c..e3b8982e 100644
--- a/src/leveled_tree.erl
+++ b/src/leveled_tree.erl
@@ -708,7 +708,9 @@ tree_timing_test() ->
 idxt_timing_test() ->
     log_tree_test_by_(16, idxt, 8000),
     log_tree_test_by_(16, idxt, 4000),
-    log_tree_test_by_(4, idxt, 256).
+    log_tree_test_by_(4, idxt, 256),
+    log_tree_test_by_(16, idxt, 256),
+    log_tree_test_by_simplekey_(16, idxt, 256).
 
 skpl_timing_test() ->
     log_tree_test_by_(auto, skpl, 8000),
@@ -751,8 +753,8 @@ tree_test_by_(Width, Type, KL, ComplexKey) ->
                 [timer:now_diff(os:timestamp(), SWaETS),
                     tsize(Tree0)]),
     io:format(user,
-        "Tree has footprint size ~w flat_size ~w~n",
-        [erts_debug:size(Tree0), erts_debug:flat_size(Tree0)]),
+        "Tree has footprint size ~w bytes flat_size ~w bytes~n",
+        [erts_debug:size(Tree0) * 8, erts_debug:flat_size(Tree0) * 8]),
     
     SWaGSL = os:timestamp(),
     Tree1 = from_orderedlist(KL, Type, Width),
@@ -761,8 +763,8 @@ tree_test_by_(Width, Type, KL, ComplexKey) ->
                 [timer:now_diff(os:timestamp(), SWaGSL),
                     tsize(Tree1)]),
     io:format(user,
-        "Tree has footprint size ~w flat_size ~w~n",
-        [erts_debug:size(Tree1), erts_debug:flat_size(Tree1)]),
+        "Tree has footprint size ~w bytes flat_size ~w bytes~n",
+        [erts_debug:size(Tree1) * 8, erts_debug:flat_size(Tree1) * 8]),
 
     SWaLUP = os:timestamp(),
     lists:foreach(match_fun(Tree0), KL),

From 3332cb78cd2ab47146866b8bf109133f2b111027 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Sat, 12 Nov 2022 11:37:32 +0000
Subject: [PATCH 06/37] Don't change summary record

As it is persisted as part of the file write, any change to the summary record cannot be rolled back
---
 src/leveled_sst.erl | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 5461a008..427d755b 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -153,9 +153,10 @@
             last_key :: tuple(),
             index :: tuple() | undefined,
             size :: integer(),
-            max_sqn :: integer(),
-            filter_fun ::
-                fun((leveled_codec:ledger_key()) -> any()) | undefined}).
+            max_sqn :: integer()}).
+    %% DO NOT CHANGE
+    %% The summary record is persisted as part of the sile format
+    %% Any chnage to this record will mean the change cannot be rolled back
 
 -type press_method() 
         :: lz4|native|none.
@@ -227,8 +228,10 @@
             deferred_startup_tuple :: tuple()|undefined,
             level :: level()|undefined,
             tomb_count = not_counted
-                    :: non_neg_integer()|not_counted,
-            high_modified_date :: non_neg_integer()|undefined}).
+                :: non_neg_integer()|not_counted,
+            high_modified_date :: non_neg_integer()|undefined,
+            filter_fun
+                :: fun((leveled_codec:ledger_key()) -> any()) | undefined}).
 
 -record(sst_timings, 
         {sample_count = 0 :: integer(),
@@ -1306,7 +1309,7 @@ fetch(LedgerKey, Hash, State, Timings0) ->
     IdxModDate = State#state.index_moddate,
     Slot =
         lookup_slot(
-            LedgerKey, Summary#summary.index, Summary#summary.filter_fun),
+            LedgerKey, Summary#summary.index, State#state.filter_fun),
     
     {SW1, Timings1} = update_timings(SW0, Timings0, index_query, true),
     
@@ -1397,7 +1400,7 @@ fetch_range(StartKey, EndKey, ScanWidth, SegList, LowLastMod, State) ->
             StartKey,
             EndKey,
             Summary#summary.index,
-            Summary#summary.filter_fun),
+            State#state.filter_fun),
     Self = self(),
     SL = length(Slots),
     
@@ -1506,14 +1509,15 @@ read_file(Filename, State, LoadPageCache) ->
     {SlotIndex, FilterFun} =
         from_list(
             SlotList, Summary#summary.first_key, Summary#summary.last_key),
-    UpdSummary = Summary#summary{index = SlotIndex, filter_fun = FilterFun},
+    UpdSummary = Summary#summary{index = SlotIndex},
     leveled_log:log("SST03", [Filename,
                                 Summary#summary.size,
                                 Summary#summary.max_sqn]),
     {UpdState1#state{summary = UpdSummary,
                         handle = Handle,
                         filename = Filename,
-                        tomb_count = TombCount},
+                        tomb_count = TombCount,
+                        filter_fun = FilterFun},
         Bloom}.
 
 gen_fileversion(PressMethod, IdxModDate, CountOfTombs) ->

From a968ea525197a73c0992b4a6a9ddebfebdd4cce9 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Sun, 13 Nov 2022 14:31:56 +0000
Subject: [PATCH 07/37] Clerk to prompt L0 write

Simplifies the logic if the clerk request work for the penciller prompts L0 writes as well as Manifest changes.

The advantage now is that if the penciller memory is full, and PUT load stops, the clerk should still be able to prompt persistence.  the penciller can therefore make use of dead time this way
---
 src/leveled_log.erl       |   3 +-
 src/leveled_penciller.erl | 199 ++++++++++++++++----------------------
 2 files changed, 86 insertions(+), 116 deletions(-)

diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index 9f4009c8..438cbf7d 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -149,8 +149,7 @@
         {info, "Completion of update to levelzero"
                     ++ " with cache_size=~w level0_due=~w"
                     ++ " change_pending=~w"
-                    ++ " MinSQN=~w MaxSQN=~w"
-                    ++ " CacheTime_us=~w RollTime_us=~w"}},
+                    ++ " MinSQN=~w MaxSQN=~w"}},
     {"P0032",
         {info, "Fetch head timing with sample_count=~w and level timings of"
                     ++ " foundmem_time=~w found0_time=~w found1_time=~w" 
diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl
index 24e3cef8..ed4061bf 100644
--- a/src/leveled_penciller.erl
+++ b/src/leveled_penciller.erl
@@ -665,7 +665,7 @@ init([LogOpts, PCLopts]) ->
     
 
 handle_call({push_mem, {LedgerTable, PushedIdx, MinSQN, MaxSQN}},
-                From,
+                _From,
                 State=#state{is_snapshot=Snap}) when Snap == false ->
     % The push_mem process is as follows:
     %
@@ -676,17 +676,10 @@ handle_call({push_mem, {LedgerTable, PushedIdx, MinSQN, MaxSQN}},
     % 
     % 2. If (1) doe snot apply, the bookie's cache will be added to the
     % penciller's cache.
-    % 
-    % 3. If the cache is over the size requirement (in terms of approximate
-    % number of keys), the writing of the cache to L0 will be prompted - unless
-    % there is already compaction work ongoing, or the last L0 file has not yet
-    % been merged to L1.  Unlike (1) the penciller's cache is still updated in
-    % this case - so the bookie can empty its ledger cache.
-    SW0 = os:timestamp(),
+    SW = os:timestamp(),
     L0Pending = State#state.levelzero_pending,
     WorkBacklog = State#state.work_backlog,
-    WorkOngoing = State#state.work_ongoing,
-    CacheAlreadyFull = leveled_pmem:cache_full(State#state.levelzero_cache),
+    ok = leveled_pclerk:clerk_prompt(State#state.clerk),
     case L0Pending or WorkBacklog of
         true ->
             % Cannot update the cache, or roll the memory so reply as such
@@ -696,78 +689,38 @@ handle_call({push_mem, {LedgerTable, PushedIdx, MinSQN, MaxSQN}},
                 [returned, L0Pending, WorkBacklog]),
             {reply, returned, State};
         false ->
-            {UpdState, Response} =
-                case CacheAlreadyFull of
-                    true ->
-                        % Don't update the cache on State if cache has reached
-                        % the maximum number of lines, otherwise we can still
-                        % add to the cache (but it may be over-sized and
-                        % require rolling to file)
-                        leveled_log:log("P0042", [State#state.levelzero_size]),
-                        {State, returned};
-                    false ->
-                        % Return ok as cache has been updated on State and
-                        % the Bookie should clear its cache which is now
-                        % received
-                        {UpdL0Cache, NewL0Size, UpdL0Index, UpdMaxSQN} =
-                            update_levelzero_cache(
-                                State#state.levelzero_size,
-                                {LedgerTable, PushedIdx, MinSQN, MaxSQN},
-                                State#state.ledger_sqn,
-                                State#state.levelzero_cache,
-                                State#state.levelzero_index),
-                        {State#state{
+            CacheAlreadyFull =
+                leveled_pmem:cache_full(State#state.levelzero_cache),
+            case CacheAlreadyFull of
+                true ->
+                    % Don't update the cache on State if cache has reached
+                    % the maximum number of lines, otherwise we can still
+                    % add to the cache (but it may be over-sized and
+                    % require rolling to file)
+                    leveled_log:log("P0042", [State#state.levelzero_size]),
+                    {reply, returned, State};
+                false ->
+                    % Return ok as cache has been updated on State and
+                    % the Bookie should clear its cache which is now
+                    % received
+                    {UpdL0Cache, NewL0Size, UpdL0Index, UpdMaxSQN} =
+                        update_levelzero_cache(
+                            State#state.levelzero_size,
+                            {LedgerTable, PushedIdx, MinSQN, MaxSQN},
+                            State#state.ledger_sqn,
+                            State#state.levelzero_cache,
+                            State#state.levelzero_index),
+                    leveled_log:log_timer(
+                        "P0031", 
+                        [NewL0Size, true, true, MinSQN, MaxSQN],
+                        SW),
+                    {reply,
+                        ok,
+                        State#state{
                             levelzero_cache = UpdL0Cache,
                             levelzero_size = NewL0Size,
                             levelzero_index = UpdL0Index,
-                            ledger_sqn = UpdMaxSQN}, ok}
-                end,
-            SW1 = os:timestamp(),
-            % Reply now as the Bookie need not wait for the roll decision, it
-            % just needs to know if the Cache update is accpeted (ok) or if the
-            % entry has not been added (returned)
-            gen_server:reply(From, Response),
-            Man0 = UpdState#state.manifest,
-            CacheOverSize =
-                maybe_cache_too_big(
-                    UpdState#state.levelzero_size,
-                    UpdState#state.levelzero_maxcachesize,
-                    UpdState#state.levelzero_cointoss),
-            ToRoll =
-                not (leveled_pmanifest:levelzero_present(Man0) or WorkOngoing)
-                    and (CacheAlreadyFull or CacheOverSize),
-            case ToRoll of
-                true ->
-                    % Rolling the memory is to create a new Level Zero file
-                    {Constructor, none} =
-                        roll_memory(
-                            leveled_pmanifest:get_manifest_sqn(Man0) + 1,
-                            UpdState#state.ledger_sqn,
-                            UpdState#state.root_path,
-                            none,
-                            length(UpdState#state.levelzero_cache),
-                            UpdState#state.sst_options,
-                            false),
-                    % Log timings if we've accepted a cache, and are rolling a
-                    % file
-                    case Response of
-                        ok ->
-                            CacheTime = timer:now_diff(SW1, SW0),
-                            RollTime = timer:now_diff(os:timestamp(), SW1),
-                            leveled_log:log_timer(
-                                "P0031", 
-                                [UpdState#state.levelzero_size, true, true,
-                                    MinSQN, MaxSQN, CacheTime, RollTime],
-                                SW0);
-                        returned ->
-                            ok
-                    end,
-                    {noreply,
-                        UpdState#state{
-                            levelzero_pending=true,
-                            levelzero_constructor=Constructor}};
-                false ->
-                    {noreply, UpdState}
+                            ledger_sqn = UpdMaxSQN}}
             end
     end;
 handle_call({fetch, Key, Hash, UseL0Index}, _From, State) ->
@@ -1131,43 +1084,61 @@ handle_cast({levelzero_complete, FN, StartKey, EndKey, Bloom}, State) ->
                             manifest=UpdMan,
                             persisted_sqn=State#state.ledger_sqn}};
 handle_cast(work_for_clerk, State) ->
-    case {State#state.levelzero_pending, State#state.work_ongoing} of
-        {false, false} ->
-            % TODO - as part of supervision tree and retry work:
-            % Need to check for work_ongoing as well as levelzero_pending as
-            % there may be a race that could lead to the clerk doing the same
-            % thing twice.
-            %
-            % This has implications though if we auto-restart the pclerk in the
-            % future, without altering this state - it may never be able to
-            % request work due to ongoing work that crashed the previous clerk
-            %
-            % Perhaps the pclerk should not be restarted because of this, and
-            % the failure should ripple up
-            {WL, WC} = leveled_pmanifest:check_for_work(State#state.manifest),
-            case WC of
-                0 ->
-                    {noreply, State#state{work_backlog=false}};
-                N ->
-                    Backlog = N > ?WORKQUEUE_BACKLOG_TOLERANCE,
-                    leveled_log:log("P0024", [N, Backlog]),
-                    [TL|_Tail] = WL,
-                    ok =
-                        leveled_pclerk:clerk_push(
-                            State#state.clerk, {TL, State#state.manifest}),
-                    case TL of
-                        0 ->
-                            % Just written a L0 so as LoopState now rewritten,
-                            % garbage collect to free as much as possible as
-                            % soon as possible
-                            garbage_collect();
-                        _ ->
-                            ok
-                    end,
-                    
+    case {State#state.levelzero_pending,
+            State#state.work_ongoing,
+            leveled_pmanifest:levelzero_present(State#state.manifest)} of
+        {false, false, false} ->
+            % If the penciller memory needs rolling, prompt this now
+            CacheOverSize =
+                maybe_cache_too_big(
+                    State#state.levelzero_size,
+                    State#state.levelzero_maxcachesize,
+                    State#state.levelzero_cointoss),
+            CacheAlreadyFull =
+                leveled_pmem:cache_full(State#state.levelzero_cache),
+            case (CacheAlreadyFull or CacheOverSize) of
+                true ->
+                    % Rolling the memory to create a new Level Zero file
+                    NextSQN =
+                        leveled_pmanifest:get_manifest_sqn(
+                            State#state.manifest) + 1,
+                    {Constructor, none} =
+                        roll_memory(
+                            NextSQN,
+                            State#state.ledger_sqn,
+                            State#state.root_path,
+                            none,
+                            length(State#state.levelzero_cache),
+                            State#state.sst_options,
+                            false),
                     {noreply,
-                        State#state{work_backlog=Backlog, work_ongoing=true}}
+                        State#state{
+                            levelzero_pending=true,
+                            levelzero_constructor=Constructor}};
+                false ->
+                    {WL, WC} =
+                        leveled_pmanifest:check_for_work(State#state.manifest),
+                    case WC of
+                        0 ->
+                            % Should do some tidy-up work here?
+                            {noreply, State#state{work_backlog=false}};
+                        N ->
+                            Backlog = N > ?WORKQUEUE_BACKLOG_TOLERANCE,
+                            leveled_log:log("P0024", [N, Backlog]),
+                            [TL|_Tail] = WL,
+                            ok =
+                                leveled_pclerk:clerk_push(
+                                    State#state.clerk,
+                                    {TL, State#state.manifest}),
+                            {noreply,
+                                State#state{
+                                    work_backlog=Backlog, work_ongoing=true}}
+                    end
             end;
+        {false, false, true} ->
+            ok = leveled_pclerk:clerk_push(
+                State#state.clerk, {0, State#state.manifest}),
+            {noreply, State#state{work_ongoing=true}};
         _ ->
             {noreply, State}
     end;

From 3315dc16f67b5b88a0614ae37199bb6898664355 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Mon, 14 Nov 2022 08:36:15 +0000
Subject: [PATCH 08/37] Add push on journal compact

If there has been a backlog, followed by a quiet period - there may be a large ledger cache left unpushed.  Journal compaction events are about once per hour, so the performance overhead of a false push should be minimal, with the advantage of clearing any backlog before load starts again.

This is only relevant to riak users with very off/full batch type workloads.
---
 src/leveled_bookie.erl | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl
index 497c103f..2c4f328e 100644
--- a/src/leveled_bookie.erl
+++ b/src/leveled_bookie.erl
@@ -1493,7 +1493,7 @@ handle_call(log_settings, _From, State) ->
 handle_call({return_runner, QueryType}, _From, State) ->
     Runner = get_runner(State, QueryType),
     {reply, Runner, State};
-handle_call({compact_journal, Timeout}, _From, State)
+handle_call({compact_journal, Timeout}, From, State)
                                         when State#state.head_only == false ->
     case leveled_inker:ink_compactionpending(State#state.inker) of
         true ->
@@ -1504,7 +1504,14 @@ handle_call({compact_journal, Timeout}, _From, State)
             R = leveled_inker:ink_compactjournal(State#state.inker,
                                                     PclSnap,
                                                     Timeout),
-            {reply, R, State}
+            gen_server:reply(From, R),
+            {_, NewCache} = 
+                maybepush_ledgercache(
+                    State#state.cache_size,
+                    State#state.cache_multiple,
+                    State#state.ledger_cache,
+                    State#state.penciller),
+            {noreply, State#state{ledger_cache = NewCache}}
     end;
 handle_call(confirm_compact, _From, State)
                                         when State#state.head_only == false ->

From 6aea43471b99d40702d043577efa54ffcdb78c13 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Tue, 15 Nov 2022 09:52:21 +0000
Subject: [PATCH 09/37] Extend tests

To more consistently trigger all overload scenarios
---
 src/leveled_penciller.erl       | 5 +++--
 test/end_to_end/basic_SUITE.erl | 8 +++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl
index ed4061bf..48db5e95 100644
--- a/src/leveled_penciller.erl
+++ b/src/leveled_penciller.erl
@@ -710,10 +710,11 @@ handle_call({push_mem, {LedgerTable, PushedIdx, MinSQN, MaxSQN}},
                             State#state.ledger_sqn,
                             State#state.levelzero_cache,
                             State#state.levelzero_index),
-                    leveled_log:log_timer(
+                    leveled_log:log_randomtimer(
                         "P0031", 
                         [NewL0Size, true, true, MinSQN, MaxSQN],
-                        SW),
+                        SW,
+                        0.1),
                     {reply,
                         ok,
                         State#state{
diff --git a/test/end_to_end/basic_SUITE.erl b/test/end_to_end/basic_SUITE.erl
index ac1a5eae..d657d079 100644
--- a/test/end_to_end/basic_SUITE.erl
+++ b/test/end_to_end/basic_SUITE.erl
@@ -530,7 +530,8 @@ load_and_count(_Config) ->
     % Use artificially small files, and the load keys, counting they're all
     % present
     load_and_count(50000000, 2500, 28000),
-    load_and_count(200000000, 100, 300000).
+    load_and_count(200000000, 50, 200000),
+    load_and_count(50000000, 1000, 5000).
 
 
 load_and_count(JournalSize, BookiesMemSize, PencillerMemSize) ->
@@ -598,6 +599,9 @@ load_and_count(JournalSize, BookiesMemSize, PencillerMemSize) ->
                         lists:seq(1, 20)),
     testutil:check_forobject(Bookie1, TestObject),
     io:format("Loading more small objects~n"),
+    io:format("Now with unused snapshot so deletions are blocked~n"),
+    {ok, PclClone, null} = 
+        leveled_bookie:book_snapshot(Bookie1, ledger, undefined, true),
     lists:foldl(fun(_X, Acc) ->
                         testutil:load_objects(5000,
                                                 [Acc + 2],
@@ -614,6 +618,8 @@ load_and_count(JournalSize, BookiesMemSize, PencillerMemSize) ->
                         200000,
                         lists:seq(1, 20)),
     testutil:check_forobject(Bookie1, TestObject),
+    ok = leveled_penciller:pcl_close(PclClone),
+    {_S, 300000} = testutil:check_bucket_stats(Bookie1, "Bucket"),
     ok = leveled_bookie:book_close(Bookie1),
     {ok, Bookie2} = leveled_bookie:book_start(StartOpts1),
     {_, 300000} = testutil:check_bucket_stats(Bookie2, "Bucket"),

From 1cf687cf4b570dcedfbd027d8dc1a2655f73dda1 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Tue, 15 Nov 2022 17:21:23 +0000
Subject: [PATCH 10/37] Fix range keys smaller than prefix

Can't make end key an empty binary  in this case, as it may be bigger than any keys within the range, but will appear to be smaller.

Unit tests and ct tests added to expose the potential issue
---
 src/leveled_codec.erl          |   4 +-
 src/leveled_sst.erl            | 188 ++++++++++++++++++++++++++++++++-
 test/end_to_end/riak_SUITE.erl |  81 +++++++++++++-
 3 files changed, 266 insertions(+), 7 deletions(-)

diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl
index 3faec9cd..17c4c9ea 100644
--- a/src/leveled_codec.erl
+++ b/src/leveled_codec.erl
@@ -81,7 +81,7 @@
 -type ledger_key() :: 
         {tag(), any(), any(), any()}|all.
 -type slimmed_key() ::
-        {binary(), binary()|null}|binary()|null|all.
+        {binary()|null, binary()|null}|binary()|null|all.
 -type ledger_value() ::
         ledger_value_v1()|ledger_value_v2().
 -type ledger_value_v1() ::
@@ -377,6 +377,8 @@ endkey_passed({K1, null}, {K1, _}) ->
     % See leveled_sst SlotIndex implementation.  Here keys may be slimmed to
     % single binaries or two element tuples before forming the index.
     false;
+endkey_passed({null, _QK1}, {_RK1, _RK2}) ->
+    false;
 endkey_passed(null, _) ->
     false;
 endkey_passed(QueryEndKey, RangeEndKey) ->
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 427d755b..d78923b3 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -1763,8 +1763,8 @@ get_filterfun(
         N ->
             fun({_Tag, _Bucket, {_Field, Term}, Key}) ->
                 case Term of
-                    T when byte_size(T) == N ->
-                        {<<>>, Key};
+                    T when byte_size(T) =< N ->
+                        {null, Key};
                     <<_:N/binary, Suffix/binary>> ->
                         {Suffix, Key}
                 end
@@ -1781,8 +1781,8 @@ get_filterfun(
                 case Key of
                     null ->
                         null;
-                    K when byte_size(K) == N ->
-                        <<>>;
+                    K when byte_size(K) =< N ->
+                        null;
                     <<_:N/binary, Suffix/binary>> ->
                         Suffix
                 end
@@ -4372,6 +4372,186 @@ block_index_cache_test() ->
     ?assertMatch(HeaderTS, array:get(0, element(2, BIC4))),
     ?assertMatch(Now, LMD4).
 
+range_key_lestthanprefix_test() ->
+    FileName = "lessthanprefix_test",
+    IndexKeyFun =
+        fun(I) ->
+            {{?IDX_TAG,
+                {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>,
+                    list_to_binary("19601301|"
+                        ++ io_lib:format("~6..0w", [I]))},
+                list_to_binary(io_lib:format("~6..0w", [I]))},
+            {1, {active, infinity}, no_lookup, null}}
+        end,
+    IndexEntries = lists:map(IndexKeyFun, lists:seq(1, 500)),
+    OptsSST = 
+        #sst_options{press_method=native,
+                        log_options=leveled_log:get_opts()},
+    {ok, P1, {_FK1, _LK1}, _Bloom1} = 
+        sst_new(?TEST_AREA, FileName, 1, IndexEntries, 6000, OptsSST),
+    
+    IdxRange1 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>}, {<<"dob_bin">>, <<"1959">>}, null},
+            all,
+            16),
+    IdxRange2 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG,
+                {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1960">>}, null},
+            {?IDX_TAG,
+                {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1961">>}, null},
+            16),
+    IdxRange3 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1960">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301|000250">>}, null},
+            16),
+    IdxRange4 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301|000251">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1961">>}, null},
+            16),
+    IdxRange5 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301|000250">>}, <<"000251">>},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1961">>}, null},
+            16),
+    IdxRange6 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301|000">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19611301|0002">>}, null},
+            16),
+    IdxRange7 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301|000">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19611301|0001">>}, null},
+            16),
+    IdxRange8 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301|000000">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19611301|000100">>}, null},
+            16),
+    ?assertMatch(500, length(IdxRange1)),
+    ?assertMatch(500, length(IdxRange2)),
+    ?assertMatch(250, length(IdxRange3)),
+    ?assertMatch(250, length(IdxRange4)),
+    ?assertMatch(250, length(IdxRange5)),
+    % No right trim - result count rounds up to slot size
+    ?assertMatch(256, length(IdxRange6)),
+    ?assertMatch(128, length(IdxRange7)),
+    ?assertMatch(128, length(IdxRange8)),
+    ok = sst_close(P1),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")),
+
+    ObjectKeyFun =
+        fun(I) ->
+            {{?RIAK_TAG,
+                {<<"btype">>, <<"bucket">>},
+                list_to_binary("19601301|"
+                    ++ io_lib:format("~6..0w", [I])),
+                null},
+            {1, {active, infinity}, {0, 0}, null}}
+        end,
+    ObjectEntries = lists:map(ObjectKeyFun, lists:seq(1, 500)),
+    OptsSST = 
+        #sst_options{press_method=native,
+                        log_options=leveled_log:get_opts()},
+    {ok, P2, {_FK2, _LK2}, _Bloom2} = 
+        sst_new(?TEST_AREA, FileName, 1, ObjectEntries, 6000, OptsSST),
+    
+    ObjRange1 =
+        sst_getkvrange(
+            P2,
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>}, <<"1959">>, null},
+            all,
+            16),
+    ObjRange2 =
+        sst_getkvrange(
+            P2,
+            {?RIAK_TAG,
+                {<<"btype">>, <<"bucket">>},
+                <<"1960">>, null},
+            {?RIAK_TAG,
+                {<<"btype">>, <<"bucket">>},
+                <<"1961">>, null},
+            16),
+    ObjRange3 =
+        sst_getkvrange(
+            P2,
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"1960">>, null},
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000250">>, null},
+            16),
+    ObjRange4 =
+        sst_getkvrange(
+            P2,
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000251">>, null},
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"1961">>, null},
+            16),
+    ObjRange6 =
+        sst_getkvrange(
+            P2,
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000">>, null},
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19611301|0002">>, null},
+            16),
+    ObjRange7 =
+        sst_getkvrange(
+            P2,
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000">>, null},
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19611301|0001">>, null},
+            16),
+    ObjRange8 =
+        sst_getkvrange(
+            P2,
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000000">>, null},
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19611301|000100">>, null},
+            16),
+
+    ?assertMatch(500, length(ObjRange1)),
+    ?assertMatch(500, length(ObjRange2)),
+    ?assertMatch(250, length(ObjRange3)),
+    ?assertMatch(250, length(ObjRange4)),
+    % No right trim - result count rounds up to slot size
+    ?assertMatch(256, length(ObjRange6)),
+    ?assertMatch(128, length(ObjRange7)),
+    ?assertMatch(128, length(ObjRange8)),
+    ok = sst_close(P2),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
+    
+
 single_key_test() ->
     FileName = "single_key_test",
     LK = leveled_codec:to_ledgerkey(<<"Bucket0">>, <<"Key0">>, ?STD_TAG),
diff --git a/test/end_to_end/riak_SUITE.erl b/test/end_to_end/riak_SUITE.erl
index c4881fd1..67abd2f9 100644
--- a/test/end_to_end/riak_SUITE.erl
+++ b/test/end_to_end/riak_SUITE.erl
@@ -9,7 +9,8 @@
         handoff/1,
         dollar_bucket_index/1,
         dollar_key_index/1,
-        bigobject_memorycheck/1
+        bigobject_memorycheck/1,
+        summarisable_sstindex/1
             ]).
 
 all() -> [
@@ -19,7 +20,8 @@ all() -> [
             handoff,
             dollar_bucket_index,
             dollar_key_index,
-            bigobject_memorycheck
+            bigobject_memorycheck,
+            summarisable_sstindex
             ].
 
 -define(MAGIC, 53). % riak_kv -> riak_object
@@ -243,6 +245,81 @@ basic_riak_tester(Bucket, KeyCount) ->
 
     ok = leveled_bookie:book_destroy(Bookie2).
 
+summarisable_sstindex(_Config) ->
+    RootPathA = testutil:reset_filestructure("SummarisableSSTIndex"),
+    KeyCount = 200000,
+    IndexGen = fun() -> [] end,
+    Bucket = {<<"test_type">>, <<"test_bucket">>},
+    KeyGen = fun(I) -> list_to_binary(io_lib:format("~10..0w", [I])) end,
+    ObjListToSort =
+        lists:map(
+            fun(I) -> 
+                {leveled_rand:uniform(KeyCount * 10),
+                testutil:set_object(
+                    Bucket, KeyGen(I), integer_to_binary(I), IndexGen, [])}
+                end,
+            lists:seq(1, KeyCount)),
+    UnsortedList =
+        lists:map(
+            fun({I, {O, S}}) -> {I, O, S} end,
+            lists:keysort(1, ObjListToSort)),
+    true = KeyCount == length(UnsortedList),
+    StartOpts1 = [{root_path, RootPathA},
+        {max_journalsize, 500000000},
+        {max_pencillercachesize, 8000},
+        {sync_strategy, testutil:sync_strategy()}],
+    {ok, Bookie1} = leveled_bookie:book_start(StartOpts1),
+    testutil:riakload(Bookie1, UnsortedList),
+    FoldAccT = {fun(_B, K, Acc) -> [K|Acc] end, []},
+    KeyRangeCheckFun =
+        fun(SK, EK) ->
+            {async, FoldFun} =
+                leveled_bookie:book_keylist(
+                    Bookie1,
+                    ?RIAK_TAG,
+                    Bucket,
+                    {SK, EK},
+                    FoldAccT,
+                    undefined),
+            QueryCount = FoldFun(),
+            io:format(
+                "QueryCount ~w against total ~w for range ~p ~p~n",
+                [length(QueryCount), KeyCount, SK, EK]),
+            QueryCount
+        end,
+    true = KeyCount == length(KeyRangeCheckFun(<<"00">>, <<"02">>)),
+    true = KeyCount == length(KeyRangeCheckFun(<<"000">>, <<"002">>)),
+    true = KeyCount == length(KeyRangeCheckFun(<<"0000">>, <<"0002">>)),
+    true =
+        (KeyCount - 1) ==
+            length(KeyRangeCheckFun(<<"00000">>, <<"00002">>)),
+    true =
+        (KeyCount div 2) ==
+            length(KeyRangeCheckFun(<<"00001">>, <<"00002">>)),
+    true =
+        1 == length(KeyRangeCheckFun(<<"00002">>, <<"000021">>)),
+    true =
+        ((KeyCount div 10) - 1) ==
+            length(KeyRangeCheckFun(<<"000000">>, <<"000002">>)),
+    true =
+        (KeyCount div 20) ==
+            length(KeyRangeCheckFun(<<"000001">>, <<"000002">>)),
+    true =
+        ((KeyCount div 100) - 1) ==
+            length(KeyRangeCheckFun(<<"0000000">>, <<"0000002">>)),
+
+    lists:foreach(
+        fun(I) ->
+            StartKey = KeyGen(I),
+            EndKey = KeyGen(I + 200 - 1),
+            true = 200 == length(KeyRangeCheckFun(StartKey, EndKey))
+        end,
+        lists:map(
+            fun(_I) -> leveled_rand:uniform(KeyCount - 200) end,
+            lists:seq(1, 100))),
+
+    ok = leveled_bookie:book_destroy(Bookie1).
+
 
 fetchclocks_modifiedbetween(_Config) ->
     RootPathA = testutil:reset_filestructure("fetchClockA"),

From a725fec205470cc682702ff0377ed1223f7657fc Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 16 Nov 2022 01:11:22 +0000
Subject: [PATCH 11/37] Tidy-up

- Remove penciller logs which are no longer called
- Get pclerk to only wait MIN_TIMEOUT after doing work, in case there is a backlog
- Remove update_levelzero_cache function as it is unique to handle_call of push_mem, and simple enough to be inline
- Alight testutil slow offer with standard slow offer used
---
 src/leveled_log.erl          |  29 +--------
 src/leveled_pclerk.erl       |   2 +-
 src/leveled_penciller.erl    | 112 ++++++++++++++---------------------
 test/end_to_end/testutil.erl |   2 +-
 4 files changed, 49 insertions(+), 96 deletions(-)

diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index 438cbf7d..da47f7c5 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -95,8 +95,6 @@
         {debug, "Remaining ledger snapshots are ~w"}},
     {"P0005",
         {debug, "Delete confirmed as file ~s is removed from Manifest"}},
-    {"P0006",
-        {info, "Orphaned reply after timeout on L0 file write ~s"}},
     {"P0007",
         {debug, "Sent release message for cloned Penciller following close for "
                 ++ "reason ~w"}},
@@ -119,28 +117,14 @@
     {"P0017",
         {info, "No L0 file found"}},
     {"P0018",
-        {info, "Response to push_mem of ~w with "
-                    ++ "L0 pending ~w and merge backlog ~w"}},
+        {info,
+        "Response to push_mem of returned with cache_size=~w "
+        ++ "L0_pending=~w merge_backlog=~w cachelines_full=~w"}},
     {"P0019",
         {info, "Rolling level zero to filename ~s at ledger sqn ~w"}},
-    {"P0021",
-        {info, "Allocation of work blocked as L0 pending"}},
-    {"P0022",
-        {info, "Manifest at Level ~w"}},
-    {"P0023",
-        {info, "Manifest entry of startkey ~s ~s ~s endkey ~s ~s ~s "
-                ++ "filename=~s~n"}},
     {"P0024",
         {info, "Outstanding compaction work items of ~w with backlog status "
                     ++ "of ~w"}},
-    {"P0025",
-        {info, "Merge to sqn ~w from Level ~w completed"}},
-    {"P0026",
-        {info, "Merge has been commmitted at sequence number ~w"}},
-    {"P0027",
-        {info, "Rename of manifest from ~s ~w to ~s ~w"}},
-    {"P0028",
-        {debug, "Adding cleared file ~s to deletion list"}},
     {"P0029",
         {info, "L0 completion confirmed and will transition to not pending"}},
     {"P0030",
@@ -162,13 +146,8 @@
     {"P0033",
         {error, "Corrupted manifest file at path ~s to be ignored "
                     ++ "due to error ~w"}},
-    {"P0034",
-        {warn, "Snapshot with pid ~w timed out and so deletion will "
-                    ++ "continue regardless"}},
     {"P0035",
         {info, "Startup with Manifest SQN of ~w"}},
-    {"P0036",
-        {info, "Garbage collection on manifest removes key for filename ~s"}},
     {"P0037",
         {debug, "Merging of penciller L0 tree from size ~w complete"}},
     {"P0038",
@@ -181,8 +160,6 @@
         {info, "Archiving filename ~s as unused at startup"}},
     {"P0041",
         {info, "Penciller manifest switched from SQN ~w to ~w"}},
-    {"P0042",
-        {warn, "Cache full so attempting roll memory with l0_size=~w"}},
         
     {"PC001",
         {info, "Penciller's clerk ~w started with owner ~w"}},
diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl
index c8f494d8..b26041c4 100644
--- a/src/leveled_pclerk.erl
+++ b/src/leveled_pclerk.erl
@@ -122,7 +122,7 @@ handle_cast({push_work, Work}, State) ->
     {ManifestSQN, Deletions} = handle_work(Work, State),
     PDs = dict:store(ManifestSQN, Deletions, State#state.pending_deletions),
     leveled_log:log("PC022", [ManifestSQN]),
-    {noreply, State#state{pending_deletions = PDs}, ?MAX_TIMEOUT};
+    {noreply, State#state{pending_deletions = PDs}, ?MIN_TIMEOUT};
 handle_cast({prompt_deletions, ManifestSQN}, State) ->
     {Deletions, UpdD} = return_deletions(ManifestSQN,
                                             State#state.pending_deletions),
diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl
index 48db5e95..6bcb7133 100644
--- a/src/leveled_penciller.erl
+++ b/src/leveled_penciller.erl
@@ -674,55 +674,60 @@ handle_call({push_mem, {LedgerTable, PushedIdx, MinSQN, MaxSQN}},
     % expectation that PUTs should be slowed.  Also if the cache has reached
     % the maximum number of lines (by default after 31 pushes from the bookie)
     % 
-    % 2. If (1) doe snot apply, the bookie's cache will be added to the
+    % 2. If (1) does not apply, the bookie's cache will be added to the
     % penciller's cache.
     SW = os:timestamp(),
+
     L0Pending = State#state.levelzero_pending,
     WorkBacklog = State#state.work_backlog,
+    CacheAlreadyFull = leveled_pmem:cache_full(State#state.levelzero_cache),
+    L0Size = State#state.levelzero_size,
+
+    % The clerk is prompted into action as there may be a L0 write required
     ok = leveled_pclerk:clerk_prompt(State#state.clerk),
-    case L0Pending or WorkBacklog of
+
+    case L0Pending or WorkBacklog or CacheAlreadyFull of
         true ->
-            % Cannot update the cache, or roll the memory so reply as such
-            % immediately
+            % Cannot update the cache, or roll the memory so reply `returned`
+            % The Bookie must now retain the lesger cache and try to push the
+            % updated cache at a later time
             leveled_log:log(
                 "P0018",
-                [returned, L0Pending, WorkBacklog]),
+                [L0Size, L0Pending, WorkBacklog, CacheAlreadyFull]),
             {reply, returned, State};
         false ->
-            CacheAlreadyFull =
-                leveled_pmem:cache_full(State#state.levelzero_cache),
-            case CacheAlreadyFull of
-                true ->
-                    % Don't update the cache on State if cache has reached
-                    % the maximum number of lines, otherwise we can still
-                    % add to the cache (but it may be over-sized and
-                    % require rolling to file)
-                    leveled_log:log("P0042", [State#state.levelzero_size]),
-                    {reply, returned, State};
-                false ->
-                    % Return ok as cache has been updated on State and
-                    % the Bookie should clear its cache which is now
-                    % received
-                    {UpdL0Cache, NewL0Size, UpdL0Index, UpdMaxSQN} =
-                        update_levelzero_cache(
-                            State#state.levelzero_size,
-                            {LedgerTable, PushedIdx, MinSQN, MaxSQN},
-                            State#state.ledger_sqn,
-                            State#state.levelzero_cache,
-                            State#state.levelzero_index),
-                    leveled_log:log_randomtimer(
-                        "P0031", 
-                        [NewL0Size, true, true, MinSQN, MaxSQN],
-                        SW,
-                        0.1),
-                    {reply,
-                        ok,
-                        State#state{
-                            levelzero_cache = UpdL0Cache,
-                            levelzero_size = NewL0Size,
-                            levelzero_index = UpdL0Index,
-                            ledger_sqn = UpdMaxSQN}}
-            end
+            % Return ok as cache has been updated on State and the Bookie
+            % should clear its ledger cache which is now with the Penciller
+            PushedTree =
+                case is_tuple(LedgerTable) of
+                    true ->
+                        LedgerTable;
+                    false ->
+                        leveled_tree:from_orderedset(LedgerTable, ?CACHE_TYPE)
+                end,
+            {UpdMaxSQN, NewL0Size, UpdL0Cache} =
+                leveled_pmem:add_to_cache(
+                    L0Size,
+                    {PushedTree, MinSQN, MaxSQN},
+                    State#state.ledger_sqn,
+                    State#state.levelzero_cache),
+            UpdL0Index =
+                leveled_pmem:add_to_index(
+                    PushedIdx,
+                    State#state.levelzero_index,
+                    length(State#state.levelzero_cache) + 1),
+            leveled_log:log_randomtimer(
+                "P0031", 
+                [NewL0Size, true, true, MinSQN, MaxSQN],
+                SW,
+                0.1),
+            {reply,
+                ok,
+                State#state{
+                    levelzero_cache = UpdL0Cache,
+                    levelzero_size = NewL0Size,
+                    levelzero_index = UpdL0Index,
+                    ledger_sqn = UpdMaxSQN}}
     end;
 handle_call({fetch, Key, Hash, UseL0Index}, _From, State) ->
     L0Idx = 
@@ -1371,35 +1376,6 @@ archive_files(RootPath, UsedFileList) ->
     ok.
 
 
--spec update_levelzero_cache(
-    non_neg_integer(), bookies_memory(), non_neg_integer(),
-    levelzero_cache(), leveled_pmem:index_array()) 
-        ->
-            {levelzero_cache(), pos_integer(),
-                leveled_pmem:index_array(), pos_integer()}.
-%% @doc
-%% Update the in-memory cache of recent changes for the penciller.  This is 
-%% the level zero at the top of the tree.
-update_levelzero_cache(
-        L0Size,
-        {LedgerTable, PushedIdx, MinSQN, MaxSQN},
-        LedgerSQN, L0Cache, L0Index) ->
-    PushedTree =
-        case is_tuple(LedgerTable) of
-            true ->
-                LedgerTable;
-            false ->
-                leveled_tree:from_orderedset(LedgerTable, ?CACHE_TYPE)
-        end,
-    {UpdMaxSQN, NewL0Size, UpdL0Cache} =
-        leveled_pmem:add_to_cache(
-            L0Size, {PushedTree, MinSQN, MaxSQN}, LedgerSQN, L0Cache),
-    UpdL0Index =
-        leveled_pmem:add_to_index(
-            PushedIdx, L0Index, length(L0Cache) + 1),
-    {UpdL0Cache, NewL0Size, UpdL0Index, UpdMaxSQN}.
-
-
 -spec maybe_cache_too_big(
     pos_integer(), pos_integer(), boolean()) -> boolean().
 %% @doc
diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl
index 0fe9a66f..0f57d2f9 100644
--- a/test/end_to_end/testutil.erl
+++ b/test/end_to_end/testutil.erl
@@ -64,7 +64,7 @@
             compact_and_wait/1]).
 
 -define(RETURN_TERMS, {true, undefined}).
--define(SLOWOFFER_DELAY, 5).
+-define(SLOWOFFER_DELAY, 10).
 -define(V1_VERS, 1).
 -define(MAGIC, 53). % riak_kv -> riak_object
 -define(MD_VTAG,     <<"X-Riak-VTag">>).

From ee93761c07890a306ed4469489668bc158a6bcc1 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 16 Nov 2022 19:29:54 +0000
Subject: [PATCH 12/37] Tidy-up

Remove pre-otp20 references.

Reinstate the check that the starting pid is still active, this was added to tidy up shutdown.

Resolve failure to run on otp20 due to `-if` sttaement
---
 rebar.config              |  9 +++------
 src/leveled_cdb.erl       | 12 +-----------
 src/leveled_math.erl      | 38 --------------------------------------
 src/leveled_pmanifest.erl |  2 ++
 src/leveled_rand.erl      | 30 ------------------------------
 src/leveled_sst.erl       | 24 +++++++++++++++++++-----
 src/leveled_tictac.erl    |  2 +-
 7 files changed, 26 insertions(+), 91 deletions(-)
 delete mode 100644 src/leveled_math.erl

diff --git a/rebar.config b/rebar.config
index 4361e273..b56e6b1c 100644
--- a/rebar.config
+++ b/rebar.config
@@ -1,11 +1,8 @@
+{minimum_otp_vsn, "20.0"}.
+
 {erl_opts, [warnings_as_errors,
 	    {platform_define, "^2[0-4]{1}", fsm_deprecated},
-            {platform_define, "^1[7-8]{1}", old_rand},
-            {platform_define, "^17", no_log2},
-            {platform_define, "^R", no_sync},
-            {platform_define, "^R", old_rand},
-            {platform_define, "^R", no_log2},
-            {platform_define, "^R", slow_test}]}.
+            {platform_define, "^2[2-4]{1}", if_check}]}.
 
 {xref_checks, [undefined_function_calls,undefined_functions]}.
 
diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl
index 19bfbe79..f7ff61f1 100644
--- a/src/leveled_cdb.erl
+++ b/src/leveled_cdb.erl
@@ -62,16 +62,6 @@
                 {gen_fsm, reply, 2}]}).
 -endif.
 
--ifdef(slow_test).
--define(SPECIAL_DELFUN, fun(_F) -> ok end).
-    % There are problems with the pendingdelete_test/0 in riak make test
-    % The deletion of the file causes the process to crash and the test to
-    % fail, but thisis not an issue tetsing outside of riak make test.
-    % Workaround this problem by not performing the delete when running unit
-    % tests in R16
--else.
--define(SPECIAL_DELFUN, fun(F) -> file:delete(F) end).
--endif.
 
 -export([init/1,
             handle_sync_event/4,
@@ -2814,7 +2804,7 @@ pendingdelete_test() ->
     {ok, P2} = cdb_open_reader(F2, #cdb_options{binary_mode=false}),
     ?assertMatch({"Key1", "Value1"}, cdb_get(P2, "Key1")),
     ?assertMatch({"Key100", "Value100"}, cdb_get(P2, "Key100")),
-    ?SPECIAL_DELFUN(F2),
+    ok = file:delete(F2),
     ok = cdb_deletepending(P2),
         % No issues destroying even though the file has already been removed
     ok = cdb_destroy(P2).
diff --git a/src/leveled_math.erl b/src/leveled_math.erl
deleted file mode 100644
index f88ed4f5..00000000
--- a/src/leveled_math.erl
+++ /dev/null
@@ -1,38 +0,0 @@
-%% Handle missing log2 prior to OTP18
-
--module(leveled_math).
-
-%% API
--export([
-         log2/1
-        ]).
-
-
--include_lib("eunit/include/eunit.hrl").
-
-%%%===================================================================
-%%% Use log2
-%%%===================================================================
--ifndef(no_log2).
-
-log2(X) ->
-    math:log2(X).
-
--else.
-%%%===================================================================
-%%% Old (r18) random style functions
-%%%===================================================================
-
-log2(X) ->
-    math:log(X) / 0.6931471805599453.
-
--endif.
-
-
--ifdef(TEST).
-
-log2_test() ->
-    ?assertMatch(8, round(log2(256))),
-    ?assertMatch(16, round(log2(65536))).
-
--endif.
diff --git a/src/leveled_pmanifest.erl b/src/leveled_pmanifest.erl
index 6c6336e2..d8703b23 100644
--- a/src/leveled_pmanifest.erl
+++ b/src/leveled_pmanifest.erl
@@ -74,9 +74,11 @@
             % At o(10) trillion keys behaviour may become increasingly 
             % difficult to predict.
 
+-ifdef(if_check).
 -if(length(?LEVEL_SCALEFACTOR) /= ?MAX_LEVELS).
 -error("length ?LEVEL_SCALEFACTOR differs from ?MAX_LEVELS").
 -endif.
+-endif.
 
 -define(TREE_TYPE, idxt).
 -define(TREE_WIDTH, 8).
diff --git a/src/leveled_rand.erl b/src/leveled_rand.erl
index 4aabe5ee..fedaf20b 100644
--- a/src/leveled_rand.erl
+++ b/src/leveled_rand.erl
@@ -11,13 +11,9 @@
          rand_bytes/1
         ]).
 
-
--include_lib("eunit/include/eunit.hrl").
-
 %%%===================================================================
 %%% New (r19+) rand style functions
 %%%===================================================================
--ifndef(old_rand).
 uniform() ->
     rand:uniform().
 
@@ -30,29 +26,3 @@ seed() ->
 rand_bytes(Size) ->
     crypto:strong_rand_bytes(Size).
 
--else.
-%%%===================================================================
-%%% Old (r18) random style functions
-%%%===================================================================
-uniform() ->
-    random:uniform().
-
-uniform(N) ->
-    random:uniform(N).
-
-seed() ->
-    SW = os:timestamp(),
-    random:seed(erlang:phash2(self()), element(2, SW), element(3, SW)).
-
-rand_bytes(Size) ->
-    crypto:rand_bytes(Size).
-
--endif.
-
-
--ifdef(TEST).
-
-rand_test() ->
-    ?assertMatch(true, uniform() < 1).
-
--endif.
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index d78923b3..1e40c5f0 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -598,6 +598,7 @@ starting({sst_new,
     leveled_log:log_timer("SST08",
                             [ActualFilename, Level, Summary#summary.max_sqn],
                             SW),
+    erlang:send_after(?STARTUP_TIMEOUT, self(), orphan_status),
     {reply,
         {ok, {Summary#summary.first_key, Summary#summary.last_key}, Bloom},
         reader,
@@ -923,8 +924,24 @@ handle_event({update_blockindex_cache, BIC}, StateName, State) ->
         State#state{blockindex_cache = BlockIndexCache,
                     high_modified_date = HighModDate}}.
 
-handle_info(_Msg, StateName, State) ->
-    {next_state, StateName, State}.
+handle_info(orphan_status, delete_pending, State) ->
+    % This message may have interrupted the delete timeout, so timeout straight
+    % away
+    {next_state, delete_pending, State, 0};
+handle_info(orphan_status, StateName, State) ->
+    % The SST file will be started by a clerk, but the clerk may be shut down
+    % prior to the manifest being updated about the existence of this SST file.
+    % If there is no activity after startup, check the clerk is still alive and
+    % otherwise assume this file is part of a closed store and shut down.
+    % If the clerk has crashed, the penciller will restart at the latest
+    % manifest, and so this file sill be restarted if and only if it is still
+    % part of the store
+    case is_process_alive(State#state.starting_pid) of
+        true ->
+            {next_state, StateName, State};
+        false ->
+            {stop, normal, State}
+    end.
 
 terminate(normal, delete_pending, _State) ->
     ok;
@@ -4130,9 +4147,6 @@ nonsense_coverage_test() ->
     ?assertMatch(
         {ok, reader, #state{}},
         code_change(nonsense, reader, #state{}, nonsense)),
-    ?assertMatch(
-        {next_state, reader, #state{}},
-        handle_info(nonsense, reader, #state{})),
     ?assertMatch(
         {reply, undefined, reader, #state{}},
         handle_sync_event("hello", self(), reader, #state{})),
diff --git a/src/leveled_tictac.erl b/src/leveled_tictac.erl
index 4d3c2565..43674c89 100644
--- a/src/leveled_tictac.erl
+++ b/src/leveled_tictac.erl
@@ -430,7 +430,7 @@ adjust_segmentmatch_list(SegmentList, CompareSize, StoreSize) ->
     StoreSizeI = get_size(StoreSize),
     if CompareSizeI =< StoreSizeI ->
         ExpItems = StoreSizeI div CompareSizeI - 1,
-        ShiftFactor = round(leveled_math:log2(CompareSizeI * ?L2_CHUNKSIZE)),
+        ShiftFactor = round(math:log2(CompareSizeI * ?L2_CHUNKSIZE)),
         ExpList = 
             lists:map(fun(X) -> X bsl ShiftFactor end, lists:seq(1, ExpItems)),
         UpdSegmentList = 

From aa353fac42cd214bfa373ed7d598cfbc836a12c8 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 16 Nov 2022 23:25:03 +0000
Subject: [PATCH 13/37] Tidy up

Using null rather then {null, Key} is potentially clearer as it is not a concern what they Key is in this case, and removes a comparison step from the leveled_codec:endkey_passed/2 function.

There were issues with coverage in eunit tests as the leveled_pclerk shut down.  This prompted a general tidy of leveled_pclerk (remove passing of LoopState into internal functions, and add dialyzer specs.
---
 src/leveled_codec.erl     |  4 +--
 src/leveled_pclerk.erl    | 58 +++++++++++++++++++++++++--------------
 src/leveled_pmanifest.erl |  3 +-
 src/leveled_sst.erl       |  2 +-
 4 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl
index 17c4c9ea..3faec9cd 100644
--- a/src/leveled_codec.erl
+++ b/src/leveled_codec.erl
@@ -81,7 +81,7 @@
 -type ledger_key() :: 
         {tag(), any(), any(), any()}|all.
 -type slimmed_key() ::
-        {binary()|null, binary()|null}|binary()|null|all.
+        {binary(), binary()|null}|binary()|null|all.
 -type ledger_value() ::
         ledger_value_v1()|ledger_value_v2().
 -type ledger_value_v1() ::
@@ -377,8 +377,6 @@ endkey_passed({K1, null}, {K1, _}) ->
     % See leveled_sst SlotIndex implementation.  Here keys may be slimmed to
     % single binaries or two element tuples before forming the index.
     false;
-endkey_passed({null, _QK1}, {_RK1, _RK2}) ->
-    false;
 endkey_passed(null, _) ->
     false;
 endkey_passed(QueryEndKey, RangeEndKey) ->
diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl
index b26041c4..2eca1f2a 100644
--- a/src/leveled_pclerk.erl
+++ b/src/leveled_pclerk.erl
@@ -53,32 +53,39 @@
 
 -record(state, {owner :: pid() | undefined,
                 root_path :: string() | undefined,
-                pending_deletions = dict:new(), % OTP 16 does not like type
-                sst_options :: #sst_options{}
+                pending_deletions = dict:new() :: dict:dict(),
+                sst_options :: sst_options()
                 }).
 
--type manifest_entry() :: #manifest_entry{}.
+-type sst_options() :: #sst_options{}.
 
 %%%============================================================================
 %%% API
 %%%============================================================================
 
-clerk_new(Owner, Manifest, OptsSST) ->
+-spec clerk_new(
+    pid(), string(), sst_options()) -> {ok, pid()}.
+clerk_new(Owner, RootPath, OptsSST) ->
     {ok, Pid} = 
         gen_server:start_link(?MODULE, 
                                 [leveled_log:get_opts(),
                                  {sst_options, OptsSST}],
                                 []),
-    ok = gen_server:call(Pid, {load, Owner, Manifest}, infinity),
+    ok = gen_server:call(Pid, {load, Owner, RootPath}, infinity),
     leveled_log:log("PC001", [Pid, Owner]),
     {ok, Pid}.
 
+-spec clerk_prompt(pid()) -> ok.
 clerk_prompt(Pid) ->
     gen_server:cast(Pid, prompt).
 
+-spec clerk_promptdeletions(pid(), pos_integer()) -> ok.
 clerk_promptdeletions(Pid, ManifestSQN) ->
     gen_server:cast(Pid, {prompt_deletions, ManifestSQN}).
 
+-spec clerk_push(
+    pid(), {leveled_pmanifest:lsm_level(), leveled_pmanifest:manifest()}) ->
+        ok.
 clerk_push(Pid, Work) ->
     gen_server:cast(Pid, {push_work, Work}).
 
@@ -100,6 +107,7 @@ clerk_addlogs(Pid, ForcedLogs) ->
 clerk_removelogs(Pid, ForcedLogs) ->
     gen_server:cast(Pid, {remove_logs, ForcedLogs}).
 
+-spec clerk_close(pid()) -> ok.
 clerk_close(Pid) ->
     gen_server:call(Pid, close, 20000).
 
@@ -119,7 +127,10 @@ handle_call(close, _From, State) ->
 handle_cast(prompt, State) ->
     handle_info(timeout, State);
 handle_cast({push_work, Work}, State) ->
-    {ManifestSQN, Deletions} = handle_work(Work, State),
+    {ManifestSQN, Deletions} =
+        handle_work(
+            Work,
+            State#state.root_path, State#state.sst_options, State#state.owner),
     PDs = dict:store(ManifestSQN, Deletions, State#state.pending_deletions),
     leveled_log:log("PC022", [ManifestSQN]),
     {noreply, State#state{pending_deletions = PDs}, ?MIN_TIMEOUT};
@@ -145,7 +156,7 @@ handle_cast({remove_logs, ForcedLogs}, State) ->
     {noreply, State#state{sst_options = SSTopts0}}.
 
 handle_info(timeout, State) ->
-    request_work(State),
+    ok = leveled_penciller:pcl_workforclerk(State#state.owner),
     % When handling work, the clerk can collect a large number of binary
     % references, so proactively GC this process before receiving any future
     % work.  In under pressure clusters, clerks with large binary memory
@@ -164,25 +175,29 @@ code_change(_OldVsn, State, _Extra) ->
 %%% Internal functions
 %%%============================================================================
 
-request_work(State) ->
-    ok = leveled_penciller:pcl_workforclerk(State#state.owner).
-
-handle_work({SrcLevel, Manifest}, State) ->
-    {UpdManifest, EntriesToDelete} = merge(SrcLevel,
-                                            Manifest,
-                                            State#state.root_path,
-                                            State#state.sst_options),
+-spec handle_work(
+    {leveled_pmanifest:lsm_level(), leveled_pmanifest:manifest()},
+    string(), sst_options(), pid()) ->
+        {leveled_pmanifest:pos_integer(),
+            list(leveled_pmanifest:manifest_entry())}.
+handle_work(
+        {SrcLevel, Manifest}, RootPath, SSTOpts, Owner) ->
+    {UpdManifest, EntriesToDelete} = 
+        merge(SrcLevel, Manifest, RootPath, SSTOpts),
     leveled_log:log("PC007", []),
     SWMC = os:timestamp(),
-    ok = leveled_penciller:pcl_manifestchange(State#state.owner,
-                                                    UpdManifest),
+    ok = leveled_penciller:pcl_manifestchange(Owner, UpdManifest),
     leveled_log:log_timer("PC017", [], SWMC),
     SWSM = os:timestamp(),
-    ok = leveled_pmanifest:save_manifest(UpdManifest,
-                                            State#state.root_path),
+    ok = leveled_pmanifest:save_manifest(UpdManifest, RootPath),
     leveled_log:log_timer("PC018", [], SWSM),
     {leveled_pmanifest:get_manifest_sqn(UpdManifest), EntriesToDelete}.
 
+-spec merge(
+    leveled_pmanifes:lsm_level(), leveled_pmanifest:manifest(),
+    string(), sst_options()) ->
+        {leveled_pmanifest:manifest(),
+            list(leveled_pmanifest:manifest_entry())}.
 merge(SrcLevel, Manifest, RootPath, OptsSST) ->
     case leveled_pmanifest:report_manifest_level(Manifest, SrcLevel + 1) of
         {0, 0, undefined} ->
@@ -224,6 +239,7 @@ merge(SrcLevel, Manifest, RootPath, OptsSST) ->
                             SST_RP, NewSQN, OptsSST)
     end.
 
+-spec notify_deletions(list(leveled_pmanifest:manifest_entry()), pid()) -> ok.
 notify_deletions([], _Penciller) ->
     ok;
 notify_deletions([Head|Tail], Penciller) ->
@@ -301,7 +317,9 @@ do_merge(KL1, KL2, SinkLevel, SinkB, RP, NewSQN, MaxSQN, OptsSST, Additions) ->
                             Additions ++ [Entry])
     end.
 
--spec grooming_scorer(list(manifest_entry())) -> manifest_entry().
+-spec grooming_scorer(
+    list(leveled_pmanifest:manifest_entry()))
+        -> leveled_pmanifest:manifest_entry().
 grooming_scorer([ME  | MEs]) ->
     InitTombCount = leveled_sst:sst_gettombcount(ME#manifest_entry.owner),
     {HighestTC, BestME} = grooming_scorer(InitTombCount, ME, MEs),
diff --git a/src/leveled_pmanifest.erl b/src/leveled_pmanifest.erl
index d8703b23..b3827622 100644
--- a/src/leveled_pmanifest.erl
+++ b/src/leveled_pmanifest.erl
@@ -107,10 +107,11 @@
 -type manifest() :: #manifest{}.
 -type manifest_entry() :: #manifest_entry{}.
 -type manifest_owner() :: pid()|list().
+-type lsm_level() :: 0..7.
 -type selector_strategy() ::
         random|{grooming, fun((list(manifest_entry())) -> manifest_entry())}.
 
--export_type([manifest/0, manifest_entry/0, manifest_owner/0]).
+-export_type([manifest/0, manifest_entry/0, manifest_owner/0, lsm_level/0]).
 
 %%%============================================================================
 %%% API
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 1e40c5f0..2e59f6e8 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -1781,7 +1781,7 @@ get_filterfun(
             fun({_Tag, _Bucket, {_Field, Term}, Key}) ->
                 case Term of
                     T when byte_size(T) =< N ->
-                        {null, Key};
+                        null;
                     <<_:N/binary, Suffix/binary>> ->
                         {Suffix, Key}
                 end

From 7d97e980638e0ab1a2f180040bee9704e4d779ba Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 16 Nov 2022 23:42:09 +0000
Subject: [PATCH 14/37] Remove R16 relic

---
 src/leveled_cdb.erl | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl
index f7ff61f1..4933822b 100644
--- a/src/leveled_cdb.erl
+++ b/src/leveled_cdb.erl
@@ -947,27 +947,10 @@ close_pendingdelete(Handle, Filename, WasteFP) ->
     end.
 
 -spec set_writeops(sync|riak_sync|none) -> {list(), sync|riak_sync|none}.
-%% Assumption is that sync should be used - it is a transaction log.
-%%
-%% However this flag is not supported in OTP 16.  Bitcask appears to pass an
-%% o_sync flag, but this isn't supported either (maybe it works with the
-%% bitcask nif fileops).
-%%
-%% To get round this will try and datasync on each PUT with riak_sync
--ifdef(no_sync).
-
-set_writeops(SyncStrategy) ->
-    case SyncStrategy of
-        sync ->
-            {?WRITE_OPS, riak_sync};
-        riak_sync ->
-            {?WRITE_OPS, riak_sync};
-        none ->
-            {?WRITE_OPS, none}
-    end.
-
--else.
-
+%% @doc
+%% Sync should be used - it is a transaction log - in single node
+%% implementations. `riak_sync` is a legacy of earlier OTP versions when
+%% passing the sync option was not supported
 set_writeops(SyncStrategy) ->
     case SyncStrategy of
         sync ->
@@ -978,8 +961,6 @@ set_writeops(SyncStrategy) ->
             {?WRITE_OPS, none}
     end.
 
--endif.
-
 -spec open_active_file(list()) -> {integer(), ets:tid(), any()}.
 %% @doc
 %% Open an active file - one for which it is assumed the hash tables have not 

From 499b0cbaff46e1923a2cc7f237044a578da4498e Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 17 Nov 2022 12:33:52 +0000
Subject: [PATCH 15/37] Further testing another issue

The StartKey must always be less than or equal to the prefix when the first N characters are stripped,  but this is not true of the EndKey (for the query) which does not have to be between the FirstKey and the LastKey.

If the EndKey query does not match it must be greater than the Prefix (as otherwise it would not have been greater than the FirstKey - so set to null.
---
 src/leveled_sst.erl            |  20 ++++--
 test/end_to_end/riak_SUITE.erl | 122 +++++++++++++++++++++++++++++++--
 2 files changed, 134 insertions(+), 8 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 2e59f6e8..3eca95ec 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -1778,12 +1778,18 @@ get_filterfun(
                 {Term, Key}
             end;
         N ->
+            <<Prefix:N/binary, _Rest/binary>> = FT,
             fun({_Tag, _Bucket, {_Field, Term}, Key}) ->
                 case Term of
                     T when byte_size(T) =< N ->
                         null;
-                    <<_:N/binary, Suffix/binary>> ->
-                        {Suffix, Key}
+                    <<QueryKey:N/binary, Suffix/binary>> ->
+                        case QueryKey of
+                            Prefix ->
+                                {Suffix, Key};
+                            _ ->
+                                null
+                        end
                 end
             end
     end;
@@ -1794,14 +1800,20 @@ get_filterfun(
         0 ->
             fun({_Tag, _Bucket, Key, null}) -> Key end;
         N ->
+            <<Prefix:N/binary, _Rest/binary>> = FK,
             fun({_Tag, _Bucket, Key, null}) ->
                 case Key of
                     null ->
                         null;
                     K when byte_size(K) =< N ->
                         null;
-                    <<_:N/binary, Suffix/binary>> ->
-                        Suffix
+                    <<QueryKey:N/binary, Suffix/binary>> ->
+                        case QueryKey of
+                            Prefix ->
+                                Suffix;
+                            _ ->
+                                null
+                        end
                 end
             end
     end;
diff --git a/test/end_to_end/riak_SUITE.erl b/test/end_to_end/riak_SUITE.erl
index 67abd2f9..6d416a57 100644
--- a/test/end_to_end/riak_SUITE.erl
+++ b/test/end_to_end/riak_SUITE.erl
@@ -281,18 +281,25 @@ summarisable_sstindex(_Config) ->
                     {SK, EK},
                     FoldAccT,
                     undefined),
-            QueryCount = FoldFun(),
+            QueryList = FoldFun(),
             io:format(
                 "QueryCount ~w against total ~w for range ~p ~p~n",
-                [length(QueryCount), KeyCount, SK, EK]),
-            QueryCount
+                [length(QueryList), KeyCount, SK, EK]),
+            QueryList
         end,
-    true = KeyCount == length(KeyRangeCheckFun(<<"00">>, <<"02">>)),
+    
+        true = KeyCount == length(KeyRangeCheckFun(<<"00">>, <<"02">>)),
     true = KeyCount == length(KeyRangeCheckFun(<<"000">>, <<"002">>)),
     true = KeyCount == length(KeyRangeCheckFun(<<"0000">>, <<"0002">>)),
     true =
         (KeyCount - 1) ==
             length(KeyRangeCheckFun(<<"00000">>, <<"00002">>)),
+    true =
+        (KeyCount - 1) ==
+            length(KeyRangeCheckFun(<<"00000">>, <<"000020">>)),
+    true =
+        (KeyCount - 1) ==
+            length(KeyRangeCheckFun(<<"00000">>, <<"0000200">>)),
     true =
         (KeyCount div 2) ==
             length(KeyRangeCheckFun(<<"00001">>, <<"00002">>)),
@@ -318,6 +325,113 @@ summarisable_sstindex(_Config) ->
             fun(_I) -> leveled_rand:uniform(KeyCount - 200) end,
             lists:seq(1, 100))),
 
+    IdxObjKeyCount = 50000,
+    TermGen =
+        fun(I, C) ->
+            list_to_binary(
+                lists:flatten(
+                    io_lib:format("~10..0w", [I]) ++ integer_to_list(C)))
+        end,
+    SequentialIndexGen =
+        fun(I) ->
+            fun() ->
+                lists:map(
+                    fun(C) ->
+                        {add, <<"indexf_bin">>,TermGen(I, C)}
+                    end,
+                    lists:seq(1, 8))
+            end
+        end,
+    IdxObjListToSort =
+        lists:map(
+            fun(I) -> 
+                {leveled_rand:uniform(KeyCount * 10),
+                    testutil:set_object(
+                        Bucket,
+                        KeyGen(I),
+                        integer_to_binary(I - KeyCount),
+                        SequentialIndexGen(I - KeyCount),
+                        [])}
+                end,
+            lists:seq(KeyCount + 1, KeyCount + IdxObjKeyCount)),
+    UnsortedIdxObjList =
+        lists:map(
+            fun({I, {O, S}}) -> {I, O, S} end,
+            lists:keysort(1, IdxObjListToSort)),
+    testutil:riakload(Bookie1, UnsortedIdxObjList),
+    IdxCount = IdxObjKeyCount * 8,
+
+    IdxQueryFun =
+        fun(StartTerm, EndTerm) ->
+            {async, FoldFun} = 
+                leveled_bookie:book_indexfold(
+                    Bookie1, {Bucket, <<>>}, FoldAccT,
+                    {<<"indexf_bin">>, StartTerm, EndTerm},
+                    {true, undefined}),
+            IdxQueryList = FoldFun(),
+            io:format(
+                "IdxQueryCount ~w for range ~p ~p~n",
+                [length(IdxQueryList), StartTerm, EndTerm]),
+            IdxQueryList
+        end,
+    true = IdxCount == length(IdxQueryFun(<<"00">>, <<"05">>)),
+    true = IdxCount == length(IdxQueryFun(<<"000">>, <<"005">>)),
+    true = IdxCount == length(IdxQueryFun(<<"0000">>, <<"0005">>)),
+    true = IdxCount == length(IdxQueryFun(<<"00000">>, <<"00005">>)),
+    true =
+        (IdxCount - 8) ==
+            length(IdxQueryFun(<<"000000">>, <<"000005">>)),
+    true =
+        (IdxCount - 8) ==
+            length(IdxQueryFun(<<"000000">>, <<"0000050">>)),
+    true =
+        (IdxCount - 8) ==
+            length(IdxQueryFun(<<"000000">>, <<"00000500">>)),
+    true = 8 == length(IdxQueryFun(<<"000005">>, <<"0000051">>)),
+
+    lists:foreach(
+        fun(I) ->
+            StartTerm = TermGen(I, 0),
+            EndTerm = TermGen(I + 20, 9),
+            true = 168 == length(IdxQueryFun(StartTerm, EndTerm))
+        end,
+        lists:map(
+            fun(_I) ->
+                leveled_rand:uniform(IdxObjKeyCount - 20)
+            end,
+            lists:seq(1, 100))),
+    lists:foreach(
+        fun(I) ->
+            StartTerm = TermGen(I, 0),
+            EndTerm = TermGen(I + 10, 9),
+            true = 88 == length(IdxQueryFun(StartTerm, EndTerm))
+        end,
+        lists:map(
+            fun(_I) ->
+                leveled_rand:uniform(IdxObjKeyCount - 10)
+            end,
+            lists:seq(1, 100))),
+
+    io:format("Redo object count checks:~n"),
+    NewKeyCount = KeyCount + IdxObjKeyCount,
+    true = NewKeyCount == length(KeyRangeCheckFun(<<"00">>, <<"025">>)),
+    true = NewKeyCount == length(KeyRangeCheckFun(<<"000">>, <<"0025">>)),
+    true = NewKeyCount == length(KeyRangeCheckFun(<<"0000">>, <<"00025">>)),
+    true =
+        (NewKeyCount - 1) ==
+            length(KeyRangeCheckFun(<<"00000">>, <<"000025">>)),
+    true = 1 == length(KeyRangeCheckFun(<<"000025">>, <<"0000251">>)),
+
+    lists:foreach(
+        fun(I) ->
+            StartKey = KeyGen(I),
+            EndKey = KeyGen(I + 200 - 1),
+            true = 200 == length(KeyRangeCheckFun(StartKey, EndKey))
+        end,
+        lists:map(
+            fun(_I) -> leveled_rand:uniform(KeyCount - 200) end,
+            lists:seq(1, 100))),
+    
     ok = leveled_bookie:book_destroy(Bookie1).
 
 

From 0f5db75bb6f5d972354c26b33b0c372cc2963070 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 17 Nov 2022 13:42:47 +0000
Subject: [PATCH 16/37] Fix unit test

Unit test had a typo - and result interpretation had a misunderstanding.
---
 src/leveled_sst.erl | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 3eca95ec..560a93f8 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -4463,7 +4463,7 @@ range_key_lestthanprefix_test() ->
             {?IDX_TAG, {<<"btype">>, <<"bucket">>},
                 {<<"dob_bin">>, <<"19601301|000">>}, null},
             {?IDX_TAG, {<<"btype">>, <<"bucket">>},
-                {<<"dob_bin">>, <<"19611301|0002">>}, null},
+                {<<"dob_bin">>, <<"19601301|0002">>}, null},
             16),
     IdxRange7 =
         sst_getkvrange(
@@ -4471,7 +4471,7 @@ range_key_lestthanprefix_test() ->
             {?IDX_TAG, {<<"btype">>, <<"bucket">>},
                 {<<"dob_bin">>, <<"19601301|000">>}, null},
             {?IDX_TAG, {<<"btype">>, <<"bucket">>},
-                {<<"dob_bin">>, <<"19611301|0001">>}, null},
+                {<<"dob_bin">>, <<"19601301|0001">>}, null},
             16),
     IdxRange8 =
         sst_getkvrange(
@@ -4479,17 +4479,16 @@ range_key_lestthanprefix_test() ->
             {?IDX_TAG, {<<"btype">>, <<"bucket">>},
                 {<<"dob_bin">>, <<"19601301|000000">>}, null},
             {?IDX_TAG, {<<"btype">>, <<"bucket">>},
-                {<<"dob_bin">>, <<"19611301|000100">>}, null},
+                {<<"dob_bin">>, <<"19601301|000100">>}, null},
             16),
     ?assertMatch(500, length(IdxRange1)),
     ?assertMatch(500, length(IdxRange2)),
     ?assertMatch(250, length(IdxRange3)),
     ?assertMatch(250, length(IdxRange4)),
     ?assertMatch(250, length(IdxRange5)),
-    % No right trim - result count rounds up to slot size
-    ?assertMatch(256, length(IdxRange6)),
-    ?assertMatch(128, length(IdxRange7)),
-    ?assertMatch(128, length(IdxRange8)),
+    ?assertMatch(199, length(IdxRange6)),
+    ?assertMatch(99, length(IdxRange7)),
+    ?assertMatch(100, length(IdxRange8)),
     ok = sst_close(P1),
     ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")),
 
@@ -4547,7 +4546,7 @@ range_key_lestthanprefix_test() ->
             {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
                 <<"19601301|000">>, null},
             {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
-                <<"19611301|0002">>, null},
+                <<"19601301|0002">>, null},
             16),
     ObjRange7 =
         sst_getkvrange(
@@ -4555,7 +4554,7 @@ range_key_lestthanprefix_test() ->
             {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
                 <<"19601301|000">>, null},
             {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
-                <<"19611301|0001">>, null},
+                <<"19601301|0001">>, null},
             16),
     ObjRange8 =
         sst_getkvrange(
@@ -4563,17 +4562,16 @@ range_key_lestthanprefix_test() ->
             {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
                 <<"19601301|000000">>, null},
             {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
-                <<"19611301|000100">>, null},
+                <<"19601301|000100">>, null},
             16),
 
     ?assertMatch(500, length(ObjRange1)),
     ?assertMatch(500, length(ObjRange2)),
     ?assertMatch(250, length(ObjRange3)),
     ?assertMatch(250, length(ObjRange4)),
-    % No right trim - result count rounds up to slot size
-    ?assertMatch(256, length(ObjRange6)),
-    ?assertMatch(128, length(ObjRange7)),
-    ?assertMatch(128, length(ObjRange8)),
+    ?assertMatch(199, length(ObjRange6)),
+    ?assertMatch(99, length(ObjRange7)),
+    ?assertMatch(100, length(ObjRange8)),
     ok = sst_close(P2),
     ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
     

From 2b5f7bc49a3dfcad1be87fcc697a9c6f50332c98 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 17 Nov 2022 16:09:16 +0000
Subject: [PATCH 17/37] Code and spec tidy

Also look to the cover the situation when the FirstKey is the same as the Prefix with tests.

This is, in theory, not an issue as it is the EndKey for each sublist which is indexed in leveled_tree.  However, guard against it mapping to null here, just in case there are dangers lurking (note that tests will still pass without `M > N` guard in place.
---
 src/leveled_sst.erl | 231 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 191 insertions(+), 40 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 560a93f8..9c9c6dfa 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -1768,57 +1768,79 @@ from_list(SlotList, FirstKey, LastKey) ->
 
 -spec get_filterfun(
     leveled_codec:ledger_key(), leveled_codec:ledger_key()) ->
-        fun((leveled_codec:ledger_key()) -> any()).
+        fun((leveled_codec:ledger_key())
+            -> leveled_codec:ledger_key()|leveled_codec:slimmed_key()).
 get_filterfun(
         {Tag, Bucket, {Field, FT}, FK}, {Tag, Bucket, {Field, LT}, LK})
             when is_binary(FT), is_binary(FK), is_binary(LT), is_binary(LK) ->
-    case binary:longest_common_prefix([FT, LT]) of
-        0 ->
-            fun({_Tag, _Bucket, {_Field, Term}, Key}) ->
-                {Term, Key}
-            end;
-        N ->
+    case {binary:longest_common_prefix([FT, LT]), byte_size(FT)} of
+        {N, M} when N > 0, M > N ->
             <<Prefix:N/binary, _Rest/binary>> = FT,
-            fun({_Tag, _Bucket, {_Field, Term}, Key}) ->
-                case Term of
-                    T when byte_size(T) =< N ->
-                        null;
-                    <<QueryKey:N/binary, Suffix/binary>> ->
-                        case QueryKey of
-                            Prefix ->
-                                {Suffix, Key};
-                            _ ->
-                                null
-                        end
-                end
-            end
+            term_prefix_filter(N, Prefix);
+        _ ->
+            fun term_filter/1
     end;
 get_filterfun(
         {Tag, Bucket, FK, null}, {Tag, Bucket, LK, null})
             when is_binary(FK), is_binary(LK), FK < LK ->
-    case binary:longest_common_prefix([FK, LK]) of
-        0 ->
-            fun({_Tag, _Bucket, Key, null}) -> Key end;
-        N ->
+    case {binary:longest_common_prefix([FK, LK]), byte_size(FK)} of
+        {N, M} when N > 0, M > N ->
             <<Prefix:N/binary, _Rest/binary>> = FK,
-            fun({_Tag, _Bucket, Key, null}) ->
-                case Key of
-                    null ->
-                        null;
-                    K when byte_size(K) =< N ->
-                        null;
-                    <<QueryKey:N/binary, Suffix/binary>> ->
-                        case QueryKey of
-                            Prefix ->
-                                Suffix;
-                            _ ->
-                                null
-                        end
-                end
-            end
+            key_prefix_filter(N, Prefix);
+        _ ->
+            fun key_filter/1
+        
     end;
 get_filterfun(_FirstKey, _LastKey) ->
-    fun(K) -> K end.
+    fun null_filter/1.
+
+-spec null_filter(leveled_codec:ledger_key()) -> leveled_codec:ledger_key().
+null_filter(Key) -> Key.
+
+-spec key_filter(leveled_codec:ledger_key()) -> leveled_codec:slimmed_key().
+key_filter({_Tag, _Bucket, Key, null}) -> Key.
+
+-spec term_filter(leveled_codec:ledger_key()) -> leveled_codec:slimmed_key().
+term_filter({_Tag, _Bucket, {_Field, Term}, Key}) -> {Term, Key}.
+
+-spec key_prefix_filter(
+    pos_integer(), binary()) ->
+        fun((leveled_codec:ledger_key()) -> leveled_codec:slimmed_key()).
+key_prefix_filter(N, Prefix) ->
+    fun({_Tag, _Bucket, Key, null}) ->
+        case Key of
+            null ->
+                null;
+            K when byte_size(K) =< N ->
+                null;
+            <<QueryKey:N/binary, Suffix/binary>> ->
+                case QueryKey of
+                    Prefix ->
+                        Suffix;
+                    _ ->
+                        null
+                end
+        end
+    end.
+
+-spec term_prefix_filter(
+    pos_integer(), binary()) ->
+        fun((leveled_codec:ledger_key()) -> leveled_codec:slimmed_key()).
+term_prefix_filter(N, Prefix) ->
+    fun({_Tag, _Bucket, {_Field, Term}, Key}) ->
+        case Term of
+            T when byte_size(T) =< N ->
+                null;
+            <<QueryKey:N/binary, Suffix/binary>> ->
+                case QueryKey of
+                    Prefix ->
+                        {Suffix, Key};
+                    _ ->
+                        null
+                end
+        end
+    end.
+
 
 lookup_slot(Key, Tree, FilterFun) ->
     StartKeyFun =
@@ -4398,6 +4420,135 @@ block_index_cache_test() ->
     ?assertMatch(HeaderTS, array:get(0, element(2, BIC4))),
     ?assertMatch(Now, LMD4).
 
+key_matchesprefix_test() ->
+    FileName = "keymatchesprefix_test",
+    IndexKeyFun =
+        fun(I) ->
+            {{?IDX_TAG,
+                {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>,
+                    list_to_binary("19601301|"
+                        ++ io_lib:format("~6..0w", [I]))},
+                list_to_binary(io_lib:format("~6..0w", [I]))},
+            {1, {active, infinity}, no_lookup, null}}
+        end,
+    IndexEntries = lists:map(IndexKeyFun, lists:seq(1, 500)),
+    OddIdxKey =
+        {{?IDX_TAG,
+            {<<"btype">>, <<"bucket">>},
+            {<<"dob_bin">>, <<"19601301">>},
+            list_to_binary(io_lib:format("~6..0w", [0]))},
+        {1, {active, infinity}, no_lookup, null}},
+    OptsSST = 
+        #sst_options{press_method=native,
+                        log_options=leveled_log:get_opts()},
+    {ok, P1, {_FK1, _LK1}, _Bloom1} = 
+        sst_new(
+            ?TEST_AREA, FileName, 1, [OddIdxKey|IndexEntries], 6000, OptsSST),
+    IdxRange2 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG,
+                {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1960">>}, null},
+            {?IDX_TAG,
+                {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1961">>}, null},
+            16),
+    IdxRange4 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301|000251">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1961">>}, null},
+            16),
+    IdxRangeX =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1961">>}, null},
+            16),
+    IdxRangeY =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301|">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1961">>}, null},
+            16),
+    ?assertMatch(501, length(IdxRange2)),
+    ?assertMatch(250, length(IdxRange4)),
+    ?assertMatch(501, length(IdxRangeX)),
+    ?assertMatch(500, length(IdxRangeY)),
+    ok = sst_close(P1),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")),
+
+    ObjectKeyFun =
+        fun(I) ->
+            {{?RIAK_TAG,
+                {<<"btype">>, <<"bucket">>},
+                list_to_binary("19601301|"
+                    ++ io_lib:format("~6..0w", [I])),
+                null},
+            {1, {active, infinity}, {0, 0}, null}}
+        end,
+    ObjectEntries = lists:map(ObjectKeyFun, lists:seq(1, 500)),
+    OddObjKey =
+        {{?RIAK_TAG,
+            {<<"btype">>, <<"bucket">>},
+            <<"19601301">>,
+            null},
+        {1, {active, infinity}, {100, 100}, null}},
+    OptsSST = 
+        #sst_options{press_method=native, log_options=leveled_log:get_opts()},
+    {ok, P2, {_FK2, _LK2}, _Bloom2} = 
+        sst_new(
+            ?TEST_AREA, FileName, 1, [OddObjKey|ObjectEntries], 6000, OptsSST),
+    ObjRange2 =
+        sst_getkvrange(
+            P2,
+            {?RIAK_TAG,
+                {<<"btype">>, <<"bucket">>},
+                <<"1960">>, null},
+            {?RIAK_TAG,
+                {<<"btype">>, <<"bucket">>},
+                <<"1961">>, null},
+            16),
+    ObjRange4 =
+        sst_getkvrange(
+            P2,
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000251">>, null},
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"1961">>, null},
+            16),
+    ObjRangeX =
+        sst_getkvrange(
+            P2,
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301">>, null},
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"1961">>, null},
+            16),
+    ObjRangeY =
+        sst_getkvrange(
+            P2,
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|">>, null},
+            {?RIAK_TAG, {<<"btype">>, <<"bucket">>},
+                <<"1961">>, null},
+            16),
+    ?assertMatch(501, length(ObjRange2)),
+    ?assertMatch(250, length(ObjRange4)),
+    ?assertMatch(501, length(ObjRangeX)),
+    ?assertMatch(500, length(ObjRangeY)),
+    ok = sst_close(P2),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
+    
+
 range_key_lestthanprefix_test() ->
     FileName = "lessthanprefix_test",
     IndexKeyFun =

From 7236a31b8cee75e64a4356427f27bcab932bd047 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 17 Nov 2022 21:07:16 +0000
Subject: [PATCH 18/37] Hibernate on BIC complete

There are three situations when the BIC becomes complete:

- In a file created as part of a merge the BIS is learned in the merge
- After startup, files below L1 learn the block cache through reads that happen to read the block, eventually the while cache will be read, unless...
- Either before/after the cache is complete, it can get whiped by a timeout after a get_sqn request (e.g. as prompted by a journal compaction) ... it will then be re-filled of the back of get/get-range requests.

In all these situations we want to hibernate after the BIC is fill - to reflect the fact that the LoopState should now be relatively stable, so it is a good point to GC and rationalise location of data.

Previously on the the first base was covered.  Now all three are covered through the bic_complete message.
---
 src/leveled_sst.erl | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 9c9c6dfa..dd2ec0b5 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -598,7 +598,7 @@ starting({sst_new,
     leveled_log:log_timer("SST08",
                             [ActualFilename, Level, Summary#summary.max_sqn],
                             SW),
-    erlang:send_after(?STARTUP_TIMEOUT, self(), orphan_status),
+    erlang:send_after(?STARTUP_TIMEOUT, self(), start_complete),
     {reply,
         {ok, {Summary#summary.first_key, Summary#summary.last_key}, Bloom},
         reader,
@@ -606,8 +606,7 @@ starting({sst_new,
                         high_modified_date = HighModDate,
                         starting_pid = StartingPID,
                         level = Level,
-                        fetch_cache = new_cache(Level)},
-        hibernate};
+                        fetch_cache = new_cache(Level)}};
 starting({sst_newlevelzero, RootPath, Filename,
                     Penciller, MaxSQN,
                     OptsSST, IdxModDate}, _From, State) -> 
@@ -924,11 +923,16 @@ handle_event({update_blockindex_cache, BIC}, StateName, State) ->
         State#state{blockindex_cache = BlockIndexCache,
                     high_modified_date = HighModDate}}.
 
-handle_info(orphan_status, delete_pending, State) ->
-    % This message may have interrupted the delete timeout, so timeout straight
-    % away
-    {next_state, delete_pending, State, 0};
-handle_info(orphan_status, StateName, State) ->
+handle_info(_Msg, delete_pending, State) ->
+    % Ignore messages when pending delete. The message may have interrupted
+    % the delete timeout, so timeout straight away
+    {next_state, delete_pending, State, 0};                
+handle_info(bic_complete, StateName, State) ->
+    % The block index cache is complete, so the memory footprint should be
+    % relatively stable from this point.  Hibernate to help minimise
+    % fragmentation
+    {next_state, StateName, State, hibernate};
+handle_info(start_complete, StateName, State) ->
     % The SST file will be started by a clerk, but the clerk may be shut down
     % prior to the manifest being updated about the existence of this SST file.
     % If there is no activity after startup, check the clerk is still alive and
@@ -1293,6 +1297,7 @@ update_blockindex_cache(true, Entries, BIC, HighModDate, IdxModDate) ->
                 {N, _} ->
                     {BIC, HighModDate};
                 {S, true} ->
+                    erlang:send(self(), bic_complete),
                     {BIC0, element(3, BIC0)};
                 _ ->
                     {BIC0, undefined}

From 1578ee300b60f48288e5095c9910e9c9e704d2e8 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Sun, 20 Nov 2022 16:26:05 +0000
Subject: [PATCH 19/37] Test all index keys have same term

This works functionally, but is not optimised (the term is replicated in the index)
---
 src/leveled_sst.erl | 85 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index dd2ec0b5..73f672d1 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -4554,6 +4554,91 @@ key_matchesprefix_test() ->
     ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
     
 
+range_key_indextermmatch_test() ->
+    FileName = "indextermmatch_test",
+    IndexKeyFun =
+        fun(I) ->
+            {{?IDX_TAG,
+                {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>,
+                    <<"19601301">>},
+                list_to_binary(io_lib:format("~6..0w", [I]))},
+            {1, {active, infinity}, no_lookup, null}}
+        end,
+    IndexEntries = lists:map(IndexKeyFun, lists:seq(1, 500)),
+    OptsSST = 
+        #sst_options{press_method=native,
+                        log_options=leveled_log:get_opts()},
+    {ok, P1, {_FK1, _LK1}, _Bloom1} = 
+        sst_new(?TEST_AREA, FileName, 1, IndexEntries, 6000, OptsSST),
+    
+    IdxRange1 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>}, {<<"dob_bin">>, <<"1959">>}, null},
+            all,
+            16),
+    IdxRange2 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG,
+                {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1960">>}, null},
+            {?IDX_TAG,
+                {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"1961">>}, null},
+            16),
+    IdxRange3 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, <<"000000">>},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, null},
+            16),
+    IdxRange4 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, <<"000100">>},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, null},
+            16),
+    IdxRange5 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, <<"000100">>},
+            16),
+    IdxRange6 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, <<"000300">>},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, null},
+            16),
+    IdxRange7 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, <<"000300">>},
+            16),
+    ?assertMatch(500, length(IdxRange1)),
+    ?assertMatch(500, length(IdxRange2)),
+    ?assertMatch(500, length(IdxRange3)),
+    ?assertMatch(401, length(IdxRange4)),
+    ?assertMatch(100, length(IdxRange5)),
+    ?assertMatch(201, length(IdxRange6)),
+    ?assertMatch(300, length(IdxRange7)),
+    ok = sst_close(P1),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
+    
+
 range_key_lestthanprefix_test() ->
     FileName = "lessthanprefix_test",
     IndexKeyFun =

From 9db486671cf5b2576ea33fb5d1dbd639c499d7aa Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Tue, 22 Nov 2022 09:57:15 +0000
Subject: [PATCH 20/37] Summaries with same index term

If the summary index all have the same index term - only the object keys need to be indexes
---
 src/leveled_sst.erl | 47 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 73f672d1..f09543f9 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -1775,6 +1775,10 @@ from_list(SlotList, FirstKey, LastKey) ->
     leveled_codec:ledger_key(), leveled_codec:ledger_key()) ->
         fun((leveled_codec:ledger_key())
             -> leveled_codec:ledger_key()|leveled_codec:slimmed_key()).
+get_filterfun(
+    {Tag, Bucket, {Field, Term}, FK}, {Tag, Bucket, {Field, Term}, LK})
+            when is_binary(Term), is_binary(FK), is_binary(LK) ->
+    subkey_filter(Term);
 get_filterfun(
         {Tag, Bucket, {Field, FT}, FK}, {Tag, Bucket, {Field, LT}, LK})
             when is_binary(FT), is_binary(FK), is_binary(LT), is_binary(LK) ->
@@ -1805,6 +1809,22 @@ null_filter(Key) -> Key.
 -spec key_filter(leveled_codec:ledger_key()) -> leveled_codec:slimmed_key().
 key_filter({_Tag, _Bucket, Key, null}) -> Key.
 
+-spec subkey_filter(
+    binary()) ->
+        fun((leveled_codec:ledger_key()) -> leveled_codec:slimmed_key()).
+subkey_filter(Term) ->
+    fun({_Tag, _Bucket, {_Field, T}, ObjKey}) ->
+        case T of
+            % If the Term does not match we ignore the Key and treat as null
+            % As we can assume that this is a range start/end key which is
+            % before/after any index key in the file
+            Term ->
+                ObjKey;
+            _ ->
+                null
+        end
+    end.
+
 -spec term_filter(leveled_codec:ledger_key()) -> leveled_codec:slimmed_key().
 term_filter({_Tag, _Bucket, {_Field, Term}, Key}) -> {Term, Key}.
 
@@ -4484,10 +4504,19 @@ key_matchesprefix_test() ->
             {?IDX_TAG, {<<"btype">>, <<"bucket">>},
                 {<<"dob_bin">>, <<"1961">>}, null},
             16),
+    IdxRangeZ =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301|">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301|000500">>}, null},
+            16),
     ?assertMatch(501, length(IdxRange2)),
     ?assertMatch(250, length(IdxRange4)),
     ?assertMatch(501, length(IdxRangeX)),
     ?assertMatch(500, length(IdxRangeY)),
+    ?assertMatch(500, length(IdxRangeZ)),
     ok = sst_close(P1),
     ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")),
 
@@ -4628,6 +4657,22 @@ range_key_indextermmatch_test() ->
             {?IDX_TAG, {<<"btype">>, <<"bucket">>},
                 {<<"dob_bin">>, <<"19601301">>}, <<"000300">>},
             16),
+    IdxRange8 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, null},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601302">>}, <<"000300">>},
+            16),
+    IdxRange9 =
+        sst_getkvrange(
+            P1,
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601300">>}, <<"000100">>},
+            {?IDX_TAG, {<<"btype">>, <<"bucket">>},
+                {<<"dob_bin">>, <<"19601301">>}, null},
+            16),
     ?assertMatch(500, length(IdxRange1)),
     ?assertMatch(500, length(IdxRange2)),
     ?assertMatch(500, length(IdxRange3)),
@@ -4635,6 +4680,8 @@ range_key_indextermmatch_test() ->
     ?assertMatch(100, length(IdxRange5)),
     ?assertMatch(201, length(IdxRange6)),
     ?assertMatch(300, length(IdxRange7)),
+    ?assertMatch(500, length(IdxRange8)),
+    ?assertMatch(500, length(IdxRange9)),
     ok = sst_close(P1),
     ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
     

From 76ed44dea489d1fd009d9d6123d92199deb7a72a Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 23 Nov 2022 12:41:22 +0000
Subject: [PATCH 21/37] Simplify case statements

We either match the pattern of <<Prefix:N, Suffix>> or the answer should be null
---
 src/leveled_sst.erl | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index f09543f9..747b19ad 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -1834,17 +1834,10 @@ term_filter({_Tag, _Bucket, {_Field, Term}, Key}) -> {Term, Key}.
 key_prefix_filter(N, Prefix) ->
     fun({_Tag, _Bucket, Key, null}) ->
         case Key of
-            null ->
-                null;
-            K when byte_size(K) =< N ->
-                null;
-            <<QueryKey:N/binary, Suffix/binary>> ->
-                case QueryKey of
-                    Prefix ->
-                        Suffix;
-                    _ ->
-                        null
-                end
+            <<Prefix:N/binary, Suffix/binary>> ->
+                Suffix;
+            _ ->
+                null
         end
     end.
 
@@ -1854,19 +1847,13 @@ key_prefix_filter(N, Prefix) ->
 term_prefix_filter(N, Prefix) ->
     fun({_Tag, _Bucket, {_Field, Term}, Key}) ->
         case Term of
-            T when byte_size(T) =< N ->
-                null;
-            <<QueryKey:N/binary, Suffix/binary>> ->
-                case QueryKey of
-                    Prefix ->
-                        {Suffix, Key};
-                    _ ->
-                        null
-                end
+            <<Prefix:N/binary, Suffix/binary>> ->
+                {Suffix, Key};
+            _ ->
+                null
         end
     end.
 
-
 lookup_slot(Key, Tree, FilterFun) ->
     StartKeyFun =
         fun(_V) ->

From 0fe735fa823daf9f92e2d468783fbec64483bd49 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 23 Nov 2022 16:34:52 +0000
Subject: [PATCH 22/37] OK for M == N

If M = N for the first key, it will have a suffix of <<>>.  This will match (as expected) a query Start Key of the sam size, and be smaller than any query Start Key that has the same prefix.

If the query Start Key does not match the prefix - it will be null - as it must be smaller than the Prefix (as other wise the query Start Key would be bigger than the Last Key).

The constraint of M > N was introduced before the *_prefix_filter functions were checking the prefix, to avoid issues.  Now the prefix is being checked, then M == N is ok.
---
 src/leveled_sst.erl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 747b19ad..3ec4134b 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -1783,7 +1783,7 @@ get_filterfun(
         {Tag, Bucket, {Field, FT}, FK}, {Tag, Bucket, {Field, LT}, LK})
             when is_binary(FT), is_binary(FK), is_binary(LT), is_binary(LK) ->
     case {binary:longest_common_prefix([FT, LT]), byte_size(FT)} of
-        {N, M} when N > 0, M > N ->
+        {N, M} when N > 0, M >= N ->
             <<Prefix:N/binary, _Rest/binary>> = FT,
             term_prefix_filter(N, Prefix);
         _ ->
@@ -1793,7 +1793,7 @@ get_filterfun(
         {Tag, Bucket, FK, null}, {Tag, Bucket, LK, null})
             when is_binary(FK), is_binary(LK), FK < LK ->
     case {binary:longest_common_prefix([FK, LK]), byte_size(FK)} of
-        {N, M} when N > 0, M > N ->
+        {N, M} when N > 0, M >= N ->
             <<Prefix:N/binary, _Rest/binary>> = FK,
             key_prefix_filter(N, Prefix);
         _ ->

From 8e2dc4168b05c5c0ac9dcd1622293f3025dba3f5 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 23 Nov 2022 19:52:33 +0000
Subject: [PATCH 23/37] Simplify

Correct the test to use a binary field in the range.

To avoid further issue, only apply filter when everything is a binary() type.
---
 src/leveled_sst.erl | 56 ++++++++++++++++-----------------------------
 1 file changed, 20 insertions(+), 36 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 3ec4134b..fe1cb1c4 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -1776,12 +1776,9 @@ from_list(SlotList, FirstKey, LastKey) ->
         fun((leveled_codec:ledger_key())
             -> leveled_codec:ledger_key()|leveled_codec:slimmed_key()).
 get_filterfun(
-    {Tag, Bucket, {Field, Term}, FK}, {Tag, Bucket, {Field, Term}, LK})
-            when is_binary(Term), is_binary(FK), is_binary(LK) ->
-    subkey_filter(Term);
-get_filterfun(
-        {Tag, Bucket, {Field, FT}, FK}, {Tag, Bucket, {Field, LT}, LK})
-            when is_binary(FT), is_binary(FK), is_binary(LT), is_binary(LK) ->
+        {?IDX_TAG, Bucket, {Field, FT}, FK}, {?IDX_TAG, Bucket, {Field, LT}, LK})
+            when is_binary(Bucket), is_binary(Field),
+            is_binary(FT), is_binary(FK), is_binary(LT), is_binary(LK) ->
     case {binary:longest_common_prefix([FT, LT]), byte_size(FT)} of
         {N, M} when N > 0, M >= N ->
             <<Prefix:N/binary, _Rest/binary>> = FT,
@@ -1791,7 +1788,7 @@ get_filterfun(
     end;
 get_filterfun(
         {Tag, Bucket, FK, null}, {Tag, Bucket, LK, null})
-            when is_binary(FK), is_binary(LK), FK < LK ->
+            when is_binary(Bucket), is_binary(FK), is_binary(LK) ->
     case {binary:longest_common_prefix([FK, LK]), byte_size(FK)} of
         {N, M} when N > 0, M >= N ->
             <<Prefix:N/binary, _Rest/binary>> = FK,
@@ -1809,22 +1806,6 @@ null_filter(Key) -> Key.
 -spec key_filter(leveled_codec:ledger_key()) -> leveled_codec:slimmed_key().
 key_filter({_Tag, _Bucket, Key, null}) -> Key.
 
--spec subkey_filter(
-    binary()) ->
-        fun((leveled_codec:ledger_key()) -> leveled_codec:slimmed_key()).
-subkey_filter(Term) ->
-    fun({_Tag, _Bucket, {_Field, T}, ObjKey}) ->
-        case T of
-            % If the Term does not match we ignore the Key and treat as null
-            % As we can assume that this is a range start/end key which is
-            % before/after any index key in the file
-            Term ->
-                ObjKey;
-            _ ->
-                null
-        end
-    end.
-
 -spec term_filter(leveled_codec:ledger_key()) -> leveled_codec:slimmed_key().
 term_filter({_Tag, _Bucket, {_Field, Term}, Key}) -> {Term, Key}.
 
@@ -4853,6 +4834,7 @@ range_key_lestthanprefix_test() ->
 
 single_key_test() ->
     FileName = "single_key_test",
+    Field = <<"t1_bin">>,
     LK = leveled_codec:to_ledgerkey(<<"Bucket0">>, <<"Key0">>, ?STD_TAG),
     Chunk = leveled_rand:rand_bytes(16),
     {_B, _K, MV, _H, _LMs} =
@@ -4866,7 +4848,7 @@ single_key_test() ->
     ok = sst_close(P1),
     ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")),
 
-    IndexSpecs = [{add, <<"t1_bin">>, <<"20220101">>}],
+    IndexSpecs = [{add, Field, <<"20220101">>}],
     [{IdxK, IdxV}] = 
         leveled_codec:idx_indexspecs(IndexSpecs, 
                                     <<"Bucket">>, 
@@ -4879,36 +4861,38 @@ single_key_test() ->
         [{IdxK, IdxV}],
         sst_getkvrange(
             P2,
-            {?IDX_TAG, <<"Bucket">>, {"t1_bin", <<"20220100">>}, null},
+            {?IDX_TAG, <<"Bucket">>, {Field, <<"20220100">>}, null},
             all,
             16)),
     ?assertMatch(
         [{IdxK, IdxV}],
         sst_getkvrange(
             P2,
-            {?IDX_TAG, <<"Bucket">>, {"t1_bin", <<"20220100">>}, null},
-            {?IDX_TAG, <<"Bucket">>, {"t1_bin", <<"20220101">>}, null},
+            {?IDX_TAG, <<"Bucket">>, {Field, <<"20220100">>}, null},
+            {?IDX_TAG, <<"Bucket">>, {Field, <<"20220101">>}, null},
             16)),
     ?assertMatch(
         [{IdxK, IdxV}],
         sst_getkvrange(
             P2,
-            {?IDX_TAG, <<"Bucket">>, {"t1_bin", <<"20220101">>}, null},
-            {?IDX_TAG, <<"Bucket">>, {"t1_bin", <<"20220101">>}, null},
+            {?IDX_TAG, <<"Bucket">>, {Field, <<"20220101">>}, null},
+            {?IDX_TAG, <<"Bucket">>, {Field, <<"20220101">>}, null},
             16)),
     ok = sst_close(P2),
     ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
 
 strange_range_test() ->
     FileName = "strange_range_test",
-    Chunk = leveled_rand:rand_bytes(16),
+    V = leveled_head:riak_metadata_to_binary(
+        term_to_binary([{"actor1", 1}]),
+        <<1:32/integer, 0:32/integer, 0:32/integer>>),
     OptsSST = 
         #sst_options{press_method=native,
                         log_options=leveled_log:get_opts()},
     
-    FK = leveled_codec:to_ledgerkey({<<"T0">>, <<"B0">>}, <<"K0">>, ?STD_TAG),
-    LK = leveled_codec:to_ledgerkey({<<"T0">>, <<"B0">>}, <<"K02">>, ?STD_TAG),
-    EK = leveled_codec:to_ledgerkey({<<"T0">>, <<"B0">>}, <<"K0299">>, ?STD_TAG),
+    FK = leveled_codec:to_ledgerkey({<<"T0">>, <<"B0">>}, <<"K0">>, ?RIAK_TAG),
+    LK = leveled_codec:to_ledgerkey({<<"T0">>, <<"B0">>}, <<"K02">>, ?RIAK_TAG),
+    EK = leveled_codec:to_ledgerkey({<<"T0">>, <<"B0">>}, <<"K0299">>, ?RIAK_TAG),
 
     KL1 =
         lists:map(
@@ -4916,7 +4900,7 @@ strange_range_test() ->
                 leveled_codec:to_ledgerkey(
                     {<<"T0">>, <<"B0">>},
                     list_to_binary("K00" ++ integer_to_list(I)),
-                    ?STD_TAG)
+                    ?RIAK_TAG)
             end,
             lists:seq(1, 300)),
     KL2 =
@@ -4925,14 +4909,14 @@ strange_range_test() ->
                 leveled_codec:to_ledgerkey(
                     {<<"T0">>, <<"B0">>},
                     list_to_binary("K02" ++ integer_to_list(I)),
-                    ?STD_TAG)
+                    ?RIAK_TAG)
             end,
             lists:seq(1, 300)),
     
     GenerateValue =
         fun(K) ->
             element(
-                3, leveled_codec:generate_ledgerkv(K, 1, Chunk, 16, infinity))
+                3, leveled_codec:generate_ledgerkv(K, 1, V, 16, infinity))
         end,
 
     KVL = 

From e6490d5729fe44499348c3404de2d15f480220fb Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 24 Nov 2022 10:46:15 +0000
Subject: [PATCH 24/37] Add test for head_only mode

When leveled is used as a tictacaae key store (in parallel mode), the keys will be head_only entries.  Double check they are handled as expected like object keys
---
 src/leveled_sst.erl | 82 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index fe1cb1c4..01abd745 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -4829,7 +4829,87 @@ range_key_lestthanprefix_test() ->
     ?assertMatch(99, length(ObjRange7)),
     ?assertMatch(100, length(ObjRange8)),
     ok = sst_close(P2),
-    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")),
+
+    HeadKeyFun =
+        fun(I) ->
+            {{?HEAD_TAG,
+                {<<"btype">>, <<"bucket">>},
+                list_to_binary("19601301|"
+                    ++ io_lib:format("~6..0w", [I])),
+                null},
+            {1, {active, infinity}, {0, 0}, null, undefined}}
+        end,
+    HeadEntries = lists:map(HeadKeyFun, lists:seq(1, 500)),
+    {ok, P3, {_FK3, _LK3}, _Bloom3} = 
+        sst_new(?TEST_AREA, FileName, 1, HeadEntries, 6000, OptsSST),
+
+    HeadRange1 =
+        sst_getkvrange(
+            P3,
+            {?HEAD_TAG, {<<"btype">>, <<"bucket">>}, <<"1959">>, null},
+            all,
+            16),
+    HeadRange2 =
+        sst_getkvrange(
+            P3,
+            {?HEAD_TAG,
+                {<<"btype">>, <<"abucket">>},
+                <<"1962">>, null},
+            {?HEAD_TAG,
+                {<<"btype">>, <<"zbucket">>},
+                <<"1960">>, null},
+            16),
+    HeadRange3 =
+        sst_getkvrange(
+            P3,
+            {?HEAD_TAG, {<<"btype">>, <<"bucket">>},
+                <<"1960">>, null},
+            {?HEAD_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000250">>, null},
+            16),
+    HeadRange4 =
+        sst_getkvrange(
+            P3,
+            {?HEAD_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000251">>, null},
+            {?HEAD_TAG, {<<"btype">>, <<"bucket">>},
+                <<"1961">>, null},
+            16),
+    HeadRange6 =
+        sst_getkvrange(
+            P3,
+            {?HEAD_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000">>, null},
+            {?HEAD_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|0002">>, null},
+            16),
+    HeadRange7 =
+        sst_getkvrange(
+            P3,
+            {?HEAD_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000">>, null},
+            {?HEAD_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|0001">>, null},
+            16),
+    HeadRange8 =
+        sst_getkvrange(
+            P3,
+            {?HEAD_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000000">>, null},
+            {?HEAD_TAG, {<<"btype">>, <<"bucket">>},
+                <<"19601301|000100">>, null},
+            16),
+
+        ?assertMatch(500, length(HeadRange1)),
+        ?assertMatch(500, length(HeadRange2)),
+        ?assertMatch(250, length(HeadRange3)),
+        ?assertMatch(250, length(HeadRange4)),
+        ?assertMatch(199, length(HeadRange6)),
+        ?assertMatch(99, length(HeadRange7)),
+        ?assertMatch(100, length(HeadRange8)),
+        ok = sst_close(P3),
+        ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
     
 
 single_key_test() ->

From df271f044ef07cda265ffe1bccfddbcd4d1fbd20 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 24 Nov 2022 15:18:13 +0000
Subject: [PATCH 25/37] Revert previous change - must support typed buckets

Add assertion to confirm worthwhile optimisation
---
 src/leveled_sst.erl | 79 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 66 insertions(+), 13 deletions(-)

diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index 01abd745..a8411831 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -1776,8 +1776,8 @@ from_list(SlotList, FirstKey, LastKey) ->
         fun((leveled_codec:ledger_key())
             -> leveled_codec:ledger_key()|leveled_codec:slimmed_key()).
 get_filterfun(
-        {?IDX_TAG, Bucket, {Field, FT}, FK}, {?IDX_TAG, Bucket, {Field, LT}, LK})
-            when is_binary(Bucket), is_binary(Field),
+        {?IDX_TAG, B, {Field, FT}, FK}, {?IDX_TAG, B, {Field, LT}, LK})
+            when is_binary(Field),
             is_binary(FT), is_binary(FK), is_binary(LT), is_binary(LK) ->
     case {binary:longest_common_prefix([FT, LT]), byte_size(FT)} of
         {N, M} when N > 0, M >= N ->
@@ -1787,8 +1787,8 @@ get_filterfun(
             fun term_filter/1
     end;
 get_filterfun(
-        {Tag, Bucket, FK, null}, {Tag, Bucket, LK, null})
-            when is_binary(Bucket), is_binary(FK), is_binary(LK) ->
+        {Tag, B, FK, null}, {Tag, B, LK, null})
+            when is_binary(FK), is_binary(LK) ->
     case {binary:longest_common_prefix([FK, LK]), byte_size(FK)} of
         {N, M} when N > 0, M >= N ->
             <<Prefix:N/binary, _Rest/binary>> = FK,
@@ -4673,6 +4673,8 @@ range_key_lestthanprefix_test() ->
     {ok, P1, {_FK1, _LK1}, _Bloom1} = 
         sst_new(?TEST_AREA, FileName, 1, IndexEntries, 6000, OptsSST),
     
+    IndexFileStateSize = size_summary(P1),
+    
     IdxRange1 =
         sst_getkvrange(
             P1,
@@ -4764,6 +4766,8 @@ range_key_lestthanprefix_test() ->
     {ok, P2, {_FK2, _LK2}, _Bloom2} = 
         sst_new(?TEST_AREA, FileName, 1, ObjectEntries, 6000, OptsSST),
     
+    ObjectFileStateSize = size_summary(P2),
+
     ObjRange1 =
         sst_getkvrange(
             P2,
@@ -4844,6 +4848,8 @@ range_key_lestthanprefix_test() ->
     {ok, P3, {_FK3, _LK3}, _Bloom3} = 
         sst_new(?TEST_AREA, FileName, 1, HeadEntries, 6000, OptsSST),
 
+    HeadFileStateSize =  size_summary(P3),
+
     HeadRange1 =
         sst_getkvrange(
             P3,
@@ -4901,15 +4907,62 @@ range_key_lestthanprefix_test() ->
                 <<"19601301|000100">>, null},
             16),
 
-        ?assertMatch(500, length(HeadRange1)),
-        ?assertMatch(500, length(HeadRange2)),
-        ?assertMatch(250, length(HeadRange3)),
-        ?assertMatch(250, length(HeadRange4)),
-        ?assertMatch(199, length(HeadRange6)),
-        ?assertMatch(99, length(HeadRange7)),
-        ?assertMatch(100, length(HeadRange8)),
-        ok = sst_close(P3),
-        ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
+    ?assertMatch(500, length(HeadRange1)),
+    ?assertMatch(500, length(HeadRange2)),
+    ?assertMatch(250, length(HeadRange3)),
+    ?assertMatch(250, length(HeadRange4)),
+    ?assertMatch(199, length(HeadRange6)),
+    ?assertMatch(99, length(HeadRange7)),
+    ?assertMatch(100, length(HeadRange8)),
+    ok = sst_close(P3),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")),
+    
+    [_HdO|RestObjectEntries] = ObjectEntries,
+    [_HdI|RestIndexEntries] = IndexEntries,
+    [_Hdh|RestHeadEntries] = HeadEntries,
+
+    {ok, P4, {_FK4, _LK4}, _Bloom4} = 
+        sst_new(
+            ?TEST_AREA,
+            FileName, 1,
+            [HeadKeyFun(9999)|RestIndexEntries],
+            6000, OptsSST),
+    print_compare_size("Index", IndexFileStateSize, size_summary(P4)),
+    ok = sst_close(P4),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")),
+
+    {ok, P5, {_FK5, _LK5}, _Bloom5} = 
+    sst_new(
+        ?TEST_AREA,
+        FileName, 1,
+        [HeadKeyFun(9999)|RestObjectEntries],
+        6000, OptsSST),
+    print_compare_size("Object", ObjectFileStateSize, size_summary(P5)),
+    ok = sst_close(P5),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")),
+
+    {ok, P6, {_FK6, _LK6}, _Bloom6} = 
+    sst_new(
+        ?TEST_AREA,
+        FileName, 1,
+        RestHeadEntries ++ [IndexKeyFun(1)],
+        6000, OptsSST),
+    print_compare_size("Head", HeadFileStateSize, size_summary(P6)),
+    ok = sst_close(P6),
+    ok = file:delete(filename:join(?TEST_AREA, FileName ++ ".sst")).
+
+size_summary(P) ->
+    Summary = element(2, element(2, sys:get_state(P))),
+    true = is_record(Summary, summary),
+    erts_debug:flat_size(Summary).
+
+print_compare_size(Type, OptimisedSize, UnoptimisedSize) ->
+    io:format(
+        user,
+        "~n~s State optimised to ~w bytes unoptimised ~w bytes~n",
+        [Type, OptimisedSize * 8, UnoptimisedSize * 8]),
+    % Reduced by at least a quarter
+    ?assert(OptimisedSize < (UnoptimisedSize - (UnoptimisedSize div 4))).
     
 
 single_key_test() ->

From 2f745d50380c29b0b98e72c0436a0f68377af421 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 30 Nov 2022 11:28:22 +0000
Subject: [PATCH 26/37] Minimise LoopState mutation on read

Currently, primarily due to timing logs, every read causes a loop state mutation in almost every process it touches (e.g. bookie, penciller and each related sst file for HEAD requests).

Loop state mutations require copies, at least of the size of a word per element (+2), assuming each element is a reference to box (and in a sense doesn't really change).  This is a small cost, but repeated for each operation, may have an impact on both CPU and GC.

This commit moves all timing logs related to requests to a new leveled_monitor process, so that countdowns and accumulators for timing logs aren't required on the Loopstate.  Where possible, LoopState changes on read are then minimised.

Outstanding potential issue is the SST fetch_cache.  It might be preferable to replace the sst local fetch_cache with a ledger_readcache (that gets flushed on every push_mem).  This would stop rotating SST loop state on fetch_cache hits.
---
 include/leveled.hrl            |  12 +-
 src/leveled_bookie.erl         | 636 +++++++++++++--------------------
 src/leveled_cdb.erl            | 246 +++++--------
 src/leveled_log.erl            |  24 +-
 src/leveled_monitor.erl        | 545 ++++++++++++++++++++++++++++
 src/leveled_penciller.erl      | 229 ++++--------
 src/leveled_sst.erl            | 443 ++++++++++-------------
 test/end_to_end/riak_SUITE.erl |  34 +-
 8 files changed, 1176 insertions(+), 993 deletions(-)
 create mode 100644 src/leveled_monitor.erl

diff --git a/include/leveled.hrl b/include/leveled.hrl
index e17698ea..97eaebc8 100644
--- a/include/leveled.hrl
+++ b/include/leveled.hrl
@@ -48,7 +48,8 @@
                         binary_mode = false :: boolean(),
                         sync_strategy = sync,
                         log_options = leveled_log:get_opts() 
-                            :: leveled_log:log_options()}).
+                            :: leveled_log:log_options(),
+                        monitor = {no_monitor, 0} :: leveled_monitor:monitor()}).
 
 -record(sst_options,
                         {press_method = native
@@ -56,7 +57,8 @@
                         log_options = leveled_log:get_opts() 
                             :: leveled_log:log_options(),
                         max_sstslots = 256 :: pos_integer(),
-                        pagecache_level = 1 :: pos_integer()}).
+                        pagecache_level = 1 :: pos_integer(),
+                        monitor = {no_monitor, 0} :: leveled_monitor:monitor()}).
 
 -record(inker_options,
                         {cdb_max_size :: integer() | undefined,
@@ -73,7 +75,8 @@
                         singlefile_compactionperc :: float()|undefined,
                         maxrunlength_compactionperc :: float()|undefined,
                         score_onein = 1 :: pos_integer(),
-                        snaptimeout_long :: pos_integer() | undefined}).
+                        snaptimeout_long :: pos_integer() | undefined,
+                        monitor = {no_monitor, 0} :: leveled_monitor:monitor()}).
 
 -record(penciller_options,
                         {root_path :: string() | undefined,
@@ -88,7 +91,8 @@
                         compression_method = native :: lz4|native,
                         levelzero_cointoss = false :: boolean(),
                         snaptimeout_short :: pos_integer() | undefined,
-                        snaptimeout_long :: pos_integer() | undefined}).
+                        snaptimeout_long :: pos_integer() | undefined,
+                        monitor = {no_monitor, 0} :: leveled_monitor:monitor()}).
 
 -record(iclerk_options,
                         {inker :: pid() | undefined,
diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl
index 2c4f328e..bc0f9852 100644
--- a/src/leveled_bookie.erl
+++ b/src/leveled_bookie.erl
@@ -94,7 +94,7 @@
         ]).
 
 -export([empty_ledgercache/0,
-            snapshot_store/6,
+            snapshot_store/7,
             fetch_value/2,
             journal_notfound/4]).
 
@@ -131,6 +131,7 @@
 -define(SST_PAGECACHELEVEL_NOLOOKUP, 1).
 -define(SST_PAGECACHELEVEL_LOOKUP, 4).
 -define(CACHE_LOGPOINT, 50000).
+-define(DEFAULT_STATS_PERC, 10).
 -define(OPTION_DEFAULTS,
             [{root_path, undefined},
                 {snapshot_bookie, undefined},
@@ -156,7 +157,8 @@
                 {database_id, ?DEFAULT_DBID},
                 {override_functions, []},
                 {snapshot_timeout_short, ?SNAPTIMEOUT_SHORT},
-                {snapshot_timeout_long, ?SNAPTIMEOUT_LONG}]).
+                {snapshot_timeout_long, ?SNAPTIMEOUT_LONG},
+                {stats_frequency, ?DEFAULT_STATS_PERC}]).
 
 -record(ledger_cache, {mem :: ets:tab(),
                         loader = leveled_tree:empty(?CACHE_TYPE)
@@ -169,58 +171,19 @@
 -record(state, {inker :: pid() | undefined,
                 penciller :: pid() | undefined,
                 cache_size :: pos_integer() | undefined,
-                cache_multiple :: pos_integer() |undefined,
+                cache_multiple :: pos_integer() | undefined,
                 ledger_cache = #ledger_cache{} :: ledger_cache(),
                 is_snapshot :: boolean() | undefined,
                 slow_offer = false :: boolean(),
-
                 head_only = false :: boolean(),
                 head_lookup = true :: boolean(),
-
                 ink_checking = ?MAX_KEYCHECK_FREQUENCY :: integer(),
-
-                put_countdown = 0 :: integer(),
-                get_countdown = 0 :: integer(),
-                snapshot_countdown = 0 :: integer(),
-                head_countdown = 0 :: integer(),
-                cache_ratio = {0, 0, 0} :: cache_ratio(),
-                get_timings = no_timing :: get_timings(),
-                put_timings = no_timing :: put_timings(),
-                snapshot_timings = no_timing :: snapshot_timings(),
-                head_timings = no_timing :: head_timings()}).
-
-
--record(get_timings, {sample_count = 0 :: integer(),
-                        head_time = 0 :: integer(),
-                        body_time = 0 :: integer(),
-                        fetch_count = 0 :: integer()}).
-
--record(head_timings, {sample_count = 0 :: integer(),
-                        pcl_time = 0 :: integer(),
-                        buildhead_time = 0 :: integer()}).
-
--record(put_timings, {sample_count = 0 :: integer(),
-                        mem_time = 0 :: integer(),
-                        ink_time = 0 :: integer(),
-                        total_size = 0 :: integer()}).
-
--record(snapshot_timings, {sample_count = 0 :: integer(),
-                            bookie_time = 0 :: integer(),
-                            pcl_time = 0 :: integer()}).
+                monitor = {no_monitor, 0} :: leveled_monitor:monitor()}).
 
 
 -type book_state() :: #state{}.
 -type sync_mode() :: sync|none|riak_sync.
 -type ledger_cache() :: #ledger_cache{}.
--type get_timings() :: no_timing|#get_timings{}.
--type put_timings() :: no_timing|#put_timings{}.
--type snapshot_timings() :: no_timing|#snapshot_timings{}.
--type head_timings() :: no_timing|#head_timings{}.
--type timings() ::
-    put_timings()|get_timings()|snapshot_timings()|head_timings().
--type timing_types() :: head|get|put|snapshot.
--type cache_ratio() ::
-    {non_neg_integer(), non_neg_integer(), non_neg_integer()}.
 
 
 -type open_options() :: 
@@ -391,11 +354,14 @@
             % assumed to have failed, and so requires to be torndown.  The
             % short timeout is applied to queries where long_running is set to
             % false
-        {snapshot_timeout_long, pos_integer()}
+        {snapshot_timeout_long, pos_integer()} |
             % Time in seconds before a snapshot that has not been shutdown is
             % assumed to have failed, and so requires to be torndown.  The
             % short timeout is applied to queries where long_running is set to
             % true
+        {stats_frequency, 0..100}
+            % Probability that stats will be collected for an individual
+            % request
         ].
 
 -type initial_loadfun() ::
@@ -1088,7 +1054,6 @@ book_headfold(Pid, Tag, all, FoldAccT, JournalCheck, SnapPreFold,
 book_snapshot(Pid, SnapType, Query, LongRunning) ->
     gen_server:call(Pid, {snapshot, SnapType, Query, LongRunning}, infinity).
 
-
 -spec book_compactjournal(pid(), integer()) -> ok|busy.
 -spec book_islastcompactionpending(pid()) -> boolean().
 -spec book_trimjournal(pid()) -> ok.
@@ -1203,7 +1168,11 @@ init([Opts]) ->
             DatabaseID = proplists:get_value(database_id, Opts),
             leveled_log:set_databaseid(DatabaseID),
 
-            {InkerOpts, PencillerOpts} = set_options(Opts),
+            {ok, Monitor} = leveled_monitor:monitor_start(),
+            StatLogFrequency = proplists:get_value(stats_frequency, Opts),
+
+            {InkerOpts, PencillerOpts} =
+                set_options(Opts, {Monitor, StatLogFrequency}),
 
             OverrideFunctions = proplists:get_value(override_functions, Opts),
             SetFun =
@@ -1252,45 +1221,45 @@ init([Opts]) ->
             SSTOpts = PencillerOpts#penciller_options.sst_options,
             SSTOpts0 = SSTOpts#sst_options{pagecache_level = SSTPageCacheLevel},
             PencillerOpts0 =
-                PencillerOpts#penciller_options{sst_options = SSTOpts0},
-            
-            State0 = 
-                #state{
-                    cache_size=CacheSize,
-                    cache_multiple = MaxCacheMultiple,
-                    is_snapshot=false,
-                    head_only=HeadOnly,
-                    head_lookup = HeadLookup},
+                PencillerOpts#penciller_options{sst_options = SSTOpts0},            
 
-            {Inker, Penciller} = 
-                startup(InkerOpts, PencillerOpts0, State0),
+            {Inker, Penciller} =  startup(InkerOpts, PencillerOpts0),
 
             NewETS = ets:new(mem, [ordered_set]),
             leveled_log:log("B0001", [Inker, Penciller]),
-            {ok, State0#state{inker=Inker,
-                                penciller=Penciller,
-                                ledger_cache=#ledger_cache{mem = NewETS}}};
+            {ok, 
+                #state{
+                    cache_size = CacheSize,
+                    cache_multiple = MaxCacheMultiple,
+                    is_snapshot = false,
+                    head_only = HeadOnly,
+                    head_lookup = HeadLookup,
+                    inker = Inker,
+                    penciller = Penciller,
+                    ledger_cache = #ledger_cache{mem = NewETS},
+                    monitor = {Monitor, StatLogFrequency}}};
         {Bookie, undefined} ->
             {ok, Penciller, Inker} = 
                 book_snapshot(Bookie, store, undefined, true),
             leveled_log:log("B0002", [Inker, Penciller]),
-            {ok, #state{penciller=Penciller,
-                        inker=Inker,
-                        is_snapshot=true}}
+            {ok,
+                #state{penciller = Penciller,
+                        inker = Inker,
+                        is_snapshot = true}}
     end.
 
 
 handle_call({put, Bucket, Key, Object, IndexSpecs, Tag, TTL, DataSync},
                 From, State) when State#state.head_only == false ->
     LedgerKey = leveled_codec:to_ledgerkey(Bucket, Key, Tag),
-    SW0 = os:timestamp(),
+    SWLR = os:timestamp(),
+    SW0 = leveled_monitor:maybe_time(State#state.monitor),
     {ok, SQN, ObjSize} = leveled_inker:ink_put(State#state.inker,
                                                LedgerKey,
                                                Object,
                                                {IndexSpecs, TTL},
                                                DataSync),
-    {SW1, Timings1} = 
-        update_timings(SW0, {put, {inker, ObjSize}}, State#state.put_timings),
+    {T0, SW1} =  leveled_monitor:step_time(SW0),
     Changes = preparefor_ledgercache(null,
                                         LedgerKey,
                                         SQN,
@@ -1298,35 +1267,25 @@ handle_call({put, Bucket, Key, Object, IndexSpecs, Tag, TTL, DataSync},
                                         ObjSize,
                                         {IndexSpecs, TTL}),
     Cache0 = addto_ledgercache(Changes, State#state.ledger_cache),
-    {_SW2, Timings2} = update_timings(SW1, {put, mem}, Timings1),
+    {T1, _SW2} = leveled_monitor:step_time(SW1),
 
-    {Timings, CountDown} = 
-        update_statetimings(put, Timings2, State#state.put_countdown),
-        % If the previous push to memory was returned then punish this PUT with
-        % a delay.  If the back-pressure in the Penciller continues, these 
-        % delays will beocme more frequent
     case State#state.slow_offer of
         true ->
             gen_server:reply(From, pause);
         false ->
             gen_server:reply(From, ok)
     end,
-    maybe_longrunning(SW0, overall_put),
+    maybe_longrunning(SWLR, overall_put),
+    maybelog_put_timing(State#state.monitor, T0, T1, ObjSize),
     case maybepush_ledgercache(
-        State#state.cache_size,
-        State#state.cache_multiple,
-        Cache0,
-        State#state.penciller) of
-        {ok, NewCache} ->
-            {noreply, State#state{ledger_cache = NewCache,
-                                  put_timings = Timings,
-                                  put_countdown = CountDown,
-                                  slow_offer = false}};
-        {returned, NewCache} ->
-            {noreply, State#state{ledger_cache = NewCache,
-                                  put_timings = Timings,
-                                  put_countdown = CountDown,
-                                  slow_offer = true}}
+            State#state.cache_size,
+            State#state.cache_multiple,
+            Cache0,
+            State#state.penciller) of
+        {ok, Cache} ->
+            {noreply, State#state{slow_offer = false, ledger_cache = Cache}};
+        {returned, Cache} ->
+            {noreply, State#state{slow_offer = true, ledger_cache = Cache}}
     end;
 handle_call({mput, ObjectSpecs, TTL}, From, State) 
                                         when State#state.head_only == true ->
@@ -1344,26 +1303,24 @@ handle_call({mput, ObjectSpecs, TTL}, From, State)
             gen_server:reply(From, ok)
     end,
     case maybepush_ledgercache(
-        State#state.cache_size,
-        State#state.cache_multiple,
-        Cache0,
-        State#state.penciller) of
-        {ok, NewCache} ->
-            {noreply, State#state{ledger_cache = NewCache,
-                                    slow_offer = false}};
-        {returned, NewCache} ->
-            {noreply, State#state{ledger_cache = NewCache,
-                                    slow_offer = true}}
+            State#state.cache_size,
+            State#state.cache_multiple,
+            Cache0,
+            State#state.penciller) of
+        {ok, Cache} ->
+            {noreply, State#state{ledger_cache = Cache, slow_offer = false}};
+        {returned, Cache} ->
+            {noreply, State#state{ledger_cache = Cache, slow_offer = true}}
     end;
 handle_call({get, Bucket, Key, Tag}, _From, State) 
                                         when State#state.head_only == false ->
     LedgerKey = leveled_codec:to_ledgerkey(Bucket, Key, Tag),
-    SWh = os:timestamp(),
-    {H0, UpdCR} =
+    SW0 = leveled_monitor:maybe_time(State#state.monitor),
+    {H0, CacheHit} =
         fetch_head(LedgerKey,
                     State#state.penciller,
-                    State#state.ledger_cache,
-                    State#state.cache_ratio),
+                    State#state.ledger_cache),
+    {TS0, SW1} = leveled_monitor:step_time(SW0),
     HeadResult = 
         case H0 of
             not_present ->
@@ -1383,72 +1340,64 @@ handle_call({get, Bucket, Key, Tag}, _From, State)
                         end
                 end
         end,
-    {SWb, Timings1} = 
-        update_timings(SWh, {get, head}, State#state.get_timings),
-    {Reply, Timings2} = 
+    {TS1, SW2} = leveled_monitor:step_time(SW1),
+    maybelog_head_timing(
+        State#state.monitor, TS0, TS1, HeadResult == not_found, CacheHit),
+    GetResult = 
         case HeadResult of 
             not_found -> 
-                {not_found, Timings1};
+                not_found;
             {LK, SQN} ->
                 Object = fetch_value(State#state.inker, {LK, SQN}),
-                {_SW, UpdTimingsB} =
-                    update_timings(SWb, {get, body}, Timings1),
                 case Object of 
                     not_present ->
-                        {not_found, UpdTimingsB};
+                        not_found;
                     _ ->
-                        {{ok, Object}, UpdTimingsB} 
+                        {ok, Object}
                 end 
         end,
-    {Timings, CountDown} = 
-        update_statetimings(get, Timings2, State#state.get_countdown),
-    {reply,
-        Reply,
-        State#state{get_timings = Timings, 
-                    get_countdown = CountDown,
-                    cache_ratio =
-                        maybelog_cacheratio(UpdCR, State#state.is_snapshot)}};
+    {TS2, _SW3} = leveled_monitor:step_time(SW2),
+    maybelog_get_timing(
+        State#state.monitor, TS1, TS2, GetResult == not_found),
+    {reply, GetResult, State};
 handle_call({head, Bucket, Key, Tag, SQNOnly}, _From, State) 
                                         when State#state.head_lookup == true ->
-    SWp = os:timestamp(),
+    SW0 = leveled_monitor:maybe_time(State#state.monitor),
     LK = leveled_codec:to_ledgerkey(Bucket, Key, Tag),
-    {Head, UpdCR} =
+    {Head, CacheHit} =
         fetch_head(LK, 
                     State#state.penciller, 
                     State#state.ledger_cache,
-                    State#state.cache_ratio,
                     State#state.head_only),
-    {SWr, UpdTimingsP} = 
-            update_timings(SWp, {head, pcl}, State#state.head_timings),
-    {LedgerMD, SQN, JournalCheckFrequency} =
+    {TS0, SW1} = leveled_monitor:step_time(SW0),
+    JrnalCheckFreq =
+        case State#state.head_only of
+            true ->
+                0;
+            false ->
+                State#state.ink_checking
+        end,
+    {LedgerMD, SQN, UpdJrnalCheckFreq} =
         case Head of
             not_present ->
-                {not_found, null, State#state.ink_checking};
+                {not_found, null, JrnalCheckFreq};
             Head ->
                 case leveled_codec:striphead_to_v1details(Head) of
                     {_SeqN, tomb, _MH, _MD} ->
-                        {not_found, null, State#state.ink_checking};
+                        {not_found, null, JrnalCheckFreq};
                     {SeqN, {active, TS}, _MH, MD} ->
                         case TS >= leveled_util:integer_now() of
                             true ->
-                                CheckFrequency =
-                                    case State#state.head_only of
-                                        true ->
-                                            0;
-                                        false ->
-                                            State#state.ink_checking
-                                    end,
-                                case journal_notfound(CheckFrequency, 
-                                                        State#state.inker,
-                                                        LK,
-                                                        SeqN) of
+                                I = State#state.inker,
+                                case journal_notfound(
+                                        JrnalCheckFreq, I, LK, SeqN) of
                                     {true, UppedFrequency} ->
                                         {not_found, null, UppedFrequency};
                                     {false, ReducedFrequency} ->
                                         {MD, SeqN, ReducedFrequency}
                                 end;
                             false ->
-                                {not_found, null, State#state.ink_checking}
+                                {not_found, null, JrnalCheckFreq}
                         end
                 end
         end,
@@ -1461,33 +1410,29 @@ handle_call({head, Bucket, Key, Tag, SQNOnly}, _From, State)
             {_, true} ->
                 {ok, SQN}
         end,
-    {_SW, UpdTimingsR} = 
-        update_timings(SWr, {head, rsp}, UpdTimingsP),
-    {UpdTimings, CountDown} =
-        update_statetimings(head, 
-                            UpdTimingsR, 
-                            State#state.head_countdown),
-
-    {reply, 
-        Reply, 
-        State#state{head_timings = UpdTimings,
-                    head_countdown = CountDown,
-                    ink_checking = JournalCheckFrequency,
-                    cache_ratio =
-                        maybelog_cacheratio(UpdCR, State#state.is_snapshot)}};
+    {TS1, _SW2} = leveled_monitor:step_time(SW1),
+    maybelog_head_timing(
+        State#state.monitor, TS0, TS1, LedgerMD == not_found, CacheHit),
+    case UpdJrnalCheckFreq of
+        JrnalCheckFreq ->
+            {reply, Reply, State};
+        UpdJrnalCheckFreq ->
+            {reply, Reply, State#state{ink_checking = UpdJrnalCheckFreq}}
+    end;
 handle_call({snapshot, SnapType, Query, LongRunning}, _From, State) ->
     % Snapshot the store, specifying if the snapshot should be long running 
     % (i.e. will the snapshot be queued or be required for an extended period 
     % e.g. many minutes)
-    {ok, PclSnap, InkSnap, Timings} =
-        snapshot_store(State, SnapType, Query, LongRunning),
-    {UpdTimings, CountDown} =
-        update_statetimings(snapshot, Timings, State#state.snapshot_countdown),
-    {reply,
-        {ok, PclSnap, InkSnap},
-        State#state{
-            snapshot_timings = UpdTimings,
-            snapshot_countdown = CountDown}};
+    {ok, PclSnap, InkSnap} =
+        snapshot_store(
+            State#state.ledger_cache,
+            State#state.penciller,
+            State#state.inker,
+            State#state.monitor,
+            SnapType,
+            Query,
+            LongRunning),
+    {reply, {ok, PclSnap, InkSnap},State};
 handle_call(log_settings, _From, State) ->
     {reply, leveled_log:return_settings(), State};
 handle_call({return_runner, QueryType}, _From, State) ->
@@ -1499,19 +1444,27 @@ handle_call({compact_journal, Timeout}, From, State)
         true ->
             {reply, {busy, undefined}, State};
         false ->
-            {ok, PclSnap, null, _Timings} =
-                snapshot_store(State, ledger, undefined, true),
+            {ok, PclSnap, null} =
+            snapshot_store(
+                State#state.ledger_cache,
+                State#state.penciller,
+                State#state.inker,
+                State#state.monitor,
+                ledger,
+                undefined,
+                true),
             R = leveled_inker:ink_compactjournal(State#state.inker,
                                                     PclSnap,
                                                     Timeout),
             gen_server:reply(From, R),
-            {_, NewCache} = 
-                maybepush_ledgercache(
+            case maybepush_ledgercache(
                     State#state.cache_size,
                     State#state.cache_multiple,
                     State#state.ledger_cache,
-                    State#state.penciller),
-            {noreply, State#state{ledger_cache = NewCache}}
+                    State#state.penciller) of
+                {_, NewCache} ->
+                    {noreply, State#state{ledger_cache = NewCache}}
+            end
     end;
 handle_call(confirm_compact, _From, State)
                                         when State#state.head_only == false ->
@@ -1537,11 +1490,13 @@ handle_call(hot_backup, _From, State) when State#state.head_only == false ->
 handle_call(close, _From, State) ->
     leveled_inker:ink_close(State#state.inker),
     leveled_penciller:pcl_close(State#state.penciller),
+    leveled_monitor:monitor_close(element(1, State#state.monitor)),
     {stop, normal, ok, State};
 handle_call(destroy, _From, State=#state{is_snapshot=Snp}) when Snp == false ->
     leveled_log:log("B0011", []),
     {ok, InkPathList} = leveled_inker:ink_doom(State#state.inker),
     {ok, PCLPathList} = leveled_penciller:pcl_doom(State#state.penciller),
+    leveled_monitor:monitor_close(element(1, State#state.monitor)),
     lists:foreach(fun(DirPath) -> delete_path(DirPath) end, InkPathList),
     lists:foreach(fun(DirPath) -> delete_path(DirPath) end, PCLPathList),
     {stop, normal, ok, State};
@@ -1554,22 +1509,28 @@ handle_call(Msg, _From, State) ->
 handle_cast({log_level, LogLevel}, State) ->
     PCL = State#state.penciller,
     INK = State#state.inker,
+    Monitor = element(1, State#state.monitor),
     ok = leveled_penciller:pcl_loglevel(PCL, LogLevel),
     ok = leveled_inker:ink_loglevel(INK, LogLevel),
+    ok = leveled_monitor:log_level(Monitor, LogLevel),
     ok = leveled_log:set_loglevel(LogLevel),
     {noreply, State};
 handle_cast({add_logs, ForcedLogs}, State) ->
     PCL = State#state.penciller,
     INK = State#state.inker,
+    Monitor = element(1, State#state.monitor),
     ok = leveled_penciller:pcl_addlogs(PCL, ForcedLogs),
     ok = leveled_inker:ink_addlogs(INK, ForcedLogs),
+    ok = leveled_monitor:log_add(Monitor, ForcedLogs),
     ok = leveled_log:add_forcedlogs(ForcedLogs),
     {noreply, State};
 handle_cast({remove_logs, ForcedLogs}, State) ->
     PCL = State#state.penciller,
     INK = State#state.inker,
+    Monitor = element(1, State#state.monitor),
     ok = leveled_penciller:pcl_removelogs(PCL, ForcedLogs),
     ok = leveled_inker:ink_removelogs(INK, ForcedLogs),
+    ok = leveled_monitor:log_remove(Monitor, ForcedLogs),
     ok = leveled_log:remove_forcedlogs(ForcedLogs),
     {noreply, State}.
 
@@ -1638,11 +1599,11 @@ loadqueue_ledgercache(Cache) ->
 -spec snapshot_store(ledger_cache(), 
                         pid(),
                         null|pid(),
-                        snapshot_timings(),
+                        leveled_monitor:monitor(),
                         store|ledger, 
                         undefined|tuple(),
                         undefined|boolean()) ->
-                            {ok, pid(), pid()|null, snapshot_timings()}.
+                            {ok, pid(), pid()|null}.
 %% @doc 
 %% Allow all a snapshot to be created from part of the store, preferably
 %% passing in a query filter so that all of the LoopState does not need to
@@ -1658,8 +1619,8 @@ loadqueue_ledgercache(Cache) ->
 %% lookup is required but the range isn't defined then 'undefined' should be 
 %% passed as the query
 snapshot_store(
-        LedgerCache, Penciller, Inker, Timings, SnapType, Query, LongRunning) ->
-    TS0 = os:timestamp(),
+        LedgerCache, Penciller, Inker, Monitor, SnapType, Query, LongRunning) ->
+    SW0 = leveled_monitor:maybe_time(Monitor),
     LedgerCacheReady = readycache_forsnapshot(LedgerCache, Query),
     BookiesMem = {LedgerCacheReady#ledger_cache.loader,
                     LedgerCacheReady#ledger_cache.index,
@@ -1672,33 +1633,21 @@ snapshot_store(
                             snapshot_longrunning = LongRunning,
 				            bookies_pid = self(),
                             bookies_mem = BookiesMem},
-    {TS1, Timings1} = update_timings(TS0, {snapshot, bookie}, Timings), 
+    {TS0, SW1} = leveled_monitor:step_time(SW0),
     {ok, LedgerSnapshot} = leveled_penciller:pcl_snapstart(PCLopts),
-    {_TS2, Timings2} = update_timings(TS1, {snapshot, pcl}, Timings1),
+    {TS1, _SW2} = leveled_monitor:step_time(SW1),
+    ok = maybelog_snap_timing(Monitor, TS0, TS1),
     case SnapType of
         store ->
-            InkerOpts = #inker_options{start_snapshot=true,
+            InkerOpts = #inker_options{start_snapshot = true,
                                        bookies_pid = self(),
-                                       source_inker=Inker},
+                                       source_inker = Inker},
             {ok, JournalSnapshot} = leveled_inker:ink_snapstart(InkerOpts),
-            {ok, LedgerSnapshot, JournalSnapshot, Timings2};
+            {ok, LedgerSnapshot, JournalSnapshot};
         ledger ->
-            {ok, LedgerSnapshot, null, Timings2}
+            {ok, LedgerSnapshot, null}
     end.
 
-snapshot_store(LedgerCache, Penciller, Inker, SnapType, Query, LongRunning) ->
-    snapshot_store(
-        LedgerCache, Penciller, Inker, no_timing, SnapType, Query, LongRunning).
-
-snapshot_store(State, SnapType, Query, LongRunning) ->
-    snapshot_store(State#state.ledger_cache,
-                    State#state.penciller,
-                    State#state.inker,
-                    State#state.snapshot_timings,
-                    SnapType,
-                    Query,
-                    LongRunning).
-
 
 -spec fetch_value(pid(), leveled_codec:journal_ref()) -> not_present|any().
 %% @doc
@@ -1718,20 +1667,19 @@ fetch_value(Inker, {Key, SQN}) ->
 %%% Internal functions
 %%%============================================================================
 
--spec startup(#inker_options{}, #penciller_options{}, book_state()) 
-                                                            -> {pid(), pid()}.
+-spec startup(#inker_options{}, #penciller_options{}) -> {pid(), pid()}.
 %% @doc
 %% Startup the Inker and the Penciller, and prompt the loading of the Penciller
 %% from the Inker.  The Penciller may be shutdown without the latest data 
 %% having been persisted: and so the Iker must be able to update the Penciller
 %% on startup with anything that happened but wasn't flushed to disk.
-startup(InkerOpts, PencillerOpts, State) ->
+startup(InkerOpts, PencillerOpts) ->
     {ok, Inker} = leveled_inker:ink_start(InkerOpts),
     {ok, Penciller} = leveled_penciller:pcl_start(PencillerOpts),
     LedgerSQN = leveled_penciller:pcl_getstartupsequencenumber(Penciller),
     leveled_log:log("B0005", [LedgerSQN]),
     ReloadStrategy = InkerOpts#inker_options.reload_strategy,
-    LoadFun = get_loadfun(ReloadStrategy, Penciller, State),
+    LoadFun = get_loadfun(ReloadStrategy, Penciller),
     BatchFun = 
         fun(BatchAcc, _Acc) ->
             push_to_penciller(Penciller, BatchAcc)
@@ -1759,11 +1707,13 @@ set_defaults(Opts) ->
                     lists:ukeysort(1, Opts), 
                     lists:ukeysort(1, ?OPTION_DEFAULTS)).
 
--spec set_options(open_options()) -> {#inker_options{}, #penciller_options{}}.
+-spec set_options(
+    open_options(), leveled_monitor:monitor()) ->
+        {#inker_options{}, #penciller_options{}}.
 %% @doc
 %% Take the passed in property list of operations and extract out any relevant
 %% options to the Inker or the Penciller
-set_options(Opts) ->
+set_options(Opts, Monitor) ->
     MaxJournalSize0 = 
         min(?ABSOLUTEMAX_JOURNALSIZE, 
             proplists:get_value(max_journalsize, Opts)),
@@ -1821,30 +1771,36 @@ set_options(Opts) ->
     ScoreOneIn = proplists:get_value(journalcompaction_scoreonein, Opts),
 
     {#inker_options{root_path = JournalFP,
-                        reload_strategy = ReloadStrategy,
-                        max_run_length = proplists:get_value(max_run_length, Opts),
-                        singlefile_compactionperc = SFL_CompPerc,
-                        maxrunlength_compactionperc = MRL_CompPerc,
-                        waste_retention_period = WRP,
-                        snaptimeout_long = SnapTimeoutLong,
-                        compression_method = CompressionMethod,
-                        compress_on_receipt = CompressOnReceipt,
-                        score_onein = ScoreOneIn,
-                        cdb_options = 
-                            #cdb_options{max_size=MaxJournalSize,
-                                        max_count=MaxJournalCount,
-                                        binary_mode=true,
-                                        sync_strategy=SyncStrat,
-                                        log_options=leveled_log:get_opts()}},
+                    reload_strategy = ReloadStrategy,
+                    max_run_length = proplists:get_value(max_run_length, Opts),
+                    singlefile_compactionperc = SFL_CompPerc,
+                    maxrunlength_compactionperc = MRL_CompPerc,
+                    waste_retention_period = WRP,
+                    snaptimeout_long = SnapTimeoutLong,
+                    compression_method = CompressionMethod,
+                    compress_on_receipt = CompressOnReceipt,
+                    score_onein = ScoreOneIn,
+                    cdb_options = 
+                        #cdb_options{
+                            max_size = MaxJournalSize,
+                            max_count = MaxJournalCount,
+                            binary_mode = true,
+                            sync_strategy = SyncStrat,
+                            log_options = leveled_log:get_opts(),
+                            monitor = Monitor},
+                    monitor = Monitor},
         #penciller_options{root_path = LedgerFP,
                             max_inmemory_tablesize = PCLL0CacheSize,
                             levelzero_cointoss = true,
                             snaptimeout_short = SnapTimeoutShort,
                             snaptimeout_long = SnapTimeoutLong,
                             sst_options =
-                                #sst_options{press_method=CompressionMethod,
-                                            log_options=leveled_log:get_opts(),
-                                            max_sstslots=MaxSSTSlots}}
+                                #sst_options{
+                                    press_method = CompressionMethod,
+                                    log_options = leveled_log:get_opts(),
+                                    max_sstslots = MaxSSTSlots,
+                                    monitor = Monitor},
+                            monitor = Monitor}
         }.
 
 
@@ -1864,8 +1820,15 @@ set_options(Opts) ->
 return_snapfun(State, SnapType, Query, LongRunning, SnapPreFold) ->
     case SnapPreFold of
         true ->
-            {ok, LS, JS, _Timings} =
-                snapshot_store(State, SnapType, Query, LongRunning),
+            {ok, LS, JS} =
+                snapshot_store(
+                    State#state.ledger_cache,
+                    State#state.penciller,
+                    State#state.inker,
+                    State#state.monitor,
+                    SnapType,
+                    Query,
+                    LongRunning),
             fun() -> {ok, LS, JS} end;
         false ->
             Self = self(),
@@ -2190,24 +2153,20 @@ scan_table(Table, StartKey, EndKey, Acc, MinSQN, MaxSQN) ->
     end.
 
 
--spec fetch_head(leveled_codec:ledger_key(), pid(), ledger_cache(),
-                    cache_ratio()) -> 
-                        {not_present|leveled_codec:ledger_value(),
-                            cache_ratio()}.
+-spec fetch_head(leveled_codec:ledger_key(), pid(), ledger_cache())
+                    -> {not_present|leveled_codec:ledger_value(), boolean()}.
 %% @doc
 %% Fetch only the head of the object from the Ledger (or the bookie's recent
 %% ledger cache if it has just been updated).  not_present is returned if the 
 %% Key is not found
-fetch_head(Key, Penciller, LedgerCache, CacheRatio) ->
-    fetch_head(Key, Penciller, LedgerCache, CacheRatio, false).
+fetch_head(Key, Penciller, LedgerCache) ->
+    fetch_head(Key, Penciller, LedgerCache, false).
 
--spec fetch_head(leveled_codec:ledger_key(), pid(), ledger_cache(),
-                    cache_ratio(), boolean())
-                        -> {not_present|leveled_codec:ledger_value(),
-                            cache_ratio()}.
+-spec fetch_head(leveled_codec:ledger_key(), pid(), ledger_cache(), boolean())
+                    -> {not_present|leveled_codec:ledger_value(), boolean()}.
 %% doc
 %% The L0Index needs to be bypassed when running head_only
-fetch_head(Key, Penciller, LedgerCache, {RC, CC, HC}, HeadOnly) ->
+fetch_head(Key, Penciller, LedgerCache, HeadOnly) ->
     SW = os:timestamp(),
     CacheResult =
         case LedgerCache#ledger_cache.mem of
@@ -2218,7 +2177,7 @@ fetch_head(Key, Penciller, LedgerCache, {RC, CC, HC}, HeadOnly) ->
         end,
     case CacheResult of
         [{Key, Head}] ->
-            {Head, {RC + 1, CC + 1, HC + 1}};
+            {Head, true};
         [] ->
             Hash = leveled_codec:segment_hash(Key),
             UseL0Idx = not HeadOnly, 
@@ -2227,10 +2186,10 @@ fetch_head(Key, Penciller, LedgerCache, {RC, CC, HC}, HeadOnly) ->
             case leveled_penciller:pcl_fetch(Penciller, Key, Hash, UseL0Idx) of
                 {Key, Head} ->
                     maybe_longrunning(SW, pcl_head),
-                    {Head, {RC + 1, CC, HC + 1}};
+                    {Head, false};
                 not_present ->
                     maybe_longrunning(SW, pcl_head),
-                    {not_present, {RC + 1, CC, HC}}
+                    {not_present, false}
             end
     end.
 
@@ -2432,7 +2391,7 @@ maybepush_ledgercache(MaxCacheSize, MaxCacheMult, Cache, Penciller) ->
                     {returned, Cache}
             end;
         true ->
-             {ok, Cache}
+            {ok, Cache}
     end.
 
 -spec maybe_withjitter(
@@ -2448,12 +2407,12 @@ maybe_withjitter(_CacheSize, _MaxCacheSize, _MaxCacheMult) ->
     false.
 
 
--spec get_loadfun(leveled_codec:compaction_strategy(), pid(), book_state())
-                    -> initial_loadfun().
+-spec get_loadfun(
+    leveled_codec:compaction_strategy(), pid()) -> initial_loadfun().
 %% @doc
 %% The LoadFun will be used by the Inker when walking across the Journal to 
 %% load the Penciller at startup.  
-get_loadfun(ReloadStrat, Penciller, _State) ->
+get_loadfun(ReloadStrat, Penciller) ->
     fun(KeyInJournal, ValueInJournal, _Pos, Acc0, ExtractFun) ->
         {MinSQN, MaxSQN, LedgerCache} = Acc0,
         {SQN, InkTag, PK} = KeyInJournal,
@@ -2497,153 +2456,54 @@ delete_path(DirPath) ->
     [file:delete(filename:join([DirPath, File])) || File <- Files],
     file:del_dir(DirPath).
 
-
+-spec maybelog_put_timing(
+        leveled_monitor:monitor(),
+        leveled_monitor:timing(),
+        leveled_monitor:timing(),
+        pos_integer()) -> ok.
+maybelog_put_timing(_Monitor, no_timing, no_timing, _Size) ->
+    ok;
+maybelog_put_timing({Pid, _StatsFreq}, MemTime, InkTime, Size) ->
+    leveled_monitor:add_stat(Pid, {bookie_put_update, MemTime, InkTime, Size}).
+
+-spec maybelog_head_timing(
+        leveled_monitor:monitor(),
+        leveled_monitor:timing(),
+        leveled_monitor:timing(),
+        boolean(),
+        boolean()) -> ok.
+maybelog_head_timing(_Monitor, no_timing, no_timing, _NF, _CH) ->
+    ok;
+maybelog_head_timing({Pid, _StatsFreq}, FetchTime, _, true, _CH) ->
+    leveled_monitor:add_stat(
+        Pid, {bookie_head_update, FetchTime, not_found, 0});
+maybelog_head_timing({Pid, _StatsFreq}, FetchTime, RspTime, _NF, CH) ->
+    CH0 = case CH of true -> 1; false -> 0 end,
+    leveled_monitor:add_stat(
+        Pid, {bookie_head_update, FetchTime, RspTime, CH0}).
+
+-spec maybelog_get_timing(
+    leveled_monitor:monitor(),
+    leveled_monitor:timing(),
+    leveled_monitor:timing(),
+    boolean()) -> ok.
+maybelog_get_timing(_Monitor, no_timing, no_timing, _NF) ->
+    ok;
+maybelog_get_timing({Pid, _StatsFreq}, HeadTime, _BodyTime, true) ->
+    leveled_monitor:add_stat(Pid, {bookie_get_update, HeadTime, not_found});
+maybelog_get_timing({Pid, _StatsFreq}, HeadTime, BodyTime, false) ->
+    leveled_monitor:add_stat(Pid, {bookie_get_update, HeadTime, BodyTime}).
+
+
+-spec maybelog_snap_timing(
+    leveled_monitor:monitor(),
+    leveled_monitor:timing(),
+    leveled_monitor:timing()) -> ok.
+maybelog_snap_timing(_Monitor, no_timing, no_timing) ->
+    ok;
+maybelog_snap_timing({Pid, _StatsFreq}, BookieTime, PCLTime) ->
+    leveled_monitor:add_stat(Pid, {bookie_snap_update, BookieTime, PCLTime}).
 
-%%%============================================================================
-%%% Timing Functions
-%%%============================================================================
-
--spec update_statetimings(timing_types(), timings(), integer()) -> 
-                    {timings(), integer()}.
-%% @doc
-%%
-%% The timings state is either in countdown to the next set of samples of
-%% we are actively collecting a sample.  Active collection take place 
-%% when the countdown is 0.  Once the sample has reached the expected count
-%% then there is a log of that sample, and the countdown is restarted.
-%%
-%% Outside of sample windows the timings object should be set to the atom
-%% no_timing.  no_timing is a valid state for each timings type.
-update_statetimings(head, no_timing, 0) ->
-    {#head_timings{}, 0};
-update_statetimings(put, no_timing, 0) ->
-    {#put_timings{}, 0};
-update_statetimings(get, no_timing, 0) ->
-    {#get_timings{}, 0};
-update_statetimings(snapshot, no_timing, 0) ->
-    {#snapshot_timings{}, 0};
-update_statetimings(head, Timings, 0) ->
-    case Timings#head_timings.sample_count of 
-        SC when SC >= ?TIMING_SAMPLESIZE ->
-            log_timings(head, Timings),
-            {no_timing, leveled_rand:uniform(10 * ?TIMING_SAMPLECOUNTDOWN)};
-        _SC ->
-            {Timings, 0}
-    end;
-update_statetimings(put, Timings, 0) ->
-    case Timings#put_timings.sample_count of 
-        SC when SC >= ?TIMING_SAMPLESIZE ->
-            log_timings(put, Timings),
-            {no_timing, leveled_rand:uniform(2 * ?TIMING_SAMPLECOUNTDOWN)};
-        _SC ->
-            {Timings, 0}
-    end;
-update_statetimings(get, Timings, 0) ->
-    case Timings#get_timings.sample_count of 
-        SC when SC >= ?TIMING_SAMPLESIZE ->
-            log_timings(get, Timings),
-            {no_timing, leveled_rand:uniform(2 * ?TIMING_SAMPLECOUNTDOWN)};
-        _SC ->
-            {Timings, 0}
-    end;
-update_statetimings(snapshot, Timings, 0) ->
-    case Timings#snapshot_timings.sample_count of 
-        SC when SC >= ?TIMING_SAMPLESIZE ->
-            log_timings(snapshot, Timings),
-            {no_timing, 
-                leveled_rand:uniform(2 * ?TIMING_SAMPLECOUNTDOWN)};
-        _SC ->
-            {Timings, 0}
-    end;
-update_statetimings(_, no_timing, N) ->
-    {no_timing, N - 1}.
-
-log_timings(head, Timings) ->
-    leveled_log:log("B0018", 
-                        [Timings#head_timings.sample_count,
-                            Timings#head_timings.pcl_time,
-                            Timings#head_timings.buildhead_time]);
-log_timings(put, Timings) ->
-    leveled_log:log("B0015", [Timings#put_timings.sample_count, 
-                                Timings#put_timings.mem_time,
-                                Timings#put_timings.ink_time,
-                                Timings#put_timings.total_size]);
-log_timings(get, Timings) ->
-    leveled_log:log("B0016", [Timings#get_timings.sample_count, 
-                                Timings#get_timings.head_time,
-                                Timings#get_timings.body_time,
-                                Timings#get_timings.fetch_count]);
-log_timings(snapshot, Timings) ->    
-    leveled_log:log("B0017", [Timings#snapshot_timings.sample_count, 
-                                Timings#snapshot_timings.bookie_time,
-                                Timings#snapshot_timings.pcl_time]).
-
-
-update_timings(_SW, _Stage, no_timing) ->
-    {no_timing, no_timing};
-update_timings(SW, {head, Stage}, Timings) ->
-    NextSW = os:timestamp(), 
-    Timer = timer:now_diff(NextSW, SW),
-    Timings0 = 
-        case Stage of 
-            pcl ->
-                PCT = Timings#head_timings.pcl_time + Timer,
-                Timings#head_timings{pcl_time = PCT};
-            rsp ->
-                BHT = Timings#head_timings.buildhead_time + Timer,
-                CNT = Timings#head_timings.sample_count + 1,
-                Timings#head_timings{buildhead_time = BHT, sample_count = CNT}
-        end,
-    {NextSW, Timings0};
-update_timings(SW, {put, Stage}, Timings) ->
-    NextSW = os:timestamp(),
-    Timer = timer:now_diff(NextSW, SW),
-    Timings0 = 
-        case Stage of 
-            {inker, ObjectSize} ->
-                INT = Timings#put_timings.ink_time + Timer,
-                TSZ = Timings#put_timings.total_size + ObjectSize,
-                Timings#put_timings{ink_time = INT, total_size = TSZ};
-            mem ->
-                PCT = Timings#put_timings.mem_time + Timer,
-                CNT = Timings#put_timings.sample_count + 1,
-                Timings#put_timings{mem_time = PCT, sample_count = CNT}
-        end,
-    {NextSW, Timings0};
-update_timings(SW, {get, head}, Timings) ->
-    NextSW = os:timestamp(), 
-    Timer = timer:now_diff(NextSW, SW),
-    GHT = Timings#get_timings.head_time + Timer,
-    CNT = Timings#get_timings.sample_count + 1,
-    Timings0 = Timings#get_timings{head_time = GHT, sample_count = CNT},
-    {NextSW, Timings0};
-update_timings(SW, {get, body}, Timings) ->
-    Timer = timer:now_diff(os:timestamp(), SW),
-    GBT = Timings#get_timings.body_time + Timer,
-    FCNT = Timings#get_timings.fetch_count + 1,
-    Timings0 = Timings#get_timings{body_time = GBT, fetch_count = FCNT},
-    {no_timing, Timings0};
-update_timings(SW, {snapshot, bookie}, Timings) ->
-    NextSW = os:timestamp(), 
-    Timer = timer:now_diff(NextSW, SW),
-    BST = Timings#snapshot_timings.bookie_time + Timer,
-    CNT = Timings#snapshot_timings.sample_count + 1,
-    Timings0 = Timings#snapshot_timings{bookie_time = BST, sample_count = CNT},
-    {NextSW, Timings0};
-update_timings(SW, {snapshot, pcl}, Timings) ->
-    NextSW = os:timestamp(), 
-    Timer = timer:now_diff(NextSW, SW),
-    PST = Timings#snapshot_timings.pcl_time + Timer,
-    Timings0 = Timings#snapshot_timings{pcl_time = PST},
-    {no_timing, Timings0}.
-
-
--spec maybelog_cacheratio(cache_ratio(), boolean()) -> cache_ratio().
-maybelog_cacheratio({?CACHE_LOGPOINT, CC, HC}, false) ->
-    leveled_log:log("B0021", [?CACHE_LOGPOINT, CC, HC]),
-    {0, 0, 0};
-maybelog_cacheratio(CR, _IsSnap) ->
-    CR.
 %%%============================================================================
 %%% Test
 %%%============================================================================
@@ -3254,14 +3114,16 @@ erase_journal_test() ->
                                 {cache_size, 100}]),
     ObjL1 = generate_multiple_objects(500, 1),
     % Put in all the objects with a TTL in the future
-    lists:foreach(fun({K, V, S}) -> ok = book_put(Bookie1,
-                                                        "Bucket", K, V, S,
-                                                        ?STD_TAG) end,
-                    ObjL1),
-    lists:foreach(fun({K, V, _S}) ->
-                        {ok, V} = book_get(Bookie1, "Bucket", K, ?STD_TAG)
-                        end,
-                    ObjL1),
+    lists:foreach(
+        fun({K, V, S}) ->
+            ok = book_put(Bookie1, "Bucket", K, V, S, ?STD_TAG)
+        end,
+        ObjL1),
+    lists:foreach(
+        fun({K, V, _S}) ->
+            {ok, V} = book_get(Bookie1, "Bucket", K, ?STD_TAG)
+        end,
+        ObjL1),
     
     CheckHeadFun =
         fun(Book) -> 
diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl
index 4933822b..9bd68c00 100644
--- a/src/leveled_cdb.erl
+++ b/src/leveled_cdb.erl
@@ -145,20 +145,12 @@
                 deferred_delete = false :: boolean(),
                 waste_path :: string() | undefined,
                 sync_strategy = none,
-                timings = no_timing :: cdb_timings(),
-                timings_countdown = 0 :: integer(),
                 log_options = leveled_log:get_opts()
                     :: leveled_log:log_options(),
-                cached_score :: {float(), erlang:timestamp()}|undefined}).
-
--record(cdb_timings, {sample_count = 0 :: integer(),
-                        sample_cyclecount = 0 :: integer(),
-                        sample_indextime = 0 :: integer(),
-                        sample_fetchtime = 0 :: integer(),
-                        fetchloop_starttime :: undefined|erlang:timestamp()}).
+                cached_score :: {float(), erlang:timestamp()}|undefined,
+                monitor = {no_monitor, 0} :: leveled_monitor:monitor()}).
 
 -type cdb_options() :: #cdb_options{}.
--type cdb_timings() :: no_timing|#cdb_timings{}.
 -type hashtable_index() :: tuple().
 -type file_location() :: integer()|eof.
 -type filter_fun() ::
@@ -479,7 +471,8 @@ init([Opts]) ->
                 binary_mode=Opts#cdb_options.binary_mode,
                 waste_path=Opts#cdb_options.waste_path,
                 sync_strategy=Opts#cdb_options.sync_strategy,
-                log_options=Opts#cdb_options.log_options}}.
+                log_options=Opts#cdb_options.log_options,
+                monitor=Opts#cdb_options.monitor}}.
 
 starting({open_writer, Filename}, _From, State) ->
     leveled_log:save(State#state.log_options),
@@ -660,26 +653,21 @@ rolling({delete_pending, ManSQN, Inker}, State) ->
         State#state{delete_point=ManSQN, inker=Inker, deferred_delete=true}}.
 
 reader({get_kv, Key}, _From, State) ->
-    {UpdTimings, Result} = 
+    Result = 
         get_withcache(State#state.handle, 
                         Key, 
                         State#state.hash_index,
                         State#state.binary_mode,
-                        State#state.timings),
-    {UpdTimings0, CountDown} = 
-        update_statetimings(UpdTimings, State#state.timings_countdown),
-    {reply, 
-        Result, 
-        reader, 
-        State#state{timings = UpdTimings0, timings_countdown = CountDown}};
+                        State#state.monitor),
+    {reply, Result, reader, State};
 reader({key_check, Key}, _From, State) ->
-    {no_timing, Result} = 
+    Result = 
         get_withcache(State#state.handle, 
                         Key, 
                         State#state.hash_index,
                         loose_presence,
                         State#state.binary_mode,
-                        no_timing),
+                        {no_monitor, 0}),
     {reply, Result, reader, State};
 reader({get_positions, SampleSize, Index, Acc}, _From, State) ->
     {Pos, Count} = element(Index + 1, State#state.hash_index),
@@ -744,27 +732,21 @@ reader({delete_pending, ManSQN, Inker}, State) ->
 
 
 delete_pending({get_kv, Key}, _From, State) ->
-    {UpdTimings, Result} = 
+    Result = 
         get_withcache(State#state.handle,
                         Key,
                         State#state.hash_index,
                         State#state.binary_mode,
-                        State#state.timings),
-    {UpdTimings0, CountDown} = 
-        update_statetimings(UpdTimings, State#state.timings_countdown),
-    {reply, 
-        Result, 
-        delete_pending, 
-        State#state{timings = UpdTimings0, timings_countdown = CountDown}, 
-        ?DELETE_TIMEOUT};
+                        State#state.monitor),
+    {reply, Result, delete_pending, State, ?DELETE_TIMEOUT};
 delete_pending({key_check, Key}, _From, State) ->
-    {no_timing, Result} = 
+    Result = 
         get_withcache(State#state.handle,
                         Key,
                         State#state.hash_index,
                         loose_presence,
                         State#state.binary_mode,
-                        no_timing),
+                        {no_monitor, 0}),
     {reply, Result, delete_pending, State, ?DELETE_TIMEOUT}.
 
 delete_pending(timeout, State=#state{delete_point=ManSQN}) when ManSQN > 0 ->
@@ -1058,36 +1040,31 @@ mput(Handle, KVList, {LastPosition, HashTree0}, BinaryMode, MaxSize) ->
     end.
 
 
--spec get_withcache(file:io_device(), 
-                        any(), 
-                        tuple(), 
-                        boolean(), 
-                        cdb_timings()) 
-                                -> {cdb_timings(), missing|probably|tuple()}.
+-spec get_withcache(
+        file:io_device(), any(), tuple(),  boolean(),
+        leveled_monitor:monitor())  -> missing|probably|tuple().
 %% @doc
 %%
 %% Using a cache of the Index array - get a K/V pair from the file using the 
 %% Key.  should return an updated timings object (if timings are being taken) 
 %% along with the result (which may be missing if the no matching entry is 
 %% found, or probably in QuickCheck scenarios)
-get_withcache(Handle, Key, Cache, BinaryMode, Timings) ->
-    get(Handle, Key, Cache, true, BinaryMode, Timings).
+get_withcache(Handle, Key, Cache, BinaryMode, Monitor) ->
+    get(Handle, Key, Cache, true, BinaryMode, Monitor).
 
-get_withcache(Handle, Key, Cache, QuickCheck, BinaryMode, Timings) ->
-    get(Handle, Key, Cache, QuickCheck, BinaryMode, Timings).
+get_withcache(Handle, Key, Cache, QuickCheck, BinaryMode, Monitor) ->
+    get(Handle, Key, Cache, QuickCheck, BinaryMode, Monitor).
 
 get(FileNameOrHandle, Key, BinaryMode) ->
-    {no_timing, R} = 
-        get(FileNameOrHandle, Key, no_cache, true, BinaryMode, no_timing),
-    R.
+    get(FileNameOrHandle, Key, no_cache, true, BinaryMode, {no_monitor, 0}).
 
 
--spec get(list()|file:io_device(), 
-            any(), no_cache|tuple(), 
-            loose_presence|any(), 
-            boolean(),
-            cdb_timings()) 
-                -> {cdb_timings(), tuple()|probably|missing}.
+-spec get(
+    list()|file:io_device(), 
+    any(), no_cache|tuple(), 
+    loose_presence|any(), 
+    boolean(),
+    leveled_monitor:monitor()) -> tuple()|probably|missing.
 %% @doc
 %%
 %% Get a K/V pair from the file using the Key.  QuickCheck can be set to 
@@ -1096,38 +1073,37 @@ get(FileNameOrHandle, Key, BinaryMode) ->
 %% that Key)
 %%
 %% Timings also passed in and can be updated based on results
-get(FileName, Key, Cache, QuickCheck, BinaryMode, Timings) 
+get(FileName, Key, Cache, QuickCheck, BinaryMode, Monitor) 
                                                     when is_list(FileName) ->
     {ok, Handle} = file:open(FileName,[binary, raw, read]),
-    get(Handle, Key, Cache, QuickCheck, BinaryMode, Timings);
-get(Handle, Key, Cache, QuickCheck, BinaryMode, Timings) 
+    get(Handle, Key, Cache, QuickCheck, BinaryMode, Monitor);
+get(Handle, Key, Cache, QuickCheck, BinaryMode, Monitor) 
                                                     when is_tuple(Handle) ->
-    SW = os:timestamp(),
-    
+    SW0 = leveled_monitor:maybe_time(Monitor),
     Hash = hash(Key),
     Index = hash_to_index(Hash),
     {HashTable, Count} = get_index(Handle, Index, Cache),
-
+    {TS0, SW1} = leveled_monitor:step_time(SW0),
     % If the count is 0 for that index - key must be missing
     case Count of
         0 ->
-            {Timings, missing};
+            missing;
         _ ->
             % Get starting slot in hashtable
             {ok, FirstHashPosition} = 
                 file:position(Handle, {bof, HashTable}),
             Slot = hash_to_slot(Hash, Count),
-            UpdTimings = update_indextimings(Timings, SW),
-            search_hash_table(Handle,
-                                {FirstHashPosition,
-                                    Slot,
-                                    1,
-                                    Count},
-                                Hash,
-                                Key,
-                                QuickCheck,
-                                BinaryMode,
-                                UpdTimings)
+            {CycleCount, Result} =
+                search_hash_table(
+                    Handle,
+                    {FirstHashPosition, Slot, 1, Count},
+                    Hash,
+                    Key,
+                    QuickCheck,
+                    BinaryMode),
+            {TS1, _SW2} = leveled_monitor:step_time(SW1),
+            maybelog_get_timing(Monitor, TS0, TS1, CycleCount),
+            Result
     end.
 
 get_index(Handle, Index, no_cache) ->
@@ -1566,10 +1542,10 @@ read_integerpairs(<<Int1:32/little-integer, Int2:32/little-integer,
 
 
 
--spec search_hash_table(file:io_device(), tuple(), integer(), any(), 
-                            loose_presence|boolean(), boolean(),  
-                            cdb_timings()) -> 
-                                {cdb_timings(), missing|probably|tuple()}.
+-spec search_hash_table(
+    file:io_device(), tuple(), integer(), any(), 
+    loose_presence|boolean(), boolean())
+        -> {pos_integer(), missing|probably|tuple()}.
 %% @doc
 %%
 %% Seach the hash table for the matching hash and key.  Be prepared for 
@@ -1580,16 +1556,15 @@ read_integerpairs(<<Int1:32/little-integer, Int2:32/little-integer,
 %% false - don't check the CRC before returning key & value
 %% loose_presence - confirm that the hash of the key is present
 search_hash_table(_Handle, 
-                    {_, _, _TotalSlots, _TotalSlots}, 
+                    {_, _, TotalSlots, TotalSlots}, 
                     _Hash, _Key,
-                    _QuickCheck, _BinaryMode, Timings) -> 
+                    _QuickCheck, _BinaryMode) -> 
     % We have done the full loop - value must not be present
-    {Timings, missing};
+    {TotalSlots, missing};
 search_hash_table(Handle, 
                     {FirstHashPosition, Slot, CycleCount, TotalSlots}, 
                     Hash, Key,
-                    QuickCheck, BinaryMode, Timings) ->
-    
+                    QuickCheck, BinaryMode) ->
     % Read the next 2 integers at current position, see if it matches the hash 
     % we're after
     Offset = 
@@ -1599,7 +1574,7 @@ search_hash_table(Handle,
     
     case read_next_2_integers(Handle) of
         {0, 0} ->
-            {Timings, missing};
+            {CycleCount, missing};
         {Hash, DataLoc} ->
             KV = 
                 case QuickCheck of
@@ -1611,91 +1586,33 @@ search_hash_table(Handle,
             case KV of
                 missing ->
                     leveled_log:log("CDB15", [Hash]),
-                    search_hash_table(Handle,
-                                        {FirstHashPosition,
-                                            Slot,
-                                            CycleCount + 1,
-                                            TotalSlots},
-                                        Hash, Key,
-                                        QuickCheck, BinaryMode,
-                                        Timings);
+                    search_hash_table(
+                        Handle,
+                        {FirstHashPosition, Slot, CycleCount + 1, TotalSlots},
+                        Hash, Key,
+                        QuickCheck, BinaryMode);
                 _ ->
-                    UpdTimings = update_fetchtimings(Timings, CycleCount),
-                    {UpdTimings, KV} 
+                    {CycleCount, KV} 
             end;
         _ ->
-            search_hash_table(Handle, 
-                                {FirstHashPosition,
-                                    Slot,
-                                    CycleCount + 1,
-                                    TotalSlots}, 
-                                Hash, Key,
-                                QuickCheck, BinaryMode,
-                                Timings)
+            search_hash_table(
+                Handle, 
+                {FirstHashPosition, Slot, CycleCount + 1, TotalSlots}, 
+                Hash, Key,
+                QuickCheck, BinaryMode)
     end.
 
 
--spec update_fetchtimings(no_timing|cdb_timings(), integer()) ->
-                                                no_timing|cdb_timings().
-%% @doc
-%%
-%% Update the timings record if sample timings currently being taken 
-%% (otherwise the timngs record will be set to no_timing)
-update_fetchtimings(no_timing, _CycleCount) ->
-    no_timing;
-update_fetchtimings(Timings, CycleCount) ->    
-    FetchTime = 
-        timer:now_diff(os:timestamp(), 
-                        Timings#cdb_timings.fetchloop_starttime),
-    Timings#cdb_timings{sample_fetchtime = 
-                            Timings#cdb_timings.sample_fetchtime + FetchTime,
-                        sample_cyclecount = 
-                            Timings#cdb_timings.sample_cyclecount + CycleCount,
-                        sample_count = 
-                            Timings#cdb_timings.sample_count + 1}.
-
--spec update_indextimings(no_timing|cdb_timings(), erlang:timestamp()) ->
-                                                no_timing|cdb_timings().
-%% @doc
-%%
-%% Update the timings record with the time spent looking up the position
-%% list to check from the index
-update_indextimings(no_timing, _SW) ->
-    no_timing;
-update_indextimings(Timings, SW) ->
-    IdxTime = timer:now_diff(os:timestamp(), SW),
-    Timings#cdb_timings{sample_indextime = 
-                            Timings#cdb_timings.sample_indextime 
-                                + IdxTime,
-                        fetchloop_starttime = 
-                            os:timestamp()}.
-
--spec update_statetimings(cdb_timings(), integer()) 
-                                            -> {cdb_timings(), integer()}.
-%% @doc
-%%
-%% The timings state is either in countdown to the next set of samples of
-%% we are actively collecting a sample.  Active collection take place 
-%% when the countdown is 0.  Once the sample has reached the expected count
-%% then there is a log of that sample, and the countdown is restarted.
-%%
-%% Outside of sample windows the timings object should be set to the atom
-%% no_timing.  no_timing is a valid state for the cdb_timings type.
-update_statetimings(no_timing, 0) ->
-    {#cdb_timings{}, 0};
-update_statetimings(Timings, 0) ->
-    case Timings#cdb_timings.sample_count of 
-        SC when SC >= ?TIMING_SAMPLESIZE ->
-            leveled_log:log("CDB19", [Timings#cdb_timings.sample_count, 
-                                        Timings#cdb_timings.sample_cyclecount,
-                                        Timings#cdb_timings.sample_fetchtime,
-                                        Timings#cdb_timings.sample_indextime]),
-            {no_timing, leveled_rand:uniform(2 * ?TIMING_SAMPLECOUNTDOWN)};
-        _SC ->
-            {Timings, 0}
-    end;
-update_statetimings(no_timing, N) ->
-    {no_timing, N - 1}.
+-spec maybelog_get_timing(
+        leveled_monitor:monitor(),
+        leveled_monitor:timing(),
+        leveled_monitor:timing(),
+        pos_integer()) -> ok.
+maybelog_get_timing(_Monitor, no_timing, no_timing, _CC) ->
+    ok;
+maybelog_get_timing({Pid, _StatsFreq}, IndexTime, ReadTime, CycleCount) ->
+    leveled_monitor:add_stat(
+        Pid, {cdb_get_update, CycleCount, IndexTime, ReadTime}).
 
 
 % Write Key and Value tuples into the CDB.  Each tuple consists of a
@@ -2307,12 +2224,13 @@ search_hash_table_findinslot_test() ->
     io:format("Slot 2 has Hash ~w Position ~w~n", [ReadH4, ReadP4]),
     ?assertMatch(0, ReadH4),
     ?assertMatch({"key1", "value1"}, get(Handle, Key1, false)),
-    ?assertMatch({no_timing, probably}, 
-                    get(Handle, Key1, 
-                        no_cache, loose_presence, false, no_timing)),
-    ?assertMatch({no_timing, missing}, 
-                    get(Handle, "Key99", 
-                        no_cache, loose_presence, false, no_timing)),
+    NoMonitor = {no_monitor, 0},
+    ?assertMatch(
+        probably, 
+        get(Handle, Key1, no_cache, loose_presence, false, NoMonitor)),
+    ?assertMatch(
+        missing, 
+        get(Handle, "Key99", no_cache, loose_presence, false, NoMonitor)),
     {ok, _} = file:position(Handle, FirstHashPosition),
     FlipH3 = endian_flip(ReadH3),
     FlipP3 = endian_flip(ReadP3),
diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index da47f7c5..43e455e1 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -73,16 +73,13 @@
         {info, "Snapshot timing with sample_count=~w and bookie_time=~w pcl_time=~w"}},
     {"B0018",
         {info, "Positive HEAD responses timed with sample_count=~w and "
-                ++ " pcl_time=~w rsp_time=~w"}},
+                ++ " fetch_time=~w rsp_time=~w found_count=~w cache_count=~w"}},
     {"B0019",
         {warn, "Use of book_indexfold with constraint of Bucket ~w with "
                     ++ "no StartKey is deprecated"}},
     {"B0020",
         {warn, "Ratio of penciller cache size ~w to bookie's memory "
                     ++ "cache size ~w is larger than expected"}},
-    {"B0021",
-        {info, "Bookie fetch RequestCount=~w and CacheCount=~w and "
-                    ++ "ObjectFoundCount=~w"}},
 
     {"R0001",
         {debug, "Object fold to process batch of ~w objects"}},
@@ -241,17 +238,17 @@
                 ++ "build_summary=~w read_switch=~w"}},
     {"SST12",
         {info, "SST Timings at level=~w for sample_count=~w"
-                ++ " at timing points index_query_time=~w"
-                ++ " lookup_cache_time=~w slot_index_time=~w "
-                ++ " fetch_cache_time=~w slot_fetch_time=~w"
-                ++ " noncached_block_fetch_time=~w"
-                ++ " exiting at points slot_index=~w"
-                ++ " fetch_cache=~w slot_fetch=~w noncached_block_fetch=~w"}},
+                ++ " at timing points notfound_time=~w fetchcache_time=~w"
+                ++ " slotcached_time=~w slotnoncached_time=~w "
+                ++ " exiting at points notfound_count=~w fetchcache_count=~w"
+                ++ " slotcached_count=~w slotnoncached_count=~w"}},
     {"SST13",
         {info, "SST merge list build timings of"
                 ++ " fold_toslot=~w slot_hashlist=~w"
                 ++ " slot_serialise=~w slot_finish=~w"
                 ++ " is_basement=~w level=~w"}},
+    {"SST14",
+        {debug, "File ~s has completed BIC"}},
     
     {"I0001",
         {info, "Unexpected failure to fetch value for Key=~w SQN=~w "
@@ -372,17 +369,12 @@
                 ++ "to_list=~w sort=~w build=~w"}},
     {"CDB15",
         {info, "Collision in search for hash ~w"}},
-    {"CDB16",
-        {info, "CDB scan from start ~w in file with end ~w and last_key ~w"}},
-    {"CDB17",
-        {info, "After ~w PUTs total_write_time=~w total_sync_time=~w "
-                ++ "and max_write_time=~w and max_sync_time=~w"}},
     {"CDB18",
         {info, "Handled return and write of hashtable"}},
     {"CDB19",
         {info, "Sample timings in microseconds for sample_count=~w " 
                     ++ "with totals of cycle_count=~w "
-                    ++ "fetch_time=~w index_time=~w"}},
+                    ++ "index_time=~w read_time=~w"}},
     {"CDB20",
         {warn, "Error ~w caught when safe reading a file to length ~w"}},
     {"CDB21",
diff --git a/src/leveled_monitor.erl b/src/leveled_monitor.erl
new file mode 100644
index 00000000..4cfa45a8
--- /dev/null
+++ b/src/leveled_monitor.erl
@@ -0,0 +1,545 @@
+%% -------- MONITOR ---------
+%%
+%% The bookie's monitor is a process dedciated to gathering and reporting
+%% stats related to performance of the store.
+%% 
+%% The monitor was introduced as a sedicated process to reduce the number of
+%% LoopState mutations otherwise necessary to track statistics, requiring
+%% State copies even on read events.
+
+-module(leveled_monitor).
+
+-behaviour(gen_server).
+
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    terminate/2,
+    code_change/3]).
+
+-export([
+    monitor_start/0,
+    monitor_start/2,
+    add_stat/2,
+    report_stats/2,
+    monitor_close/1,
+    maybe_time/1,
+    step_time/1,
+    log_level/2,
+    log_add/2,
+    log_remove/2]).
+
+-include_lib("eunit/include/eunit.hrl").
+
+-define(
+    LOG_ORDER,
+    [bookie_get, bookie_put, bookie_head, bookie_snap,
+        pcl_fetch, sst_fetch, cdb_get]).
+-define(LOG_FREQUENCY_SECONDS, 30).
+
+
+-record(bookie_get_timings,
+    {sample_count = 0 :: non_neg_integer(),
+    head_time = 0 :: non_neg_integer(),
+    body_time = 0 :: non_neg_integer(),
+    fetch_count = 0 :: non_neg_integer()}).
+
+-record(bookie_head_timings,
+    {sample_count = 0 :: non_neg_integer(),
+    fetch_time = 0 :: non_neg_integer(),
+    rsp_time = 0 :: non_neg_integer(),
+    fetch_count = 0 :: non_neg_integer(),
+    cache_count = 0 :: non_neg_integer()}).
+
+-record(bookie_put_timings,
+    {sample_count = 0 :: non_neg_integer(),
+    mem_time = 0 :: non_neg_integer(),
+    ink_time = 0 :: non_neg_integer(),
+    total_size = 0 :: non_neg_integer()}).
+
+-record(bookie_snap_timings,
+    {sample_count = 0 :: non_neg_integer(),
+    bookie_time = 0 :: non_neg_integer(),
+    pcl_time = 0 :: non_neg_integer()}).
+
+-record(pcl_fetch_timings, 
+    {sample_count = 0 :: non_neg_integer(),
+    foundmem_time = 0 :: non_neg_integer(),
+    found0_time = 0 :: non_neg_integer(),
+    found1_time = 0 :: non_neg_integer(),
+    found2_time = 0 :: non_neg_integer(),
+    found3_time = 0 :: non_neg_integer(),
+    foundlower_time = 0 :: non_neg_integer(),
+    notfound_time = 0 :: non_neg_integer(),
+    foundmem_count = 0 :: non_neg_integer(),
+    found0_count = 0 :: non_neg_integer(),
+    found1_count = 0 :: non_neg_integer(),
+    found2_count = 0 :: non_neg_integer(),
+    found3_count = 0 :: non_neg_integer(),
+    foundlower_count = 0 :: non_neg_integer(),
+    notfound_count = 0 :: non_neg_integer()}).
+
+-record(sst_fetch_timings, 
+    {sample_count = 0 :: non_neg_integer(),
+    fetchcache_time = 0 :: non_neg_integer(),
+    slotcached_time = 0 :: non_neg_integer(),
+    slotnoncached_time = 0 :: non_neg_integer(),
+    notfound_time = 0 :: non_neg_integer(),
+    fetchcache_count = 0 :: non_neg_integer(),
+    slotcached_count = 0 :: non_neg_integer(),
+    slotnoncached_count = 0 :: non_neg_integer(),
+    notfound_count = 0 :: non_neg_integer()}).
+
+-record(cdb_get_timings,
+    {sample_count = 0 :: non_neg_integer(),
+    cycle_count = 0 :: non_neg_integer(),
+    index_time = 0 :: non_neg_integer(),
+    read_time = 0 :: non_neg_integer()}).
+
+-record(state, 
+    {bookie_get_timings = #bookie_get_timings{} :: bookie_get_timings(),
+    bookie_head_timings = #bookie_head_timings{} :: bookie_head_timings(),
+    bookie_put_timings = #bookie_put_timings{} :: bookie_put_timings(),
+    bookie_snap_timings = #bookie_snap_timings{} :: bookie_snap_timings(),
+    pcl_fetch_timings = #pcl_fetch_timings{} :: pcl_fetch_timings(),
+    sst_fetch_timings = [] :: list(sst_fetch_timings()),
+    cdb_get_timings = #cdb_get_timings{} :: cdb_get_timings(),
+    log_frequency = ?LOG_FREQUENCY_SECONDS :: pos_integer(),
+    log_order = [] :: list(log_types())}).      
+
+
+-type bookie_get_timings() :: #bookie_get_timings{}.
+-type bookie_head_timings() :: #bookie_head_timings{}.
+-type bookie_put_timings() :: #bookie_put_timings{}.
+-type bookie_snap_timings() :: #bookie_snap_timings{}.
+-type pcl_fetch_timings() :: #pcl_fetch_timings{}.
+-type cdb_get_timings() :: #cdb_get_timings{}.
+-type sst_fetch_timings() ::
+    {leveled_pmanifest:lsm_level(), #sst_fetch_timings{}}.
+-type log_types() ::
+    bookie_head|bookie_get|bookie_put|bookie_snap|pcl_fetch|sst_fetch|cdb_get.
+-type pcl_level() :: mem|leveled_pmanifest:lsm_level().
+-type sst_fetch_type() ::
+    fetch_cache|slot_cachedblock|slot_noncachedblock|not_found.
+-type microsecs() :: pos_integer().
+-type byte_size() :: pos_integer().
+-type monitor() :: {no_monitor, 0}|{pid(), 0..100}.
+-type timing() :: no_timing|pos_integer().
+
+
+-type bookie_get_update() ::
+    {bookie_get_update, microsecs(), microsecs()|not_found}.
+-type bookie_head_update() ::
+    {bookie_head_update, microsecs(), microsecs()|not_found, 0..1}.
+-type bookie_put_update() ::
+    {bookie_put_update, microsecs(), microsecs(), byte_size()}.
+-type bookie_snap_update() ::
+    {bookie_snap_update, microsecs(), microsecs()}.
+-type pcl_fetch_update() ::
+    {pcl_fetch_update, not_found|pcl_level(), microsecs()}.
+-type sst_fetch_update() ::
+    {sst_fetch_update,
+        leveled_pmanifest:lsm_level(), sst_fetch_type(), microsecs()}.
+-type cdb_get_update() ::
+    {cdb_get_update, pos_integer(), microsecs(), microsecs()}.
+-type statistic() ::
+    bookie_get_update()|bookie_head_update()|bookie_put_update()|
+        bookie_snap_update()|
+        pcl_fetch_update()|sst_fetch_update()|cdb_get_update().
+
+-export_type([monitor/0, timing/0, sst_fetch_type/0]).
+
+%%%============================================================================
+%%% API
+%%%============================================================================
+
+-spec monitor_start() -> {ok, pid()}.
+monitor_start() ->
+    monitor_start(?LOG_FREQUENCY_SECONDS, ?LOG_ORDER).
+
+-spec monitor_start(pos_integer(), list(log_types())) -> {ok, pid()}.
+monitor_start(LogFreq, LogOrder) ->
+    gen_server:start_link(
+        ?MODULE, [leveled_log:get_opts(), LogFreq, LogOrder], []).
+
+-spec add_stat(pid(), statistic()) -> ok.
+add_stat(Watcher, Statistic) ->
+    gen_server:cast(Watcher, Statistic).
+
+-spec report_stats(pid(), log_types()) -> ok.
+report_stats(Watcher, StatsType) ->
+    gen_server:cast(Watcher, {report_stats, StatsType}).
+
+-spec monitor_close(pid()|no_monitor) -> ok.
+monitor_close(no_monitor) ->
+    ok;
+monitor_close(Watcher) ->
+    gen_server:call(Watcher, close, 60000).
+
+-spec log_level(pid(), leveled_log:log_level()) -> ok.
+log_level(Pid, LogLevel) ->
+    gen_server:cast(Pid, {log_level, LogLevel}).
+
+-spec log_add(pid(), list(string())) -> ok.
+log_add(Pid, ForcedLogs) ->
+    gen_server:cast(Pid, {log_add, ForcedLogs}).
+
+-spec log_remove(pid(), list(string())) -> ok.
+log_remove(Pid, ForcedLogs) ->
+    gen_server:cast(Pid, {log_remove, ForcedLogs}).
+
+-spec maybe_time(monitor()) -> os:timestamp()|no_timing.
+maybe_time({_Pid, TimingProbability}) ->
+    case leveled_rand:uniform(100) of
+        N when N =< TimingProbability ->
+            os:timestamp();
+        _ ->
+            no_timing
+    end.
+
+-spec step_time(
+    os:timestamp()|no_timing) ->
+        {pos_integer(), os:timestamp()}|{no_timing, no_timing}.
+step_time(no_timing) ->
+    {no_timing, no_timing};
+step_time(TS) ->
+    Now = os:timestamp(),
+    {timer:now_diff(Now, TS), Now}.
+
+
+%%%============================================================================
+%%% gen_server callbacks
+%%%============================================================================
+
+init([LogOpts, LogFrequency, LogOrder]) ->
+    leveled_log:save(LogOpts),
+    leveled_rand:seed(),
+    RandomLogOrder = 
+        lists:map(
+            fun({_R, SL}) -> SL end,
+            lists:keysort(
+                1,
+                lists:map(
+                    fun(L) -> {leveled_rand:uniform(), L} end,
+                    LogOrder))),
+    InitialJitter = leveled_rand:uniform(2 * 1000 * LogFrequency),
+    erlang:send_after(InitialJitter, self(), report_next_stats),
+    {ok, #state{log_frequency = LogFrequency, log_order = RandomLogOrder}}.
+
+handle_call(close, _From, State) ->
+    {stop, normal, ok, State}.
+
+handle_cast({bookie_head_update, FetchTime, RspTime, CacheHit}, State) ->
+    Timings = State#state.bookie_head_timings,
+    SC0 = Timings#bookie_head_timings.sample_count + 1,
+    CC0 = Timings#bookie_head_timings.cache_count + CacheHit, 
+    {FC0, PT0, RT0} =
+        case RspTime of
+            not_found ->
+                {Timings#bookie_head_timings.fetch_count,
+                    Timings#bookie_head_timings.fetch_time + FetchTime,
+                    Timings#bookie_head_timings.rsp_time};
+            RspTime ->
+                {Timings#bookie_head_timings.fetch_count + 1,
+                    Timings#bookie_head_timings.fetch_time + FetchTime,
+                    Timings#bookie_head_timings.rsp_time + RspTime}
+        end,
+    UpdTimings =
+        #bookie_head_timings{
+            sample_count = SC0,
+            fetch_time = PT0,
+            rsp_time = RT0,
+            fetch_count = FC0,
+            cache_count = CC0
+        },
+    {noreply, State#state{bookie_head_timings = UpdTimings}};
+handle_cast({bookie_get_update, HeadTime, BodyTime}, State) ->
+    Timings = State#state.bookie_get_timings,
+    SC0 = Timings#bookie_get_timings.sample_count + 1,
+    {FC0, HT0, BT0} =
+        case BodyTime of
+            not_found ->
+                {Timings#bookie_get_timings.fetch_count,
+                    Timings#bookie_get_timings.head_time + HeadTime,
+                    Timings#bookie_get_timings.body_time};
+            BodyTime ->
+                {Timings#bookie_get_timings.fetch_count + 1,
+                    Timings#bookie_get_timings.head_time + HeadTime,
+                    Timings#bookie_get_timings.body_time + BodyTime}
+        end,
+    UpdTimings =
+        #bookie_get_timings{
+            sample_count = SC0,
+            head_time = HT0,
+            body_time = BT0,
+            fetch_count = FC0
+        },
+    {noreply, State#state{bookie_get_timings = UpdTimings}};
+handle_cast({bookie_put_update, MemTime, InkTime, Size}, State) ->
+    Timings = State#state.bookie_put_timings,
+    SC0 = Timings#bookie_put_timings.sample_count + 1,
+    SZ0 = Timings#bookie_put_timings.total_size + Size,
+    MT0 = Timings#bookie_put_timings.mem_time + MemTime,
+    IT0 = Timings#bookie_put_timings.ink_time + InkTime,
+    UpdTimings =
+        #bookie_put_timings{
+            sample_count = SC0,
+            mem_time = MT0,
+            ink_time = IT0,
+            total_size = SZ0
+        },
+    {noreply, State#state{bookie_put_timings = UpdTimings}};
+handle_cast({bookie_snap_update, BookieTime, PCLTime}, State) ->
+    Timings = State#state.bookie_snap_timings,
+    SC0 = Timings#bookie_snap_timings.sample_count + 1,
+    BT0 = Timings#bookie_snap_timings.bookie_time + BookieTime,
+    PT0 = Timings#bookie_snap_timings.pcl_time + PCLTime,
+    UpdTimings =
+        #bookie_snap_timings{
+            sample_count = SC0,
+            bookie_time = BT0,
+            pcl_time = PT0
+        },
+    {noreply, State#state{bookie_snap_timings = UpdTimings}};
+handle_cast({pcl_fetch_update, Level, FetchTime}, State) ->
+    Timings = State#state.pcl_fetch_timings,
+    SC0 = Timings#pcl_fetch_timings.sample_count + 1,
+    UpdTimings =
+        case Level of
+            not_found ->
+                Timings#pcl_fetch_timings{
+                    notfound_count =
+                        Timings#pcl_fetch_timings.notfound_count + 1,
+                    notfound_time =
+                        Timings#pcl_fetch_timings.notfound_time + FetchTime
+                };
+            memory ->
+                Timings#pcl_fetch_timings{
+                    foundmem_count =
+                        Timings#pcl_fetch_timings.foundmem_count + 1,
+                    foundmem_time =
+                        Timings#pcl_fetch_timings.foundmem_time + FetchTime
+                };
+            0 ->
+                Timings#pcl_fetch_timings{
+                    found0_count =
+                        Timings#pcl_fetch_timings.found0_count + 1,
+                    found0_time =
+                        Timings#pcl_fetch_timings.found0_time + FetchTime
+                };
+            1 ->
+                Timings#pcl_fetch_timings{
+                    found1_count =
+                        Timings#pcl_fetch_timings.found1_count + 1,
+                    found1_time =
+                        Timings#pcl_fetch_timings.found1_time + FetchTime
+                };
+            2 ->
+                Timings#pcl_fetch_timings{
+                    found2_count =
+                        Timings#pcl_fetch_timings.found2_count + 1,
+                    found2_time =
+                        Timings#pcl_fetch_timings.found2_time + FetchTime
+                };
+            3 ->
+                Timings#pcl_fetch_timings{
+                    found3_count =
+                        Timings#pcl_fetch_timings.found3_count + 1,
+                    found3_time =
+                        Timings#pcl_fetch_timings.found3_time + FetchTime
+                };
+            N when N  > 3 ->
+                Timings#pcl_fetch_timings{
+                    foundlower_count =
+                        Timings#pcl_fetch_timings.foundlower_count + 1,
+                    foundlower_time =
+                        Timings#pcl_fetch_timings.foundlower_time + FetchTime
+                }
+        end,              
+    UpdTimings0 = UpdTimings#pcl_fetch_timings{sample_count = SC0},
+    {noreply, State#state{pcl_fetch_timings = UpdTimings0}};
+handle_cast({sst_fetch_update, Level, FetchPoint, FetchTime}, State) ->
+    Timings =
+        case lists:keyfind(Level, 1, State#state.sst_fetch_timings) of
+            {Level, PrvTimings} ->
+                PrvTimings;
+            false ->
+                #sst_fetch_timings{}
+        end,
+    SC0 = Timings#sst_fetch_timings.sample_count + 1,
+    UpdTimings =
+        case FetchPoint of
+            not_found ->
+                Timings#sst_fetch_timings{
+                    notfound_count =
+                        Timings#sst_fetch_timings.notfound_count + 1,
+                    notfound_time =
+                        Timings#sst_fetch_timings.notfound_time + FetchTime
+                };
+            fetch_cache ->
+                Timings#sst_fetch_timings{
+                    fetchcache_count =
+                        Timings#sst_fetch_timings.fetchcache_count + 1,
+                    fetchcache_time =
+                        Timings#sst_fetch_timings.fetchcache_time + FetchTime
+                };
+            slot_cachedblock ->
+                Timings#sst_fetch_timings{
+                    slotcached_count =
+                        Timings#sst_fetch_timings.slotcached_count + 1,
+                    slotcached_time =
+                        Timings#sst_fetch_timings.slotcached_time + FetchTime
+                };
+            slot_noncachedblock ->
+                Timings#sst_fetch_timings{
+                    slotnoncached_count =
+                        Timings#sst_fetch_timings.slotnoncached_count + 1,
+                    slotnoncached_time =
+                        Timings#sst_fetch_timings.slotnoncached_time + FetchTime
+                }
+        end,
+    UpdLevel = {Level, UpdTimings#sst_fetch_timings{sample_count = SC0}},
+    UpdLevels = 
+        lists:ukeysort(1, [UpdLevel|State#state.sst_fetch_timings]),
+    {noreply, State#state{sst_fetch_timings = UpdLevels}};
+handle_cast({cdb_get_update, CycleCount, IndexTime, ReadTime}, State) ->
+    Timings = State#state.cdb_get_timings,
+    SC0 = Timings#cdb_get_timings.sample_count + 1,
+    CC0 = Timings#cdb_get_timings.cycle_count + CycleCount,
+    IT0 = Timings#cdb_get_timings.index_time + IndexTime,
+    RT0 = Timings#cdb_get_timings.read_time + ReadTime,
+    UpdTimings =
+        #cdb_get_timings{
+            sample_count = SC0,
+            cycle_count = CC0,
+            index_time = IT0,
+            read_time = RT0
+        },
+    {noreply, State#state{cdb_get_timings = UpdTimings}};
+handle_cast({report_stats, bookie_get}, State) ->
+    Timings = State#state.bookie_get_timings,
+    leveled_log:log(
+        "B0016",
+        [Timings#bookie_get_timings.sample_count,
+            Timings#bookie_get_timings.head_time,
+            Timings#bookie_get_timings.body_time,
+            Timings#bookie_get_timings.fetch_count]),
+    {noreply, State#state{bookie_get_timings = #bookie_get_timings{}}};
+handle_cast({report_stats, bookie_head}, State) ->
+    Timings = State#state.bookie_head_timings,
+    leveled_log:log(
+        "B0018",
+        [Timings#bookie_head_timings.sample_count,
+            Timings#bookie_head_timings.fetch_time,
+            Timings#bookie_head_timings.rsp_time,
+            Timings#bookie_head_timings.fetch_count,
+            Timings#bookie_head_timings.cache_count]),
+    {noreply, State#state{bookie_head_timings = #bookie_head_timings{}}};
+handle_cast({report_stats, bookie_put}, State) ->
+    Timings = State#state.bookie_put_timings,
+    leveled_log:log(
+        "B0015",
+        [Timings#bookie_put_timings.sample_count,
+            Timings#bookie_put_timings.mem_time,
+            Timings#bookie_put_timings.ink_time,
+            Timings#bookie_put_timings.total_size]),
+    {noreply, State#state{bookie_put_timings = #bookie_put_timings{}}};
+handle_cast({report_stats, bookie_snap}, State) ->
+    Timings = State#state.bookie_snap_timings,
+    leveled_log:log(
+        "B0017",
+        [Timings#bookie_snap_timings.sample_count,
+            Timings#bookie_snap_timings.bookie_time,
+            Timings#bookie_snap_timings.pcl_time]),
+    {noreply, State#state{bookie_snap_timings = #bookie_snap_timings{}}};
+handle_cast({report_stats, pcl_fetch}, State) ->
+    Timings = State#state.pcl_fetch_timings,
+    leveled_log:log(
+        "P0032",
+        [Timings#pcl_fetch_timings.sample_count,
+            Timings#pcl_fetch_timings.foundmem_time,
+            Timings#pcl_fetch_timings.found0_time,
+            Timings#pcl_fetch_timings.found1_time,
+            Timings#pcl_fetch_timings.found2_time,
+            Timings#pcl_fetch_timings.found3_time,
+            Timings#pcl_fetch_timings.foundlower_time,
+            Timings#pcl_fetch_timings.notfound_time,
+            Timings#pcl_fetch_timings.foundmem_count,
+            Timings#pcl_fetch_timings.found0_count,
+            Timings#pcl_fetch_timings.found1_count,
+            Timings#pcl_fetch_timings.found2_count,
+            Timings#pcl_fetch_timings.found3_count,
+            Timings#pcl_fetch_timings.foundlower_count,
+            Timings#pcl_fetch_timings.notfound_count]),
+    {noreply, State#state{pcl_fetch_timings = #pcl_fetch_timings{}}};
+handle_cast({report_stats, sst_fetch}, State) ->
+    LogFun =
+        fun({Level, Timings}) ->
+            leveled_log:log(
+                "SST12",
+                [Level,
+                    Timings#sst_fetch_timings.sample_count,
+                    Timings#sst_fetch_timings.notfound_time,
+                    Timings#sst_fetch_timings.fetchcache_time,
+                    Timings#sst_fetch_timings.slotcached_time,
+                    Timings#sst_fetch_timings.slotnoncached_time,
+                    Timings#sst_fetch_timings.notfound_count,
+                    Timings#sst_fetch_timings.fetchcache_count,
+                    Timings#sst_fetch_timings.slotcached_count,
+                    Timings#sst_fetch_timings.slotnoncached_count])
+        end,
+    lists:foreach(LogFun, State#state.sst_fetch_timings),
+    {noreply, State#state{sst_fetch_timings = []}};
+handle_cast({report_stats, cdb_get}, State) ->
+    Timings = State#state.cdb_get_timings,
+    leveled_log:log(
+        "CDB19",
+        [Timings#cdb_get_timings.sample_count,
+            Timings#cdb_get_timings.cycle_count,
+            Timings#cdb_get_timings.index_time,
+            Timings#cdb_get_timings.read_time]),
+    {noreply, State#state{cdb_get_timings = #cdb_get_timings{}}};
+handle_cast({log_level, LogLevel}, State) ->
+    ok = leveled_log:set_loglevel(LogLevel),
+    {noreply, State};
+handle_cast({log_add, ForcedLogs}, State) ->
+    ok = leveled_log:add_forcedlogs(ForcedLogs),
+    {noreply, State};
+handle_cast({log_remove, ForcedLogs}, State) ->
+    ok = leveled_log:remove_forcedlogs(ForcedLogs),
+    {noreply, State}.
+
+handle_info(report_next_stats, State) ->
+    erlang:send_after(
+        State#state.log_frequency * 1000, self(), report_next_stats),
+    case State#state.log_order of
+        [] ->
+            {noreply, State};
+        [NextStat|TailLogOrder] ->
+            ok = report_stats(self(), NextStat),
+            {noreply, State#state{log_order = TailLogOrder ++ [NextStat]}}
+    end.
+
+terminate(_Reason, _State) ->
+    ok.
+    
+code_change(_OldVsn, State, _Extra) ->
+    {ok, State}.
+
+
+%%%============================================================================
+%%% Test
+%%%============================================================================
+
+-ifdef(TEST).
+
+coverage_cheat_test() ->
+    {ok, M} = monitor_start(1, []),
+    timer:sleep(2000),
+    {ok, _State1} = code_change(null, #state{}, null),
+    % Can close, so empty log_order hasn't crashed
+    ok = monitor_close(M).
+
+-endif.
\ No newline at end of file
diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl
index 6bcb7133..3ce6a32f 100644
--- a/src/leveled_penciller.erl
+++ b/src/leveled_penciller.erl
@@ -233,12 +233,8 @@
                     % related to  specific query, and the StartKey/EndKey
                     % used to extract this part
 
-                persisted_sqn = 0 :: integer(), % The highest SQN persisted
-                
+                persisted_sqn = 0 :: integer(), % The highest SQN persisted          
                 ledger_sqn = 0 :: integer(), % The highest SQN added to L0
-                root_path = "test" :: string(),
-                
-                clerk :: pid() | undefined,
                 
                 levelzero_pending = false :: boolean(),
                 levelzero_constructor :: pid() | undefined,
@@ -248,13 +244,16 @@
                 levelzero_cointoss = false :: boolean(),
                 levelzero_index ::
                     leveled_pmem:index_array() | undefined | redacted,
+                levelzero_astree :: list() | undefined | redacted,
+
+                root_path = "test" :: string(),
+                clerk :: pid() | undefined,
                 
                 is_snapshot = false :: boolean(),
                 snapshot_fully_loaded = false :: boolean(),
                 snapshot_time :: pos_integer() | undefined,
                 source_penciller :: pid() | undefined,
 		        bookie_monref :: reference() | undefined,
-                levelzero_astree :: list() | undefined | redacted,
                 
                 work_ongoing = false :: boolean(), % i.e. compaction work
                 work_backlog = false :: boolean(), % i.e. compaction work
@@ -262,30 +261,13 @@
                 pending_removals = [] :: list(string()),
                 maybe_release = false :: boolean(),
                 
-                timings = no_timing :: pcl_timings(),
-                timings_countdown = 0 :: integer(),
-
                 snaptimeout_short :: pos_integer()|undefined,
                 snaptimeout_long :: pos_integer()|undefined,
 
+                monitor = {no_monitor, 0} :: leveled_monitor:monitor(),
+
                 sst_options = #sst_options{} :: sst_options()}).
 
--record(pcl_timings, 
-                    {sample_count = 0 :: integer(),
-                        foundmem_time = 0 :: integer(),
-                        found0_time = 0 :: integer(),
-                        found1_time = 0 :: integer(),
-                        found2_time = 0 :: integer(),
-                        found3_time = 0 :: integer(),
-                        foundlower_time = 0 :: integer(),
-                        missed_time = 0 :: integer(),
-                        foundmem_count = 0 :: integer(),
-                        found0_count = 0 :: integer(),
-                        found1_count = 0 :: integer(),
-                        found2_count = 0 :: integer(),
-                        found3_count = 0 :: integer(),
-                        foundlower_count = 0 :: integer(),
-                        missed_count = 0 :: integer()}).
 
 -type penciller_options() :: #penciller_options{}.
 -type bookies_memory() :: {tuple()|empty_cache,
@@ -293,7 +275,6 @@
                             integer()|infinity,
                             integer()}.
 -type pcl_state() :: #state{}.
--type pcl_timings() :: no_timing|#pcl_timings{}.
 -type levelzero_cacheentry() :: {pos_integer(), leveled_tree:leveled_tree()}.
 -type levelzero_cache() :: list(levelzero_cacheentry()).
 -type iterator_entry() 
@@ -656,9 +637,9 @@ init([LogOpts, PCLopts]) ->
                                                 BookiesMem, 
                                                 LongRunning),
             leveled_log:log("P0001", [self()]),
-            {ok, State#state{is_snapshot=true,
+            {ok, State#state{is_snapshot = true,
 			     bookie_monref = BookieMonitor,
-			     source_penciller=SrcPenciller}};
+			     source_penciller = SrcPenciller}};
         {_RootPath, _Snapshot=false, _Q, _BM} ->
             start_from_file(PCLopts)
     end.    
@@ -737,15 +718,12 @@ handle_call({fetch, Key, Hash, UseL0Index}, _From, State) ->
             false ->
                 none
         end,
-    {R, UpdTimings} = timed_fetch_mem(Key,
-                                        Hash,
-                                        State#state.manifest,
-                                        State#state.levelzero_cache,
-                                        L0Idx,
-                                        State#state.timings),
-    {UpdTimings0, CountDown} = 
-        update_statetimings(UpdTimings, State#state.timings_countdown),
-    {reply, R, State#state{timings=UpdTimings0, timings_countdown=CountDown}};
+    R = 
+        timed_fetch_mem(
+            Key, Hash, State#state.manifest,
+            State#state.levelzero_cache, L0Idx,
+            State#state.monitor),
+    {reply, R, State};
 handle_call({check_sqn, Key, Hash, SQN}, _From, State) ->
     {reply,
         compare_to_sqn(
@@ -1226,6 +1204,7 @@ start_from_file(PCLopts) ->
     RootPath = PCLopts#penciller_options.root_path,
     MaxTableSize = PCLopts#penciller_options.max_inmemory_tablesize,
     OptsSST = PCLopts#penciller_options.sst_options,
+    Monitor = PCLopts#penciller_options.monitor,
 
     SnapTimeoutShort = PCLopts#penciller_options.snaptimeout_short,
     SnapTimeoutLong = PCLopts#penciller_options.snaptimeout_long,
@@ -1244,7 +1223,8 @@ start_from_file(PCLopts) ->
                         levelzero_index = [],
                         snaptimeout_short = SnapTimeoutShort,
                         snaptimeout_long = SnapTimeoutLong,
-                        sst_options = OptsSST},
+                        sst_options = OptsSST,
+                        monitor = Monitor},
     
     %% Open manifest
     Manifest0 = leveled_pmanifest:open_manifest(RootPath),
@@ -1440,10 +1420,12 @@ roll_memory(NextManSQN, LedgerSQN, RootPath, L0Cache, CL, SSTOpts, true) ->
     {Constructor, Bloom}.
 
 
--spec timed_fetch_mem(tuple(), {integer(), integer()}, 
-                        leveled_pmanifest:manifest(), list(), 
-                        leveled_pmem:index_array(), pcl_timings()) 
-                                                -> {tuple(), pcl_timings()}.
+-spec timed_fetch_mem(
+    tuple(),
+    {integer(), integer()}, 
+    leveled_pmanifest:manifest(), list(), 
+    leveled_pmem:index_array(),
+    leveled_monitor:monitor()) -> leveled_codec:ledger_kv()|not_found.
 %% @doc
 %% Fetch the result from the penciller, starting by looking in the memory, 
 %% and if it is not found looking down level by level through the LSM tree.
@@ -1453,12 +1435,13 @@ roll_memory(NextManSQN, LedgerSQN, RootPath, L0Cache, CL, SSTOpts, true) ->
 %% the cost of requests dropping levels can be monitored.
 %%
 %% the result tuple includes the level at which the result was found.
-timed_fetch_mem(Key, Hash, Manifest, L0Cache, L0Index, Timings) ->
-    SW = os:timestamp(),
+timed_fetch_mem(Key, Hash, Manifest, L0Cache, L0Index, Monitor) ->
+    SW0 = leveled_monitor:maybe_time(Monitor),
     {R, Level} =
         fetch_mem(Key, Hash, Manifest, L0Cache, L0Index, fun timed_sst_get/4),
-    UpdTimings = update_timings(SW, Timings, R, Level),
-    {R, UpdTimings}.
+    {TS0, _SW1} = leveled_monitor:step_time(SW0),
+    maybelog_fetch_timing(Monitor, Level, TS0, R == not_present),
+    R.
 
 
 -spec fetch_sqn(
@@ -1886,95 +1869,17 @@ find_nextkey(QueryArray, LCnt,
     end.
 
 
-
-%%%============================================================================
-%%% Timing Functions
-%%%============================================================================
-
--spec update_statetimings(pcl_timings(), integer()) 
-                                            -> {pcl_timings(), integer()}.
-%% @doc
-%%
-%% The timings state is either in countdown to the next set of samples of
-%% we are actively collecting a sample.  Active collection take place 
-%% when the countdown is 0.  Once the sample has reached the expected count
-%% then there is a log of that sample, and the countdown is restarted.
-%%
-%% Outside of sample windows the timings object should be set to the atom
-%% no_timing.  no_timing is a valid state for the pcl_timings type.
-update_statetimings(no_timing, 0) ->
-    {#pcl_timings{}, 0};
-update_statetimings(Timings, 0) ->
-    case Timings#pcl_timings.sample_count of 
-        SC when SC >= ?TIMING_SAMPLESIZE ->
-            log_timings(Timings),
-            {no_timing, leveled_rand:uniform(2 * ?TIMING_SAMPLECOUNTDOWN)};
-        _SC ->
-            {Timings, 0}
-    end;
-update_statetimings(no_timing, N) ->
-    {no_timing, N - 1}.
-
-log_timings(Timings) ->
-    leveled_log:log("P0032", [Timings#pcl_timings.sample_count, 
-                                Timings#pcl_timings.foundmem_time,
-                                Timings#pcl_timings.found0_time,
-                                Timings#pcl_timings.found1_time,
-                                Timings#pcl_timings.found2_time,
-                                Timings#pcl_timings.found3_time,
-                                Timings#pcl_timings.foundlower_time,
-                                Timings#pcl_timings.missed_time,
-                                Timings#pcl_timings.foundmem_count,
-                                Timings#pcl_timings.found0_count,
-                                Timings#pcl_timings.found1_count,
-                                Timings#pcl_timings.found2_count,
-                                Timings#pcl_timings.found3_count,
-                                Timings#pcl_timings.foundlower_count,
-                                Timings#pcl_timings.missed_count]).
-
--spec update_timings(erlang:timestamp(), pcl_timings(), 
-                        not_found|tuple(), integer()|basement) 
-                                                    -> pcl_timings().
-%% @doc
-%%
-%% update the timings record unless the current record object is the atom
-%% no_timing.
-update_timings(_SW, no_timing, _Result, _Stage) ->
-    no_timing;
-update_timings(SW, Timings, Result, Stage) ->
-    Timer = timer:now_diff(os:timestamp(), SW),
-    SC = Timings#pcl_timings.sample_count + 1,
-    Timings0 = Timings#pcl_timings{sample_count = SC},
-    case {Result, Stage} of
-        {not_present, _} ->
-            NFT = Timings#pcl_timings.missed_time + Timer,
-            NFC = Timings#pcl_timings.missed_count + 1,
-            Timings0#pcl_timings{missed_time = NFT, missed_count = NFC};
-        {_, memory} ->
-            PMT = Timings#pcl_timings.foundmem_time + Timer,
-            PMC = Timings#pcl_timings.foundmem_count + 1,
-            Timings0#pcl_timings{foundmem_time = PMT, foundmem_count = PMC};
-        {_, 0} ->
-            L0T = Timings#pcl_timings.found0_time + Timer,
-            L0C = Timings#pcl_timings.found0_count + 1,
-            Timings0#pcl_timings{found0_time = L0T, found0_count = L0C};
-        {_, 1} ->
-            L1T = Timings#pcl_timings.found1_time + Timer,
-            L1C = Timings#pcl_timings.found1_count + 1,
-            Timings0#pcl_timings{found1_time = L1T, found1_count = L1C};
-        {_, 2} ->
-            L2T = Timings#pcl_timings.found2_time + Timer,
-            L2C = Timings#pcl_timings.found2_count + 1,
-            Timings0#pcl_timings{found2_time = L2T, found2_count = L2C};
-        {_, 3} ->
-            L3T = Timings#pcl_timings.found3_time + Timer,
-            L3C = Timings#pcl_timings.found3_count + 1,
-            Timings0#pcl_timings{found3_time = L3T, found3_count = L3C};
-        _ ->
-            LLT = Timings#pcl_timings.foundlower_time + Timer,
-            LLC = Timings#pcl_timings.foundlower_count + 1,
-            Timings0#pcl_timings{foundlower_time = LLT, foundlower_count = LLC}
-    end.
+-spec maybelog_fetch_timing(
+    leveled_monitor:monitor(),
+    memory|leveled_pmanifest:lsm_level(),
+    leveled_monitor:timing(),
+    boolean()) -> ok.
+maybelog_fetch_timing(_Monitor, _Level, no_timing, _NF) ->
+    ok;
+maybelog_fetch_timing({Pid, _StatsFreq}, _Level, FetchTime, true) ->
+    leveled_monitor:add_stat(Pid, {pcl_fetch_update, not_found, FetchTime});
+maybelog_fetch_timing({Pid, _StatsFreq}, Level, FetchTime, _NF) ->
+    leveled_monitor:add_stat(Pid, {pcl_fetch_update, Level, FetchTime}).
 
 
 %%%============================================================================
@@ -2166,13 +2071,15 @@ simple_server_test() ->
     ?assertMatch(Key3, pcl_fetch(PCLr, {o,"Bucket0003", "Key0003", null})),
     ?assertMatch(Key4, pcl_fetch(PCLr, {o,"Bucket0004", "Key0004", null})),
     
-    {ok, PclSnap, null, _} = 
-        leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
-                                        PCLr,
-                                        null,
-                                        ledger,
-                                        undefined,
-                                        false),           
+    {ok, PclSnap, null} = 
+        leveled_bookie:snapshot_store(
+            leveled_bookie:empty_ledgercache(),
+            PCLr,
+            null,
+            {no_monitor, 0},
+            ledger,
+            undefined,
+            false),           
     
     ?assertMatch(Key1, pcl_fetch(PclSnap, {o,"Bucket0001", "Key0001", null})),
     ?assertMatch(Key2, pcl_fetch(PclSnap, {o,"Bucket0002", "Key0002", null})),
@@ -2221,13 +2128,15 @@ simple_server_test() ->
                                                 1)),
     ok = pcl_close(PclSnap),
      
-    {ok, PclSnap2, null, _} = 
-        leveled_bookie:snapshot_store(leveled_bookie:empty_ledgercache(),
-                                        PCLr,
-                                        null,
-                                        ledger,
-                                        undefined,
-                                        false),
+    {ok, PclSnap2, null} = 
+        leveled_bookie:snapshot_store(
+            leveled_bookie:empty_ledgercache(),
+            PCLr,
+            null,
+            {no_monitor, 0},
+            ledger,
+            undefined,
+            false),
     
     ?assertMatch(replaced, pcl_checksequencenumber(PclSnap2,
                                                 {o,
@@ -2433,19 +2342,6 @@ slow_fetch_test() ->
     ?assertMatch(not_present, log_slowfetch(2, not_present, "fake", 0, 1)),
     ?assertMatch("value", log_slowfetch(2, "value", "fake", 0, 1)).
 
-timings_test() ->
-    SW = os:timestamp(),
-    timer:sleep(1),
-    T0 = update_timings(SW, #pcl_timings{}, {"K", "V"}, 2),
-    timer:sleep(1),
-    T1 = update_timings(SW, T0, {"K", "V"}, 3),
-    T2 = update_timings(SW, T1, {"K", "V"}, basement),
-    ?assertMatch(3, T2#pcl_timings.sample_count),
-    ?assertMatch(true, T2#pcl_timings.foundlower_time > T2#pcl_timings.found2_time),
-    ?assertMatch(1, T2#pcl_timings.found2_count),
-    ?assertMatch(1, T2#pcl_timings.found3_count),
-    ?assertMatch(1, T2#pcl_timings.foundlower_count).    
-
 
 coverage_cheat_test() ->
     {noreply, _State0} = handle_info(timeout, #state{}),
@@ -2511,10 +2407,15 @@ handle_down_test() ->
 loop() ->
     receive
         {snap, PCLr, TestPid} ->
-            {ok, Snap, null, _Timings} =
+            {ok, Snap, null} =
                 leveled_bookie:snapshot_store(
                     leveled_bookie:empty_ledgercache(),
-                    PCLr, null, ledger, undefined, false),
+                    PCLr,
+                    null, 
+                    {no_monitor, 0},
+                    ledger,
+                    undefined,
+                    false),
             TestPid ! {self(), {ok, Snap, null}},
             loop();
         stop ->
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index a8411831..af657878 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -202,6 +202,8 @@
         :: no_cache|non_neg_integer().
 -type level()
         :: non_neg_integer().
+-type summary_filter()
+        :: fun((leveled_codec:ledger_key()) -> any()).
 
 %% yield_blockquery is used to determine if the work necessary to process a
 %% range query beyond the fetching the slot should be managed from within
@@ -220,8 +222,6 @@
             blockindex_cache :: blockindex_cache()|undefined,
             compression_method = native :: press_method(),
             index_moddate = ?INDEX_MODDATE :: boolean(),
-            timings = no_timing :: sst_timings(),
-            timings_countdown = 0 :: integer(),
             starting_pid :: pid()|undefined,
             fetch_cache = no_cache :: fetch_cache(),
             new_slots :: list()|undefined,
@@ -230,22 +230,8 @@
             tomb_count = not_counted
                 :: non_neg_integer()|not_counted,
             high_modified_date :: non_neg_integer()|undefined,
-            filter_fun
-                :: fun((leveled_codec:ledger_key()) -> any()) | undefined}).
-
--record(sst_timings, 
-        {sample_count = 0 :: integer(),
-            index_query_time = 0 :: integer(),
-            lookup_cache_time = 0 :: integer(),
-            slot_index_time = 0 :: integer(),
-            fetch_cache_time = 0 :: integer(),
-            slot_fetch_time = 0 :: integer(),
-            noncached_block_time = 0 :: integer(),
-            lookup_cache_count = 0 :: integer(),
-            slot_index_count = 0 :: integer(),
-            fetch_cache_count = 0 :: integer(),
-            slot_fetch_count = 0 :: integer(),
-            noncached_block_count = 0 :: integer()}).
+            filter_fun :: summary_filter() | undefined,
+            monitor = {no_monitor, 0} :: leveled_monitor:monitor()}).
 
 -record(build_timings,
         {slot_hashlist = 0 :: integer(),
@@ -254,7 +240,6 @@
             fold_toslot = 0 :: integer()}).
 
 -type sst_state() :: #state{}.
--type sst_timings() :: no_timing|#sst_timings{}.
 -type build_timings() :: no_timing|#build_timings{}.
 
 -export_type([expandable_pointer/0, press_method/0]).
@@ -541,13 +526,6 @@ sst_switchlevels(Pid, NewLevel) ->
 sst_close(Pid) ->
     gen_fsm:sync_send_event(Pid, close).
 
--spec sst_printtimings(pid()) -> ok.
-%% @doc
-%% The state of the FSM keeps track of timings of operations, and this can
-%% forced to be printed.
-%% Used in unit tests to force the printing of timings
-sst_printtimings(Pid) ->
-    gen_fsm:sync_send_event(Pid, print_timings).
 
 
 %%%============================================================================
@@ -559,6 +537,7 @@ init([]) ->
 
 starting({sst_open, RootPath, Filename, OptsSST, Level}, _From, State) ->
     leveled_log:save(OptsSST#sst_options.log_options),
+    Monitor = OptsSST#sst_options.monitor,
     {UpdState, Bloom} = 
         read_file(Filename,
                     State#state{root_path=RootPath},
@@ -567,17 +546,19 @@ starting({sst_open, RootPath, Filename, OptsSST, Level}, _From, State) ->
     {reply,
         {ok, {Summary#summary.first_key, Summary#summary.last_key}, Bloom},
         reader,
-        UpdState#state{level = Level, fetch_cache = new_cache(Level)}};
+        UpdState#state{
+            level = Level, fetch_cache = new_cache(Level), monitor = Monitor}};
 starting({sst_new, 
             RootPath, Filename, Level, 
             {SlotList, FirstKey}, MaxSQN,
             OptsSST, IdxModDate, CountOfTombs, StartingPID}, _From, State) ->
     SW = os:timestamp(),
     leveled_log:save(OptsSST#sst_options.log_options),
+    Monitor = OptsSST#sst_options.monitor,
     PressMethod = OptsSST#sst_options.press_method,
     {Length, SlotIndex, BlockEntries, SlotsBin, Bloom} = 
         build_all_slots(SlotList),
-    {BlockIndex, HighModDate} =
+    {_, BlockIndex, HighModDate} =
         update_blockindex_cache(true,
                                 BlockEntries,
                                 new_blockindex_cache(Length),
@@ -602,11 +583,13 @@ starting({sst_new,
     {reply,
         {ok, {Summary#summary.first_key, Summary#summary.last_key}, Bloom},
         reader,
-        UpdState#state{blockindex_cache = BlockIndex,
-                        high_modified_date = HighModDate,
-                        starting_pid = StartingPID,
-                        level = Level,
-                        fetch_cache = new_cache(Level)}};
+        UpdState#state{
+            blockindex_cache = BlockIndex,
+            high_modified_date = HighModDate,
+            starting_pid = StartingPID,
+            level = Level,
+            fetch_cache = new_cache(Level),
+            monitor = Monitor}};
 starting({sst_newlevelzero, RootPath, Filename,
                     Penciller, MaxSQN,
                     OptsSST, IdxModDate}, _From, State) -> 
@@ -630,6 +613,7 @@ starting(complete_l0startup, State) ->
     SW0 = os:timestamp(),
     FetchedSlots = State#state.new_slots,
     leveled_log:save(OptsSST#sst_options.log_options),
+    Monitor = OptsSST#sst_options.monitor,
     PressMethod = OptsSST#sst_options.press_method,
     FetchFun = fun(Slot) -> lists:nth(Slot, FetchedSlots) end,
     KVList = leveled_pmem:to_list(length(FetchedSlots), FetchFun),
@@ -643,7 +627,7 @@ starting(complete_l0startup, State) ->
     SW2 = os:timestamp(),
     {SlotCount, SlotIndex, BlockEntries, SlotsBin,Bloom} =
         build_all_slots(SlotList),
-    {BlockIndex, HighModDate} =
+    {_, BlockIndex, HighModDate} =
         update_blockindex_cache(true,
                                 BlockEntries,
                                 new_blockindex_cache(SlotCount),
@@ -682,17 +666,20 @@ starting(complete_l0startup, State) ->
         undefined ->
             ok;
         _ ->
-            leveled_penciller:pcl_confirml0complete(Penciller,
-                                                    UpdState#state.filename,
-                                                    Summary#summary.first_key,
-                                                    Summary#summary.last_key,
-                                                    Bloom),
+            leveled_penciller:pcl_confirml0complete(
+                Penciller,
+                UpdState#state.filename,
+                Summary#summary.first_key,
+                Summary#summary.last_key,
+                Bloom),
             ok
     end,
     {next_state,
         reader,
-        UpdState#state{blockindex_cache = BlockIndex,
-                        high_modified_date = HighModDate}};
+        UpdState#state{
+            blockindex_cache = BlockIndex,
+            high_modified_date = HighModDate,
+            monitor = Monitor}};
 starting({sst_returnslot, FetchedSlot, FetchFun, SlotCount}, State) ->
     Self = self(),
     FetchedSlots = 
@@ -725,21 +712,47 @@ starting({sst_returnslot, FetchedSlot, FetchFun, SlotCount}, State) ->
 
 reader({get_sqn, LedgerKey, Hash}, _From, State) ->
     % Get a KV value and potentially take sample timings
-    {Result, UpdState, _UpdTimings} = 
-        fetch(LedgerKey, Hash, State, no_timing),
-    {reply, sqn_only(Result), reader, UpdState, ?HIBERNATE_TIMEOUT};
+    {Result, _BIC, _HMD, _FC} = 
+        fetch(
+            LedgerKey, Hash,
+            State#state.summary,
+            State#state.compression_method,
+            State#state.high_modified_date,
+            State#state.index_moddate,
+            State#state.filter_fun,
+            State#state.blockindex_cache,
+            State#state.fetch_cache,
+            State#state.handle,
+            State#state.level,
+            {no_monitor, 0}),
+    {reply, sqn_only(Result), reader, State, ?HIBERNATE_TIMEOUT};
 reader({get_kv, LedgerKey, Hash}, _From, State) ->
     % Get a KV value and potentially take sample timings
-    {Result, UpdState, UpdTimings} = 
-        fetch(LedgerKey, Hash, State, State#state.timings),
-
-    {UpdTimings0, CountDown} = 
-        update_statetimings(UpdTimings,
-                            State#state.timings_countdown,
-                            State#state.level),
-    
-    {reply, Result, reader, UpdState#state{timings = UpdTimings0,
-                                            timings_countdown = CountDown}};
+    {Result, BIC, HMD, FC} = 
+        fetch(
+            LedgerKey, Hash,
+            State#state.summary,
+            State#state.compression_method,
+            State#state.high_modified_date,
+            State#state.index_moddate,
+            State#state.filter_fun,
+            State#state.blockindex_cache,
+            State#state.fetch_cache,
+            State#state.handle,
+            State#state.level,
+            State#state.monitor),
+    case {BIC, HMD, FC} of
+        {no_update, no_update, no_update} ->
+            {reply, Result, reader, State};
+        {no_update, no_update, FC} ->
+            {reply, Result, reader, State#state{fetch_cache = FC}};
+        {BIC, HMD, no_update} ->
+            {reply,
+                Result,
+                reader,
+                State#state{
+                    blockindex_cache = BIC, high_modified_date = HMD}}
+    end;
 reader({get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod},
                                                             _From, State) ->
     ReadNeeded =
@@ -774,17 +787,23 @@ reader({get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod},
                                     PressMethod,
                                     IdxModDate,
                                     SegList),
-            {BlockIdxC0, HighModDate} =
+            {UpdateCache, BlockIdxC0, HighModDate} =
                 update_blockindex_cache(NeedBlockIdx,
                                         FoundBIC,
                                         State#state.blockindex_cache,
                                         State#state.high_modified_date,
                                         State#state.index_moddate),
-            {reply, 
-                L ++ SlotsToPoint, 
-                reader, 
-                State#state{blockindex_cache = BlockIdxC0,
-                            high_modified_date = HighModDate}}
+            case UpdateCache of
+                true ->
+                    {reply, 
+                        L ++ SlotsToPoint, 
+                        reader, 
+                        State#state{
+                            blockindex_cache = BlockIdxC0,
+                            high_modified_date = HighModDate}};
+                false ->
+                    {reply, L ++ SlotsToPoint, reader, State}
+            end
     end;
 reader({get_slots, SlotList, SegList, LowLastMod}, _From, State) ->
     PressMethod = State#state.compression_method,
@@ -804,9 +823,6 @@ reader({get_slots, SlotList, SegList, LowLastMod}, _From, State) ->
 reader(get_maxsequencenumber, _From, State) ->
     Summary = State#state.summary,
     {reply, Summary#summary.max_sqn, reader, State};
-reader(print_timings, _From, State) ->
-    log_timings(State#state.timings, State#state.level),
-    {reply, ok, reader, State};
 reader({set_for_delete, Penciller}, _From, State) ->
     leveled_log:log("SST06", [State#state.filename]),
     {reply,
@@ -851,12 +867,35 @@ reader({switch_levels, NewLevel}, State) ->
 
 delete_pending({get_sqn, LedgerKey, Hash}, _From, State) ->
     % Get a KV value and potentially take sample timings
-    {Result, UpdState, _UpdTimings} = 
-        fetch(LedgerKey, Hash, State, no_timing),
-    {reply, sqn_only(Result), delete_pending, UpdState, ?DELETE_TIMEOUT};
+    {Result, _BIC, _HMD, _FC} = 
+        fetch(
+            LedgerKey, Hash,
+            State#state.summary,
+            State#state.compression_method,
+            State#state.high_modified_date,
+            State#state.index_moddate,
+            State#state.filter_fun,
+            State#state.blockindex_cache,
+            State#state.fetch_cache,
+            State#state.handle,
+            State#state.level,
+            {no_monitor, 0}),
+    {reply, sqn_only(Result), delete_pending, State, ?DELETE_TIMEOUT};
 delete_pending({get_kv, LedgerKey, Hash}, _From, State) ->
-    {Result, UpdState, _Ts} = fetch(LedgerKey, Hash, State, no_timing),
-    {reply, Result, delete_pending, UpdState, ?DELETE_TIMEOUT};
+    {Result, _BIC, _HMD, _FC} = 
+        fetch(
+            LedgerKey, Hash,
+            State#state.summary,
+            State#state.compression_method,
+            State#state.high_modified_date,
+            State#state.index_moddate,
+            State#state.filter_fun,
+            State#state.blockindex_cache,
+            State#state.fetch_cache,
+            State#state.handle,
+            State#state.level,
+            {no_monitor, 0}),
+    {reply, Result, delete_pending, State, ?DELETE_TIMEOUT};
 delete_pending({get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod},
                                                             _From, State) ->
     {_NeedBlockIdx, SlotsToFetchBinList, SlotsToPoint} =
@@ -912,7 +951,7 @@ handle_sync_event(_Msg, _From, StateName, State) ->
     {reply, undefined, StateName, State}.
 
 handle_event({update_blockindex_cache, BIC}, StateName, State) ->
-    {BlockIndexCache, HighModDate} =
+    {_, BlockIndexCache, HighModDate} =
         update_blockindex_cache(true,
                                 BIC,
                                 State#state.blockindex_cache,
@@ -931,6 +970,7 @@ handle_info(bic_complete, StateName, State) ->
     % The block index cache is complete, so the memory footprint should be
     % relatively stable from this point.  Hibernate to help minimise
     % fragmentation
+    leveled_log:log("SST14", [State#state.filename]),
     {next_state, StateName, State, hibernate};
 handle_info(start_complete, StateName, State) ->
     % The SST file will be started by a clerk, but the clerk may be shut down
@@ -1279,11 +1319,12 @@ updatebic_foldfun(HMDRequired) ->
 -spec update_blockindex_cache(
         boolean(), list({integer(), binary()}),
         blockindex_cache(), non_neg_integer()|undefined,
-        boolean()) -> {blockindex_cache(), non_neg_integer()|undefined}.
+        boolean()) -> 
+            {boolean(), blockindex_cache(), non_neg_integer()|undefined}.
 update_blockindex_cache(true, Entries, BIC, HighModDate, IdxModDate) ->
     case {element(1, BIC), array:size(element(2, BIC))} of
         {N, N} ->
-            {BIC, HighModDate};
+            {false, BIC, HighModDate};
         {N, S} when N < S ->
             FoldFun =
                 case {HighModDate, IdxModDate} of
@@ -1295,16 +1336,16 @@ update_blockindex_cache(true, Entries, BIC, HighModDate, IdxModDate) ->
             BIC0 = lists:foldl(FoldFun, BIC, Entries),
             case {element(1, BIC0), IdxModDate} of
                 {N, _} ->
-                    {BIC, HighModDate};
+                    {false, BIC, HighModDate};
                 {S, true} ->
                     erlang:send(self(), bic_complete),
-                    {BIC0, element(3, BIC0)};
+                    {true, BIC0, element(3, BIC0)};
                 _ ->
-                    {BIC0, undefined}
+                    {true, BIC0, undefined}
             end
     end;
 update_blockindex_cache(_Needed, _Entries, BIC, HighModDate, _IdxModDate) ->
-    {BIC, HighModDate}.
+    {false, BIC, HighModDate}.
 
 -spec check_modified(non_neg_integer()|undefined,
                         non_neg_integer(),
@@ -1315,89 +1356,97 @@ check_modified(HighLastModifiedInSST, LowModDate, true)
 check_modified(_, _, _) ->
     true.
 
--spec fetch(tuple(), 
-            {integer(), integer()}|integer(), 
-            sst_state(), sst_timings()) 
-                        -> {not_present|tuple(), sst_state(), sst_timings()}.
+-spec fetch(
+    leveled_codec:ledger_key(), 
+    leveled_codec:segment_hash(),
+    sst_summary(),
+    press_method(),
+    non_neg_integer()|undefined,
+    boolean(),
+    summary_filter(),
+    blockindex_cache(),
+    fetch_cache(),
+    file:fd(),
+    leveled_pmanifest:lsm_level(),
+    leveled_monitor:monitor()) 
+        -> {not_present|leveled_codec:ledger_kv(),
+            blockindex_cache()|no_update,
+            non_neg_integer()|undefined|no_update,
+            fetch_cache()|no_update}.
 %% @doc
 %%
 %% Fetch a key from the store, potentially taking timings.  Result should be
 %% not_present if the key is not in the store.
-fetch(LedgerKey, Hash, State, Timings0) ->
-    SW0 = os:timestamp(),
-
-    Summary = State#state.summary,
-    PressMethod = State#state.compression_method,
-    IdxModDate = State#state.index_moddate,
+fetch(LedgerKey, Hash,
+        Summary,
+        PressMethod, HighModDate, IndexModDate, FilterFun, BIC, FetchCache,
+        Handle, Level, Monitor) ->
+    SW0 = leveled_monitor:maybe_time(Monitor),
     Slot =
-        lookup_slot(
-            LedgerKey, Summary#summary.index, State#state.filter_fun),
-    
-    {SW1, Timings1} = update_timings(SW0, Timings0, index_query, true),
-    
+        lookup_slot(LedgerKey, Summary#summary.index, FilterFun),
     SlotID = Slot#slot_index_value.slot_id,
-    CachedBlockIdx = 
-        array:get(SlotID - 1, element(2, State#state.blockindex_cache)),
-    {SW2, Timings2} = update_timings(SW1, Timings1, lookup_cache, true),
-
-    case extract_header(CachedBlockIdx, IdxModDate) of 
+    CachedBlockIdx = array:get(SlotID - 1, element(2, BIC)),
+    
+    case extract_header(CachedBlockIdx, IndexModDate) of 
         none ->
-            SlotBin = read_slot(State#state.handle, Slot),
+            SlotBin = read_slot(Handle, Slot),
             {Result, Header} = 
-                binaryslot_get(SlotBin, LedgerKey, Hash, PressMethod, IdxModDate),
-            {BlockIndexCache, HighModDate} =
+                binaryslot_get(
+                    SlotBin, LedgerKey, Hash, PressMethod, IndexModDate),
+            {_UpdateState, BIC0, HMD0} =
                 update_blockindex_cache(true,
                                         [{SlotID, Header}],
-                                        State#state.blockindex_cache,
-                                        State#state.high_modified_date,
-                                        State#state.index_moddate),
-            {_SW3, Timings3} = 
-                update_timings(SW2, Timings2, noncached_block, false),
-            {Result, 
-                State#state{blockindex_cache = BlockIndexCache,
-                            high_modified_date = HighModDate}, 
-                Timings3};
+                                        BIC,
+                                        HighModDate,
+                                        IndexModDate),
+            case Result of
+                not_present ->
+                    maybelog_fetch_timing(
+                        Monitor, Level, not_found, SW0);
+                _ ->
+                    maybelog_fetch_timing(
+                        Monitor, Level, slot_noncachedblock, SW0)
+            end,
+            {Result, BIC0, HMD0, no_update};
         {BlockLengths, _LMD, PosBin} ->
             PosList = find_pos(PosBin, extract_hash(Hash), [], 0),
             case PosList of 
                 [] ->
-                    {_SW3, Timings3} =
-                        update_timings(SW2, Timings2, slot_index, false),
-                    {not_present, State, Timings3};
+                    maybelog_fetch_timing(Monitor, Level, not_found, SW0),
+                    {not_present, no_update, no_update, no_update};
                 _ ->
-                    {SW3, Timings3} =
-                        update_timings(SW2, Timings2, slot_index, true),
-                    FetchCache = State#state.fetch_cache,
-                    CacheHash = cache_hash(Hash, State#state.level),
+                    CacheHash = cache_hash(Hash, Level),
                     case fetch_from_cache(CacheHash, FetchCache) of 
                         {LedgerKey, V} ->
-                            {_SW4, Timings4} = 
-                                update_timings(SW3, 
-                                                Timings3, 
-                                                fetch_cache, 
-                                                false),
-                            {{LedgerKey, V}, State, Timings4};
+                            maybelog_fetch_timing(
+                                Monitor, Level, fetch_cache, SW0),
+                            {{LedgerKey, V}, no_update, no_update, no_update};
                         _ ->
                             StartPos = Slot#slot_index_value.start_position,
                             Result = 
                                 check_blocks(PosList,
-                                                {State#state.handle, StartPos},
+                                                {Handle, StartPos},
                                                 BlockLengths,
                                                 byte_size(PosBin),
                                                 LedgerKey, 
                                                 PressMethod,
-                                                IdxModDate,
+                                                IndexModDate,
                                                 not_present),
-                            FetchCache0 =
-                                add_to_cache(CacheHash, Result, FetchCache),
-                            {_SW4, Timings4} = 
-                                update_timings(SW3, 
-                                                Timings3, 
-                                                slot_fetch, 
-                                                false),
-                            {Result, 
-                                State#state{fetch_cache = FetchCache0}, 
-                                Timings4}
+                            case Result of
+                                not_present ->
+                                    maybelog_fetch_timing(
+                                        Monitor, Level, not_found, SW0),
+                                    {not_present,
+                                        no_update, no_update, no_update};
+                                _ ->
+                                    FetchCache0 =
+                                        add_to_cache(
+                                            CacheHash, Result, FetchCache),
+                                    maybelog_fetch_timing(
+                                        Monitor, Level, slot_cachedblock, SW0),
+                                    {Result,
+                                        no_update, no_update, FetchCache0}  
+                            end
                     end
             end 
     end.
@@ -3019,99 +3068,16 @@ log_buildtimings(Timings, LI) ->
                                 element(1, LI),
                                 element(2, LI)]).
 
-
--spec update_statetimings(sst_timings(), integer(), non_neg_integer()) 
-                                            -> {sst_timings(), integer()}.
-%% @doc
-%%
-%% The timings state is either in countdown to the next set of samples of
-%% we are actively collecting a sample.  Active collection take place 
-%% when the countdown is 0.  Once the sample has reached the expected count
-%% then there is a log of that sample, and the countdown is restarted.
-%%
-%% Outside of sample windows the timings object should be set to the atom
-%% no_timing.  no_timing is a valid state for the cdb_timings type.
-update_statetimings(no_timing, 0, _Level) ->
-    {#sst_timings{}, 0};
-update_statetimings(Timings, 0, Level) ->
-    case Timings#sst_timings.sample_count of 
-        SC when SC >= ?TIMING_SAMPLESIZE ->
-            log_timings(Timings, Level),
-                % If file at lower level wait longer before tsking another
-                % sample
-            {no_timing,
-                leveled_rand:uniform(2 * ?TIMING_SAMPLECOUNTDOWN)};
-        _SC ->
-            {Timings, 0}
-    end;
-update_statetimings(no_timing, N, _Level) ->
-    {no_timing, N - 1}.
-
-log_timings(no_timing, _Level) ->
+-spec maybelog_fetch_timing(
+        leveled_monitor:monitor(),
+        leveled_pmanifest:lsm_level(),
+        leveled_monitor:sst_fetch_type(),
+        erlang:timestamp()|no_timing) -> ok.
+maybelog_fetch_timing(_Monitor, _Level, _Type, no_timing) ->
     ok;
-log_timings(Timings, Level) ->
-    leveled_log:log("SST12", [Level,
-                                Timings#sst_timings.sample_count, 
-                                Timings#sst_timings.index_query_time,
-                                Timings#sst_timings.lookup_cache_time,
-                                Timings#sst_timings.slot_index_time,
-                                Timings#sst_timings.fetch_cache_time,
-                                Timings#sst_timings.slot_fetch_time,
-                                Timings#sst_timings.noncached_block_time,
-                                Timings#sst_timings.slot_index_count,
-                                Timings#sst_timings.fetch_cache_count,
-                                Timings#sst_timings.slot_fetch_count,
-                                Timings#sst_timings.noncached_block_count]).
-
-
-update_timings(_SW, no_timing, _Stage, _Continue) ->
-    {no_timing, no_timing};
-update_timings(SW, Timings, Stage, Continue) ->
-    Timer = timer:now_diff(os:timestamp(), SW),
-    Timings0 = 
-        case Stage of 
-            index_query ->
-                IQT = Timings#sst_timings.index_query_time,
-                Timings#sst_timings{index_query_time = IQT + Timer};
-            lookup_cache ->
-                TBT = Timings#sst_timings.lookup_cache_time,
-                Timings#sst_timings{lookup_cache_time = TBT + Timer};
-            slot_index ->
-                SIT = Timings#sst_timings.slot_index_time,
-                Timings#sst_timings{slot_index_time = SIT + Timer};
-            fetch_cache ->
-                FCT = Timings#sst_timings.fetch_cache_time,
-                Timings#sst_timings{fetch_cache_time = FCT + Timer};
-            slot_fetch ->
-                SFT = Timings#sst_timings.slot_fetch_time,
-                Timings#sst_timings{slot_fetch_time = SFT + Timer};
-            noncached_block ->
-                NCT = Timings#sst_timings.noncached_block_time,
-                Timings#sst_timings{noncached_block_time = NCT + Timer}
-        end,
-    case Continue of 
-        true ->
-            {os:timestamp(), Timings0};
-        false ->
-            Timings1 = 
-                case Stage of 
-                    slot_index ->
-                        SIC = Timings#sst_timings.slot_index_count,
-                        Timings0#sst_timings{slot_index_count = SIC + 1};
-                    fetch_cache ->
-                        FCC = Timings#sst_timings.fetch_cache_count,
-                        Timings0#sst_timings{fetch_cache_count = FCC + 1};
-                    slot_fetch ->
-                        SFC = Timings#sst_timings.slot_fetch_count,
-                        Timings0#sst_timings{slot_fetch_count = SFC + 1};
-                    noncached_block ->
-                        NCC = Timings#sst_timings.noncached_block_count,
-                        Timings0#sst_timings{noncached_block_count = NCC + 1}
-                end,
-            SC = Timings1#sst_timings.sample_count,
-            {no_timing, Timings1#sst_timings{sample_count = SC + 1}}
-    end.
-
+maybelog_fetch_timing({Pid, _SlotFreq}, Level, Type, SW) ->
+    {TS1, _} = leveled_monitor:step_time(SW),
+    leveled_monitor:add_stat(Pid, {sst_fetch_update, Level, Type, TS1}).
 
 %%%============================================================================
 %%% Test
@@ -4003,7 +3969,6 @@ simple_persisted_tester(SSTNewFun) ->
                 "Checking for ~w keys (twice) in file with cache hit took ~w "
                     ++ "microseconds~n",
                 [length(KVList1), timer:now_diff(os:timestamp(), SW1)]),
-    ok = sst_printtimings(Pid),
     KVList2 = generate_randomkeys(1, ?LOOK_SLOTSIZE * 32, 1, 20),
     MapFun =
         fun({K, V}, Acc) ->
@@ -4025,7 +3990,6 @@ simple_persisted_tester(SSTNewFun) ->
     io:format(user,
                 "Checking for ~w missing keys took ~w microseconds~n",
                 [length(KVList3), timer:now_diff(os:timestamp(), SW2)]),
-    ok = sst_printtimings(Pid),
     FetchList1 = sst_getkvrange(Pid, all, all, 2),
     FoldFun = fun(X, Acc) ->
                     case X of
@@ -4223,19 +4187,6 @@ check_segment_match(PosBinIndex1, KVL, TreeSize) ->
         end,
     lists:foreach(CheckFun, KVL).
 
-timings_test() ->
-    SW = os:timestamp(),
-    timer:sleep(1),
-    {no_timing, T1} = update_timings(SW, #sst_timings{}, slot_index, false),
-    {no_timing, T2} = update_timings(SW, T1, slot_fetch, false),
-    {no_timing, T3} = update_timings(SW, T2, noncached_block, false),
-    timer:sleep(1),
-    {_, T4} = update_timings(SW, T3, slot_fetch, true),
-    ?assertMatch(3, T4#sst_timings.sample_count),
-    ?assertMatch(1, T4#sst_timings.slot_fetch_count),
-    ?assertMatch(true, T4#sst_timings.slot_fetch_time > 
-                            T3#sst_timings.slot_fetch_time).
-
 take_max_lastmoddate_test() ->
     % TODO: Remove this test
     % Temporarily added to make dialyzer happy (until we've made use of last
@@ -4390,20 +4341,20 @@ block_index_cache_test() ->
     HeaderTS = <<0:160/integer, Now:32/integer, 0:32/integer>>,
     HeaderNoTS = <<0:192>>,
     BIC = new_blockindex_cache(8),
-    {BIC0, undefined} =
+    {_, BIC0, undefined} =
         update_blockindex_cache(false, EntriesNoTS, BIC, undefined, false),
-    {BIC1, undefined} =
+    {_, BIC1, undefined} =
         update_blockindex_cache(false, EntriesTS, BIC, undefined, true),
-    {BIC2, undefined} =
+    {_, BIC2, undefined} =
         update_blockindex_cache(true, EntriesNoTS, BIC, undefined, false),
     {ETSP1, ETSP2} = lists:split(6, EntriesTS),
-    {BIC3, undefined} =
+    {_, BIC3, undefined} =
         update_blockindex_cache(true, ETSP1, BIC, undefined, true),
-    {BIC3, undefined} =
+    {_, BIC3, undefined} =
         update_blockindex_cache(true, ETSP1, BIC3, undefined, true),
-    {BIC4, LMD4} =
+    {_, BIC4, LMD4} =
         update_blockindex_cache(true, ETSP2, BIC3, undefined, true),
-    {BIC4, LMD4} =
+    {_, BIC4, LMD4} =
         update_blockindex_cache(true, ETSP2, BIC4, LMD4, true),
     
     ?assertMatch(none, array:get(0, element(2, BIC0))),
diff --git a/test/end_to_end/riak_SUITE.erl b/test/end_to_end/riak_SUITE.erl
index 6d416a57..cb37dc86 100644
--- a/test/end_to_end/riak_SUITE.erl
+++ b/test/end_to_end/riak_SUITE.erl
@@ -627,16 +627,15 @@ fetchclocks_modifiedbetween(_Config) ->
     io:format("Comparing queries for Obj1 TS range ~w ~w~n",
                 [ObjL1StartTS, ObjL1EndTS]),
 
-    PlusFilterStart = os:timestamp(),
-    R3A_PlusFilter = lists:foldl(FoldRangesFun(Bookie1A, 
-                                    {ObjL1StartTS, ObjL1EndTS},
-                                    100000,
-                                    100000),
-                        {0, 0}, lists:seq(1, 1)),
-    PlusFilterTime = timer:now_diff(os:timestamp(), PlusFilterStart)/1000,
-    io:format("R3A_PlusFilter ~w~n", [R3A_PlusFilter]),
-    true = {20000, 20000} == R3A_PlusFilter,
-
+    PlusFilterTimes =
+        lists:map(
+            fun(_I) -> 
+                time_filtered_query(
+                    FoldRangesFun, Bookie1A, ObjL1StartTS, ObjL1EndTS)
+            end,
+            lists:seq(1, 4)),
+    PlusFilterTime = lists:sum(PlusFilterTimes) div 4,
+    
     NoFilterStart = os:timestamp(),
     {async, R3A_NoFilterRunner} = 
         leveled_bookie:book_headfold(Bookie1A,
@@ -649,7 +648,7 @@ fetchclocks_modifiedbetween(_Config) ->
                                         true,
                                         false),
     R3A_NoFilter = R3A_NoFilterRunner(),
-    NoFilterTime = timer:now_diff(os:timestamp(), NoFilterStart)/1000,
+    NoFilterTime = timer:now_diff(os:timestamp(), NoFilterStart) div 1000,
     io:format("R3A_NoFilter ~w~n", [R3A_NoFilter]),
     true = {20000, 20000} == R3A_NoFilter,
     io:format("Filtered query ~w ms and unfiltered query ~w ms~n", 
@@ -777,7 +776,18 @@ fetchclocks_modifiedbetween(_Config) ->
 
     ok = leveled_bookie:book_destroy(Bookie1A),
     ok = leveled_bookie:book_destroy(Bookie1BS).
-    
+
+time_filtered_query(FoldRangesFun, Bookie, ObjL1StartTS, ObjL1EndTS) ->
+    PlusFilterStart = os:timestamp(),
+    R3A_PlusFilter = lists:foldl(FoldRangesFun(Bookie, 
+                                    {ObjL1StartTS, ObjL1EndTS},
+                                    100000,
+                                    100000),
+                        {0, 0}, lists:seq(1, 1)),
+    PlusFilterTime = timer:now_diff(os:timestamp(), PlusFilterStart) div 1000,
+    io:format("R3A_PlusFilter ~w in ~w~n", [R3A_PlusFilter, PlusFilterTime]),
+    true = {20000, 20000} == R3A_PlusFilter,
+    PlusFilterTime.
 
 lmdrange_tester(Bookie1BS, SimpleCountFun,
                 ObjL4StartTS, ObjL6StartTS, ObjL6EndTS, TooLate) ->

From 54af7cc556d66ceaded42ddd3bcf1a485d7cbb2a Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 30 Nov 2022 16:46:45 +0000
Subject: [PATCH 27/37] Make configurable

---
 docs/STARTUP_OPTIONS.md         |  6 ++++++
 priv/leveled.schema             | 19 ++++++++++++++++---
 priv/leveled_multi.schema       | 18 ++++++++++++++++++
 src/leveled_bookie.erl          | 24 +++++++++++++++++++-----
 src/leveled_monitor.erl         | 26 +++++++++++++-------------
 test/end_to_end/basic_SUITE.erl | 14 +++++++++-----
 test/end_to_end/riak_SUITE.erl  |  4 +++-
 7 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/docs/STARTUP_OPTIONS.md b/docs/STARTUP_OPTIONS.md
index d09ebd3f..e5604c5f 100644
--- a/docs/STARTUP_OPTIONS.md
+++ b/docs/STARTUP_OPTIONS.md
@@ -120,3 +120,9 @@ There are two snapshot timeouts that can be configured:
 These set the period in seconds before a snapshot which has not shutdown, is declared to have been released - so that any file deletions which are awaiting the snapshot's completion can go ahead.
 
 This covers only silently failing snapshots.  Snapshots that shutdown neatly will be released from locking deleted files when they shutdown.  The 'short' timeout is used for snapshots which support index queries and bucket listing.  The 'long' timeout is used for all other folds (e.g. key lists, head folds and object folds).
+
+## Statistic gathering
+
+Leveled will gather monitoring statistics on HEAD/GET/PUT requests, with timing points taken throughout the store.  These timings are gathered by the `leveled_monitor`, and there are three configuration options.  The two primary options are: `stats_percentage` is an integer between 0 and 100 which informs the store of the proprtion of the requests which should be timed at each part; and `monitor_log_frequency` which controls the frequency (in seconds) with which the leveled_monitor will write a log file (for one of the stats types in its queue).
+
+The specific stats types logged can be found in the ?LOG_LIST within the leveled_monitor.  If a subset only is of interest, than this list can be modified by setting `monitor_log_list`.  This can also be used to repeat the frequency of individual log types by adding them to the list multiple times.
\ No newline at end of file
diff --git a/priv/leveled.schema b/priv/leveled.schema
index 6f726931..f6501eb4 100644
--- a/priv/leveled.schema
+++ b/priv/leveled.schema
@@ -188,7 +188,20 @@
   hidden
 ]}.
 
+%% @doc Statistic monitoring proportion
+%% The proportion of requests to be convered by stats, an integer between
+%% 0 and 100.  There is no flow control, so setting this too high could
+%% possibly overflow the leveled_monitor mailbox.
+{mapping, "leveled.stats_percentage", "leveled.stats_percentage", [
+  {default, 10},
+  {datatype, integer},
+  {validators, ["range:0-100"]}
+]}.
 
-
-
-
+%% @doc Statistic log frequency (seconds)
+%% The wait in seconds between logs from each leveled_monitor (there is one
+%% monitor per vnode)
+{mapping, "leveled.stats_percentage", "leveled.stats_percentage", [
+  {default, 30},
+  {datatype, integer}
+]}.
\ No newline at end of file
diff --git a/priv/leveled_multi.schema b/priv/leveled_multi.schema
index 0b4c854e..3d66c60d 100644
--- a/priv/leveled_multi.schema
+++ b/priv/leveled_multi.schema
@@ -178,7 +178,25 @@
   hidden
 ]}.
 
+%% @doc Statistic monitoring proportion
+%% The proportion of requests to be convered by stats, an integer between
+%% 0 and 100.  There is no flow control, so setting this too high could
+%% possibly overflow the leveled_monitor mailbox.
+{mapping, "multi_backend.$name.leveled.stats_percentage", "riak_kv.multi_backend", [
+  {default, 10},
+  {datatype, integer},
+  {validators, ["range:0-100"]},
+  hidden
+]}.
 
+%% @doc Statistic log frequency (seconds)
+%% The wait in seconds between logs from each leveled_monitor (there is one
+%% monitor per vnode)
+{mapping, "multi_backend.$name.leveled.stats_percentage", "riak_kv.multi_backend", [
+  {default, 30},
+  {datatype, integer},
+  hidden
+]}.
 
 
 
diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl
index bc0f9852..2812ac75 100644
--- a/src/leveled_bookie.erl
+++ b/src/leveled_bookie.erl
@@ -158,7 +158,11 @@
                 {override_functions, []},
                 {snapshot_timeout_short, ?SNAPTIMEOUT_SHORT},
                 {snapshot_timeout_long, ?SNAPTIMEOUT_LONG},
-                {stats_frequency, ?DEFAULT_STATS_PERC}]).
+                {stats_percentage, ?DEFAULT_STATS_PERC},
+                {monitor_log_frequency,
+                    element(1, leveled_monitor:get_defaults())},
+                {monitor_log_list,
+                    element(2, leveled_monitor:get_defaults())}]).
 
 -record(ledger_cache, {mem :: ets:tab(),
                         loader = leveled_tree:empty(?CACHE_TYPE)
@@ -359,9 +363,15 @@
             % assumed to have failed, and so requires to be torndown.  The
             % short timeout is applied to queries where long_running is set to
             % true
-        {stats_frequency, 0..100}
+        {stats_percentage, 0..100} |
             % Probability that stats will be collected for an individual
-            % request
+            % request.
+        {monitor_log_frequency, pos_integer()} |
+            % Time in seconds before logging the next timing log. This covers
+            % the logs associated with the timing of GET/PUTs in various parts
+            % of the system.  There are 7 such logs - so setting to 30s will
+            % mean that each inidividual log will occur every 210s
+        {monitor_log_list, list(leveled_monitor:log_type())}
         ].
 
 -type initial_loadfun() ::
@@ -1168,8 +1178,12 @@ init([Opts]) ->
             DatabaseID = proplists:get_value(database_id, Opts),
             leveled_log:set_databaseid(DatabaseID),
 
-            {ok, Monitor} = leveled_monitor:monitor_start(),
-            StatLogFrequency = proplists:get_value(stats_frequency, Opts),
+            {ok, Monitor} =
+                leveled_monitor:monitor_start(
+                    proplists:get_value(monitor_log_frequency, Opts),
+                    proplists:get_value(monitor_log_list, Opts)
+                ),
+            StatLogFrequency = proplists:get_value(stats_percentage, Opts),
 
             {InkerOpts, PencillerOpts} =
                 set_options(Opts, {Monitor, StatLogFrequency}),
diff --git a/src/leveled_monitor.erl b/src/leveled_monitor.erl
index 4cfa45a8..9601e53c 100644
--- a/src/leveled_monitor.erl
+++ b/src/leveled_monitor.erl
@@ -20,7 +20,6 @@
     code_change/3]).
 
 -export([
-    monitor_start/0,
     monitor_start/2,
     add_stat/2,
     report_stats/2,
@@ -29,12 +28,12 @@
     step_time/1,
     log_level/2,
     log_add/2,
-    log_remove/2]).
+    log_remove/2,
+    get_defaults/0]).
 
 -include_lib("eunit/include/eunit.hrl").
 
--define(
-    LOG_ORDER,
+-define(LOG_LIST,
     [bookie_get, bookie_put, bookie_head, bookie_snap,
         pcl_fetch, sst_fetch, cdb_get]).
 -define(LOG_FREQUENCY_SECONDS, 30).
@@ -107,7 +106,7 @@
     sst_fetch_timings = [] :: list(sst_fetch_timings()),
     cdb_get_timings = #cdb_get_timings{} :: cdb_get_timings(),
     log_frequency = ?LOG_FREQUENCY_SECONDS :: pos_integer(),
-    log_order = [] :: list(log_types())}).      
+    log_order = [] :: list(log_type())}).      
 
 
 -type bookie_get_timings() :: #bookie_get_timings{}.
@@ -118,7 +117,7 @@
 -type cdb_get_timings() :: #cdb_get_timings{}.
 -type sst_fetch_timings() ::
     {leveled_pmanifest:lsm_level(), #sst_fetch_timings{}}.
--type log_types() ::
+-type log_type() ::
     bookie_head|bookie_get|bookie_put|bookie_snap|pcl_fetch|sst_fetch|cdb_get.
 -type pcl_level() :: mem|leveled_pmanifest:lsm_level().
 -type sst_fetch_type() ::
@@ -149,17 +148,13 @@
         bookie_snap_update()|
         pcl_fetch_update()|sst_fetch_update()|cdb_get_update().
 
--export_type([monitor/0, timing/0, sst_fetch_type/0]).
+-export_type([monitor/0, timing/0, sst_fetch_type/0, log_type/0]).
 
 %%%============================================================================
 %%% API
 %%%============================================================================
 
--spec monitor_start() -> {ok, pid()}.
-monitor_start() ->
-    monitor_start(?LOG_FREQUENCY_SECONDS, ?LOG_ORDER).
-
--spec monitor_start(pos_integer(), list(log_types())) -> {ok, pid()}.
+-spec monitor_start(pos_integer(), list(log_type())) -> {ok, pid()}.
 monitor_start(LogFreq, LogOrder) ->
     gen_server:start_link(
         ?MODULE, [leveled_log:get_opts(), LogFreq, LogOrder], []).
@@ -168,7 +163,7 @@ monitor_start(LogFreq, LogOrder) ->
 add_stat(Watcher, Statistic) ->
     gen_server:cast(Watcher, Statistic).
 
--spec report_stats(pid(), log_types()) -> ok.
+-spec report_stats(pid(), log_type()) -> ok.
 report_stats(Watcher, StatsType) ->
     gen_server:cast(Watcher, {report_stats, StatsType}).
 
@@ -208,6 +203,9 @@ step_time(TS) ->
     Now = os:timestamp(),
     {timer:now_diff(Now, TS), Now}.
 
+-spec get_defaults() -> {pos_integer(), list(log_type())}.
+get_defaults() ->
+    {?LOG_FREQUENCY_SECONDS, ?LOG_LIST}.
 
 %%%============================================================================
 %%% gen_server callbacks
@@ -539,6 +537,8 @@ coverage_cheat_test() ->
     {ok, M} = monitor_start(1, []),
     timer:sleep(2000),
     {ok, _State1} = code_change(null, #state{}, null),
+    ok = add_stat(M, {pcl_fetch_update, 4, 100}),
+    ok = report_stats(M, pcl_fetch),
     % Can close, so empty log_order hasn't crashed
     ok = monitor_close(M).
 
diff --git a/test/end_to_end/basic_SUITE.erl b/test/end_to_end/basic_SUITE.erl
index d657d079..c93ef76d 100644
--- a/test/end_to_end/basic_SUITE.erl
+++ b/test/end_to_end/basic_SUITE.erl
@@ -95,10 +95,12 @@ simple_test_withlog(LogLevel, ForcedLogs) ->
 
 many_put_fetch_head(_Config) ->
     RootPath = testutil:reset_filestructure(),
-    StartOpts1 = [{root_path, RootPath},
-                    {max_pencillercachesize, 16000},
-                    {sync_strategy, riak_sync},
-                    {compression_point, on_compact}],
+    StartOpts1 =
+        [{root_path, RootPath},
+            {max_pencillercachesize, 16000},
+            {sync_strategy, riak_sync},
+            {compression_point, on_compact}
+        ],
     {ok, Bookie1} = leveled_bookie:book_start(StartOpts1),
     {TestObject, TestSpec} = testutil:generate_testobject(),
     ok = testutil:book_riakput(Bookie1, TestObject, TestSpec),
@@ -540,7 +542,9 @@ load_and_count(JournalSize, BookiesMemSize, PencillerMemSize) ->
                     {max_journalsize, JournalSize},
                     {cache_size, BookiesMemSize},
                     {max_pencillercachesize, PencillerMemSize},
-                    {sync_strategy, testutil:sync_strategy()}],
+                    {sync_strategy, testutil:sync_strategy()},
+                    {monitor_log_frequency, 5},
+                    {stats_probability, 80}],
     {ok, Bookie1} = leveled_bookie:book_start(StartOpts1),
     {TestObject, TestSpec} = testutil:generate_testobject(),
     ok = testutil:book_riakput(Bookie1, TestObject, TestSpec),
diff --git a/test/end_to_end/riak_SUITE.erl b/test/end_to_end/riak_SUITE.erl
index cb37dc86..39de7ac6 100644
--- a/test/end_to_end/riak_SUITE.erl
+++ b/test/end_to_end/riak_SUITE.erl
@@ -43,7 +43,9 @@ basic_riak_tester(Bucket, KeyCount) ->
                     {max_journalsize, 500000000},
                     {max_pencillercachesize, 24000},
                     {sync_strategy, testutil:sync_strategy()},
-                    {database_id, 32}],
+                    {database_id, 32},
+                    {monitor_log_frequency, 5},
+                    {stats_probability, 80}],
     {ok, Bookie1} = leveled_bookie:book_start(StartOpts1),
 
     IndexGenFun =

From 4aaecf4ab46823461296ad852c693b5ada52346f Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 1 Dec 2022 09:33:51 +0000
Subject: [PATCH 28/37] Close in loop

If the send_after occurs immediately before the close, the prompted handle_cast may fail due to termination.

Now close will cast to reply after the cast, and any additional messages in the queue are processed.  A message received by handle_info in between will be ignored and not cast to prompt a log
---
 src/leveled_monitor.erl | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/leveled_monitor.erl b/src/leveled_monitor.erl
index 9601e53c..74bdb08a 100644
--- a/src/leveled_monitor.erl
+++ b/src/leveled_monitor.erl
@@ -106,7 +106,8 @@
     sst_fetch_timings = [] :: list(sst_fetch_timings()),
     cdb_get_timings = #cdb_get_timings{} :: cdb_get_timings(),
     log_frequency = ?LOG_FREQUENCY_SECONDS :: pos_integer(),
-    log_order = [] :: list(log_type())}).      
+    log_order = [] :: list(log_type()),
+    closer = false :: false|gen_server:from()}).      
 
 
 -type bookie_get_timings() :: #bookie_get_timings{}.
@@ -226,8 +227,9 @@ init([LogOpts, LogFrequency, LogOrder]) ->
     erlang:send_after(InitialJitter, self(), report_next_stats),
     {ok, #state{log_frequency = LogFrequency, log_order = RandomLogOrder}}.
 
-handle_call(close, _From, State) ->
-    {stop, normal, ok, State}.
+handle_call(close, From, State) ->
+    gen_server:cast(self(), close),
+    {noreply, State#state{closer = From}}.
 
 handle_cast({bookie_head_update, FetchTime, RspTime, CacheHit}, State) ->
     Timings = State#state.bookie_head_timings,
@@ -507,9 +509,12 @@ handle_cast({log_add, ForcedLogs}, State) ->
     {noreply, State};
 handle_cast({log_remove, ForcedLogs}, State) ->
     ok = leveled_log:remove_forcedlogs(ForcedLogs),
-    {noreply, State}.
+    {noreply, State};
+handle_cast(close, State) ->
+    gen_server:reply(State#state.closer, ok),
+    {stop, normal, State}.
 
-handle_info(report_next_stats, State) ->
+handle_info(report_next_stats, State=#state{closer = C}) when C == false ->
     erlang:send_after(
         State#state.log_frequency * 1000, self(), report_next_stats),
     case State#state.log_order of
@@ -518,7 +523,9 @@ handle_info(report_next_stats, State) ->
         [NextStat|TailLogOrder] ->
             ok = report_stats(self(), NextStat),
             {noreply, State#state{log_order = TailLogOrder ++ [NextStat]}}
-    end.
+    end;
+handle_info(_Msg, State) ->
+    {noreply, State}.
 
 terminate(_Reason, _State) ->
     ok.
@@ -539,6 +546,7 @@ coverage_cheat_test() ->
     {ok, _State1} = code_change(null, #state{}, null),
     ok = add_stat(M, {pcl_fetch_update, 4, 100}),
     ok = report_stats(M, pcl_fetch),
+    M ! timeout,
     % Can close, so empty log_order hasn't crashed
     ok = monitor_close(M).
 

From c8b426b334c75c0598812e2c30790b698e5cbec9 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 1 Dec 2022 12:34:14 +0000
Subject: [PATCH 29/37] Revert "Close in loop"

This reverts commit 4aaecf4ab46823461296ad852c693b5ada52346f.
---
 src/leveled_monitor.erl | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/leveled_monitor.erl b/src/leveled_monitor.erl
index 74bdb08a..9601e53c 100644
--- a/src/leveled_monitor.erl
+++ b/src/leveled_monitor.erl
@@ -106,8 +106,7 @@
     sst_fetch_timings = [] :: list(sst_fetch_timings()),
     cdb_get_timings = #cdb_get_timings{} :: cdb_get_timings(),
     log_frequency = ?LOG_FREQUENCY_SECONDS :: pos_integer(),
-    log_order = [] :: list(log_type()),
-    closer = false :: false|gen_server:from()}).      
+    log_order = [] :: list(log_type())}).      
 
 
 -type bookie_get_timings() :: #bookie_get_timings{}.
@@ -227,9 +226,8 @@ init([LogOpts, LogFrequency, LogOrder]) ->
     erlang:send_after(InitialJitter, self(), report_next_stats),
     {ok, #state{log_frequency = LogFrequency, log_order = RandomLogOrder}}.
 
-handle_call(close, From, State) ->
-    gen_server:cast(self(), close),
-    {noreply, State#state{closer = From}}.
+handle_call(close, _From, State) ->
+    {stop, normal, ok, State}.
 
 handle_cast({bookie_head_update, FetchTime, RspTime, CacheHit}, State) ->
     Timings = State#state.bookie_head_timings,
@@ -509,12 +507,9 @@ handle_cast({log_add, ForcedLogs}, State) ->
     {noreply, State};
 handle_cast({log_remove, ForcedLogs}, State) ->
     ok = leveled_log:remove_forcedlogs(ForcedLogs),
-    {noreply, State};
-handle_cast(close, State) ->
-    gen_server:reply(State#state.closer, ok),
-    {stop, normal, State}.
+    {noreply, State}.
 
-handle_info(report_next_stats, State=#state{closer = C}) when C == false ->
+handle_info(report_next_stats, State) ->
     erlang:send_after(
         State#state.log_frequency * 1000, self(), report_next_stats),
     case State#state.log_order of
@@ -523,9 +518,7 @@ handle_info(report_next_stats, State=#state{closer = C}) when C == false ->
         [NextStat|TailLogOrder] ->
             ok = report_stats(self(), NextStat),
             {noreply, State#state{log_order = TailLogOrder ++ [NextStat]}}
-    end;
-handle_info(_Msg, State) ->
-    {noreply, State}.
+    end.
 
 terminate(_Reason, _State) ->
     ok.
@@ -546,7 +539,6 @@ coverage_cheat_test() ->
     {ok, _State1} = code_change(null, #state{}, null),
     ok = add_stat(M, {pcl_fetch_update, 4, 100}),
     ok = report_stats(M, pcl_fetch),
-    M ! timeout,
     % Can close, so empty log_order hasn't crashed
     ok = monitor_close(M).
 

From a7a8239099784fffe78898d308b40d3d7a38fdab Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Fri, 2 Dec 2022 02:12:18 +0000
Subject: [PATCH 30/37] Refactor leveled_log

Switch the logbase to a map with atoms for keys
---
 src/leveled_bookie.erl          |  63 ++-
 src/leveled_cdb.erl             |  45 +-
 src/leveled_codec.erl           |   2 +-
 src/leveled_iclerk.erl          |  35 +-
 src/leveled_imanifest.erl       |  16 +-
 src/leveled_inker.erl           |  52 +--
 src/leveled_log.erl             | 738 ++++++++++++++------------------
 src/leveled_monitor.erl         |  83 ++--
 src/leveled_pclerk.erl          |  32 +-
 src/leveled_penciller.erl       |  59 ++-
 src/leveled_pmanifest.erl       |  18 +-
 src/leveled_pmem.erl            |   2 +-
 src/leveled_runner.erl          |   6 +-
 src/leveled_sst.erl             |  46 +-
 test/end_to_end/basic_SUITE.erl |   5 +-
 15 files changed, 562 insertions(+), 640 deletions(-)

diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl
index 2812ac75..a855c43e 100644
--- a/src/leveled_bookie.erl
+++ b/src/leveled_bookie.erl
@@ -339,13 +339,12 @@
             % moving to higher log levels will at present make the operator
             % blind to sample performance statistics of leveled sub-components
             % etc
-        {forced_logs, list(string())} |
+        {forced_logs, list(atom())} |
             % Forced logs allow for specific info level logs, such as those
             % logging stats to be logged even when the default log level has
             % been set to a higher log level.  Using:
             % {forced_logs, 
-            %   ["B0015", "B0016", "B0017", "B0018",
-            %       "P0032", "SST12", "CDB19", "SST13", "I0019"]}
+            %   [b0015, b0016, b0017, b0018, p0032, sst12]}
             % Will log all timing points even when log_level is not set to
             % support info
         {database_id, non_neg_integer()} |
@@ -719,7 +718,7 @@ book_indexfold(Pid, Bucket, FoldAccT, Range, TermHandling) ->
     % future release this code branch may be removed, and such queries may
     % instead return `error`.  For now null is assumed to be lower than any
     % key
-    leveled_log:log("B0019", [Bucket]),
+    leveled_log:log(b0019, [Bucket]),
     book_indexfold(Pid, {Bucket, null}, FoldAccT, Range, TermHandling).
 
 
@@ -1213,7 +1212,7 @@ init([Opts]) ->
                 % and performance may be unpredictable
             case CacheRatio > 32 of
                 true ->
-                    leveled_log:log("B0020", [PCLMaxSize, ConfiguredCacheSize]);
+                    leveled_log:log(b0020, [PCLMaxSize, ConfiguredCacheSize]);
                 false ->
                     ok
             end,
@@ -1240,7 +1239,7 @@ init([Opts]) ->
             {Inker, Penciller} =  startup(InkerOpts, PencillerOpts0),
 
             NewETS = ets:new(mem, [ordered_set]),
-            leveled_log:log("B0001", [Inker, Penciller]),
+            leveled_log:log(b0001, [Inker, Penciller]),
             {ok, 
                 #state{
                     cache_size = CacheSize,
@@ -1255,7 +1254,7 @@ init([Opts]) ->
         {Bookie, undefined} ->
             {ok, Penciller, Inker} = 
                 book_snapshot(Bookie, store, undefined, true),
-            leveled_log:log("B0002", [Inker, Penciller]),
+            leveled_log:log(b0002, [Inker, Penciller]),
             {ok,
                 #state{penciller = Penciller,
                         inker = Inker,
@@ -1273,16 +1272,13 @@ handle_call({put, Bucket, Key, Object, IndexSpecs, Tag, TTL, DataSync},
                                                Object,
                                                {IndexSpecs, TTL},
                                                DataSync),
-    {T0, SW1} =  leveled_monitor:step_time(SW0),
-    Changes = preparefor_ledgercache(null,
-                                        LedgerKey,
-                                        SQN,
-                                        Object,
-                                        ObjSize,
-                                        {IndexSpecs, TTL}),
+    {T0, SW1} = leveled_monitor:step_time(SW0),
+    Changes =
+        preparefor_ledgercache(
+            null, LedgerKey, SQN, Object, ObjSize, {IndexSpecs, TTL}),
+    {T1, SW2} = leveled_monitor:step_time(SW1),
     Cache0 = addto_ledgercache(Changes, State#state.ledger_cache),
-    {T1, _SW2} = leveled_monitor:step_time(SW1),
-
+    {T2, _SW3} = leveled_monitor:step_time(SW2),
     case State#state.slow_offer of
         true ->
             gen_server:reply(From, pause);
@@ -1290,7 +1286,7 @@ handle_call({put, Bucket, Key, Object, IndexSpecs, Tag, TTL, DataSync},
             gen_server:reply(From, ok)
     end,
     maybe_longrunning(SWLR, overall_put),
-    maybelog_put_timing(State#state.monitor, T0, T1, ObjSize),
+    maybelog_put_timing(State#state.monitor, T0, T1, T2, ObjSize),
     case maybepush_ledgercache(
             State#state.cache_size,
             State#state.cache_multiple,
@@ -1330,11 +1326,10 @@ handle_call({get, Bucket, Key, Tag}, _From, State)
                                         when State#state.head_only == false ->
     LedgerKey = leveled_codec:to_ledgerkey(Bucket, Key, Tag),
     SW0 = leveled_monitor:maybe_time(State#state.monitor),
-    {H0, CacheHit} =
+    {H0, _CacheHit} =
         fetch_head(LedgerKey,
                     State#state.penciller,
                     State#state.ledger_cache),
-    {TS0, SW1} = leveled_monitor:step_time(SW0),
     HeadResult = 
         case H0 of
             not_present ->
@@ -1354,9 +1349,7 @@ handle_call({get, Bucket, Key, Tag}, _From, State)
                         end
                 end
         end,
-    {TS1, SW2} = leveled_monitor:step_time(SW1),
-    maybelog_head_timing(
-        State#state.monitor, TS0, TS1, HeadResult == not_found, CacheHit),
+    {TS0, SW1} = leveled_monitor:step_time(SW0),
     GetResult = 
         case HeadResult of 
             not_found -> 
@@ -1370,9 +1363,9 @@ handle_call({get, Bucket, Key, Tag}, _From, State)
                         {ok, Object}
                 end 
         end,
-    {TS2, _SW3} = leveled_monitor:step_time(SW2),
+    {TS1, _SW2} = leveled_monitor:step_time(SW1),
     maybelog_get_timing(
-        State#state.monitor, TS1, TS2, GetResult == not_found),
+        State#state.monitor, TS0, TS1, GetResult == not_found),
     {reply, GetResult, State};
 handle_call({head, Bucket, Key, Tag, SQNOnly}, _From, State) 
                                         when State#state.head_lookup == true ->
@@ -1507,7 +1500,7 @@ handle_call(close, _From, State) ->
     leveled_monitor:monitor_close(element(1, State#state.monitor)),
     {stop, normal, ok, State};
 handle_call(destroy, _From, State=#state{is_snapshot=Snp}) when Snp == false ->
-    leveled_log:log("B0011", []),
+    leveled_log:log(b0011, []),
     {ok, InkPathList} = leveled_inker:ink_doom(State#state.inker),
     {ok, PCLPathList} = leveled_penciller:pcl_doom(State#state.penciller),
     leveled_monitor:monitor_close(element(1, State#state.monitor)),
@@ -1553,7 +1546,7 @@ handle_info(_Info, State) ->
     {noreply, State}.
 
 terminate(Reason, _State) ->
-    leveled_log:log("B0003", [Reason]).
+    leveled_log:log(b0003, [Reason]).
 
 code_change(_OldVsn, State, _Extra) ->
     {ok, State}.
@@ -1691,7 +1684,7 @@ startup(InkerOpts, PencillerOpts) ->
     {ok, Inker} = leveled_inker:ink_start(InkerOpts),
     {ok, Penciller} = leveled_penciller:pcl_start(PencillerOpts),
     LedgerSQN = leveled_penciller:pcl_getstartupsequencenumber(Penciller),
-    leveled_log:log("B0005", [LedgerSQN]),
+    leveled_log:log(b0005, [LedgerSQN]),
     ReloadStrategy = InkerOpts#inker_options.reload_strategy,
     LoadFun = get_loadfun(ReloadStrategy, Penciller),
     BatchFun = 
@@ -1700,7 +1693,7 @@ startup(InkerOpts, PencillerOpts) ->
         end,
     InitAccFun =
         fun(FN, CurrentMinSQN) ->
-            leveled_log:log("I0014", [FN, CurrentMinSQN]),
+            leveled_log:log(i0014, [FN, CurrentMinSQN]),
             empty_ledgercache()
         end,
     ok = leveled_inker:ink_loadpcl(Inker,
@@ -2074,7 +2067,7 @@ return_ledger_keyrange(Tag, Bucket, KeyRange) ->
 maybe_longrunning(SW, Aspect) ->
     case timer:now_diff(os:timestamp(), SW) of
         N when N > ?LONG_RUNNING ->
-            leveled_log:log("B0013", [N, Aspect]);
+            leveled_log:log(b0013, [N, Aspect]);
         _ ->
             ok
     end.
@@ -2434,7 +2427,7 @@ get_loadfun(ReloadStrat, Penciller) ->
             SQN when SQN < MinSQN ->
                 {loop, Acc0};
             SQN when SQN > MaxSQN ->
-                leveled_log:log("B0007", [MaxSQN, SQN]),
+                leveled_log:log(b0007, [MaxSQN, SQN]),
                 {stop, Acc0};
             _ ->
                 {VBin, ValSize} = ExtractFun(ValueInJournal),
@@ -2453,7 +2446,7 @@ get_loadfun(ReloadStrat, Penciller) ->
                     end,
                 case SQN of
                     MaxSQN ->
-                        leveled_log:log("B0006", [SQN]),
+                        leveled_log:log(b0006, [SQN]),
                         LC0 = addto_ledgercache(Chngs, LedgerCache, loader),
                         {stop, {MinSQN, MaxSQN, LC0}};
                     _ ->
@@ -2474,11 +2467,13 @@ delete_path(DirPath) ->
         leveled_monitor:monitor(),
         leveled_monitor:timing(),
         leveled_monitor:timing(),
+        leveled_monitor:timing(),
         pos_integer()) -> ok.
-maybelog_put_timing(_Monitor, no_timing, no_timing, _Size) ->
+maybelog_put_timing(_Monitor, no_timing, no_timing, no_timing, _Size) ->
     ok;
-maybelog_put_timing({Pid, _StatsFreq}, MemTime, InkTime, Size) ->
-    leveled_monitor:add_stat(Pid, {bookie_put_update, MemTime, InkTime, Size}).
+maybelog_put_timing({Pid, _StatsFreq}, InkTime, PrepTime, MemTime, Size) ->
+    leveled_monitor:add_stat(
+        Pid, {bookie_put_update, InkTime, PrepTime, MemTime, Size}).
 
 -spec maybelog_head_timing(
         leveled_monitor:monitor(),
diff --git a/src/leveled_cdb.erl b/src/leveled_cdb.erl
index 9bd68c00..77a27e34 100644
--- a/src/leveled_cdb.erl
+++ b/src/leveled_cdb.erl
@@ -476,10 +476,10 @@ init([Opts]) ->
 
 starting({open_writer, Filename}, _From, State) ->
     leveled_log:save(State#state.log_options),
-    leveled_log:log("CDB01", [Filename]),
+    leveled_log:log(cdb01, [Filename]),
     {LastPosition, HashTree, LastKey} = open_active_file(Filename),
     {WriteOps, UpdStrategy} = set_writeops(State#state.sync_strategy),
-    leveled_log:log("CDB13", [WriteOps]),
+    leveled_log:log(cdb13, [WriteOps]),
     {ok, Handle} = file:open(Filename, WriteOps),
     State0 = State#state{handle=Handle,
                             current_count = size_hashtree(HashTree),
@@ -491,7 +491,7 @@ starting({open_writer, Filename}, _From, State) ->
     {reply, ok, writer, State0, hibernate};
 starting({open_reader, Filename}, _From, State) ->
     leveled_log:save(State#state.log_options),
-    leveled_log:log("CDB02", [Filename]),
+    leveled_log:log(cdb02, [Filename]),
     {Handle, Index, LastKey} = open_for_readonly(Filename, false),
     State0 = State#state{handle=Handle,
                             last_key=LastKey,
@@ -500,7 +500,7 @@ starting({open_reader, Filename}, _From, State) ->
     {reply, ok, reader, State0, hibernate};
 starting({open_reader, Filename, LastKey}, _From, State) ->
     leveled_log:save(State#state.log_options),
-    leveled_log:log("CDB02", [Filename]),
+    leveled_log:log(cdb02, [Filename]),
     {Handle, Index, LastKey} = open_for_readonly(Filename, LastKey),
     State0 = State#state{handle=Handle,
                             last_key=LastKey,
@@ -629,7 +629,7 @@ rolling({return_hashtable, IndexList, HashTreeBin}, _From, State) ->
     ok = write_top_index_table(Handle, BasePos, IndexList),
     file:close(Handle),
     ok = rename_for_read(State#state.filename, NewName),
-    leveled_log:log("CDB03", [NewName]),
+    leveled_log:log(cdb03, [NewName]),
     ets:delete(State#state.hashtree),
     {NewHandle, Index, LastKey} = open_for_readonly(NewName,
                                                     State#state.last_key),
@@ -641,7 +641,7 @@ rolling({return_hashtable, IndexList, HashTreeBin}, _From, State) ->
         true ->
             {reply, ok, delete_pending, State0};
         false ->
-            leveled_log:log_timer("CDB18", [], SW),
+            leveled_log:log_timer(cdb18, [], SW),
             {reply, ok, reader, State0, hibernate}
     end;
 rolling(check_hashtable, _From, State) ->
@@ -713,7 +713,7 @@ reader({direct_fetch, PositionList, Info}, From, State) ->
             {next_state, reader, State}
     end;
 reader(cdb_complete, _From, State) ->
-    leveled_log:log("CDB05", [State#state.filename, reader, cdb_ccomplete]),
+    leveled_log:log(cdb05, [State#state.filename, reader, cdb_ccomplete]),
     ok = file:close(State#state.handle),
     {stop, normal, {ok, State#state.filename}, State#state{handle=undefined}};
 reader(check_hashtable, _From, State) ->
@@ -758,20 +758,20 @@ delete_pending(timeout, State=#state{delete_point=ManSQN}) when ManSQN > 0 ->
                                                 self()),
             {next_state, delete_pending, State, ?DELETE_TIMEOUT};
         false ->
-            leveled_log:log("CDB04", [State#state.filename, ManSQN]),
+            leveled_log:log(cdb04, [State#state.filename, ManSQN]),
             close_pendingdelete(State#state.handle, 
                                 State#state.filename, 
                                 State#state.waste_path),
             {stop, normal, State}
     end;
 delete_pending(delete_confirmed, State=#state{delete_point=ManSQN}) ->
-    leveled_log:log("CDB04", [State#state.filename, ManSQN]),
+    leveled_log:log(cdb04, [State#state.filename, ManSQN]),
     close_pendingdelete(State#state.handle, 
                         State#state.filename, 
                         State#state.waste_path),
     {stop, normal, State};
 delete_pending(destroy, State) ->
-    leveled_log:log("CDB05", [State#state.filename, delete_pending, destroy]),
+    leveled_log:log(cdb05, [State#state.filename, delete_pending, destroy]),
     close_pendingdelete(State#state.handle, 
                         State#state.filename, 
                         State#state.waste_path),
@@ -858,8 +858,7 @@ handle_sync_event({get_cachedscore, {NowMega, NowSecs, _}},
 handle_sync_event({put_cachedscore, Score}, _From, StateName, State) ->
     {reply, ok, StateName, State#state{cached_score = {Score,os:timestamp()}}};
 handle_sync_event(cdb_close, _From, delete_pending, State) ->
-    leveled_log:log("CDB05", 
-                        [State#state.filename, delete_pending, cdb_close]),
+    leveled_log:log(cdb05, [State#state.filename, delete_pending, cdb_close]),
     close_pendingdelete(State#state.handle, 
                         State#state.filename, 
                         State#state.waste_path),
@@ -925,7 +924,7 @@ close_pendingdelete(Handle, Filename, WasteFP) ->
         false ->
             % This may happen when there has been a destroy while files are
             % still pending deletion
-            leveled_log:log("CDB21", [Filename])
+            leveled_log:log(cdb21, [Filename])
     end.
 
 -spec set_writeops(sync|riak_sync|none) -> {list(), sync|riak_sync|none}.
@@ -967,7 +966,7 @@ open_active_file(FileName) when is_list(FileName) ->
                 {?BASE_POSITION, 0} ->
                     ok;
                 _ ->
-                    leveled_log:log("CDB06", [LastPosition, EndPosition])
+                    leveled_log:log(cdb06, [LastPosition, EndPosition])
             end,
             {ok, _LastPosition} = file:position(Handle, LastPosition),
             ok = file:truncate(Handle),
@@ -1142,7 +1141,7 @@ hashtable_calc(HashTree, StartPos) ->
     Seq = lists:seq(0, 255),
     SWC = os:timestamp(),
     {IndexList, HashTreeBin} = write_hash_tables(Seq, HashTree, StartPos),
-    leveled_log:log_timer("CDB07", [], SWC),
+    leveled_log:log_timer(cdb07, [], SWC),
     {IndexList, HashTreeBin}.
 
 %%%%%%%%%%%%%%%%%%%%
@@ -1155,7 +1154,7 @@ determine_new_filename(Filename) ->
     
 rename_for_read(Filename, NewName) ->
     %% Rename file
-    leveled_log:log("CDB08", [Filename, NewName, filelib:is_file(NewName)]),
+    leveled_log:log(cdb08, [Filename, NewName, filelib:is_file(NewName)]),
     file:rename(Filename, NewName).
 
 
@@ -1348,7 +1347,7 @@ scan_over_file(Handle, Position, FilterFun, Output, LastKey) ->
                     % Not interesting that we've nothing to read at base
                     ok;
                 _ ->
-                    leveled_log:log("CDB09", [Position])
+                    leveled_log:log(cdb09, [Position])
             end,
             % Bring file back to that position
             {ok, Position} = file:position(Handle, {bof, Position}),
@@ -1461,7 +1460,7 @@ safe_read_next(Handle, Length, ReadFun) ->
         loose_read(Handle, Length, ReadFun)
     catch
         error:ReadError ->
-            leveled_log:log("CDB20", [ReadError, Length]),
+            leveled_log:log(cdb20, [ReadError, Length]),
             false
     end.
 
@@ -1488,11 +1487,11 @@ crccheck(<<CRC:32/integer, Value/binary>>, KeyBin) when is_binary(KeyBin) ->
         CRC -> 
             Value;
         _ -> 
-            leveled_log:log("CDB10", []),
+            leveled_log:log(cdb10, ["mismatch"]),
             false
         end;
 crccheck(_V, _KB) ->
-    leveled_log:log("CDB11", []),
+    leveled_log:log(cdb10, ["size"]),
     false.
 
 
@@ -1585,7 +1584,7 @@ search_hash_table(Handle,
                 end,
             case KV of
                 missing ->
-                    leveled_log:log("CDB15", [Hash]),
+                    leveled_log:log(cdb15, [Hash]),
                     search_hash_table(
                         Handle,
                         {FirstHashPosition, Slot, CycleCount + 1, TotalSlots},
@@ -1650,7 +1649,7 @@ perform_write_hash_tables(Handle, HashTreeBin, StartPos) ->
     ok = file:write(Handle, HashTreeBin),
     {ok, EndPos} = file:position(Handle, cur),
     ok = file:advise(Handle, StartPos, EndPos - StartPos, will_need),
-    leveled_log:log_timer("CDB12", [], SWW),
+    leveled_log:log_timer(cdb12, [], SWW),
     ok.
 
 
@@ -1820,7 +1819,7 @@ write_hash_tables(Indexes, HashTree, CurrPos) ->
 
 write_hash_tables([], _HashTree, _CurrPos, _BasePos, 
                                         IndexList, HT_BinList, {T1, T2, T3}) ->
-    leveled_log:log("CDB14", [T1, T2, T3]),
+    leveled_log:log(cdb14, [T1, T2, T3]),
     IL = lists:reverse(IndexList),
     {IL, list_to_binary(HT_BinList)};
 write_hash_tables([Index|Rest], HashTree, CurrPos, BasePos,
diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl
index 3faec9cd..ca1a3ddd 100644
--- a/src/leveled_codec.erl
+++ b/src/leveled_codec.erl
@@ -414,7 +414,7 @@ get_tagstrategy(Tag, Strategy) ->
         {Tag, TagStrat} ->
             TagStrat;
         false ->
-            leveled_log:log("IC012", [Tag, Strategy]),
+            leveled_log:log(ic012, [Tag, Strategy]),
             retain
     end.
 
diff --git a/src/leveled_iclerk.erl b/src/leveled_iclerk.erl
index d7edc732..7ae3b445 100644
--- a/src/leveled_iclerk.erl
+++ b/src/leveled_iclerk.erl
@@ -304,7 +304,7 @@ handle_call(stop, _From, State) ->
 
 handle_cast({compact, Checker, InitiateFun, CloseFun, FilterFun, Manifest0},
                 State) ->
-    leveled_log:log("IC014", [State#state.reload_strategy,
+    leveled_log:log(ic014, [State#state.reload_strategy,
                                 State#state.max_run_length]),
     % Empty the waste folder
     clear_waste(State),
@@ -386,7 +386,7 @@ handle_cast(scoring_complete, State) ->
             State#state.maxrunlength_compactionperc, 
             State#state.singlefile_compactionperc},
     {BestRun0, Score} = assess_candidates(Candidates, ScoreParams),
-    leveled_log:log_timer("IC003", [Score, length(BestRun0)], SW),
+    leveled_log:log_timer(ic003, [Score, length(BestRun0)], SW),
     case Score > 0.0 of
         true ->
             BestRun1 = sort_run(BestRun0),
@@ -405,7 +405,7 @@ handle_cast(scoring_complete, State) ->
                                                 undefined}
                                             end,
                                         BestRun1),
-            leveled_log:log("IC002", [length(FilesToDelete)]),
+            leveled_log:log(ic002, [length(FilesToDelete)]),
             ok = CloseFun(FilterServer),
             ok = leveled_inker:ink_clerkcomplete(State#state.inker,
                                                     ManifestSlice,
@@ -418,7 +418,7 @@ handle_cast(scoring_complete, State) ->
 handle_cast({trim, PersistedSQN, ManifestAsList}, State) ->
     FilesToDelete = 
         leveled_imanifest:find_persistedentries(PersistedSQN, ManifestAsList),
-    leveled_log:log("IC007", []),
+    leveled_log:log(ic007, []),
     ok = leveled_inker:ink_clerkcomplete(State#state.inker, [], FilesToDelete),
     {noreply, State};
 handle_cast({prompt_deletions, ManifestSQN, FilesToDelete}, State) ->
@@ -455,7 +455,7 @@ handle_info(_Info, State) ->
 terminate(normal, _State) ->
     ok;
 terminate(Reason, _State) ->
-    leveled_log:log("IC001", [Reason]).
+    leveled_log:log(ic001, [Reason]).
 
 code_change(_OldVsn, State, _Extra) ->
     {ok, State}.
@@ -576,12 +576,12 @@ check_single_file(CDB, FilterFun, FilterServer, MaxSQN,
     Score.
 
 safely_log_filescore([], FN, Score, SW) ->
-    leveled_log:log_timer("IC004", [Score, empty, FN], SW);
+    leveled_log:log_timer(ic004, [Score, empty, FN], SW);
 safely_log_filescore(PositionList, FN, Score, SW) ->
     AvgJump =
         (lists:last(PositionList) - lists:nth(1, PositionList))
             div length(PositionList),
-    leveled_log:log_timer("IC004", [Score, AvgJump, FN], SW).
+    leveled_log:log_timer(ic004, [Score, AvgJump, FN], SW).
 
 -spec size_comparison_score(list(key_size() | corrupted_test_key_size()),
                                     leveled_inker:filterfun(),
@@ -732,12 +732,13 @@ score_run(Run, {MaxRunLength, MR_CT, SF_CT}) ->
 
 
 print_compaction_run(BestRun, ScoreParams) ->
-    leveled_log:log("IC005", [length(BestRun),
-                                score_run(BestRun, ScoreParams)]),
-    lists:foreach(fun(File) ->
-                        leveled_log:log("IC006", [File#candidate.filename])
-                        end,
-                    BestRun).
+    leveled_log:log(
+        ic005, [length(BestRun), score_run(BestRun, ScoreParams)]),
+    lists:foreach(
+        fun(File) ->
+            leveled_log:log(ic006, [File#candidate.filename])
+        end,
+        BestRun).
 
 sort_run(RunOfFiles) ->
     CompareFun = fun(Cand1, Cand2) ->
@@ -795,7 +796,7 @@ get_all_positions([], PositionBatches) ->
 get_all_positions([HeadRef|RestOfBest], PositionBatches) ->
     SrcJournal = HeadRef#candidate.journal,
     Positions = leveled_cdb:cdb_getpositions(SrcJournal, all),
-    leveled_log:log("IC008", [HeadRef#candidate.filename, length(Positions)]),
+    leveled_log:log(ic008, [HeadRef#candidate.filename, length(Positions)]),
     Batches = split_positions_into_batches(lists:sort(Positions),
                                             SrcJournal,
                                             []),
@@ -913,7 +914,7 @@ write_values(KVCList, CDBopts, Journal0, ManSlice0, PressMethod) ->
                 {SQN, _LK} = leveled_codec:from_journalkey(TK),
                 FP = CDBopts#cdb_options.file_path,
                 FN = leveled_inker:filepath(FP, SQN, compact_journal),
-                leveled_log:log("IC009", [FN]),
+                leveled_log:log(ic009, [FN]),
                 leveled_cdb:cdb_open_writer(FN, CDBopts);
             _ ->
                 {ok, Journal0}
@@ -941,9 +942,9 @@ clear_waste(State) ->
                     case N - calendar:datetime_to_gregorian_seconds(LMD) of
                         LMD_Delta when LMD_Delta >= WRP ->
                             ok = file:delete(WP ++ DelJ),
-                            leveled_log:log("IC010", [WP ++ DelJ]);
+                            leveled_log:log(ic010, [WP ++ DelJ]);
                         LMD_Delta ->
-                            leveled_log:log("IC011", [WP ++ DelJ, LMD_Delta]),
+                            leveled_log:log(ic011, [WP ++ DelJ, LMD_Delta]),
                             ok
                     end
                 end,
diff --git a/src/leveled_imanifest.erl b/src/leveled_imanifest.erl
index e278ad7d..115e5f94 100644
--- a/src/leveled_imanifest.erl
+++ b/src/leveled_imanifest.erl
@@ -58,7 +58,7 @@ generate_entry(Journal) ->
             [{StartSQN, NewFN, PidR, LastKey}];
         empty ->
             ok = leveled_cdb:cdb_close(PidR),
-            leveled_log:log("IC013", [NewFN]),
+            leveled_log:log(ic013, [NewFN]),
             []
     end.
 
@@ -100,7 +100,7 @@ append_lastkey(Manifest, Pid, LastKey) ->
 %% Remove an entry from a manifest (after compaction)
 remove_entry(Manifest, Entry) ->
     {SQN, FN, _PidR, _LastKey} = Entry,
-    leveled_log:log("I0013", [FN]),
+    leveled_log:log(i0013, [FN]),
     Man0 = lists:keydelete(SQN, 1, to_list(Manifest)),
     from_list(Man0).
 
@@ -157,7 +157,7 @@ to_list(Manifest) ->
 %% loss on rollback.
 reader(SQN, RootPath) ->
     ManifestPath = leveled_inker:filepath(RootPath, manifest_dir),
-    leveled_log:log("I0015", [ManifestPath, SQN]),
+    leveled_log:log(i0015, [ManifestPath, SQN]),
     {ok, MBin} = file:read_file(filename:join(ManifestPath,
                                                 integer_to_list(SQN)
                                                 ++ ".man")),
@@ -179,7 +179,7 @@ writer(Manifest, ManSQN, RootPath) ->
     %% check backwards compatible (so that the reader can read manifests both
     %% with and without a CRC check)
     MBin = term_to_binary(to_list(Manifest), [compressed]),
-    leveled_log:log("I0016", [ManSQN]),
+    leveled_log:log(i0016, [ManSQN]),
     ok = leveled_util:safe_rename(TmpFN, NewFN, MBin, true),
     GC_SQN = ManSQN - ?MANIFESTS_TO_RETAIN,
     GC_Man = filename:join(ManPath,
@@ -196,9 +196,11 @@ writer(Manifest, ManSQN, RootPath) ->
 %% @doc
 %% Print the manifest to the log
 printer(Manifest) ->
-    lists:foreach(fun({SQN, FN, _PID, _LK}) ->
-                         leveled_log:log("I0017", [SQN, FN]) end,
-                    to_list(Manifest)).
+    lists:foreach(
+        fun({SQN, FN, _PID, _LK}) ->
+            leveled_log:log(i0017, [SQN, FN])
+        end,
+        to_list(Manifest)).
 
 -spec complete_filex() -> string().
 %% @doc
diff --git a/src/leveled_inker.erl b/src/leveled_inker.erl
index e33d2f31..fa07713d 100644
--- a/src/leveled_inker.erl
+++ b/src/leveled_inker.erl
@@ -517,7 +517,7 @@ handle_call({fetch, Key, SQN}, _From, State) ->
         {{SQN, Key}, {Value, _IndexSpecs}} ->
             {reply, {ok, Value}, State};
         Other ->
-            leveled_log:log("I0001", [Key, SQN, Other]),
+            leveled_log:log(i0001, [Key, SQN, Other]),
             {reply, not_present, State}
     end;
 handle_call({get, Key, SQN}, _From, State) ->
@@ -546,7 +546,7 @@ handle_call({register_snapshot, Requestor},
     Rs = [{Requestor,
             os:timestamp(),
             State#state.manifest_sqn}|State#state.registered_snapshots],
-    leveled_log:log("I0002", [Requestor, State#state.manifest_sqn]),
+    leveled_log:log(i0002, [Requestor, State#state.manifest_sqn]),
     {reply, {State#state.manifest,
                 State#state.active_journaldb,
                 State#state.journal_sqn},
@@ -592,7 +592,7 @@ handle_call(roll, _From, State=#state{is_snapshot=Snap}) when Snap == false ->
                             State#state.cdb_options,
                             State#state.root_path,
                             State#state.manifest_sqn),
-            leveled_log:log_timer("I0024", [NewSQN], SWroll),
+            leveled_log:log_timer(i0024, [NewSQN], SWroll),
             {reply, ok, State#state{journal_sqn = NewSQN,
                                         manifest = Manifest1,
                                         manifest_sqn = NewManSQN,
@@ -604,7 +604,7 @@ handle_call({backup, BackupPath}, _from, State)
     BackupJFP = filepath(filename:join(BackupPath, ?JOURNAL_FP), journal_dir),
     ok = filelib:ensure_dir(BackupJFP),
     {ok, CurrentFNs} = file:list_dir(BackupJFP),
-    leveled_log:log("I0023", [length(CurrentFNs)]),
+    leveled_log:log(i0023, [length(CurrentFNs)]),
     BackupFun =
         fun({SQN, FN, PidR, LastKey}, {ManAcc, FTRAcc}) ->
             case SQN < State#state.journal_sqn of
@@ -623,7 +623,7 @@ handle_call({backup, BackupPath}, _from, State)
                     {[{SQN, BackupName, PidR, LastKey}|ManAcc],
                         [ExtendedBaseFN|FTRAcc]};
                 false ->
-                    leveled_log:log("I0021", [FN, SQN, State#state.journal_sqn]),
+                    leveled_log:log(i0021, [FN, SQN, State#state.journal_sqn]),
                     {ManAcc, FTRAcc}
             end
         end,
@@ -635,7 +635,7 @@ handle_call({backup, BackupPath}, _from, State)
     FilesToRemove = lists:subtract(CurrentFNs, FilesToRetain),
     RemoveFun = 
         fun(RFN) -> 
-            leveled_log:log("I0022", [RFN]),
+            leveled_log:log(i0022, [RFN]),
             RemoveFile = filename:join(BackupJFP, RFN),
             case filelib:is_file(RemoveFile) 
                     and not filelib:is_dir(RemoveFile) of 
@@ -649,15 +649,15 @@ handle_call({backup, BackupPath}, _from, State)
     leveled_imanifest:writer(leveled_imanifest:from_list(BackupManifest),
                                 State#state.manifest_sqn, 
                                 filename:join(BackupPath, ?JOURNAL_FP)),
-    leveled_log:log_timer("I0020", 
-                            [filename:join(BackupPath, ?JOURNAL_FP), 
-                                length(BackupManifest)], 
-                            SW),
+    leveled_log:log_timer(
+        i0020,
+        [filename:join(BackupPath, ?JOURNAL_FP), length(BackupManifest)], 
+        SW),
     {reply, ok, State};
 handle_call({check_sqn, LedgerSQN}, _From, State) ->
     case State#state.journal_sqn of
         JSQN when JSQN < LedgerSQN ->
-            leveled_log:log("I0025", [JSQN, LedgerSQN]),
+            leveled_log:log(i0025, [JSQN, LedgerSQN]),
             {reply, ok, State#state{journal_sqn = LedgerSQN}};
         _JSQN ->
             {reply, ok, State}
@@ -669,9 +669,9 @@ handle_call(close, _From, State) ->
         true ->
             ok = ink_releasesnapshot(State#state.source_inker, self());
         false ->    
-            leveled_log:log("I0005", [close]),
-            leveled_log:log("I0006", [State#state.journal_sqn,
-                                        State#state.manifest_sqn]),
+            leveled_log:log(i0005, [close]),
+            leveled_log:log(
+                i0006, [State#state.journal_sqn, State#state.manifest_sqn]),
             ok = leveled_iclerk:clerk_stop(State#state.clerk),
             shutdown_snapshots(State#state.registered_snapshots),
             shutdown_manifest(State#state.manifest)
@@ -682,11 +682,11 @@ handle_call(doom, _From, State) ->
             filepath(State#state.root_path, manifest_dir),
             filepath(State#state.root_path, journal_compact_dir),
             filepath(State#state.root_path, journal_waste_dir)],
-    leveled_log:log("I0018", []),
+    leveled_log:log(i0018, []),
 
-    leveled_log:log("I0005", [doom]),
-    leveled_log:log("I0006", [State#state.journal_sqn,
-                                State#state.manifest_sqn]),
+    leveled_log:log(i0005, [doom]),
+    leveled_log:log(
+        i0006, [State#state.journal_sqn, State#state.manifest_sqn]),
     ok = leveled_iclerk:clerk_stop(State#state.clerk),
     shutdown_snapshots(State#state.registered_snapshots),
     shutdown_manifest(State#state.manifest),
@@ -748,12 +748,12 @@ handle_cast({confirm_delete, ManSQN, CDB}, State) ->
     end,
     {noreply, State#state{registered_snapshots = RegisteredSnapshots0}};
 handle_cast({release_snapshot, Snapshot}, State) ->
-    leveled_log:log("I0003", [Snapshot]),
+    leveled_log:log(i0003, [Snapshot]),
     case lists:keydelete(Snapshot, 1, State#state.registered_snapshots) of
         [] ->
             {noreply, State#state{registered_snapshots=[]}};
         Rs ->
-            leveled_log:log("I0004", [length(Rs)]),
+            leveled_log:log(i0004, [length(Rs)]),
             {noreply, State#state{registered_snapshots=Rs}}
     end;
 handle_cast({log_level, LogLevel}, State) ->
@@ -875,7 +875,7 @@ shutdown_snapshots(Snapshots) ->
 %% @doc
 %% Shutdown all files in the manifest
 shutdown_manifest(Manifest) ->
-    leveled_log:log("I0007", []),
+    leveled_log:log(i0007, []),
     leveled_imanifest:printer(Manifest),
     ManAsList = leveled_imanifest:to_list(Manifest),
     close_allmanifest(ManAsList).
@@ -940,7 +940,7 @@ put_object(LedgerKey, Object, KeyChanges, Sync, State) ->
                             State#state.cdb_options,
                             State#state.root_path,
                             State#state.manifest_sqn),
-            leveled_log:log_timer("I0008", [], SWroll),
+            leveled_log:log_timer(i0008, [], SWroll),
             ok = leveled_cdb:cdb_put(NewJournalP,
                                         JournalKey,
                                         JournalBin),
@@ -1051,13 +1051,13 @@ build_manifest(ManifestFilenames,
     UpdManifestSQN =
         if
             length(OpenManifest) > length(Manifest)  ->
-                leveled_log:log("I0009", []),
+                leveled_log:log(i0009, []),
                 leveled_imanifest:printer(OpenManifest),
                 NextSQN = ManifestSQN + 1,
                 leveled_imanifest:writer(OpenManifest, NextSQN, RootPath),
                 NextSQN;
             true ->
-                leveled_log:log("I0010", []),
+                leveled_log:log(i0010, []),
                 leveled_imanifest:printer(OpenManifest),
                 ManifestSQN
         end,
@@ -1082,7 +1082,7 @@ close_allmanifest([H|ManifestT]) ->
 %% Open all the files in the manifets, and updating the manifest with the PIDs
 %% of the opened files
 open_all_manifest([], RootPath, CDBOpts) ->
-    leveled_log:log("I0011", []),
+    leveled_log:log(i0011, []),
     leveled_imanifest:add_entry([],
                                 start_new_activejournal(0, RootPath, CDBOpts),
                                 true);
@@ -1113,7 +1113,7 @@ open_all_manifest(Man0, RootPath, CDBOpts) ->
     PendingHeadFN = HeadFN ++ "." ++ ?PENDING_FILEX,
     case filelib:is_file(CompleteHeadFN) of
         true ->
-            leveled_log:log("I0012", [HeadFN]),
+            leveled_log:log(i0012, [HeadFN]),
             {ok, HeadR} = leveled_cdb:cdb_open_reader(CompleteHeadFN),
             LastKey = leveled_cdb:cdb_lastkey(HeadR),
             LastSQN = element(1, LastKey),
diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index 43e455e1..ec5c551c 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -20,9 +20,10 @@
             return_settings/0]).
 
 
--record(log_options, {log_level = info :: log_level(), 
-                        forced_logs = [] :: [string()],
-                        database_id = 0 :: non_neg_integer()}).
+-record(log_options,
+    {log_level = info :: log_level(), 
+        forced_logs = [] :: [atom()],
+        database_id = 0 :: non_neg_integer()}).
 
 -type log_level()  ::  debug | info | warn | error | critical.
 -type log_options() :: #log_options{}.
@@ -32,355 +33,279 @@
 -define(LOG_LEVELS, [debug, info, warn, error, critical]).
 -define(DEFAULT_LOG_LEVEL, error).
 
--define(LOGBASE, [
-
-    {"G0001",
-        {info, "Generic log point"}},
-    {"G0002",
-        {info, "Generic log point with term ~w"}},
-    {"D0001",
-        {debug, "Generic debug log"}},
-    
-    {"B0001",
-        {info, "Bookie starting with Ink ~w Pcl ~w"}},
-    {"B0002",
-        {info, "Snapshot starting with Ink ~w Pcl ~w"}},
-    {"B0003",
-        {info, "Bookie closing for reason ~w"}},
-    {"B0004",
-        {info, "Initialised PCL clone and length of increment in snapshot is ~w"}},
-    {"B0005",
-        {info, "LedgerSQN=~w at startup"}},
-    {"B0006",
-        {info, "Reached end of load batch with SQN ~w"}},
-    {"B0007",
-        {info, "Skipping as exceeded MaxSQN ~w with SQN ~w"}},
-    {"B0008",
-        {info, "Bucket list finds no more results"}},
-    {"B0009",
-        {debug, "Bucket list finds Bucket ~w"}},
-    {"B0011",
-        {warn, "Call to destroy the store and so all files to be removed"}},
-    {"B0013",
-        {warn, "Long running task took ~w microseconds with task_type=~w"}},
-    {"B0015",
-        {info, "Put timing with sample_count=~w and mem_time=~w ink_time=~w"
-                ++ " with total_object_size=~w"}},
-    {"B0016",
-        {info, "Get timing with sample_count=~w and head_time=~w body_time=~w"
-                ++ " with fetch_count=~w"}},
-    {"B0017",
-        {info, "Snapshot timing with sample_count=~w and bookie_time=~w pcl_time=~w"}},
-    {"B0018",
-        {info, "Positive HEAD responses timed with sample_count=~w and "
-                ++ " fetch_time=~w rsp_time=~w found_count=~w cache_count=~w"}},
-    {"B0019",
-        {warn, "Use of book_indexfold with constraint of Bucket ~w with "
-                    ++ "no StartKey is deprecated"}},
-    {"B0020",
-        {warn, "Ratio of penciller cache size ~w to bookie's memory "
-                    ++ "cache size ~w is larger than expected"}},
-
-    {"R0001",
-        {debug, "Object fold to process batch of ~w objects"}},
-    
-    {"P0001",
-        {debug, "Ledger snapshot ~w registered"}},
-    {"P0003",
-        {debug, "Ledger snapshot ~w released"}},
-    {"P0004",
-        {debug, "Remaining ledger snapshots are ~w"}},
-    {"P0005",
-        {debug, "Delete confirmed as file ~s is removed from Manifest"}},
-    {"P0007",
-        {debug, "Sent release message for cloned Penciller following close for "
-                ++ "reason ~w"}},
-    {"P0008",
-        {info, "Penciller closing for reason ~w"}},
-    {"P0010",
-        {info, "discarded=~w level zero on close of Penciller"}},
-    {"P0011",
-        {info, "Shutdown complete for Penciller for reason ~w"}},
-    {"P0012",
-        {info, "Store to be started based on manifest sequence number of ~w"}},
-    {"P0013",
-        {warn, "Seqence number of 0 indicates no valid manifest"}},
-    {"P0014",
-        {info, "Maximum sequence number of ~w found in nonzero levels"}},
-    {"P0015",
-        {info, "L0 file found ~s"}},
-    {"P0016",
-        {info, "L0 file had maximum sequence number of ~w"}},
-    {"P0017",
-        {info, "No L0 file found"}},
-    {"P0018",
-        {info,
-        "Response to push_mem of returned with cache_size=~w "
-        ++ "L0_pending=~w merge_backlog=~w cachelines_full=~w"}},
-    {"P0019",
-        {info, "Rolling level zero to filename ~s at ledger sqn ~w"}},
-    {"P0024",
-        {info, "Outstanding compaction work items of ~w with backlog status "
-                    ++ "of ~w"}},
-    {"P0029",
-        {info, "L0 completion confirmed and will transition to not pending"}},
-    {"P0030",
-        {warn, "We're doomed - intention recorded to destroy all files"}},
-    {"P0031",
-        {info, "Completion of update to levelzero"
-                    ++ " with cache_size=~w level0_due=~w"
-                    ++ " change_pending=~w"
-                    ++ " MinSQN=~w MaxSQN=~w"}},
-    {"P0032",
-        {info, "Fetch head timing with sample_count=~w and level timings of"
-                    ++ " foundmem_time=~w found0_time=~w found1_time=~w" 
-                    ++ " found2_time=~w found3_time=~w foundlower_time=~w" 
-                    ++ " missed_time=~w"
-                    ++ " with counts of"
-                    ++ " foundmem_count=~w found0_count=~w found1_count=~w" 
-                    ++ " found2_count=~w found3_count=~w foundlower_count=~w"
-                    ++ " missed_count=~w"}},
-    {"P0033",
-        {error, "Corrupted manifest file at path ~s to be ignored "
-                    ++ "due to error ~w"}},
-    {"P0035",
-        {info, "Startup with Manifest SQN of ~w"}},
-    {"P0037",
-        {debug, "Merging of penciller L0 tree from size ~w complete"}},
-    {"P0038",
-        {info, "Timeout of snapshot with pid=~w at SQN=~w at TS ~w "
-                    ++ "set to timeout=~w"}},
-    {"P0039",
-        {debug, "Failed to release pid=~w "
-                    ++ "leaving SnapshotCount=~w and MinSQN=~w"}},
-    {"P0040",
-        {info, "Archiving filename ~s as unused at startup"}},
-    {"P0041",
-        {info, "Penciller manifest switched from SQN ~w to ~w"}},
-        
-    {"PC001",
-        {info, "Penciller's clerk ~w started with owner ~w"}},
-    {"PC002",
-        {info, "Request for manifest change from clerk on closing"}},
-    {"PC003",
-        {info, "Confirmation of manifest change on closing"}},
-    {"PC004",
-        {info, "Prompted confirmation of manifest change"}},
-    {"PC005",
-        {info, "Penciller's Clerk ~w shutdown now complete for reason ~w"}},
-    {"PC006",
-        {debug, "Work prompted but none needed"}},
-    {"PC007",
-        {debug, "Clerk prompting Penciller regarding manifest change"}},
-    {"PC008",
-        {info, "Merge from level ~w to merge into ~w files below"}},
-    {"PC009",
-        {debug, "File ~s to simply switch levels to level ~w"}},
-    {"PC010",
-        {info, "Merge to be commenced for FileToMerge=~s with MSN=~w"}},
-    {"PC011",
-        {info, "Merge completed with MSN=~w to Level=~w and FileCounter=~w"}},
-    {"PC012",
-        {debug, "File to be created as part of MSN=~w Filename=~s "
-                    ++ "IsBasement=~w"}},
-    {"PC013",
-        {warn, "Merge resulted in empty file ~s"}},
-    {"PC015",
-        {info, "File created"}},
-    {"PC016",
-        {info, "Slow fetch from SFT ~w of ~w us at level ~w "
-                    ++ "with result ~w"}},
-    {"PC017",
-        {debug, "Notified clerk of manifest change"}},
-    {"PC018",
-        {info, "Saved manifest file"}},
-    {"PC019",
-        {debug, "After ~s level ~w is ~w"}},
-    {"PC020",
-        {warn, "Empty prompt deletions at ManifestSQN=~w"}},
-    {"PC021",
-        {debug, "Prompting deletions at ManifestSQN=~w"}},
-    {"PC022",
-        {debug, "Storing reference to deletions at ManifestSQN=~w"}},
-    {"PC023",
-        {info, "At level=~w file_count=~w avg_mem=~w " 
-                ++ "file with most memory fn=~s p=~w mem=~w"}},
-    {"PC024",
-        {info, "Grooming compaction picked file with tomb_count=~w"}},
-    {"PM002",
-        {info, "Completed dump of L0 cache to list of l0cache_size=~w"}},
-    
-    {"SST01",
-        {info, "SST timing for result=~w is sample=~w total=~w and max=~w"}},
-    {"SST02",
-        {error, "False result returned from SST with filename ~s as "
-                    ++ "slot ~w has failed crc check"}},
-    {"SST03",
-        {info, "Opening SST file with filename ~s slot_count=~w and"
-                ++ " max sqn ~w"}},
-    {"SST04",
-        {debug, "Exit called for reason ~w on filename ~s"}},
-    {"SST05",
-        {warn, "Rename rogue filename ~s to ~s"}},
-    {"SST06",
-        {debug, "File ~s has been set for delete"}},
-    {"SST07",
-        {info, "Exit called and now clearing ~s"}},
-    {"SST08",
-        {info, "Completed creation of ~s at level ~w with max sqn ~w"}},
-    {"SST09",
-        {warn, "Read request exposes slot with bad CRC"}},
-    {"SST10",
-        {debug, "Expansion sought to support pointer to pid ~w status ~w"}},
-    {"SST11",
-        {info, "Level zero creation timings in microseconds "
-                ++ "pmem_fetch=~w merge_lists=~w build_slots=~w " 
-                ++ "build_summary=~w read_switch=~w"}},
-    {"SST12",
-        {info, "SST Timings at level=~w for sample_count=~w"
-                ++ " at timing points notfound_time=~w fetchcache_time=~w"
-                ++ " slotcached_time=~w slotnoncached_time=~w "
-                ++ " exiting at points notfound_count=~w fetchcache_count=~w"
-                ++ " slotcached_count=~w slotnoncached_count=~w"}},
-    {"SST13",
-        {info, "SST merge list build timings of"
-                ++ " fold_toslot=~w slot_hashlist=~w"
-                ++ " slot_serialise=~w slot_finish=~w"
-                ++ " is_basement=~w level=~w"}},
-    {"SST14",
-        {debug, "File ~s has completed BIC"}},
-    
-    {"I0001",
-        {info, "Unexpected failure to fetch value for Key=~w SQN=~w "
-                ++ "with reason ~w"}},
-    {"I0002",
-        {debug, "Journal snapshot ~w registered at SQN ~w"}},
-    {"I0003",
-        {debug, "Journal snapshot ~w released"}},
-    {"I0004",
-        {info, "Remaining number of journal snapshots is ~w"}},
-    {"I0005",
-        {info, "Inker closing journal for reason ~w"}},
-    {"I0006",
-        {info, "Close triggered with journal_sqn=~w and manifest_sqn=~w"}},
-    {"I0007",
-        {info, "Inker manifest when closing is:"}},
-    {"I0008",
-        {info, "Put to new active journal required roll and manifest write"}},
-    {"I0009",
-        {info, "Updated manifest on startup:"}},
-    {"I0010",
-        {info, "Unchanged manifest on startup:"}},
-    {"I0011",
-        {info, "Manifest is empty, starting from manifest SQN 1"}},
-    {"I0012",
-        {info, "Head manifest entry ~s is complete so new active journal "
-                ++ "required"}},
-    {"I0013",
-        {info, "File ~s to be removed from manifest"}},
-    {"I0014",
-        {info, "On startup loading from filename ~s from SQN ~w"}},
-    {"I0015",
-        {info, "Opening manifest file at ~s with SQN ~w"}},
-    {"I0016",
-        {info, "Writing new version of manifest for manifestSQN=~w"}},
-    {"I0017",
-        {debug, "At SQN=~w journal has filename ~s"}},
-    {"I0018",
-        {warn, "We're doomed - intention recorded to destroy all files"}},
-    {"I0019",
-        {info, "After ~w PUTs total prepare time is ~w total cdb time is ~w "
-                ++ "and max prepare time is ~w and max cdb time is ~w"}},
-    {"I0020",
-        {info, "Journal backup completed to path=~s with file_count=~w"}},
-    {"I0021",
-        {info, "Ingoring filename=~s with SQN=~w and JournalSQN=~w"}},
-    {"I0022",
-        {info, "Removing filename=~s from backup folder as not in backup"}},
-    {"I0023",
-        {info, "Backup commencing into folder with ~w existing files"}},
-    {"I0024",
-        {info, "Prompted roll at NewSQN=~w"}},
-    {"I0025",
-        {warn, "Journal SQN of ~w is below Ledger SQN of ~w " ++
-                "anti-entropy will be required"}},
-
-    {"IC001",
-        {info, "Closed for reason ~w so maybe leaving garbage"}},
-    {"IC002",
-        {info, "Clerk updating Inker as compaction complete of ~w files"}},
-    {"IC003",
-        {info, "Scoring of compaction runs complete with highest score=~w " 
-                ++ "with run of run_length=~w"}},
-    {"IC004",
-        {info, "Score=~w with mean_byte_jump=~w for filename ~s"}},
-    {"IC005",
-        {info, "Compaction to be performed on ~w files with score of ~w"}},
-    {"IC006",
-        {info, "Filename ~s is part of compaction run"}},
-    {"IC007",
-        {info, "Clerk has completed compaction process"}},
-    {"IC008",
-        {info, "Compaction source ~s has yielded ~w positions"}},
-    {"IC009",
-        {info, "Generate journal for compaction with filename ~s"}},
-    {"IC010",
-        {info, "Clearing journal with filename ~s"}},
-    {"IC011",
-        {info, "Not clearing filename ~s as modified delta is only ~w seconds"}},
-    {"IC012",
-        {warn, "Tag ~w not found in Strategy ~w - maybe corrupted"}},
-    {"IC013",
-        {warn, "File with name ~s to be ignored in manifest as scanning for "
-                ++ "first key returned empty - maybe corrupted"}},
-    {"IC014",
-        {info, "Compaction to be run with strategy ~w and max_run_length ~w"}},
-
-    {"CDB01",
-        {info, "Opening file for writing with filename ~s"}},
-    {"CDB02",
-        {info, "Opening file for reading with filename ~s"}},
-    {"CDB03",
-        {info, "Re-opening file for reading with filename ~s"}},
-    {"CDB04",
-        {info, "Deletion confirmed for file ~s at ManifestSQN ~w"}},
-    {"CDB05",
-        {info, "Closing of filename ~s from state ~w for reason ~w"}},
-    {"CDB06",
-        {warn, "File to be truncated at last position of ~w with end of "
-                ++ "file at ~w"}},
-    {"CDB07",
-        {info, "Hashtree computed"}},
-    {"CDB08",
-        {info, "Renaming file from ~s to ~s for which existence is ~w"}},
-    {"CDB09",
-        {info, "Failure to read Key/Value at Position ~w in scan " ++ 
-                "this may be the end of the file"}},
-    {"CDB10",
-        {info, "CRC check failed due to mismatch"}},
-    {"CDB11",
-        {info, "CRC check failed due to size"}},
-    {"CDB12",
-        {info, "HashTree written"}},
-    {"CDB13",
-        {debug, "Write options of ~w"}},
-    {"CDB14",
-        {info, "Microsecond timings for hashtree build of "
-                ++ "to_list=~w sort=~w build=~w"}},
-    {"CDB15",
-        {info, "Collision in search for hash ~w"}},
-    {"CDB18",
-        {info, "Handled return and write of hashtable"}},
-    {"CDB19",
-        {info, "Sample timings in microseconds for sample_count=~w " 
-                    ++ "with totals of cycle_count=~w "
-                    ++ "index_time=~w read_time=~w"}},
-    {"CDB20",
-        {warn, "Error ~w caught when safe reading a file to length ~w"}},
-    {"CDB21",
-        {warn, "File ~s to be deleted but already gone"}}
-        
-    ]).
+-define(LOGBASE,
+    #{
+        g0001 => 
+            {info, <<"Generic log point">>},
+        g0002 =>
+            {info, <<"Generic log point with term ~w">>},
+        d0001 =>
+            {info, <<"Generic debug log">>},
+        b0001 =>
+            {info, <<"Bookie starting with Ink ~w Pcl ~w">>},
+        b0002 =>
+            {info, <<"Snapshot starting with Ink ~w Pcl ~w">>},
+        b0003 =>
+            {info, <<"Bookie closing for reason ~w">>},
+        b0005 =>
+            {info, <<"LedgerSQN=~w at startup">>},
+        b0006 =>
+            {info, <<"Reached end of load batch with SQN ~w">>},
+        b0007 =>
+            {info, <<"Skipping as exceeded MaxSQN ~w with SQN ~w">>},
+        b0008 =>
+            {info, <<"Bucket list finds no more results">>},
+        b0009 =>
+            {debug, <<"Bucket list finds Bucket ~w">>},
+        b0011 =>
+            {warn, <<"Call to destroy the store and so all files to be removed">>},
+        b0013 =>
+            {warn, <<"Long running task took ~w microseconds with task_type=~w">>},
+        b0015 =>
+            {info, <<"Put timing with sample_count=~w ink_time=~w prep_time=~w mem_time=~w with total_object_size=~w">>},
+        b0016 =>
+            {info, <<"Get timing with sample_count=~w and head_time=~w body_time=~w with fetch_count=~w">>},
+        b0017 =>
+            {info, <<"Snapshot timing with sample_count=~w and bookie_time=~w pcl_time=~w">>},
+        b0018 =>
+            {info, <<"Positive HEAD responses timed with sample_count=~w and cache_count=~w found_count=~w fetch_ledger_time=~w fetch_ledgercache_time=~w rsp_time=~w notfound_time=~w">>},
+        b0019 =>
+            {warn, <<"Use of book_indexfold with constraint of Bucket ~w with no StartKey is deprecated">>},
+        b0020 =>
+            {warn, <<"Ratio of penciller cache size ~w to bookie's memory cache size ~w is larger than expected">>},
+        r0001 =>
+            {debug, <<"Object fold to process batch of ~w objects">>},
+        p0001 =>
+            {debug, <<"Ledger snapshot ~w registered">>},
+        p0003 =>
+            {debug, <<"Ledger snapshot ~w released">>},
+        p0004 =>
+            {debug, <<"Remaining ledger snapshots are ~w">>},
+        p0005 =>
+            {debug, <<"Delete confirmed as file ~s is removed from Manifest">>},
+        p0007 =>
+            {debug, <<"Sent release message for cloned Penciller following close for reason ~w">>},
+        p0008 =>
+            {info, <<"Penciller closing for reason ~w">>},
+        p0010 =>
+            {info, <<"level zero discarded_count=~w on close of Penciller">>},
+        p0011 =>
+            {info, <<"Shutdown complete for Penciller for reason ~w">>},
+        p0012 =>
+            {info, <<"Store to be started based on manifest sequence number of ~w">>},
+        p0013 =>
+            {warn, <<"Seqence number of 0 indicates no valid manifest">>},
+        p0014 =>
+            {info, <<"Maximum sequence number of ~w found in nonzero levels">>},
+        p0015 =>
+            {info, <<"L0 file found ~s">>},
+        p0016 =>
+            {info, <<"L0 file had maximum sequence number of ~w">>},
+        p0017 =>
+            {info, <<"No L0 file found">>},
+        p0018 =>
+            {info, <<"Response to push_mem of returned with cache_size=~w L0_pending=~w merge_backlog=~w cachelines_full=~w">>},
+        p0019 =>
+            {info, <<"Rolling level zero to filename ~s at ledger sqn ~w">>},
+        p0024 =>
+            {info, <<"Outstanding compaction work items of ~w with backlog status of ~w">>},
+        p0029 =>
+            {info, <<"L0 completion confirmed and will transition to not pending">>},
+        p0030 =>
+            {warn, <<"We're doomed - intention recorded to destroy all files">>},
+        p0031 =>
+            {info, <<"Completion of update to levelzero with cache_size=~w level0_due=~w change_pending=~w MinSQN=~w MaxSQN=~w">>},
+        p0032 =>
+            {info, <<"Fetch head timing with sample_count=~w and level timings of foundmem_time=~w found0_time=~w found1_time=~w found2_time=~w found3_time=~w foundlower_time=~w missed_time=~w with counts of foundmem_count=~w found0_count=~w found1_count=~w found2_count=~w found3_count=~w foundlower_count=~w missed_count=~w">>},
+        p0033 =>
+            {error, <<"Corrupted manifest file at path ~s to be ignored due to error ~s">>},
+        p0035 =>
+            {info, <<"Startup with Manifest SQN of ~w">>},
+        p0037 =>
+            {debug, <<"Merging of penciller L0 tree from size ~w complete">>},
+        p0038 =>
+            {info, <<"Timeout of snapshot with pid=~w at SQN=~w at TS ~w set to timeout=~w">>},
+        p0039 =>
+            {debug, <<"Failed to release pid=~w leaving SnapshotCount=~w and MinSQN=~w">>},
+        p0040 =>
+            {info, <<"Archiving filename ~s as unused at startup">>},
+        p0041 =>
+            {info, <<"Penciller manifest switched from SQN ~w to ~w">>},
+        pc001 =>
+            {info, <<"Penciller's clerk ~w started with owner ~w">>},
+        pc005 =>
+            {info, <<"Penciller's Clerk ~w shutdown now complete for reason ~w">>},
+        pc007 =>
+            {debug, <<"Clerk prompting Penciller regarding manifest change">>},
+        pc008 =>
+            {info, <<"Merge from level ~w to merge into ~w files below">>},
+        pc009 =>
+            {debug, <<"File ~s to simply switch levels to level ~w">>},
+        pc010 =>
+            {info, <<"Merge to be commenced for FileToMerge=~s with MSN=~w">>},
+        pc011 =>
+            {info, <<"Merge completed with MSN=~w to Level=~w and FileCounter=~w">>},
+        pc012 =>
+            {debug, <<"File to be created as part of MSN=~w Filename=~s IsBasement=~w">>},
+        pc013 =>
+            {warn, <<"Merge resulted in empty file ~s">>},
+        pc015 =>
+            {info, <<"file created">>},
+        pc016 =>
+            {info, <<"Slow fetch from SFT ~w of ~w us at level ~w with result ~w">>},
+        pc017 =>
+            {debug, <<"Notified clerk of manifest change">>},
+        pc018 =>
+            {info, <<"Saved manifest file">>},
+        pc019 =>
+            {debug, <<"After ~s level ~w is ~w">>},
+        pc021 =>
+            {debug, <<"Prompting deletions at ManifestSQN=~w">>},
+        pc022 =>
+            {debug, <<"Storing reference to deletions at ManifestSQN=~w">>},
+        pc023 =>
+            {info, <<"At level=~w file_count=~w avg_mem=~w file with most memory fn=~s p=~w mem=~w">>},
+        pc024 =>
+            {info, <<"Grooming compaction picked file with tomb_count=~w">>},
+        pm002 =>
+            {info, <<"Completed dump of L0 cache to list of l0cache_size=~w">>},
+        sst03 =>
+            {info, <<"Opening SST file with filename ~s slot_count=~w and max sqn ~w">>},
+        sst04 =>
+            {debug, <<"Exit called for reason ~w on filename ~s">>},
+        sst05 =>
+            {warn, <<"Rename rogue filename ~s to ~s">>},
+        sst06 =>
+            {debug, <<"File ~s has been set for delete">>},
+        sst07 =>
+            {info, <<"Exit called and now clearing ~s">>},
+        sst08 =>
+            {info, <<"Completed creation of ~s at level ~w with max sqn ~w">>},
+        sst09 =>
+            {warn, <<"Read request exposes slot with bad CRC">>},
+        sst10 =>
+            {debug, <<"Expansion sought to support pointer to pid ~w status ~w">>},
+        sst11 =>
+            {info, <<"Level zero creation timings in microseconds pmem_fetch=~w merge_lists=~w build_slots=~w build_summary=~w read_switch=~w">>},
+        sst12 =>
+            {info, <<"SST Timings at level=~w for sample_count=~w at timing points notfound_time=~w fetchcache_time=~w slotcached_time=~w slotnoncached_time=~w exiting at points notfound_count=~w fetchcache_count=~w slotcached_count=~w slotnoncached_count=~w">>},
+        sst13 =>
+            {info, <<"SST merge list build timings of fold_toslot=~w slot_hashlist=~w slot_serialise=~w slot_finish=~w is_basement=~w level=~w">>},
+        sst14 =>
+            {debug, <<"File ~s has completed BIC">>},
+        i0001 =>
+            {info, <<"Unexpected failure to fetch value for Key=~w SQN=~w with reason ~w">>},
+        i0002 =>
+            {debug, <<"Journal snapshot ~w registered at SQN ~w">>},
+        i0003 =>
+            {debug, <<"Journal snapshot ~w released">>},
+        i0004 =>
+            {info, <<"Remaining number of journal snapshots is ~w">>},
+        i0005 =>
+            {info, <<"Inker closing journal for reason ~w">>},
+        i0006 =>
+            {info, <<"Close triggered with journal_sqn=~w and manifest_sqn=~w">>},
+        i0007 =>
+            {info, <<"Inker manifest when closing is:">>},
+        i0008 =>
+            {info, <<"Put to new active journal required roll and manifest write">>},
+        i0009 =>
+            {info, <<"Updated manifest on startup:">>},
+        i0010 =>
+            {info, <<"Unchanged manifest on startup:">>},
+        i0011 =>
+            {info, <<"Manifest is empty, starting from manifest SQN 1">>},
+        i0012 =>
+            {info, <<"Head manifest entry ~s is complete so new active journal required">>},
+        i0013 =>
+            {info, <<"File ~s to be removed from manifest">>},
+        i0014 =>
+            {info, <<"On startup loading from filename ~s from SQN ~w">>},
+        i0015 =>
+            {info, <<"Opening manifest file at ~s with SQN ~w">>},
+        i0016 =>
+            {info, <<"Writing new version of manifest for manifestSQN=~w">>},
+        i0017 =>
+            {debug, <<"At SQN=~w journal has filename ~s">>},
+        i0018 =>
+            {warn, <<"We're doomed - intention recorded to destroy all files">>},
+        i0020 =>
+            {info, <<"Journal backup completed to path=~s with file_count=~w">>},
+        i0021 =>
+            {info, <<"Ingoring filename=~s with SQN=~w and JournalSQN=~w">>},
+        i0022 =>
+            {info, <<"Removing filename=~s from backup folder as not in backup">>},
+        i0023 =>
+            {info, <<"Backup commencing into folder with ~w existing files">>},
+        i0024 =>
+            {info, <<"Prompted roll at NewSQN=~w">>},
+        i0025 =>
+            {warn, <<"Journal SQN of ~w is below Ledger SQN of ~w anti-entropy will be required">>},
+        ic001 =>
+            {info, <<"Closed for reason ~w so maybe leaving garbage">>},
+        ic002 =>
+            {info, <<"Clerk updating Inker as compaction complete of ~w files">>},
+        ic003 =>
+            {info, <<"Scoring of compaction runs complete with highest score=~w with run of run_length=~w">>},
+        ic004 =>
+            {info, <<"Score=~w with mean_byte_jump=~w for filename ~s">>},
+        ic005 =>
+            {info, <<"Compaction to be performed on file_count=~w with compaction_score=~w">>},
+        ic006 =>
+            {info, <<"Filename ~s is part of compaction run">>},
+        ic007 =>
+            {info, <<"Clerk has completed compaction process">>},
+        ic008 =>
+            {info, <<"Compaction source ~s has yielded ~w positions">>},
+        ic009 =>
+            {info, <<"Generate journal for compaction with filename ~s">>},
+        ic010 =>
+            {info, <<"Clearing journal with filename ~s">>},
+        ic011 =>
+            {info, <<"Not clearing filename ~s as modified delta is only ~w seconds">>},
+        ic012 =>
+            {warn, <<"Tag ~w not found in Strategy ~w - maybe corrupted">>},
+        ic013 =>
+            {warn, "File with name ~s to be ignored in manifest as scanning for first key returned empty - maybe corrupted"},
+        ic014 =>
+            {info, <<"Compaction to be run with strategy ~w and max_run_length ~w">>},
+        cdb01 =>
+            {info, <<"Opening file for writing with filename ~s">>},
+        cdb02 =>
+            {info, <<"Opening file for reading with filename ~s">>},
+        cdb03 =>
+            {info, <<"Re-opening file for reading with filename ~s">>},
+        cdb04 =>
+            {info, <<"Deletion confirmed for file ~s at ManifestSQN ~w">>},
+        cdb05 =>
+            {info, <<"Closing of filename ~s from state ~w for reason ~w">>},
+        cdb06 =>
+            {warn, <<"File to be truncated at last position of ~w with end of file at ~w">>},
+        cdb07 =>
+            {info, <<"Hashtree index computed">>},
+        cdb08 =>
+            {info, <<"Renaming file from ~s to ~s for which existence is ~w">>},
+        cdb09 =>
+            {info, <<"Failure to read Key/Value at Position ~w in scan this may be the end of the file">>},
+        cdb10 =>
+            {warn, <<"CRC check failed due to error=~s">>},
+        cdb12 =>
+            {info, <<"Hashtree index writte">>},
+        cdb13 =>
+            {debug, <<"Write options of ~w">>},
+        cdb14 =>
+            {info, <<"Microsecond timings for hashtree build of to_list=~w sort=~w build=~w">>},
+        cdb15 =>
+            {info, <<"Collision in search for hash ~w">>},
+        cdb18 =>
+            {info, <<"Handled return and write of hashtable">>},
+        cdb19 =>
+            {info, <<"Sample timings in microseconds for sample_count=~w with totals of cycle_count=~w index_time=~w read_time=~w">>},
+        cdb20 =>
+            {warn, <<"Error ~w caught when safe reading a file to length ~w">>},
+        cdb21 =>
+            {warn, <<"File ~s to be deleted but already gone">>}
+    }).
 
 
 %%%============================================================================
@@ -403,7 +328,7 @@ set_databaseid(DBid) when is_integer(DBid) ->
     UpdLO = LO#log_options{database_id = DBid},
     save(UpdLO).
 
--spec add_forcedlogs(list(string())) -> ok.
+-spec add_forcedlogs(list(atom())) -> ok.
 %% @doc
 %% Add a forced log to the list of forced logs. this will cause the log of this
 %% logReference to be logged even if the log_level of the process would not
@@ -415,7 +340,7 @@ add_forcedlogs(LogRefs) ->
     UpdLO = LO#log_options{forced_logs = lists:usort(LogRefs ++ ForcedLogs)},
     save(UpdLO).
 
--spec remove_forcedlogs(list(string())) -> ok.
+-spec remove_forcedlogs(list(atom())) -> ok.
 %% @doc
 %% Remove a forced log from the list of forced logs
 remove_forcedlogs(LogRefs) ->
@@ -454,26 +379,20 @@ return_settings() ->
 %%% Prompt Logs
 %%%============================================================================
 
-
+-spec log(atom(), list()) -> ok.
 log(LogReference, Subs) ->
     log(LogReference, Subs, ?LOG_LEVELS).
 
 log(LogRef, Subs, SupportedLogLevels) ->
-    case lists:keyfind(LogRef, 1, ?LOGBASE) of
-        {LogRef, {LogLevel, LogText}} ->
-            LogOpts = get_opts(),
-            case should_i_log(LogLevel, SupportedLogLevels, LogRef, LogOpts) of
-                true ->
-                    DBid = LogOpts#log_options.database_id,
-                    io:format(format_time() ++ " "
-                                ++ " log_level="
-                                ++ atom_to_list(LogLevel) ++ " log_ref="
-                                ++ LogRef ++ " db_id=~w pid=~w "
-                                ++ LogText ++ "~n",
-                                [DBid|[self()|Subs]]);
-                false ->
-                    ok
-            end;
+    {LogLevel, Log} = maps:get(LogRef, ?LOGBASE),
+    LogOpts = get_opts(),
+    case should_i_log(LogLevel, SupportedLogLevels, LogRef, LogOpts) of
+        true ->
+            DBid = LogOpts#log_options.database_id,
+            Time = format_time(),
+            Prefix = log_prefix(LogLevel, LogRef, DBid, self()),
+            Suffix = "~n",
+            io:format(iolist_to_binary([Time, Prefix, Log, Suffix]), Subs);
         false ->
             ok
     end.
@@ -502,31 +421,18 @@ log_timer(LogReference, Subs, StartTime) ->
     log_timer(LogReference, Subs, StartTime, ?LOG_LEVELS).
 
 log_timer(LogRef, Subs, StartTime, SupportedLevels) ->
-    case lists:keyfind(LogRef, 1, ?LOGBASE) of
-        {LogRef, {LogLevel, LogText}} ->
-            LogOpts = get_opts(),
-            case should_i_log(LogLevel, SupportedLevels, LogRef, LogOpts) of
-                true ->
-                    DurationText =
-                        case timer:now_diff(os:timestamp(), StartTime) of
-                            US when US > 1000 ->
-                                " with us_duration=" ++ integer_to_list(US) ++
-                                " or ms_duration="
-                                ++ integer_to_list(US div 1000);
-                            US ->
-                                " with us_duration=" ++ integer_to_list(US)
-                        end,
-                    DBid = LogOpts#log_options.database_id,
-                    io:format(format_time() ++ " "
-                                ++ " log_level="
-                                ++ atom_to_list(LogLevel) ++ " log_ref="
-                                ++ LogRef ++ " db_id=~w pid=~w "
-                                ++ LogText
-                                ++ DurationText ++ "~n",
-                                [DBid|[self()|Subs]]);
-                false ->
-                    ok
-            end;
+    {LogLevel, Log} = maps:get(LogRef, ?LOGBASE),
+    LogOpts = get_opts(),
+    case should_i_log(LogLevel, SupportedLevels, LogRef, LogOpts) of
+        true ->
+            Duration = duration_text(StartTime),
+            DBid = LogOpts#log_options.database_id,
+            Time = format_time(),
+            Prefix = log_prefix(LogLevel, LogRef, DBid, self()),
+            Suffix = <<"~n">>,
+            io:format(
+                iolist_to_binary([Time, Prefix, Log, Duration, Suffix]),
+                Subs);
         false ->
             ok
     end.
@@ -540,6 +446,7 @@ log_randomtimer(LogReference, Subs, StartTime, RandomProb) ->
             ok
     end.
 
+-spec format_time() -> io_lib:chars().
 format_time() ->
     format_time(localtime_ms()).
 
@@ -552,37 +459,48 @@ format_time({{Y, M, D}, {H, Mi, S, Ms}}) ->
     io_lib:format("~b-~2..0b-~2..0b", [Y, M, D]) ++ "T" ++
         io_lib:format("~2..0b:~2..0b:~2..0b.~3..0b", [H, Mi, S, Ms]).
 
+-spec log_prefix(atom(), atom(), non_neg_integer(), pid()) -> io_lib:chars().
+log_prefix(LogLevel, LogRef, DBid, Pid) ->
+    io_lib:format(
+        " log_level=~w log_ref=~w db_id=~w pid=~w ",
+        [LogLevel, LogRef, DBid, Pid]).
+
+-spec duration_text(erlang:timestamp()) -> io_lib:chars().
+duration_text(StartTime) ->
+    case timer:now_diff(os:timestamp(), StartTime) of
+        US when US > 1000 ->
+            io_lib:format(
+                " with us_duration=~w or ms_duration=~w", [US, US div 1000]);
+        US ->
+            io_lib:format(" with us_duration=~w", [US])
+    end.
 
 %%%============================================================================
 %%% Test
 %%%============================================================================
 
-
-
 -ifdef(TEST).
 
 log_test() ->
-    log("D0001", []),
-    log_timer("D0001", [], os:timestamp()).
+    log(d0001, []),
+    log_timer(d0001, [], os:timestamp()).
 
 log_warn_test() ->
-    ok = log("G0001", [], [warn, error]),
-    ok = log("G8888", [], [info, warn, error]),
-    ok = log_timer("G0001", [], os:timestamp(), [warn, error]),
-    ok = log_timer("G8888", [], os:timestamp(), [info, warn, error]).
+    ok = log(g0001, [], [warn, error]),
+    ok = log_timer(g0001, [], os:timestamp(), [warn, error]).
 
 shouldilog_test() ->
     ok = set_loglevel(debug),
-    ?assertMatch(true, should_i_log(info, ?LOG_LEVELS, "G0001")),
+    ?assertMatch(true, should_i_log(info, ?LOG_LEVELS, g0001)),
     ok = set_loglevel(info),
-    ?assertMatch(true, should_i_log(info, ?LOG_LEVELS, "G0001")),
-    ok = add_forcedlogs(["G0001"]),
+    ?assertMatch(true, should_i_log(info, ?LOG_LEVELS, g0001)),
+    ok = add_forcedlogs([g0001]),
     ok = set_loglevel(error),
-    ?assertMatch(true, should_i_log(info, ?LOG_LEVELS, "G0001")),
-    ?assertMatch(false, should_i_log(info, ?LOG_LEVELS, "G0002")),
-    ok = remove_forcedlogs(["G0001"]),
+    ?assertMatch(true, should_i_log(info, ?LOG_LEVELS, g0001)),
+    ?assertMatch(false, should_i_log(info, ?LOG_LEVELS, g0002)),
+    ok = remove_forcedlogs([g0001]),
     ok = set_loglevel(info),
-    ?assertMatch(false, should_i_log(debug, ?LOG_LEVELS, "D0001")).
+    ?assertMatch(false, should_i_log(debug, ?LOG_LEVELS, d0001)).
 
 badloglevel_test() ->
     % Set a bad log level - and everything logs
diff --git a/src/leveled_monitor.erl b/src/leveled_monitor.erl
index 9601e53c..fb6d8276 100644
--- a/src/leveled_monitor.erl
+++ b/src/leveled_monitor.erl
@@ -47,15 +47,19 @@
 
 -record(bookie_head_timings,
     {sample_count = 0 :: non_neg_integer(),
-    fetch_time = 0 :: non_neg_integer(),
+    cache_count = 0 :: non_neg_integer(),
+    found_count = 0 :: non_neg_integer(),
+    cache_hits = 0 :: non_neg_integer(),
+    fetch_ledger_time = 0 :: non_neg_integer(),
+    fetch_ledgercache_time = 0 :: non_neg_integer(),
     rsp_time = 0 :: non_neg_integer(),
-    fetch_count = 0 :: non_neg_integer(),
-    cache_count = 0 :: non_neg_integer()}).
+    notfound_time = 0 :: non_neg_integer()}).
 
 -record(bookie_put_timings,
     {sample_count = 0 :: non_neg_integer(),
-    mem_time = 0 :: non_neg_integer(),
     ink_time = 0 :: non_neg_integer(),
+    prep_time = 0 :: non_neg_integer(),
+    mem_time = 0 :: non_neg_integer(),
     total_size = 0 :: non_neg_integer()}).
 
 -record(bookie_snap_timings,
@@ -133,7 +137,7 @@
 -type bookie_head_update() ::
     {bookie_head_update, microsecs(), microsecs()|not_found, 0..1}.
 -type bookie_put_update() ::
-    {bookie_put_update, microsecs(), microsecs(), byte_size()}.
+    {bookie_put_update, microsecs(), microsecs(), microsecs(), byte_size()}.
 -type bookie_snap_update() ::
     {bookie_snap_update, microsecs(), microsecs()}.
 -type pcl_fetch_update() ::
@@ -232,25 +236,31 @@ handle_call(close, _From, State) ->
 handle_cast({bookie_head_update, FetchTime, RspTime, CacheHit}, State) ->
     Timings = State#state.bookie_head_timings,
     SC0 = Timings#bookie_head_timings.sample_count + 1,
-    CC0 = Timings#bookie_head_timings.cache_count + CacheHit, 
-    {FC0, PT0, RT0} =
-        case RspTime of
-            not_found ->
-                {Timings#bookie_head_timings.fetch_count,
-                    Timings#bookie_head_timings.fetch_time + FetchTime,
-                    Timings#bookie_head_timings.rsp_time};
-            RspTime ->
-                {Timings#bookie_head_timings.fetch_count + 1,
-                    Timings#bookie_head_timings.fetch_time + FetchTime,
-                    Timings#bookie_head_timings.rsp_time + RspTime}
+    CC0 = Timings#bookie_head_timings.cache_count + CacheHit,
+    FC = Timings#bookie_head_timings.found_count,
+    FLT = Timings#bookie_head_timings.fetch_ledger_time,
+    FCT = Timings#bookie_head_timings.fetch_ledgercache_time,
+    RST = Timings#bookie_head_timings.rsp_time,
+    NFT = Timings#bookie_head_timings.notfound_time,
+
+    {FC0, FLT0, FCT0, RST0, NFT0} =
+        case {RspTime, CacheHit} of
+            {not_found, _} ->
+                {FC, FLT, FCT, RST, NFT + FetchTime};
+            {RspTime, 0} ->
+                {FC + 1, FLT + FetchTime, FCT, RST + RspTime, NFT};
+            {RspTime, 1} ->
+                {FC + 1, FLT, FCT + FetchTime, RST + RspTime, NFT}
         end,
     UpdTimings =
         #bookie_head_timings{
             sample_count = SC0,
-            fetch_time = PT0,
-            rsp_time = RT0,
-            fetch_count = FC0,
-            cache_count = CC0
+            cache_count = CC0,
+            found_count = FC0,
+            fetch_ledger_time = FLT0,
+            fetch_ledgercache_time = FCT0,
+            rsp_time = RST0,
+            notfound_time = NFT0
         },
     {noreply, State#state{bookie_head_timings = UpdTimings}};
 handle_cast({bookie_get_update, HeadTime, BodyTime}, State) ->
@@ -275,17 +285,19 @@ handle_cast({bookie_get_update, HeadTime, BodyTime}, State) ->
             fetch_count = FC0
         },
     {noreply, State#state{bookie_get_timings = UpdTimings}};
-handle_cast({bookie_put_update, MemTime, InkTime, Size}, State) ->
+handle_cast({bookie_put_update, InkTime, PrepTime, MemTime, Size}, State) ->
     Timings = State#state.bookie_put_timings,
     SC0 = Timings#bookie_put_timings.sample_count + 1,
     SZ0 = Timings#bookie_put_timings.total_size + Size,
-    MT0 = Timings#bookie_put_timings.mem_time + MemTime,
     IT0 = Timings#bookie_put_timings.ink_time + InkTime,
+    PT0 = Timings#bookie_put_timings.prep_time + PrepTime,
+    MT0 = Timings#bookie_put_timings.mem_time + MemTime,
     UpdTimings =
         #bookie_put_timings{
             sample_count = SC0,
-            mem_time = MT0,
             ink_time = IT0,
+            prep_time = PT0,
+            mem_time = MT0,
             total_size = SZ0
         },
     {noreply, State#state{bookie_put_timings = UpdTimings}};
@@ -419,7 +431,7 @@ handle_cast({cdb_get_update, CycleCount, IndexTime, ReadTime}, State) ->
 handle_cast({report_stats, bookie_get}, State) ->
     Timings = State#state.bookie_get_timings,
     leveled_log:log(
-        "B0016",
+        b0016,
         [Timings#bookie_get_timings.sample_count,
             Timings#bookie_get_timings.head_time,
             Timings#bookie_get_timings.body_time,
@@ -428,26 +440,29 @@ handle_cast({report_stats, bookie_get}, State) ->
 handle_cast({report_stats, bookie_head}, State) ->
     Timings = State#state.bookie_head_timings,
     leveled_log:log(
-        "B0018",
+        b0018,
         [Timings#bookie_head_timings.sample_count,
-            Timings#bookie_head_timings.fetch_time,
+            Timings#bookie_head_timings.cache_count,
+            Timings#bookie_head_timings.found_count,
+            Timings#bookie_head_timings.fetch_ledger_time,
+            Timings#bookie_head_timings.fetch_ledgercache_time,
             Timings#bookie_head_timings.rsp_time,
-            Timings#bookie_head_timings.fetch_count,
-            Timings#bookie_head_timings.cache_count]),
+            Timings#bookie_head_timings.notfound_time]),
     {noreply, State#state{bookie_head_timings = #bookie_head_timings{}}};
 handle_cast({report_stats, bookie_put}, State) ->
     Timings = State#state.bookie_put_timings,
     leveled_log:log(
-        "B0015",
+        b0015,
         [Timings#bookie_put_timings.sample_count,
-            Timings#bookie_put_timings.mem_time,
             Timings#bookie_put_timings.ink_time,
+            Timings#bookie_put_timings.prep_time,
+            Timings#bookie_put_timings.mem_time,
             Timings#bookie_put_timings.total_size]),
     {noreply, State#state{bookie_put_timings = #bookie_put_timings{}}};
 handle_cast({report_stats, bookie_snap}, State) ->
     Timings = State#state.bookie_snap_timings,
     leveled_log:log(
-        "B0017",
+        b0017,
         [Timings#bookie_snap_timings.sample_count,
             Timings#bookie_snap_timings.bookie_time,
             Timings#bookie_snap_timings.pcl_time]),
@@ -455,7 +470,7 @@ handle_cast({report_stats, bookie_snap}, State) ->
 handle_cast({report_stats, pcl_fetch}, State) ->
     Timings = State#state.pcl_fetch_timings,
     leveled_log:log(
-        "P0032",
+        p0032,
         [Timings#pcl_fetch_timings.sample_count,
             Timings#pcl_fetch_timings.foundmem_time,
             Timings#pcl_fetch_timings.found0_time,
@@ -476,7 +491,7 @@ handle_cast({report_stats, sst_fetch}, State) ->
     LogFun =
         fun({Level, Timings}) ->
             leveled_log:log(
-                "SST12",
+                sst12,
                 [Level,
                     Timings#sst_fetch_timings.sample_count,
                     Timings#sst_fetch_timings.notfound_time,
@@ -493,7 +508,7 @@ handle_cast({report_stats, sst_fetch}, State) ->
 handle_cast({report_stats, cdb_get}, State) ->
     Timings = State#state.cdb_get_timings,
     leveled_log:log(
-        "CDB19",
+        cdb19,
         [Timings#cdb_get_timings.sample_count,
             Timings#cdb_get_timings.cycle_count,
             Timings#cdb_get_timings.index_time,
diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl
index 2eca1f2a..c5c7618e 100644
--- a/src/leveled_pclerk.erl
+++ b/src/leveled_pclerk.erl
@@ -72,7 +72,7 @@ clerk_new(Owner, RootPath, OptsSST) ->
                                  {sst_options, OptsSST}],
                                 []),
     ok = gen_server:call(Pid, {load, Owner, RootPath}, infinity),
-    leveled_log:log("PC001", [Pid, Owner]),
+    leveled_log:log(pc001, [Pid, Owner]),
     {ok, Pid}.
 
 -spec clerk_prompt(pid()) -> ok.
@@ -132,7 +132,7 @@ handle_cast({push_work, Work}, State) ->
             Work,
             State#state.root_path, State#state.sst_options, State#state.owner),
     PDs = dict:store(ManifestSQN, Deletions, State#state.pending_deletions),
-    leveled_log:log("PC022", [ManifestSQN]),
+    leveled_log:log(pc022, [ManifestSQN]),
     {noreply, State#state{pending_deletions = PDs}, ?MIN_TIMEOUT};
 handle_cast({prompt_deletions, ManifestSQN}, State) ->
     {Deletions, UpdD} = return_deletions(ManifestSQN,
@@ -165,7 +165,7 @@ handle_info(timeout, State) ->
     {noreply, State, ?MAX_TIMEOUT}.
 
 terminate(Reason, _State) ->
-    leveled_log:log("PC005", [self(), Reason]).
+    leveled_log:log(pc005, [self(), Reason]).
 
 code_change(_OldVsn, State, _Extra) ->
     {ok, State}.
@@ -184,13 +184,13 @@ handle_work(
         {SrcLevel, Manifest}, RootPath, SSTOpts, Owner) ->
     {UpdManifest, EntriesToDelete} = 
         merge(SrcLevel, Manifest, RootPath, SSTOpts),
-    leveled_log:log("PC007", []),
+    leveled_log:log(pc007, []),
     SWMC = os:timestamp(),
     ok = leveled_penciller:pcl_manifestchange(Owner, UpdManifest),
-    leveled_log:log_timer("PC017", [], SWMC),
+    leveled_log:log_timer(pc017, [], SWMC),
     SWSM = os:timestamp(),
     ok = leveled_pmanifest:save_manifest(UpdManifest, RootPath),
-    leveled_log:log_timer("PC018", [], SWSM),
+    leveled_log:log_timer(pc018, [], SWSM),
     {leveled_pmanifest:get_manifest_sqn(UpdManifest), EntriesToDelete}.
 
 -spec merge(
@@ -203,7 +203,7 @@ merge(SrcLevel, Manifest, RootPath, OptsSST) ->
         {0, 0, undefined} ->
             ok;
         {FCnt, AvgMem, {MaxFN, MaxP, MaxMem}} ->
-            leveled_log:log("PC023",
+            leveled_log:log(pc023,
                             [SrcLevel + 1, FCnt, AvgMem, MaxFN, MaxP, MaxMem])
     end,
     SelectMethod =
@@ -221,11 +221,11 @@ merge(SrcLevel, Manifest, RootPath, OptsSST) ->
                                                 Src#manifest_entry.start_key,
                                                 Src#manifest_entry.end_key),
     Candidates = length(SinkList),
-    leveled_log:log("PC008", [SrcLevel, Candidates]),
+    leveled_log:log(pc008, [SrcLevel, Candidates]),
     case Candidates of
         0 ->
             NewLevel = SrcLevel + 1,
-            leveled_log:log("PC009", [Src#manifest_entry.filename, NewLevel]),
+            leveled_log:log(pc009, [Src#manifest_entry.filename, NewLevel]),
             leveled_sst:sst_switchlevels(Src#manifest_entry.owner, NewLevel),
             Man0 = leveled_pmanifest:switch_manifest_entry(Manifest,
                                                             NewSQN,
@@ -256,7 +256,7 @@ perform_merge(Manifest,
                 Src, SinkList, SrcLevel, 
                 RootPath, NewSQN, 
                 OptsSST) ->
-    leveled_log:log("PC010", [Src#manifest_entry.filename, NewSQN]),
+    leveled_log:log(pc010, [Src#manifest_entry.filename, NewSQN]),
     SrcList = [{next, Src, all}],
     MaxSQN = leveled_sst:sst_getmaxsequencenumber(Src#manifest_entry.owner),
     SinkLevel = SrcLevel + 1,
@@ -284,19 +284,19 @@ perform_merge(Manifest,
     {Man2, [Src|SinkManifestList]}.
 
 do_merge([], [], SinkLevel, _SinkB, _RP, NewSQN, _MaxSQN, _Opts, Additions) ->
-    leveled_log:log("PC011", [NewSQN, SinkLevel, length(Additions)]),
+    leveled_log:log(pc011, [NewSQN, SinkLevel, length(Additions)]),
     Additions;
 do_merge(KL1, KL2, SinkLevel, SinkB, RP, NewSQN, MaxSQN, OptsSST, Additions) ->
     FileName = leveled_penciller:sst_filename(NewSQN,
                                                 SinkLevel,
                                                 length(Additions)),
-    leveled_log:log("PC012", [NewSQN, FileName, SinkB]),
+    leveled_log:log(pc012, [NewSQN, FileName, SinkB]),
     TS1 = os:timestamp(),
     case leveled_sst:sst_newmerge(RP, FileName,
                                     KL1, KL2, SinkB, SinkLevel, MaxSQN,
                                     OptsSST) of
         empty ->
-            leveled_log:log("PC013", [FileName]),
+            leveled_log:log(pc013, [FileName]),
             do_merge([], [],
                         SinkLevel, SinkB,
                         RP, NewSQN, MaxSQN,
@@ -309,7 +309,7 @@ do_merge(KL1, KL2, SinkLevel, SinkB, RP, NewSQN, MaxSQN, OptsSST, Additions) ->
                                             owner=Pid,
                                             filename=FileName,
                                             bloom=Bloom},
-                leveled_log:log_timer("PC015", [], TS1),
+                leveled_log:log_timer(pc015, [], TS1),
                 do_merge(KL1Rem, KL2Rem,
                             SinkLevel, SinkB,
                             RP, NewSQN, MaxSQN,
@@ -323,7 +323,7 @@ do_merge(KL1, KL2, SinkLevel, SinkB, RP, NewSQN, MaxSQN, OptsSST, Additions) ->
 grooming_scorer([ME  | MEs]) ->
     InitTombCount = leveled_sst:sst_gettombcount(ME#manifest_entry.owner),
     {HighestTC, BestME} = grooming_scorer(InitTombCount, ME, MEs),
-    leveled_log:log("PC024", [HighestTC]),
+    leveled_log:log(pc024, [HighestTC]),
     BestME.
 
 grooming_scorer(HighestTC, BestME, []) ->
@@ -346,7 +346,7 @@ return_deletions(ManifestSQN, PendingDeletionD) ->
     %
     % So this is now allowed to crash again
     PendingDeletions = dict:fetch(ManifestSQN, PendingDeletionD),
-    leveled_log:log("PC021", [ManifestSQN]),
+    leveled_log:log(pc021, [ManifestSQN]),
     {PendingDeletions, dict:erase(ManifestSQN, PendingDeletionD)}.
 
 %%%============================================================================
diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl
index 3ce6a32f..6b380776 100644
--- a/src/leveled_penciller.erl
+++ b/src/leveled_penciller.erl
@@ -636,7 +636,7 @@ init([LogOpts, PCLopts]) ->
                                                 Query, 
                                                 BookiesMem, 
                                                 LongRunning),
-            leveled_log:log("P0001", [self()]),
+            leveled_log:log(p0001, [self()]),
             {ok, State#state{is_snapshot = true,
 			     bookie_monref = BookieMonitor,
 			     source_penciller = SrcPenciller}};
@@ -673,7 +673,7 @@ handle_call({push_mem, {LedgerTable, PushedIdx, MinSQN, MaxSQN}},
             % The Bookie must now retain the lesger cache and try to push the
             % updated cache at a later time
             leveled_log:log(
-                "P0018",
+                p0018,
                 [L0Size, L0Pending, WorkBacklog, CacheAlreadyFull]),
             {reply, returned, State};
         false ->
@@ -698,10 +698,7 @@ handle_call({push_mem, {LedgerTable, PushedIdx, MinSQN, MaxSQN}},
                     State#state.levelzero_index,
                     length(State#state.levelzero_cache) + 1),
             leveled_log:log_randomtimer(
-                "P0031", 
-                [NewL0Size, true, true, MinSQN, MaxSQN],
-                SW,
-                0.1),
+                p0031, [NewL0Size, true, true, MinSQN, MaxSQN], SW, 0.1),
             {reply,
                 ok,
                 State#state{
@@ -776,10 +773,8 @@ handle_call({fetch_keys,
                 lists:filter(FilterFun, L0AsList)
         end,
     
-    leveled_log:log_randomtimer("P0037",
-                                [State#state.levelzero_size],
-                                SW,
-                                0.01),
+    leveled_log:log_randomtimer(
+        p0037, [State#state.levelzero_size], SW, 0.01),
     
     %% Rename any reference to loop state that may be used by the function
     %% to be returned - https://github.com/martinsumner/leveled/issues/326
@@ -858,10 +853,8 @@ handle_call({register_snapshot, Snapshot, Query, BookiesMem, LongRunning},
                                                 EndKey,
                                                 State#state.levelzero_cache,
                                                 LM1Cache),
-                leveled_log:log_randomtimer("P0037",
-                                            [State#state.levelzero_size],
-                                            SW,
-                                            0.01),
+                leveled_log:log_randomtimer(
+                    p0037, [State#state.levelzero_size], SW, 0.01),
                 {#state{levelzero_astree = L0AsTree,
                         ledger_sqn = MaxSQN,
                         persisted_sqn = State#state.persisted_sqn},
@@ -915,7 +908,7 @@ handle_call(close, _From, State) ->
     % The penciller should close each file in the manifest, and call a close
     % on the clerk.
     ok = leveled_pclerk:clerk_close(State#state.clerk),
-    leveled_log:log("P0008", [close]),
+    leveled_log:log(p0008, [close]),
     L0Left = State#state.levelzero_size > 0,
     case (not State#state.levelzero_pending and L0Left) of
         true ->
@@ -931,12 +924,12 @@ handle_call(close, _From, State) ->
                     true),
             ok = leveled_sst:sst_close(Constructor);
         false ->
-            leveled_log:log("P0010", [State#state.levelzero_size])
+            leveled_log:log(p0010, [State#state.levelzero_size])
     end,
     shutdown_manifest(State#state.manifest, State#state.levelzero_constructor),
     {stop, normal, ok, State};
 handle_call(doom, _From, State) ->
-    leveled_log:log("P0030", []),
+    leveled_log:log(p0030, []),
     ok = leveled_pclerk:clerk_close(State#state.clerk),
     
     shutdown_manifest(State#state.manifest,  State#state.levelzero_constructor),
@@ -970,7 +963,7 @@ handle_call(persisted_sqn, _From, State) ->
 handle_cast({manifest_change, Manifest}, State) ->
     NewManSQN = leveled_pmanifest:get_manifest_sqn(Manifest),
     OldManSQN = leveled_pmanifest:get_manifest_sqn(State#state.manifest),
-    leveled_log:log("P0041", [OldManSQN, NewManSQN]),
+    leveled_log:log(p0041, [OldManSQN, NewManSQN]),
     % Only safe to update the manifest if the SQN increments
     if NewManSQN > OldManSQN ->
         ok =
@@ -995,7 +988,7 @@ handle_cast({manifest_change, Manifest}, State) ->
 handle_cast({release_snapshot, Snapshot}, State) ->
     Manifest0 = leveled_pmanifest:release_snapshot(State#state.manifest,
                                                    Snapshot),
-    leveled_log:log("P0003", [Snapshot]),
+    leveled_log:log(p0003, [Snapshot]),
     {noreply, State#state{manifest=Manifest0}};
 handle_cast({confirm_delete, PDFN, FilePid}, State=#state{is_snapshot=Snap})
                                                         when Snap == false ->
@@ -1015,7 +1008,7 @@ handle_cast({confirm_delete, PDFN, FilePid}, State=#state{is_snapshot=Snap})
     % will be cleared from pending using the maybe_release boolean
     case leveled_pmanifest:ready_to_delete(State#state.manifest, PDFN) of
         true ->
-            leveled_log:log("P0005", [PDFN]),
+            leveled_log:log(p0005, [PDFN]),
             ok = leveled_sst:sst_deleteconfirmed(FilePid),
             case State#state.work_ongoing of 
                 true ->
@@ -1047,7 +1040,7 @@ handle_cast({confirm_delete, PDFN, FilePid}, State=#state{is_snapshot=Snap})
             end
     end;
 handle_cast({levelzero_complete, FN, StartKey, EndKey, Bloom}, State) ->
-    leveled_log:log("P0029", []),
+    leveled_log:log(p0029, []),
     ManEntry = #manifest_entry{start_key=StartKey,
                                 end_key=EndKey,
                                 owner=State#state.levelzero_constructor,
@@ -1108,7 +1101,7 @@ handle_cast(work_for_clerk, State) ->
                             {noreply, State#state{work_backlog=false}};
                         N ->
                             Backlog = N > ?WORKQUEUE_BACKLOG_TOLERANCE,
-                            leveled_log:log("P0024", [N, Backlog]),
+                            leveled_log:log(p0024, [N, Backlog]),
                             [TL|_Tail] = WL,
                             ok =
                                 leveled_pclerk:clerk_push(
@@ -1161,9 +1154,9 @@ handle_info(_Info, State) ->
     {noreply, State}.
 
 terminate(Reason, _State=#state{is_snapshot=Snap}) when Snap == true ->
-    leveled_log:log("P0007", [Reason]);
+    leveled_log:log(p0007, [Reason]);
 terminate(Reason, _State) ->
-    leveled_log:log("P0011", [Reason]).
+    leveled_log:log(p0011, [Reason]).
 
 format_status(normal, [_PDict, State]) ->
     State;
@@ -1238,15 +1231,15 @@ start_from_file(PCLopts) ->
     SQNFun = fun leveled_sst:sst_getmaxsequencenumber/1,
     {MaxSQN, Manifest1, FileList} = 
         leveled_pmanifest:load_manifest(Manifest0, OpenFun, SQNFun),
-    leveled_log:log("P0014", [MaxSQN]),
+    leveled_log:log(p0014, [MaxSQN]),
     ManSQN = leveled_pmanifest:get_manifest_sqn(Manifest1),
-    leveled_log:log("P0035", [ManSQN]),
+    leveled_log:log(p0035, [ManSQN]),
     %% Find any L0 files
     L0FN = sst_filename(ManSQN + 1, 0, 0),
     {State0, FileList0} = 
         case filelib:is_file(filename:join(sst_rootpath(RootPath), L0FN)) of
             true ->
-                leveled_log:log("P0015", [L0FN]),
+                leveled_log:log(p0015, [L0FN]),
                 L0Open = leveled_sst:sst_open(sst_rootpath(RootPath),
                                                 L0FN,
                                                 OptsSST,
@@ -1263,14 +1256,14 @@ start_from_file(PCLopts) ->
                                                             ManSQN + 1,
                                                             0,
                                                             L0Entry),
-                leveled_log:log("P0016", [L0SQN]),
+                leveled_log:log(p0016, [L0SQN]),
                 LedgerSQN = max(MaxSQN, L0SQN),
                 {InitState#state{manifest = Manifest2,
                                     ledger_sqn = LedgerSQN,
                                     persisted_sqn = LedgerSQN},
                     [L0FN|FileList]};
             false ->
-                leveled_log:log("P0017", []),
+                leveled_log:log(p0017, []),
                 {InitState#state{manifest = Manifest1,
                                     ledger_sqn = MaxSQN,
                                     persisted_sqn = MaxSQN},
@@ -1336,7 +1329,7 @@ archive_files(RootPath, UsedFileList) ->
                         true ->
                             UnusedFiles;
                         false ->
-                            leveled_log:log("P0040", [FN0]),
+                            leveled_log:log(p0040, [FN0]),
                             [FN0|UnusedFiles]
                     end;
                 _ ->
@@ -1401,7 +1394,7 @@ maybe_cache_too_big(NewL0Size, L0MaxSize, CoinToss) ->
 roll_memory(NextManSQN, LedgerSQN, RootPath, none, CL, SSTOpts, false) ->
     L0Path = sst_rootpath(RootPath),
     L0FN = sst_filename(NextManSQN, 0, 0),
-    leveled_log:log("P0019", [L0FN, LedgerSQN]),
+    leveled_log:log(p0019, [L0FN, LedgerSQN]),
     PCL = self(),
     FetchFun =
         fun(Slot, ReturnFun) -> pcl_fetchlevelzero(PCL, Slot, ReturnFun) end,
@@ -1516,10 +1509,10 @@ log_slowfetch(T0, R, PID, Level, FetchTolerance) ->
         {T, R} when T < FetchTolerance ->
             R;
         {T, not_present} ->
-            leveled_log:log("PC016", [PID, T, Level, not_present]),
+            leveled_log:log(pc016, [PID, T, Level, not_present]),
             not_present;
         {T, R} ->
-            leveled_log:log("PC016", [PID, T, Level, found]),
+            leveled_log:log(pc016, [PID, T, Level, found]),
             R
     end.
 
diff --git a/src/leveled_pmanifest.erl b/src/leveled_pmanifest.erl
index b3827622..5286e153 100644
--- a/src/leveled_pmanifest.erl
+++ b/src/leveled_pmanifest.erl
@@ -325,7 +325,7 @@ replace_manifest_entry(Manifest, ManSQN, LevelIdx, Removals, Additions) ->
     {UpdBlooms, StrippedAdditions} = 
         update_blooms(Removals, Additions, Manifest#manifest.blooms),
     UpdLevel = replace_entry(LevelIdx, Level, Removals, StrippedAdditions),
-    leveled_log:log("PC019", ["insert", LevelIdx, UpdLevel]),
+    leveled_log:log(pc019, ["insert", LevelIdx, UpdLevel]),
     PendingDeletes = 
         update_pendingdeletes(ManSQN, 
                                 Removals, 
@@ -358,7 +358,7 @@ insert_manifest_entry(Manifest, ManSQN, LevelIdx, Entry) ->
     {UpdBlooms, UpdEntry} = 
         update_blooms([], Entry, Manifest#manifest.blooms),
     UpdLevel = add_entry(LevelIdx, Level, UpdEntry),
-    leveled_log:log("PC019", ["insert", LevelIdx, UpdLevel]),
+    leveled_log:log(pc019, ["insert", LevelIdx, UpdLevel]),
     Basement = max(LevelIdx, Manifest#manifest.basement),
     Manifest#manifest{levels = array:set(LevelIdx, UpdLevel, Levels),
                         basement = Basement,
@@ -375,7 +375,7 @@ remove_manifest_entry(Manifest, ManSQN, LevelIdx, Entry) ->
     {UpdBlooms, []} = 
         update_blooms(Entry, [], Manifest#manifest.blooms),
     UpdLevel = remove_entry(LevelIdx, Level, Entry),
-    leveled_log:log("PC019", ["remove", LevelIdx, UpdLevel]),
+    leveled_log:log(pc019, ["remove", LevelIdx, UpdLevel]),
     PendingDeletes = update_pendingdeletes(ManSQN,
                                             Entry,
                                             Manifest#manifest.pending_deletes),
@@ -560,7 +560,7 @@ release_snapshot(Manifest, Pid) ->
                 _ ->
                     case seconds_now() > (ST + TO) of 
                         true ->
-                            leveled_log:log("P0038", [P, SQN,  ST, TO]),
+                            leveled_log:log(p0038, [P, SQN,  ST, TO]),
                             {Acc, MinSQN, Found};
                         false ->
                             {[{P, SQN, ST, TO}|Acc], min(SQN, MinSQN), Found}
@@ -572,7 +572,7 @@ release_snapshot(Manifest, Pid) ->
                                                 Manifest#manifest.snapshots),
     case Hit of 
         false ->
-            leveled_log:log("P0039", [Pid, length(SnapList0), MinSnapSQN]);
+            leveled_log:log(p0039, [Pid, length(SnapList0), MinSnapSQN]);
         true ->
             ok 
     end,
@@ -581,7 +581,7 @@ release_snapshot(Manifest, Pid) ->
             Manifest#manifest{snapshots = SnapList0,
                                 min_snapshot_sqn = 0};
         _  ->
-            leveled_log:log("P0004", [SnapList0]),
+            leveled_log:log(p0004, [SnapList0]),
             Manifest#manifest{snapshots = SnapList0,
                                 min_snapshot_sqn = MinSnapSQN}
     end.
@@ -995,7 +995,7 @@ filepath(RootPath, NewMSN, pending_manifest) ->
 
 
 open_manifestfile(_RootPath, L) when L == [] orelse L == [0] ->
-    leveled_log:log("P0013", []),
+    leveled_log:log(p0013, []),
     new_manifest();
 open_manifestfile(RootPath, [TopManSQN|Rest]) ->
     CurrManFile = filepath(RootPath, TopManSQN, current_manifest),
@@ -1003,10 +1003,10 @@ open_manifestfile(RootPath, [TopManSQN|Rest]) ->
     <<CRC:32/integer, BinaryOfTerm/binary>> = FileBin,
     case erlang:crc32(BinaryOfTerm) of
         CRC ->
-            leveled_log:log("P0012", [TopManSQN]),
+            leveled_log:log(p0012, [TopManSQN]),
             binary_to_term(BinaryOfTerm);
         _ ->
-            leveled_log:log("P0033", [CurrManFile, "crc wonky"]),
+            leveled_log:log(p0033, [CurrManFile, "crc wonky"]),
             open_manifestfile(RootPath, Rest)
     end.
 
diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl
index cbbdff87..025fa7b9 100644
--- a/src/leveled_pmem.erl
+++ b/src/leveled_pmem.erl
@@ -146,7 +146,7 @@ to_list(Slots, FetchFun) ->
                                 end,
                             [],
                             SlotList),
-    leveled_log:log_timer("PM002", [length(FullList)], SW),
+    leveled_log:log_timer(pm002, [length(FullList)], SW),
     FullList.
 
 -spec check_levelzero(tuple(), list(integer()), list())
diff --git a/src/leveled_runner.erl b/src/leveled_runner.erl
index f9dda1f9..1d8dcc70 100644
--- a/src/leveled_runner.erl
+++ b/src/leveled_runner.erl
@@ -423,7 +423,7 @@ foldobjects_allkeys(SnapFun, Tag, FoldObjectsFun, sqn_order) ->
                                     Acc
                             end 
                         end,
-                    leveled_log:log("R0001", [length(BatchAcc)]),
+                    leveled_log:log(r0001, [length(BatchAcc)]),
                     lists:foldr(ObjFun, ObjAcc, BatchAcc)
                 end,
             
@@ -527,10 +527,10 @@ get_nextbucket(NextBucket, NextKey, Tag, LedgerSnapshot, BKList, {C, L}) ->
                                                     null),
     case R of
         {1, null} ->
-            leveled_log:log("B0008",[]),
+            leveled_log:log(b0008,[]),
             BKList;
         {0, {{B, K}, _V}} ->
-            leveled_log:log("B0009",[B]),
+            leveled_log:log(b0009,[B]),
             get_nextbucket(leveled_codec:next_key(B),
                             null,
                             Tag,
diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl
index af657878..c749ae6a 100644
--- a/src/leveled_sst.erl
+++ b/src/leveled_sst.erl
@@ -576,9 +576,8 @@ starting({sst_new,
                     State#state{root_path=RootPath, yield_blockquery=YBQ},
                     OptsSST#sst_options.pagecache_level >= Level),
     Summary = UpdState#state.summary,
-    leveled_log:log_timer("SST08",
-                            [ActualFilename, Level, Summary#summary.max_sqn],
-                            SW),
+    leveled_log:log_timer(
+        sst08, [ActualFilename, Level, Summary#summary.max_sqn], SW),
     erlang:send_after(?STARTUP_TIMEOUT, self(), start_complete),
     {reply,
         {ok, {Summary#summary.first_key, Summary#summary.last_key}, Bloom},
@@ -657,10 +656,9 @@ starting(complete_l0startup, State) ->
     Summary = UpdState#state.summary,
     Time4 = timer:now_diff(os:timestamp(), SW4),
     
-    leveled_log:log_timer("SST08",
-                            [ActualFilename, 0, Summary#summary.max_sqn],
-                            SW0),
-    leveled_log:log("SST11", [Time0, Time1, Time2, Time3, Time4]),
+    leveled_log:log_timer(
+        sst08, [ActualFilename, 0, Summary#summary.max_sqn], SW0),
+    leveled_log:log(sst11, [Time0, Time1, Time2, Time3, Time4]),
 
     case Penciller of
         undefined ->
@@ -824,7 +822,7 @@ reader(get_maxsequencenumber, _From, State) ->
     Summary = State#state.summary,
     {reply, Summary#summary.max_sqn, reader, State};
 reader({set_for_delete, Penciller}, _From, State) ->
-    leveled_log:log("SST06", [State#state.filename]),
+    leveled_log:log(sst06, [State#state.filename]),
     {reply,
         ok,
         delete_pending,
@@ -923,7 +921,7 @@ delete_pending({get_slots, SlotList, SegList, LowLastMod}, _From, State) ->
         State, 
         ?DELETE_TIMEOUT};
 delete_pending(close, _From, State) ->
-    leveled_log:log("SST07", [State#state.filename]),
+    leveled_log:log(sst07, [State#state.filename]),
     ok = file:close(State#state.handle),
     ok = file:delete(filename:join(State#state.root_path,
                                     State#state.filename)),
@@ -941,7 +939,7 @@ delete_pending(timeout, State) ->
     % back-off
     {next_state, delete_pending, State, leveled_rand:uniform(10) * ?DELETE_TIMEOUT};
 delete_pending(close, State) ->
-    leveled_log:log("SST07", [State#state.filename]),
+    leveled_log:log(sst07, [State#state.filename]),
     ok = file:close(State#state.handle),
     ok = file:delete(filename:join(State#state.root_path,
                                     State#state.filename)),
@@ -970,7 +968,7 @@ handle_info(bic_complete, StateName, State) ->
     % The block index cache is complete, so the memory footprint should be
     % relatively stable from this point.  Hibernate to help minimise
     % fragmentation
-    leveled_log:log("SST14", [State#state.filename]),
+    leveled_log:log(sst14, [State#state.filename]),
     {next_state, StateName, State, hibernate};
 handle_info(start_complete, StateName, State) ->
     % The SST file will be started by a clerk, but the clerk may be shut down
@@ -990,7 +988,7 @@ handle_info(start_complete, StateName, State) ->
 terminate(normal, delete_pending, _State) ->
     ok;
 terminate(Reason, _StateName, State) ->
-    leveled_log:log("SST04", [Reason, State#state.filename]).
+    leveled_log:log(sst04, [Reason, State#state.filename]).
 
 code_change(_OldVsn, StateName, State, _Extra) ->
     {ok, StateName, State}.
@@ -1053,7 +1051,7 @@ expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, EndKey},
 expand_list_by_pointer({next, ManEntry, StartKey, EndKey}, 
                                         Tail, Width, SegList, LowLastMod) ->
     SSTPid = ManEntry#manifest_entry.owner,
-    leveled_log:log("SST10", [SSTPid, is_process_alive(SSTPid)]),
+    leveled_log:log(sst10, [SSTPid, is_process_alive(SSTPid)]),
     ExpPointer = sst_getfilteredrange(SSTPid, 
                                         StartKey,
                                         EndKey, 
@@ -1551,7 +1549,7 @@ write_file(RootPath, Filename, SummaryBin, SlotsBin,
         true ->
             AltName = filename:join(RootPath, filename:basename(FinalName))
                         ++ ?DISCARD_EXT,
-            leveled_log:log("SST05", [FinalName, AltName]),
+            leveled_log:log(sst05, [FinalName, AltName]),
             ok = file:rename(filename:join(RootPath, FinalName), AltName);
         false ->
             ok
@@ -1581,9 +1579,8 @@ read_file(Filename, State, LoadPageCache) ->
         from_list(
             SlotList, Summary#summary.first_key, Summary#summary.last_key),
     UpdSummary = Summary#summary{index = SlotIndex},
-    leveled_log:log("SST03", [Filename,
-                                Summary#summary.size,
-                                Summary#summary.max_sqn]),
+    leveled_log:log(
+        sst03, [Filename, Summary#summary.size, Summary#summary.max_sqn]),
     {UpdState1#state{summary = UpdSummary,
                         handle = Handle,
                         filename = Filename,
@@ -2627,7 +2624,7 @@ crc_check_slot(FullBin) ->
         {CRC32H, CRC32PBL} ->
             {Header, Blocks};
         _ ->
-            leveled_log:log("SST09", []),
+            leveled_log:log(sst09, []),
             crc_wonky
     end.
 
@@ -3061,12 +3058,13 @@ update_buildtimings(SW, Timings, Stage) ->
 %%
 %% Log out the time spent during the merge lists part of the SST build
 log_buildtimings(Timings, LI) ->
-    leveled_log:log("SST13", [Timings#build_timings.fold_toslot,
-                                Timings#build_timings.slot_hashlist,
-                                Timings#build_timings.slot_serialise,
-                                Timings#build_timings.slot_finish,
-                                element(1, LI),
-                                element(2, LI)]).
+    leveled_log:log(
+        sst13, 
+        [Timings#build_timings.fold_toslot,
+            Timings#build_timings.slot_hashlist,
+            Timings#build_timings.slot_serialise,
+            Timings#build_timings.slot_finish,
+            element(1, LI), element(2, LI)]).
 
 -spec maybelog_fetch_timing(
         leveled_monitor:monitor(),
diff --git a/test/end_to_end/basic_SUITE.erl b/test/end_to_end/basic_SUITE.erl
index c93ef76d..24c58020 100644
--- a/test/end_to_end/basic_SUITE.erl
+++ b/test/end_to_end/basic_SUITE.erl
@@ -40,8 +40,9 @@ simple_put_fetch_head_delete(_Config) ->
     io:format("simple test with error and no forced logs~n"),
     simple_test_withlog(error, []),
     io:format("simple test with error and stats logs~n"),
-    simple_test_withlog(error, ["B0015", "B0016", "B0017", "B0018", 
-                                "P0032", "SST12", "CDB19", "SST13", "I0019"]).
+    simple_test_withlog(
+        error,
+        [b0015, b0016, b0017, b0018, p0032, sst12, cdb19, sst13, i0019]).
 
 
 simple_test_withlog(LogLevel, ForcedLogs) ->

From eaa36cc59a22e04f0b6df26e618d95a940960253 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Fri, 2 Dec 2022 11:08:01 +0000
Subject: [PATCH 31/37] Efficient time format

Borrow from lager to make the formatting of time more efficient.  The io:format/io_lib:format appear to be the most expensive parts of logging by far.
---
 src/leveled_log.erl | 84 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 24 deletions(-)

diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index ec5c551c..eee603ac 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -150,7 +150,7 @@
         pc013 =>
             {warn, <<"Merge resulted in empty file ~s">>},
         pc015 =>
-            {info, <<"file created">>},
+            {info, <<"File created">>},
         pc016 =>
             {info, <<"Slow fetch from SFT ~w of ~w us at level ~w with result ~w">>},
         pc017 =>
@@ -389,10 +389,12 @@ log(LogRef, Subs, SupportedLogLevels) ->
     case should_i_log(LogLevel, SupportedLogLevels, LogRef, LogOpts) of
         true ->
             DBid = LogOpts#log_options.database_id,
-            Time = format_time(),
-            Prefix = log_prefix(LogLevel, LogRef, DBid, self()),
-            Suffix = "~n",
-            io:format(iolist_to_binary([Time, Prefix, Log, Suffix]), Subs);
+            Prefix =
+                iolist_to_binary(
+                    log_prefix(
+                        localtime_ms(), LogLevel, LogRef, DBid, self())),
+            Suffix = <<"~n">>,
+            io:format(iolist_to_binary([Prefix, Log, Suffix]), Subs);
         false ->
             ok
     end.
@@ -425,13 +427,15 @@ log_timer(LogRef, Subs, StartTime, SupportedLevels) ->
     LogOpts = get_opts(),
     case should_i_log(LogLevel, SupportedLevels, LogRef, LogOpts) of
         true ->
-            Duration = duration_text(StartTime),
             DBid = LogOpts#log_options.database_id,
-            Time = format_time(),
-            Prefix = log_prefix(LogLevel, LogRef, DBid, self()),
+            Prefix =
+                iolist_to_binary(
+                    log_prefix(
+                        localtime_ms(), LogLevel, LogRef, DBid, self())),
             Suffix = <<"~n">>,
+            Duration = duration_text(StartTime),
             io:format(
-                iolist_to_binary([Time, Prefix, Log, Duration, Suffix]),
+                iolist_to_binary([Prefix, Log, Duration, Suffix]),
                 Subs);
         false ->
             ok
@@ -446,33 +450,33 @@ log_randomtimer(LogReference, Subs, StartTime, RandomProb) ->
             ok
     end.
 
--spec format_time() -> io_lib:chars().
-format_time() ->
-    format_time(localtime_ms()).
-
 localtime_ms() ->
     {_, _, Micro} = Now = os:timestamp(),
     {Date, {Hours, Minutes, Seconds}} = calendar:now_to_local_time(Now),
     {Date, {Hours, Minutes, Seconds, Micro div 1000 rem 1000}}.
 
-format_time({{Y, M, D}, {H, Mi, S, Ms}}) ->
-    io_lib:format("~b-~2..0b-~2..0b", [Y, M, D]) ++ "T" ++
-        io_lib:format("~2..0b:~2..0b:~2..0b.~3..0b", [H, Mi, S, Ms]).
+-spec log_prefix(
+    tuple(), atom(), atom(), non_neg_integer(), pid()) -> io_lib:chars().
+log_prefix({{Y, M, D}, {H, Mi, S, Ms}}, LogLevel, LogRef, DBid, Pid) ->
+    [integer_to_list(Y), $-, i2l(M), $-, i2l(D),
+    $T, i2l(H), $:, i2l(Mi), $:, i2l(S), $., i3l(Ms),
+    " log_level=", atom_to_list(LogLevel), " log_ref=", atom_to_list(LogRef),
+    " db_id=", integer_to_list(DBid), " pid=", pid_to_list(Pid), " "].
+
+i2l(I) when I < 10  -> [$0, $0+I];
+i2l(I)              -> integer_to_list(I).
 
--spec log_prefix(atom(), atom(), non_neg_integer(), pid()) -> io_lib:chars().
-log_prefix(LogLevel, LogRef, DBid, Pid) ->
-    io_lib:format(
-        " log_level=~w log_ref=~w db_id=~w pid=~w ",
-        [LogLevel, LogRef, DBid, Pid]).
+i3l(I) when I < 100 -> [$0 | i2l(I)];
+i3l(I)              -> integer_to_list(I).
 
 -spec duration_text(erlang:timestamp()) -> io_lib:chars().
 duration_text(StartTime) ->
     case timer:now_diff(os:timestamp(), StartTime) of
         US when US > 1000 ->
-            io_lib:format(
-                " with us_duration=~w or ms_duration=~w", [US, US div 1000]);
+            [" with us_duration=", integer_to_list(US),
+            " or ms_duration=", integer_to_list(US div 1000)];
         US ->
-            io_lib:format(" with us_duration=~w", [US])
+            [" with us_duration=", integer_to_list(US)]
     end.
 
 %%%============================================================================
@@ -507,4 +511,36 @@ badloglevel_test() ->
     ?assertMatch(true, is_active_level(?LOG_LEVELS, debug, unsupported)),
     ?assertMatch(true, is_active_level(?LOG_LEVELS, critical, unsupported)).
 
+timing_test() ->
+    % Timing test
+    % Previous LOGBASE used list with string-based keys and values
+    % The size of the LOGBASE was 19,342 words (>150KB), and logs took
+    % o(100) microseconds.
+    % Changing the LOGBASE ot a map with binary-based keys and values does not
+    % appear to improve the speed of logging, but does reduce the size of the
+    % LOGBASE to just over 2,000 words (so an order of magnitude improvement)
+    timer:sleep(10),
+    io:format(user, "Log timings:~n", []),
+    io:format(user, "Logbase size ~w~n", [erts_debug:flat_size(?LOGBASE)]),
+    io:format(
+        user,
+        "Front log timing ~p~n",
+        [timer:tc(fun() -> log(cdb21, ["test_file"]) end)]
+    ),
+    io:format(
+        user,
+        "Mid log timing ~p~n",
+        [timer:tc(fun() -> log(pc013, ["test_file"]) end)]
+    ),
+    io:format(
+        user,
+        "End log timing ~p~n",
+        [timer:tc(fun() -> log(b0003, ["testing"]) end)]
+    ),
+    io:format(
+        user,
+        "Big log timing ~p~n",
+        [timer:tc(fun() -> log(sst13, [100,100,100,100,true,1]) end)]
+    ).
+
 -endif.

From 6967d3e83439ad500c1d6b2738d9673ee71d964b Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Fri, 2 Dec 2022 11:26:44 +0000
Subject: [PATCH 32/37] Minor tidy

---
 src/leveled_log.erl | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index eee603ac..d27d771a 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -290,7 +290,7 @@
         cdb10 =>
             {warn, <<"CRC check failed due to error=~s">>},
         cdb12 =>
-            {info, <<"Hashtree index writte">>},
+            {info, <<"Hashtree index written">>},
         cdb13 =>
             {debug, <<"Write options of ~w">>},
         cdb14 =>
@@ -390,9 +390,8 @@ log(LogRef, Subs, SupportedLogLevels) ->
         true ->
             DBid = LogOpts#log_options.database_id,
             Prefix =
-                iolist_to_binary(
-                    log_prefix(
-                        localtime_ms(), LogLevel, LogRef, DBid, self())),
+                log_prefix(
+                    localtime_ms(), LogLevel, LogRef, DBid, self()),
             Suffix = <<"~n">>,
             io:format(iolist_to_binary([Prefix, Log, Suffix]), Subs);
         false ->
@@ -429,9 +428,8 @@ log_timer(LogRef, Subs, StartTime, SupportedLevels) ->
         true ->
             DBid = LogOpts#log_options.database_id,
             Prefix =
-                iolist_to_binary(
-                    log_prefix(
-                        localtime_ms(), LogLevel, LogRef, DBid, self())),
+                log_prefix(
+                        localtime_ms(), LogLevel, LogRef, DBid, self()),
             Suffix = <<"~n">>,
             Duration = duration_text(StartTime),
             io:format(
@@ -541,6 +539,11 @@ timing_test() ->
         user,
         "Big log timing ~p~n",
         [timer:tc(fun() -> log(sst13, [100,100,100,100,true,1]) end)]
+    ),
+    io:format(
+        user,
+        "Timer log timing ~p~n",
+        [timer:tc(fun() -> log_timer(pc015, [], os:timestamp()) end)]
     ).
 
 -endif.

From 228db26f9bc67aa0ee4a4055898065273d1bc33c Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Fri, 2 Dec 2022 13:34:38 +0000
Subject: [PATCH 33/37] Add dialyzer specs

---
 src/leveled_log.erl | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index d27d771a..1010eaf9 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -418,6 +418,7 @@ is_active_level([L|_], L, _) -> true;
 is_active_level([L|_], _, L) -> false;
 is_active_level([_|T], C, L) -> is_active_level(T, C, L).
 
+-spec log_timer(atom(), list(), erlang:timestamp()) -> ok.
 log_timer(LogReference, Subs, StartTime) ->
     log_timer(LogReference, Subs, StartTime, ?LOG_LEVELS).
 
@@ -439,6 +440,7 @@ log_timer(LogRef, Subs, StartTime, SupportedLevels) ->
             ok
     end.
 
+-spec log_randomtimer(atom(), list(), erlang:timestamp(), float()) -> ok.
 log_randomtimer(LogReference, Subs, StartTime, RandomProb) ->
     R = leveled_rand:uniform(),
     case R < RandomProb of
@@ -461,11 +463,17 @@ log_prefix({{Y, M, D}, {H, Mi, S, Ms}}, LogLevel, LogRef, DBid, Pid) ->
     " log_level=", atom_to_list(LogLevel), " log_ref=", atom_to_list(LogRef),
     " db_id=", integer_to_list(DBid), " pid=", pid_to_list(Pid), " "].
 
-i2l(I) when I < 10  -> [$0, $0+I];
-i2l(I)              -> integer_to_list(I).
-
-i3l(I) when I < 100 -> [$0 | i2l(I)];
-i3l(I)              -> integer_to_list(I).
+-spec i2l(non_neg_integer()) -> list().
+i2l(I) when I < 10 ->
+    [$0, $0+I];
+i2l(I) ->
+    integer_to_list(I).
+
+-spec i3l(non_neg_integer()) -> list().
+i3l(I) when I < 100 ->
+    [$0 | i2l(I)];
+i3l(I) ->
+    integer_to_list(I).
 
 -spec duration_text(erlang:timestamp()) -> io_lib:chars().
 duration_text(StartTime) ->

From e7f38f5014737146be6bd96ae8f648b3f83993c1 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Tue, 6 Dec 2022 14:50:49 +0000
Subject: [PATCH 34/37] Name config options correctly

---
 docs/STARTUP_OPTIONS.md         |  4 ++--
 priv/leveled.schema             |  2 +-
 priv/leveled_multi.schema       |  2 +-
 src/leveled_bookie.erl          | 12 ++++++------
 test/end_to_end/basic_SUITE.erl |  2 +-
 test/end_to_end/riak_SUITE.erl  |  2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/STARTUP_OPTIONS.md b/docs/STARTUP_OPTIONS.md
index e5604c5f..870b0db0 100644
--- a/docs/STARTUP_OPTIONS.md
+++ b/docs/STARTUP_OPTIONS.md
@@ -123,6 +123,6 @@ This covers only silently failing snapshots.  Snapshots that shutdown neatly wil
 
 ## Statistic gathering
 
-Leveled will gather monitoring statistics on HEAD/GET/PUT requests, with timing points taken throughout the store.  These timings are gathered by the `leveled_monitor`, and there are three configuration options.  The two primary options are: `stats_percentage` is an integer between 0 and 100 which informs the store of the proprtion of the requests which should be timed at each part; and `monitor_log_frequency` which controls the frequency (in seconds) with which the leveled_monitor will write a log file (for one of the stats types in its queue).
+Leveled will gather monitoring statistics on HEAD/GET/PUT requests, with timing points taken throughout the store.  These timings are gathered by the `leveled_monitor`, and there are three configuration options.  The two primary options are: `stats_percentage` is an integer between 0 and 100 which informs the store of the proprtion of the requests which should be timed at each part; and `stats_logfrequency` which controls the frequency (in seconds) with which the leveled_monitor will write a log file (for one of the stats types in its queue).
 
-The specific stats types logged can be found in the ?LOG_LIST within the leveled_monitor.  If a subset only is of interest, than this list can be modified by setting `monitor_log_list`.  This can also be used to repeat the frequency of individual log types by adding them to the list multiple times.
\ No newline at end of file
+The specific stats types logged can be found in the ?LOG_LIST within the leveled_monitor.  If a subset only is of interest, than this list can be modified by setting `monitor_loglist`.  This can also be used to repeat the frequency of individual log types by adding them to the list multiple times.
\ No newline at end of file
diff --git a/priv/leveled.schema b/priv/leveled.schema
index f6501eb4..85e4a397 100644
--- a/priv/leveled.schema
+++ b/priv/leveled.schema
@@ -201,7 +201,7 @@
 %% @doc Statistic log frequency (seconds)
 %% The wait in seconds between logs from each leveled_monitor (there is one
 %% monitor per vnode)
-{mapping, "leveled.stats_percentage", "leveled.stats_percentage", [
+{mapping, "leveled.stats_logfrequency", "leveled.stats_logfrequency", [
   {default, 30},
   {datatype, integer}
 ]}.
\ No newline at end of file
diff --git a/priv/leveled_multi.schema b/priv/leveled_multi.schema
index 3d66c60d..ff0f4a4b 100644
--- a/priv/leveled_multi.schema
+++ b/priv/leveled_multi.schema
@@ -192,7 +192,7 @@
 %% @doc Statistic log frequency (seconds)
 %% The wait in seconds between logs from each leveled_monitor (there is one
 %% monitor per vnode)
-{mapping, "multi_backend.$name.leveled.stats_percentage", "riak_kv.multi_backend", [
+{mapping, "multi_backend.$name.leveled.stats_logfrequency", "riak_kv.multi_backend", [
   {default, 30},
   {datatype, integer},
   hidden
diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl
index a855c43e..a013dcec 100644
--- a/src/leveled_bookie.erl
+++ b/src/leveled_bookie.erl
@@ -159,9 +159,9 @@
                 {snapshot_timeout_short, ?SNAPTIMEOUT_SHORT},
                 {snapshot_timeout_long, ?SNAPTIMEOUT_LONG},
                 {stats_percentage, ?DEFAULT_STATS_PERC},
-                {monitor_log_frequency,
+                {stats_logfrequency,
                     element(1, leveled_monitor:get_defaults())},
-                {monitor_log_list,
+                {monitor_loglist,
                     element(2, leveled_monitor:get_defaults())}]).
 
 -record(ledger_cache, {mem :: ets:tab(),
@@ -365,12 +365,12 @@
         {stats_percentage, 0..100} |
             % Probability that stats will be collected for an individual
             % request.
-        {monitor_log_frequency, pos_integer()} |
+        {stats_logfrequency, pos_integer()} |
             % Time in seconds before logging the next timing log. This covers
             % the logs associated with the timing of GET/PUTs in various parts
             % of the system.  There are 7 such logs - so setting to 30s will
             % mean that each inidividual log will occur every 210s
-        {monitor_log_list, list(leveled_monitor:log_type())}
+        {monitor_loglist, list(leveled_monitor:log_type())}
         ].
 
 -type initial_loadfun() ::
@@ -1179,8 +1179,8 @@ init([Opts]) ->
 
             {ok, Monitor} =
                 leveled_monitor:monitor_start(
-                    proplists:get_value(monitor_log_frequency, Opts),
-                    proplists:get_value(monitor_log_list, Opts)
+                    proplists:get_value(stats_logfrequency, Opts),
+                    proplists:get_value(monitor_loglist, Opts)
                 ),
             StatLogFrequency = proplists:get_value(stats_percentage, Opts),
 
diff --git a/test/end_to_end/basic_SUITE.erl b/test/end_to_end/basic_SUITE.erl
index 24c58020..43d3f5eb 100644
--- a/test/end_to_end/basic_SUITE.erl
+++ b/test/end_to_end/basic_SUITE.erl
@@ -544,7 +544,7 @@ load_and_count(JournalSize, BookiesMemSize, PencillerMemSize) ->
                     {cache_size, BookiesMemSize},
                     {max_pencillercachesize, PencillerMemSize},
                     {sync_strategy, testutil:sync_strategy()},
-                    {monitor_log_frequency, 5},
+                    {stats_logfrequency, 5},
                     {stats_probability, 80}],
     {ok, Bookie1} = leveled_bookie:book_start(StartOpts1),
     {TestObject, TestSpec} = testutil:generate_testobject(),
diff --git a/test/end_to_end/riak_SUITE.erl b/test/end_to_end/riak_SUITE.erl
index 39de7ac6..90419835 100644
--- a/test/end_to_end/riak_SUITE.erl
+++ b/test/end_to_end/riak_SUITE.erl
@@ -44,7 +44,7 @@ basic_riak_tester(Bucket, KeyCount) ->
                     {max_pencillercachesize, 24000},
                     {sync_strategy, testutil:sync_strategy()},
                     {database_id, 32},
-                    {monitor_log_frequency, 5},
+                    {stats_logfrequency, 5},
                     {stats_probability, 80}],
     {ok, Bookie1} = leveled_bookie:book_start(StartOpts1),
 

From 9fd49879f0a03879c1199c464ae5869bf65f4f9c Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Mon, 12 Dec 2022 11:37:11 +0000
Subject: [PATCH 35/37] Add missing specs, and extra test

---
 src/leveled_head.erl | 21 +++++++++++++++++----
 src/leveled_log.erl  | 20 ++++++++++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/src/leveled_head.erl b/src/leveled_head.erl
index 3ee66c2d..7224e7fe 100644
--- a/src/leveled_head.erl
+++ b/src/leveled_head.erl
@@ -75,8 +75,21 @@
     key_to_canonicalbinary | build_head | extract_metadata | diff_indexspecs.
         % Functions for which default behaviour can be over-written for the
         % application's own tags
+-type appdefinable_keyfun() ::
+    fun((tuple()) -> binary()).
+-type appdefinable_headfun() ::
+    fun((object_tag(), object_metadata()) -> head()).
+-type appdefinable_metadatafun() ::
+    fun(({leveled_codec:tag(), non_neg_integer(), any()}) ->
+        {object_metadata(), list(erlang:timestamp())}).
+-type appdefinable_indexspecsfun() ::
+    fun((object_tag(), object_metadata(), object_metadata()|not_present) ->
+        leveled_codec:index_specs()).
+-type appdefinable_function_fun() ::
+    appdefinable_keyfun() | appdefinable_headfun() |
+    appdefinable_metadatafun() | appdefinable_indexspecsfun().
 -type appdefinable_function_tuple() ::
-    {appdefinable_function(), fun()}.
+    {appdefinable_function(), appdefinable_function_fun()}.
 
 -type index_op() :: add | remove.
 -type index_value() :: integer() | binary().
@@ -265,9 +278,9 @@ standard_hash(Obj) ->
 %%% Handling Override Functions
 %%%============================================================================
 
--spec get_appdefined_function(appdefinable_function(),
-                                fun(),
-                                non_neg_integer()) -> fun().
+-spec get_appdefined_function(
+    appdefinable_function(), appdefinable_function_fun(), non_neg_integer()) ->
+        appdefinable_function_fun().
 %% @doc
 %% If a keylist of [{function_name, fun()}] has been set as an environment 
 %% variable for a tag, then this FunctionName can be used instead of the
diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index 1010eaf9..533407f9 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -19,6 +19,9 @@
             save/1,
             return_settings/0]).
 
+-ifdef(TEST).
+-export([format_time/1, log_prefix/5]).
+-endif.
 
 -record(log_options,
     {log_level = info :: log_level(), 
@@ -491,6 +494,23 @@ duration_text(StartTime) ->
 
 -ifdef(TEST).
 
+format_time({{Y, M, D}, {H, Mi, S, Ms}}) ->
+    io_lib:format("~b-~2..0b-~2..0b", [Y, M, D]) ++ "T" ++
+        io_lib:format("~2..0b:~2..0b:~2..0b.~3..0b", [H, Mi, S, Ms]).
+
+prefix_compare_test() ->
+    Time = localtime_ms(),
+    DBid = 64,
+    LogLevel = info,
+    LogRef = b0001,
+    {TS0, OldTS} =
+        timer:tc(?MODULE, format_time, [Time]),
+    {TS1, NewPrefix} =
+        timer:tc(?MODULE, log_prefix, [Time, LogLevel, LogRef, DBid, self()]),
+    {NewTS, _Rest} = lists:split(23, lists:flatten(NewPrefix)),
+    ?assertMatch(OldTS, NewTS),
+    io:format(user, "~nTimestamp timings old ~w new ~w~n", [TS0, TS1]).
+
 log_test() ->
     log(d0001, []),
     log_timer(d0001, [], os:timestamp()).

From 0ec6be897ef205d82d0a37fb9ee6fd70b2cd7792 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 15 Dec 2022 20:06:57 +0000
Subject: [PATCH 36/37] Clarify monitoring logs

Make clear over what period was the sample taken
---
 src/leveled_log.erl     | 14 ++++----
 src/leveled_monitor.erl | 77 +++++++++++++++++++++++++++++++++--------
 2 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/src/leveled_log.erl b/src/leveled_log.erl
index 533407f9..d609f77b 100644
--- a/src/leveled_log.erl
+++ b/src/leveled_log.erl
@@ -65,13 +65,13 @@
         b0013 =>
             {warn, <<"Long running task took ~w microseconds with task_type=~w">>},
         b0015 =>
-            {info, <<"Put timing with sample_count=~w ink_time=~w prep_time=~w mem_time=~w with total_object_size=~w">>},
+            {info, <<"Put timing with sample_count=~w ink_time=~w prep_time=~w mem_time=~w with total_object_size=~w with sample_period=~w seconds">>},
         b0016 =>
-            {info, <<"Get timing with sample_count=~w and head_time=~w body_time=~w with fetch_count=~w">>},
+            {info, <<"Get timing with sample_count=~w and head_time=~w body_time=~w with fetch_count=~w with sample_period=~w seconds">>},
         b0017 =>
-            {info, <<"Snapshot timing with sample_count=~w and bookie_time=~w pcl_time=~w">>},
+            {info, <<"Snapshot timing with sample_count=~w and bookie_time=~w pcl_time=~w with sample_period=~w seconds">>},
         b0018 =>
-            {info, <<"Positive HEAD responses timed with sample_count=~w and cache_count=~w found_count=~w fetch_ledger_time=~w fetch_ledgercache_time=~w rsp_time=~w notfound_time=~w">>},
+            {info, <<"Positive HEAD responses timed with sample_count=~w and cache_count=~w found_count=~w fetch_ledger_time=~w fetch_ledgercache_time=~w rsp_time=~w notfound_time=~w with sample_period=~w seconds">>},
         b0019 =>
             {warn, <<"Use of book_indexfold with constraint of Bucket ~w with no StartKey is deprecated">>},
         b0020 =>
@@ -119,7 +119,7 @@
         p0031 =>
             {info, <<"Completion of update to levelzero with cache_size=~w level0_due=~w change_pending=~w MinSQN=~w MaxSQN=~w">>},
         p0032 =>
-            {info, <<"Fetch head timing with sample_count=~w and level timings of foundmem_time=~w found0_time=~w found1_time=~w found2_time=~w found3_time=~w foundlower_time=~w missed_time=~w with counts of foundmem_count=~w found0_count=~w found1_count=~w found2_count=~w found3_count=~w foundlower_count=~w missed_count=~w">>},
+            {info, <<"Fetch head timing with sample_count=~w and level timings of foundmem_time=~w found0_time=~w found1_time=~w found2_time=~w found3_time=~w foundlower_time=~w missed_time=~w with counts of foundmem_count=~w found0_count=~w found1_count=~w found2_count=~w found3_count=~w foundlower_count=~w missed_count=~w with sample_period=~w seconds">>},
         p0033 =>
             {error, <<"Corrupted manifest file at path ~s to be ignored due to error ~s">>},
         p0035 =>
@@ -191,7 +191,7 @@
         sst11 =>
             {info, <<"Level zero creation timings in microseconds pmem_fetch=~w merge_lists=~w build_slots=~w build_summary=~w read_switch=~w">>},
         sst12 =>
-            {info, <<"SST Timings at level=~w for sample_count=~w at timing points notfound_time=~w fetchcache_time=~w slotcached_time=~w slotnoncached_time=~w exiting at points notfound_count=~w fetchcache_count=~w slotcached_count=~w slotnoncached_count=~w">>},
+            {info, <<"SST Timings at level=~w for sample_count=~w at timing points notfound_time=~w fetchcache_time=~w slotcached_time=~w slotnoncached_time=~w exiting at points notfound_count=~w fetchcache_count=~w slotcached_count=~w slotnoncached_count=~w with sample_period=~w seconds">>},
         sst13 =>
             {info, <<"SST merge list build timings of fold_toslot=~w slot_hashlist=~w slot_serialise=~w slot_finish=~w is_basement=~w level=~w">>},
         sst14 =>
@@ -303,7 +303,7 @@
         cdb18 =>
             {info, <<"Handled return and write of hashtable">>},
         cdb19 =>
-            {info, <<"Sample timings in microseconds for sample_count=~w with totals of cycle_count=~w index_time=~w read_time=~w">>},
+            {info, <<"Sample timings in microseconds for sample_count=~w with totals of cycle_count=~w index_time=~w read_time=~w with sample_period=~w seconds">>},
         cdb20 =>
             {warn, <<"Error ~w caught when safe reading a file to length ~w">>},
         cdb21 =>
diff --git a/src/leveled_monitor.erl b/src/leveled_monitor.erl
index fb6d8276..44609b08 100644
--- a/src/leveled_monitor.erl
+++ b/src/leveled_monitor.erl
@@ -43,7 +43,8 @@
     {sample_count = 0 :: non_neg_integer(),
     head_time = 0 :: non_neg_integer(),
     body_time = 0 :: non_neg_integer(),
-    fetch_count = 0 :: non_neg_integer()}).
+    fetch_count = 0 :: non_neg_integer(),
+    sample_start_time = os:timestamp() :: erlang:timestamp()}).
 
 -record(bookie_head_timings,
     {sample_count = 0 :: non_neg_integer(),
@@ -53,19 +54,22 @@
     fetch_ledger_time = 0 :: non_neg_integer(),
     fetch_ledgercache_time = 0 :: non_neg_integer(),
     rsp_time = 0 :: non_neg_integer(),
-    notfound_time = 0 :: non_neg_integer()}).
+    notfound_time = 0 :: non_neg_integer(),
+    sample_start_time = os:timestamp() :: erlang:timestamp()}).
 
 -record(bookie_put_timings,
     {sample_count = 0 :: non_neg_integer(),
     ink_time = 0 :: non_neg_integer(),
     prep_time = 0 :: non_neg_integer(),
     mem_time = 0 :: non_neg_integer(),
-    total_size = 0 :: non_neg_integer()}).
+    total_size = 0 :: non_neg_integer(),
+    sample_start_time = os:timestamp() :: erlang:timestamp()}).
 
 -record(bookie_snap_timings,
     {sample_count = 0 :: non_neg_integer(),
     bookie_time = 0 :: non_neg_integer(),
-    pcl_time = 0 :: non_neg_integer()}).
+    pcl_time = 0 :: non_neg_integer(),
+    sample_start_time = os:timestamp() :: erlang:timestamp()}).
 
 -record(pcl_fetch_timings, 
     {sample_count = 0 :: non_neg_integer(),
@@ -82,7 +86,8 @@
     found2_count = 0 :: non_neg_integer(),
     found3_count = 0 :: non_neg_integer(),
     foundlower_count = 0 :: non_neg_integer(),
-    notfound_count = 0 :: non_neg_integer()}).
+    notfound_count = 0 :: non_neg_integer(),
+    sample_start_time = os:timestamp() :: erlang:timestamp()}).
 
 -record(sst_fetch_timings, 
     {sample_count = 0 :: non_neg_integer(),
@@ -93,13 +98,15 @@
     fetchcache_count = 0 :: non_neg_integer(),
     slotcached_count = 0 :: non_neg_integer(),
     slotnoncached_count = 0 :: non_neg_integer(),
-    notfound_count = 0 :: non_neg_integer()}).
+    notfound_count = 0 :: non_neg_integer(),
+    sample_start_time = os:timestamp() :: erlang:timestamp()}).
 
 -record(cdb_get_timings,
     {sample_count = 0 :: non_neg_integer(),
     cycle_count = 0 :: non_neg_integer(),
     index_time = 0 :: non_neg_integer(),
-    read_time = 0 :: non_neg_integer()}).
+    read_time = 0 :: non_neg_integer(),
+    sample_start_time = os:timestamp() :: erlang:timestamp()}).
 
 -record(state, 
     {bookie_get_timings = #bookie_get_timings{} :: bookie_get_timings(),
@@ -430,15 +437,25 @@ handle_cast({cdb_get_update, CycleCount, IndexTime, ReadTime}, State) ->
     {noreply, State#state{cdb_get_timings = UpdTimings}};
 handle_cast({report_stats, bookie_get}, State) ->
     Timings = State#state.bookie_get_timings,
+    SamplePeriod =
+        timer:now_diff(
+            os:timestamp(),
+            Timings#bookie_get_timings.sample_start_time) div 1000000,
     leveled_log:log(
         b0016,
         [Timings#bookie_get_timings.sample_count,
             Timings#bookie_get_timings.head_time,
             Timings#bookie_get_timings.body_time,
-            Timings#bookie_get_timings.fetch_count]),
+            Timings#bookie_get_timings.fetch_count,
+            SamplePeriod
+        ]),
     {noreply, State#state{bookie_get_timings = #bookie_get_timings{}}};
 handle_cast({report_stats, bookie_head}, State) ->
     Timings = State#state.bookie_head_timings,
+    SamplePeriod =
+        timer:now_diff(
+            os:timestamp(),
+            Timings#bookie_head_timings.sample_start_time) div 1000000,
     leveled_log:log(
         b0018,
         [Timings#bookie_head_timings.sample_count,
@@ -447,28 +464,46 @@ handle_cast({report_stats, bookie_head}, State) ->
             Timings#bookie_head_timings.fetch_ledger_time,
             Timings#bookie_head_timings.fetch_ledgercache_time,
             Timings#bookie_head_timings.rsp_time,
-            Timings#bookie_head_timings.notfound_time]),
+            Timings#bookie_head_timings.notfound_time,
+            SamplePeriod
+        ]),
     {noreply, State#state{bookie_head_timings = #bookie_head_timings{}}};
 handle_cast({report_stats, bookie_put}, State) ->
     Timings = State#state.bookie_put_timings,
+    SamplePeriod =
+        timer:now_diff(
+            os:timestamp(),
+            Timings#bookie_put_timings.sample_start_time) div 1000000,
     leveled_log:log(
         b0015,
         [Timings#bookie_put_timings.sample_count,
             Timings#bookie_put_timings.ink_time,
             Timings#bookie_put_timings.prep_time,
             Timings#bookie_put_timings.mem_time,
-            Timings#bookie_put_timings.total_size]),
+            Timings#bookie_put_timings.total_size,
+            SamplePeriod
+        ]),
     {noreply, State#state{bookie_put_timings = #bookie_put_timings{}}};
 handle_cast({report_stats, bookie_snap}, State) ->
     Timings = State#state.bookie_snap_timings,
+    SamplePeriod =
+        timer:now_diff(
+            os:timestamp(),
+            Timings#bookie_snap_timings.sample_start_time) div 1000000,
     leveled_log:log(
         b0017,
         [Timings#bookie_snap_timings.sample_count,
             Timings#bookie_snap_timings.bookie_time,
-            Timings#bookie_snap_timings.pcl_time]),
+            Timings#bookie_snap_timings.pcl_time,
+            SamplePeriod
+        ]),
     {noreply, State#state{bookie_snap_timings = #bookie_snap_timings{}}};
 handle_cast({report_stats, pcl_fetch}, State) ->
     Timings = State#state.pcl_fetch_timings,
+    SamplePeriod =
+        timer:now_diff(
+            os:timestamp(),
+            Timings#pcl_fetch_timings.sample_start_time) div 1000000,
     leveled_log:log(
         p0032,
         [Timings#pcl_fetch_timings.sample_count,
@@ -485,11 +520,17 @@ handle_cast({report_stats, pcl_fetch}, State) ->
             Timings#pcl_fetch_timings.found2_count,
             Timings#pcl_fetch_timings.found3_count,
             Timings#pcl_fetch_timings.foundlower_count,
-            Timings#pcl_fetch_timings.notfound_count]),
+            Timings#pcl_fetch_timings.notfound_count,
+            SamplePeriod
+        ]),
     {noreply, State#state{pcl_fetch_timings = #pcl_fetch_timings{}}};
 handle_cast({report_stats, sst_fetch}, State) ->
     LogFun =
         fun({Level, Timings}) ->
+            SamplePeriod =
+                timer:now_diff(
+                    os:timestamp(),
+                    Timings#sst_fetch_timings.sample_start_time) div 1000000,
             leveled_log:log(
                 sst12,
                 [Level,
@@ -501,18 +542,26 @@ handle_cast({report_stats, sst_fetch}, State) ->
                     Timings#sst_fetch_timings.notfound_count,
                     Timings#sst_fetch_timings.fetchcache_count,
                     Timings#sst_fetch_timings.slotcached_count,
-                    Timings#sst_fetch_timings.slotnoncached_count])
+                    Timings#sst_fetch_timings.slotnoncached_count,
+                    SamplePeriod
+                ])
         end,
     lists:foreach(LogFun, State#state.sst_fetch_timings),
     {noreply, State#state{sst_fetch_timings = []}};
 handle_cast({report_stats, cdb_get}, State) ->
     Timings = State#state.cdb_get_timings,
+    SamplePeriod =
+        timer:now_diff(
+            os:timestamp(),
+            Timings#cdb_get_timings.sample_start_time) div 1000000,
     leveled_log:log(
         cdb19,
         [Timings#cdb_get_timings.sample_count,
             Timings#cdb_get_timings.cycle_count,
             Timings#cdb_get_timings.index_time,
-            Timings#cdb_get_timings.read_time]),
+            Timings#cdb_get_timings.read_time,
+            SamplePeriod
+        ]),
     {noreply, State#state{cdb_get_timings = #cdb_get_timings{}}};
 handle_cast({log_level, LogLevel}, State) ->
     ok = leveled_log:set_loglevel(LogLevel),

From fc015f3dd357827acab04a9c35d68edc180f2213 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Thu, 15 Dec 2022 21:08:17 +0000
Subject: [PATCH 37/37] Clarify module comment

---
 src/leveled_monitor.erl | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/leveled_monitor.erl b/src/leveled_monitor.erl
index 44609b08..0727c72e 100644
--- a/src/leveled_monitor.erl
+++ b/src/leveled_monitor.erl
@@ -1,11 +1,19 @@
 %% -------- MONITOR ---------
 %%
-%% The bookie's monitor is a process dedciated to gathering and reporting
-%% stats related to performance of the store.
+%% The bookie's monitor is a process dedicated to gathering and reporting
+%% stats related to performance of the leveled store.
 %% 
-%% The monitor was introduced as a sedicated process to reduce the number of
-%% LoopState mutations otherwise necessary to track statistics, requiring
-%% State copies even on read events.
+%% Depending on the sample frequency, a process will randomly determine whether
+%% or not to take a timing of a transaction.  If a timing is taken the result
+%% is cast to the moniitor.
+%% 
+%% The monitor gathers stats across the store, and then on a timing loop logs
+%% out the gathered stats for one of the monitored stat types once every
+%% ?LOG_FREQUENCY_SECONDS.  On each timing trigger the monitor should move on
+%% to the next timing stat in its list.
+%% 
+%% The different types of timing stats are defined within the ?LOG_LIST.  Each
+%% type of timing stat has its own record maintained in the monitor loop state.
 
 -module(leveled_monitor).