stats: separate quantile and average fields

Add separate quantile (`latency_quantile_recent`) and average ( `latency_average`) fields to `crud.stats()` output. `latency` field and tarantool/metrics output remains unchanged. Before this patch, `latency` displayed `latency_quantile_recent` or `latency_average` and there wasn't any was to see pre-computed average if quantiles are enabled. But it may be useful if quantile is `nan`. Quantiles may display `-nan` if there were no observations for a several ages. Such behavior is expected [1] and valid: for example, Grafana should ignore such values and they will be displayed as `No data` for a window when there wasn't any requests. 1. tarantool/metrics#303 Closes #286
tarantool · May 20, 2022 · 4a76d92 · 4a76d92
1 parent 712c50b
commit 4a76d92
Show file tree

Hide file tree

Showing 7 changed files with 112 additions and 50 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### Added
 * Make metrics quantile collector age params configurable (#286).
+* Add separate `latency_average` and `latency_quantile_recent`
+  fields to `crud.stats()` output (#286).
 
 ### Changed
 

diff --git a/README.md b/README.md
@@ -721,23 +721,31 @@ crud.stats()
     my_space:
       insert:
         ok:
-          latency: 0.002
+          latency: 0.0015
+          latency_average: 0.002
+          latency_quantile_recent: 0.0015
           count: 19800
           time: 39.6
         error:
-          latency: 0.000001
+          latency: 0.0000008
+          latency_average: 0.000001
+          latency_quantile_recent: 0.0000008
           count: 4
           time: 0.000004
 ...
 crud.stats('my_space')
 ---
 - insert:
     ok:
-      latency: 0.002
+      latency: 0.0015
+      latency_average: 0.002
+      latency_quantile_recent: 0.0015
       count: 19800
       time: 39.6
     error:
-      latency: 0.000001
+      latency: 0.0000008
+      latency_average: 0.000001
+      latency_quantile_recent: 0.0000008
       count: 4
       time: 0.000004
 ...
@@ -759,10 +767,17 @@ and `borders` (for `min` and `max` calls).
 Each operation section consists of different collectors
 for success calls and error (both error throw and `nil, err`)
 returns. `count` is the total requests count since instance start
-or stats restart. `latency` is the 0.99 quantile of request execution
-time if `metrics` driver used and quantiles enabled,
-otherwise `latency` is the total average.
-`time` is the total time of requests execution.
+or stats restart.  `time` is the total time of requests execution.
+`latency_average` is `time` / `count`.
+`latency_quantile_recent` is the 0.99 quantile of request execution
+time for a recent period (see 
+[`metrics` summary API](https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary)).
+It is computed only if `metrics` driver is used and quantiles are
+enabled. `latency_quantile_recent` value may be `-nan` if there
+wasn't any observations for several ages, see
+[tarantool/metrics#303](https://github.com/tarantool/metrics/issues/303).
+`latency` is a `latency_quantile_recent` if the field is not empty,
+otherwise it's `latency_average`.
 
 In [`metrics`](https://www.tarantool.io/en/doc/latest/book/monitoring/)
 registry statistics are stored as `tnt_crud_stats` metrics

diff --git a/crud/stats/local_registry.lua b/crud/stats/local_registry.lua
@@ -121,7 +121,8 @@ function registry.observe(latency, space_name, op, status)
 
     collectors.count = collectors.count + 1
     collectors.time = collectors.time + latency
-    collectors.latency = collectors.time / collectors.count
+    collectors.latency_average = collectors.time / collectors.count
+    collectors.latency = collectors.latency_average
 
     return true
 end

diff --git a/crud/stats/metrics_registry.lua b/crud/stats/metrics_registry.lua
@@ -162,50 +162,35 @@ function registry.destroy()
     return true
 end
 
---- Compute `latency` field of an observation.
+--- Compute `latency_average` and set `latency` fields of each observation.
 --
---  If it is a `{ time = ..., count = ... }` observation,
---  compute latency as overall average and store it
---  inside observation object.
+--  `latency` is `latency_average` if quantiles disabled
+--  and `latency_quantile` otherwise.
 --
--- @function compute_obs_latency
--- @local
---
--- @tab obs
---  Objects from `registry_utils`
---  `stats.spaces[name][op][status]`.
---  If something like `details` collector
---  passed, do nothing.
---
-local function compute_obs_latency(obs)
-    if obs.count == nil or obs.time == nil then
-        return
-    end
-
-    if obs.count == 0 then
-        obs.latency = 0
-    else
-        obs.latency = obs.time / obs.count
-    end
-end
-
---- Compute `latency` field of each observation.
---
---  If quantiles disabled, we need to compute
---  latency as overall average from `time` and
---  `count` values.
---
--- @function compute_latencies
+-- @function compute_aggregates
 -- @local
 --
 -- @tab stats
 --  Object from registry_utils stats.
 --
-local function compute_latencies(stats)
+local function compute_aggregates(stats)
     for _, space_stats in pairs(stats.spaces) do
         for _, op_stats in pairs(space_stats) do
             for _, obs in pairs(op_stats) do
-                compute_obs_latency(obs)
+                -- There are no count in `details`.
+                if obs.count ~= nil then
+                    if obs.count == 0 then
+                        obs.latency_average = 0
+                    else
+                        obs.latency_average = obs.time / obs.count
+                    end
+
+                    if obs.latency_quantile_recent ~= nil then
+                        obs.latency = obs.latency_quantile_recent
+                    else
+                        obs.latency = obs.latency_average
+                    end
+                end
             end
         end
     end
@@ -250,7 +235,7 @@ function registry.get(space_name)
         -- metric_name.stats presents only if quantiles enabled.
         if obs.metric_name == metric_name.stats then
             if obs.label_pairs.quantile == LATENCY_QUANTILE then
-                space_stats[op][status].latency = obs.value
+                space_stats[op][status].latency_quantile_recent = obs.value
             end
         elseif obs.metric_name == metric_name.stats_sum then
             space_stats[op][status].time = obs.value
@@ -261,9 +246,7 @@ function registry.get(space_name)
         :: stats_continue ::
     end
 
-    if not internal.opts.quantiles then
-        compute_latencies(stats)
-    end
+    compute_aggregates(stats)
 
     -- Fill select/pairs detail statistics values.
     for stat_name, metric_name in pairs(metric_name.details) do

diff --git a/crud/stats/registry_utils.lua b/crud/stats/registry_utils.lua
@@ -16,7 +16,8 @@ local registry_utils = {}
 --  Use `require('crud.stats').op` to pick one.
 --
 -- @treturn table Returns collectors for success and error requests.
---  Collectors store 'count', 'latency' and 'time' values. Also
+--  Collectors store 'count', 'latency', 'latency_average',
+--  'latency_quantile_recent' and 'time' values. Also
 --  returns additional collectors for select operation.
 --
 function registry_utils.build_collectors(op)
@@ -26,11 +27,19 @@ function registry_utils.build_collectors(op)
         ok = {
             count = 0,
             latency = 0,
+            latency_average = 0,
+            -- latency_quantile_recent presents only if driver
+            -- is 'metrics' and quantiles enabled.
+            latency_quantile_recent = nil,
             time = 0,
         },
         error = {
             count = 0,
             latency = 0,
+            latency_average = 0,
+            -- latency_quantile_recent presents only if driver
+            -- is 'metrics' and quantiles enabled.
+            latency_quantile_recent = nil,
             time = 0,
         },
     }

diff --git a/test/integration/stats_test.lua b/test/integration/stats_test.lua
@@ -529,6 +529,24 @@ for name, case in pairs(simple_operation_cases) do
         t.assert_le(changed_after.latency, ok_latency_max,
             'Changed latency has appropriate value')
 
+        local ok_average_max = math.max(changed_before.latency_average, after_finish - before_start)
+
+        t.assert_gt(changed_after.latency_average, 0,
+            'Changed average has appropriate value')
+        t.assert_le(changed_after.latency_average, ok_average_max,
+            'Changed average has appropriate value')
+
+        if g.params.quantiles == true then
+            local ok_quantile_max = math.max(
+                changed_before.latency_quantile_recent or 0,
+                after_finish - before_start)
+
+            t.assert_gt(changed_after.latency_quantile_recent, 0,
+                'Changed quantile has appropriate value')
+            t.assert_le(changed_after.latency_quantile_recent, ok_quantile_max,
+                'Changed quantile has appropriate value')
+        end
+
         local time_diff = changed_after.time - changed_before.time
 
         t.assert_gt(time_diff, 0, 'Total time increase has appropriate value')

diff --git a/test/unit/stats_test.lua b/test/unit/stats_test.lua
@@ -170,9 +170,25 @@ for name, case in pairs(observe_cases) do
             t.assert_equals(changed.count, run_count, 'Count incremented by count of runs')
 
             local sleep_time = helpers.simple_functions_params().sleep_time
+
             t.assert_ge(changed.latency, sleep_time, 'Latency has appropriate value')
             t.assert_le(changed.latency, time_diffs[#time_diffs], 'Latency has appropriate value')
 
+            t.assert_ge(changed.latency_average, sleep_time, 'Average has appropriate value')
+            t.assert_le(changed.latency_average, time_diffs[#time_diffs], 'Average has appropriate value')
+
+            if g.params.quantiles == true then
+                t.assert_ge(
+                    changed.latency_quantile_recent,
+                    sleep_time,
+                    'Quantile has appropriate value')
+
+                t.assert_le(
+                    changed.latency_quantile_recent,
+                    time_diffs[#time_diffs],
+                    'Quantile has appropriate value')
+            end
+
             t.assert_ge(changed.time, sleep_time * run_count,
                 'Total time increase has appropriate value')
             t.assert_le(changed.time, total_time, 'Total time increase has appropriate value')
@@ -187,6 +203,7 @@ for name, case in pairs(observe_cases) do
                 {
                     count = 0,
                     latency = 0,
+                    latency_average = 0,
                     time = 0
                 },
                 'Other status collectors initialized after observations'
@@ -387,15 +404,31 @@ for name, case in pairs(pairs_cases) do
 
         t.assert_equals(changed.count, 1, 'Count incremented by 1')
 
-        t.assert_ge(changed.latency,
+        t.assert_ge(
+            changed.latency,
             params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected),
             'Latency has appropriate value')
         t.assert_le(changed.latency, time_diff, 'Latency has appropriate value')
 
+        t.assert_ge(
+            changed.latency_average,
+            params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected),
+            'Average has appropriate value')
+        t.assert_le(changed.latency_average, time_diff, 'Average has appropriate value')
+
+        if g.params.quantiles == true then
+            t.assert_ge(
+                changed.latency_quantile_recent,
+                params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected),
+                'Quantile has appropriate value')
+
+            t.assert_le(changed.latency_quantile_recent, time_diff, 'Quantile has appropriate value')
+        end
+
         t.assert_ge(changed.time,
             params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected),
             'Total time has appropriate value')
-        t.assert_le(changed.time, time_diff, 'Total time  has appropriate value')
+        t.assert_le(changed.time, time_diff, 'Total time has appropriate value')
 
         -- Other collectors (unchanged_coll: 'error' or 'ok')
         -- have been initialized and have default values.
@@ -407,6 +440,7 @@ for name, case in pairs(pairs_cases) do
             {
                 count = 0,
                 latency = 0,
+                latency_average = 0,
                 time = 0
             },
             'Other status collectors initialized after observations'