stats: make quantile tolerated error configurable

Make metrics quantile collector tolerated error [1] configurable. Change metrics quantile collector default tolerated error from 1e-2 to 1e-3. The motivation of this patch is a tarantool/metrics bug [2]. Sometimes quantile values turn to `-Inf` under high load when observations are small. It was reproduced in process of developing Grafana dashboard panels for CRUD stats [3]. Quantile tolerated error could be changed with crud.cfg: crud.cfg{stats_quantile_tolerated_error = 1e-4} 1. https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary 2. tarantool/metrics#189 3. https://github.com/tarantool/grafana-dashboard/tree/DifferentialOrange/crud-report
tarantool · May 6, 2022 · f4d44a7 · f4d44a7
1 parent 6a4d4e9
commit f4d44a7
Show file tree

Hide file tree

Showing 8 changed files with 121 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,8 +8,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+* Make metrics quantile collector tolerated error configurable (#281).
 
 ### Changed
+* Change metrics quantile collector default tolerated error
+  from 1e-2 to 1e-3 (#281).
 
 ### Fixed
 * Requests no more fail with "Sharding hash mismatch" error

diff --git a/README.md b/README.md
@@ -793,6 +793,13 @@ metrics:collect()
     metric_name: tnt_crud_stats
 ...
 ```
+If you see `-Inf` value in quantile metrics, try to decrease the tolerated error:
+```lua
+crud.cfg{stats_quantile_tolerated_error = 1e-4}
+```
+See [tarantool/metrics#189](https://github.com/tarantool/metrics/issues/189) for
+details about the issue.
+
 
 `select` section additionally contains `details` collectors.
 ```lua

diff --git a/crud/cfg.lua b/crud/cfg.lua
@@ -25,6 +25,10 @@ local function set_defaults_if_empty(cfg)
         cfg.stats_quantiles = false
     end
 
+    if cfg.stats_quantile_tolerated_error == nil then
+        cfg.stats_quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR
+    end
+
     return cfg
 end
 
@@ -33,7 +37,8 @@ local cfg = set_defaults_if_empty(stash.get(stash.name.cfg))
 local function configure_stats(cfg, opts)
     if  (opts.stats == nil)
     and (opts.stats_driver == nil)
-    and (opts.stats_quantiles == nil) then
+    and (opts.stats_quantiles == nil)
+    and (opts.stats_quantile_tolerated_error == nil) then
         return
     end
 
@@ -49,15 +54,24 @@ local function configure_stats(cfg, opts)
         opts.stats_quantiles = cfg.stats_quantiles
     end
 
+    if opts.stats_quantiles == nil then
+        opts.stats_quantile_tolerated_error = cfg.stats_quantile_tolerated_error
+    end
+
     if opts.stats == true then
-        stats.enable{ driver = opts.stats_driver, quantiles = opts.stats_quantiles }
+        stats.enable{
+            driver = opts.stats_driver,
+            quantiles = opts.stats_quantiles,
+            quantile_tolerated_error = opts.stats_quantile_tolerated_error,
+        }
     else
         stats.disable()
     end
 
     rawset(cfg, 'stats', opts.stats)
     rawset(cfg, 'stats_driver', opts.stats_driver)
     rawset(cfg, 'stats_quantiles', opts.stats_quantiles)
+    rawset(cfg, 'stats_quantile_tolerated_error', opts.stats_quantile_tolerated_error)
 end
 
 --- Configure CRUD module.
@@ -86,13 +100,21 @@ end
 --  Enable or disable statistics quantiles (only for metrics driver).
 --  Quantiles computations increases performance overhead up to 10%.
 --
+-- @number[opt=1e-3] opts.stats_quantile_tolerated_error
+--  See tarantool/metrics summary API for details:
+--  https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
+--  If quantile value is -Inf, try to decrease quantile tolerance.
+--  See https://github.com/tarantool/metrics/issues/189 for issue details.
+--  Decreasing the value increases computational load.
+--
 -- @return Configuration table.
 --
 local function __call(self, opts)
     checks('table', {
         stats = '?boolean',
         stats_driver = '?string',
-        stats_quantiles = '?boolean'
+        stats_quantiles = '?boolean',
+        stats_quantile_tolerated_error = '?number',
     })
 
     opts = table.deepcopy(opts) or {}

diff --git a/crud/stats/init.lua b/crud/stats/init.lua
@@ -84,10 +84,20 @@ end
 --  computing requests latency as 0.99 quantile with aging.
 --  Performance overhead for enabling is near 10%.
 --
+-- @number[opt=1e-3] opts.quantile_tolerated_error
+--  See tarantool/metrics summary API for details:
+--  https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
+--  If quantile value is -Inf, try to decrease quantile tolerance.
+--  See https://github.com/tarantool/metrics/issues/189 for issue details.
+--
 -- @treturn boolean Returns `true`.
 --
 function stats.enable(opts)
-    checks({ driver = '?string', quantiles = '?boolean' })
+    checks({
+        driver = '?string',
+        quantiles = '?boolean',
+        quantile_tolerated_error = '?number',
+    })
 
     StatsError:assert(
         rawget(_G, 'crud') ~= nil,
@@ -108,19 +118,29 @@ function stats.enable(opts)
         opts.quantiles = false
     end
 
+    if opts.quantile_tolerated_error == nil then
+        opts.quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR
+    end
+
     -- Do not reinit if called with same options.
     if internal.driver == opts.driver
-    and internal.quantiles == opts.quantiles then
+    and internal.quantiles == opts.quantiles
+    and internal.quantile_tolerated_error == opts.quantile_tolerated_error then
         return true
     end
 
     -- Disable old driver registry, if another one was requested.
     stats.disable()
 
     internal.driver = opts.driver
-    internal.quantiles = opts.quantiles
 
-    internal:get_registry().init({ quantiles = internal.quantiles })
+    internal:get_registry().init{
+        quantiles = opts.quantiles,
+        quantile_tolerated_error = opts.quantile_tolerated_error
+    }
+
+    internal.quantiles = opts.quantiles
+    internal.quantile_tolerated_error = opts.quantile_tolerated_error
 
     return true
 end
@@ -140,7 +160,10 @@ function stats.reset()
     end
 
     internal:get_registry().destroy()
-    internal:get_registry().init({ quantiles = internal.quantiles })
+    internal:get_registry().init{
+        quantiles = internal.quantiles,
+        quantile_tolerated_error = internal.quantile_tolerated_error
+    }
 
     return true
 end
@@ -469,4 +492,7 @@ stats.op = op_module
 -- @tfield[opt] boolean quantiles Is quantiles computed.
 stats.internal = internal
 
+--- Default metrics quantile precision.
+stats.DEFAULT_QUANTILE_TOLERATED_ERROR = 1e-3
+
 return stats
diff --git a/crud/stats/local_registry.lua b/crud/stats/local_registry.lua
@@ -25,10 +25,16 @@ local StatsLocalError = errors.new_class('StatsLocalError', {capture_stack = fal
 -- @bool opts.quantiles
 --  Quantiles is not supported for local, only `false` is valid.
 --
+-- @number opts.quantile_tolerated_error
+--  Quantiles is not supported for local, so the value is ignored.
+--
 -- @treturn boolean Returns `true`.
 --
 function registry.init(opts)
-    dev_checks({ quantiles = 'boolean' })
+    dev_checks({
+        quantiles = 'boolean',
+        quantile_tolerated_error = 'number',
+    })
 
     StatsLocalError:assert(opts.quantiles == false,
         "Quantiles are not supported for 'local' statistics registry")

diff --git a/crud/stats/metrics_registry.lua b/crud/stats/metrics_registry.lua
@@ -31,11 +31,6 @@ local metric_name = {
 
 local LATENCY_QUANTILE = 0.99
 
--- Increasing tolerance threshold affects performance.
-local DEFAULT_QUANTILES = {
-    [LATENCY_QUANTILE] = 1e-2,
-}
-
 local DEFAULT_AGE_PARAMS = {
     age_buckets_count = 2,
     max_age_time = 60,
@@ -86,17 +81,24 @@ end
 -- @bool opts.quantiles
 --  If `true`, computes latency as 0.99 quantile with aging.
 --
+-- @number[opt=1e-3] opts.quantile_tolerated_error
+--  See metrics summary API for details:
+--  https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
+--  If quantile value is -Inf, try to decrease quantile tolerance.
+--  See https://github.com/tarantool/metrics/issues/189 for issue details.
+--
 -- @treturn boolean Returns `true`.
 --
 function registry.init(opts)
-    dev_checks({ quantiles = 'boolean' })
-
-    internal.opts = table.deepcopy(opts)
+    dev_checks({
+        quantiles = 'boolean',
+        quantile_tolerated_error = 'number',
+    })
 
     local quantile_params = nil
     local age_params = nil
     if opts.quantiles == true then
-        quantile_params = DEFAULT_QUANTILES
+        quantile_params = {[LATENCY_QUANTILE] = opts.quantile_tolerated_error}
         age_params = DEFAULT_AGE_PARAMS
     end
 
@@ -119,6 +121,8 @@ function registry.init(opts)
         metric_name.details.map_reduces,
         'Map reduces planned during CRUD select/pairs')
 
+    internal.opts = table.deepcopy(opts)
+
     return true
 end
 

diff --git a/test/integration/cfg_test.lua b/test/integration/cfg_test.lua
@@ -26,6 +26,7 @@ group.test_defaults = function(g)
         stats = false,
         stats_driver = stats.get_default_driver(),
         stats_quantiles = false,
+        stats_quantile_tolerated_error = 1e-3,
     })
 end
 

diff --git a/test/unit/stats_test.lua b/test/unit/stats_test.lua
@@ -658,6 +658,40 @@ group_driver.test_default_quantiles = function(g)
 end
 
 
+group_driver.test_default_quantile_tolerated_error = function(g)
+    enable_stats(g)
+
+    local quantile_tolerated_error = g.router:eval(" return stats_module.internal.quantile_tolerated_error ")
+    t.assert_equals(quantile_tolerated_error, 1e-3)
+end
+
+
+group_driver.before_test(
+    'test_custom_quantile_tolerated_error',
+    function(g)
+        t.skip_if(g.is_metrics_supported == false, 'Metrics registry is unsupported')
+    end
+)
+
+group_driver.test_custom_quantile_tolerated_error = function(g)
+    g.router:call('crud.cfg', {{
+        stats = true,
+        stats_driver = 'metrics',
+        stats_quantiles = true,
+        stats_quantile_tolerated_error = 5e-4,
+    }})
+
+    local resp = g.router:eval([[
+        local metrics = require('metrics')
+
+        local summary = metrics.summary('tnt_crud_stats')
+        return summary.objectives
+    ]])
+
+    t.assert_equals(resp, {[0.99] = 5e-4})
+end
+
+
 group_driver.before_test(
     'test_stats_reenable_with_different_driver_reset_stats',
     function(g)