Skip to content

Commit

Permalink
stats: make quantile tolerated error configurable
Browse files Browse the repository at this point in the history
Make metrics quantile collector tolerated error [1] configurable. Change
metrics quantile collector default tolerated error from 1e-2 to 1e-3.

The motivation of this patch is a tarantool/metrics bug [2]. Sometimes
quantile values turn to `-Inf` under high load when observations are
small. It was reproduced in process of developing Grafana dashboard
panels for CRUD stats [3].

Quantile tolerated error could be changed with crud.cfg:

  crud.cfg{stats_quantile_tolerated_error = 1e-4}

1. https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
2. tarantool/metrics#189
3. https://github.com/tarantool/grafana-dashboard/tree/DifferentialOrange/crud-report
  • Loading branch information
DifferentialOrange committed May 6, 2022
1 parent 6a4d4e9 commit f4d44a7
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 18 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]

### Added
* Make metrics quantile collector tolerated error configurable (#281).

### Changed
* Change metrics quantile collector default tolerated error
from 1e-2 to 1e-3 (#281).

### Fixed
* Requests no more fail with "Sharding hash mismatch" error
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,13 @@ metrics:collect()
metric_name: tnt_crud_stats
...
```
If you see `-Inf` value in quantile metrics, try to decrease the tolerated error:
```lua
crud.cfg{stats_quantile_tolerated_error = 1e-4}
```
See [tarantool/metrics#189](https://github.com/tarantool/metrics/issues/189) for
details about the issue.


`select` section additionally contains `details` collectors.
```lua
Expand Down
28 changes: 25 additions & 3 deletions crud/cfg.lua
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ local function set_defaults_if_empty(cfg)
cfg.stats_quantiles = false
end

if cfg.stats_quantile_tolerated_error == nil then
cfg.stats_quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR
end

return cfg
end

Expand All @@ -33,7 +37,8 @@ local cfg = set_defaults_if_empty(stash.get(stash.name.cfg))
local function configure_stats(cfg, opts)
if (opts.stats == nil)
and (opts.stats_driver == nil)
and (opts.stats_quantiles == nil) then
and (opts.stats_quantiles == nil)
and (opts.stats_quantile_tolerated_error == nil) then
return
end

Expand All @@ -49,15 +54,24 @@ local function configure_stats(cfg, opts)
opts.stats_quantiles = cfg.stats_quantiles
end

if opts.stats_quantiles == nil then
opts.stats_quantile_tolerated_error = cfg.stats_quantile_tolerated_error
end

if opts.stats == true then
stats.enable{ driver = opts.stats_driver, quantiles = opts.stats_quantiles }
stats.enable{
driver = opts.stats_driver,
quantiles = opts.stats_quantiles,
quantile_tolerated_error = opts.stats_quantile_tolerated_error,
}
else
stats.disable()
end

rawset(cfg, 'stats', opts.stats)
rawset(cfg, 'stats_driver', opts.stats_driver)
rawset(cfg, 'stats_quantiles', opts.stats_quantiles)
rawset(cfg, 'stats_quantile_tolerated_error', opts.stats_quantile_tolerated_error)
end

--- Configure CRUD module.
Expand Down Expand Up @@ -86,13 +100,21 @@ end
-- Enable or disable statistics quantiles (only for metrics driver).
-- Quantiles computations increases performance overhead up to 10%.
--
-- @number[opt=1e-3] opts.stats_quantile_tolerated_error
-- See tarantool/metrics summary API for details:
-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
-- If quantile value is -Inf, try to decrease quantile tolerance.
-- See https://github.com/tarantool/metrics/issues/189 for issue details.
-- Decreasing the value increases computational load.
--
-- @return Configuration table.
--
local function __call(self, opts)
checks('table', {
stats = '?boolean',
stats_driver = '?string',
stats_quantiles = '?boolean'
stats_quantiles = '?boolean',
stats_quantile_tolerated_error = '?number',
})

opts = table.deepcopy(opts) or {}
Expand Down
36 changes: 31 additions & 5 deletions crud/stats/init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,20 @@ end
-- computing requests latency as 0.99 quantile with aging.
-- Performance overhead for enabling is near 10%.
--
-- @number[opt=1e-3] opts.quantile_tolerated_error
-- See tarantool/metrics summary API for details:
-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
-- If quantile value is -Inf, try to decrease quantile tolerance.
-- See https://github.com/tarantool/metrics/issues/189 for issue details.
--
-- @treturn boolean Returns `true`.
--
function stats.enable(opts)
checks({ driver = '?string', quantiles = '?boolean' })
checks({
driver = '?string',
quantiles = '?boolean',
quantile_tolerated_error = '?number',
})

StatsError:assert(
rawget(_G, 'crud') ~= nil,
Expand All @@ -108,19 +118,29 @@ function stats.enable(opts)
opts.quantiles = false
end

if opts.quantile_tolerated_error == nil then
opts.quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR
end

-- Do not reinit if called with same options.
if internal.driver == opts.driver
and internal.quantiles == opts.quantiles then
and internal.quantiles == opts.quantiles
and internal.quantile_tolerated_error == opts.quantile_tolerated_error then
return true
end

-- Disable old driver registry, if another one was requested.
stats.disable()

internal.driver = opts.driver
internal.quantiles = opts.quantiles

internal:get_registry().init({ quantiles = internal.quantiles })
internal:get_registry().init{
quantiles = opts.quantiles,
quantile_tolerated_error = opts.quantile_tolerated_error
}

internal.quantiles = opts.quantiles
internal.quantile_tolerated_error = opts.quantile_tolerated_error

return true
end
Expand All @@ -140,7 +160,10 @@ function stats.reset()
end

internal:get_registry().destroy()
internal:get_registry().init({ quantiles = internal.quantiles })
internal:get_registry().init{
quantiles = internal.quantiles,
quantile_tolerated_error = internal.quantile_tolerated_error
}

return true
end
Expand Down Expand Up @@ -469,4 +492,7 @@ stats.op = op_module
-- @tfield[opt] boolean quantiles Is quantiles computed.
stats.internal = internal

--- Default metrics quantile precision.
stats.DEFAULT_QUANTILE_TOLERATED_ERROR = 1e-3

return stats
8 changes: 7 additions & 1 deletion crud/stats/local_registry.lua
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,16 @@ local StatsLocalError = errors.new_class('StatsLocalError', {capture_stack = fal
-- @bool opts.quantiles
-- Quantiles is not supported for local, only `false` is valid.
--
-- @number opts.quantile_tolerated_error
-- Quantiles is not supported for local, so the value is ignored.
--
-- @treturn boolean Returns `true`.
--
function registry.init(opts)
dev_checks({ quantiles = 'boolean' })
dev_checks({
quantiles = 'boolean',
quantile_tolerated_error = 'number',
})

StatsLocalError:assert(opts.quantiles == false,
"Quantiles are not supported for 'local' statistics registry")
Expand Down
22 changes: 13 additions & 9 deletions crud/stats/metrics_registry.lua
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,6 @@ local metric_name = {

local LATENCY_QUANTILE = 0.99

-- Increasing tolerance threshold affects performance.
local DEFAULT_QUANTILES = {
[LATENCY_QUANTILE] = 1e-2,
}

local DEFAULT_AGE_PARAMS = {
age_buckets_count = 2,
max_age_time = 60,
Expand Down Expand Up @@ -86,17 +81,24 @@ end
-- @bool opts.quantiles
-- If `true`, computes latency as 0.99 quantile with aging.
--
-- @number[opt=1e-3] opts.quantile_tolerated_error
-- See metrics summary API for details:
-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
-- If quantile value is -Inf, try to decrease quantile tolerance.
-- See https://github.com/tarantool/metrics/issues/189 for issue details.
--
-- @treturn boolean Returns `true`.
--
function registry.init(opts)
dev_checks({ quantiles = 'boolean' })

internal.opts = table.deepcopy(opts)
dev_checks({
quantiles = 'boolean',
quantile_tolerated_error = 'number',
})

local quantile_params = nil
local age_params = nil
if opts.quantiles == true then
quantile_params = DEFAULT_QUANTILES
quantile_params = {[LATENCY_QUANTILE] = opts.quantile_tolerated_error}
age_params = DEFAULT_AGE_PARAMS
end

Expand All @@ -119,6 +121,8 @@ function registry.init(opts)
metric_name.details.map_reduces,
'Map reduces planned during CRUD select/pairs')

internal.opts = table.deepcopy(opts)

return true
end

Expand Down
1 change: 1 addition & 0 deletions test/integration/cfg_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ group.test_defaults = function(g)
stats = false,
stats_driver = stats.get_default_driver(),
stats_quantiles = false,
stats_quantile_tolerated_error = 1e-3,
})
end

Expand Down
34 changes: 34 additions & 0 deletions test/unit/stats_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,40 @@ group_driver.test_default_quantiles = function(g)
end


group_driver.test_default_quantile_tolerated_error = function(g)
enable_stats(g)

local quantile_tolerated_error = g.router:eval(" return stats_module.internal.quantile_tolerated_error ")
t.assert_equals(quantile_tolerated_error, 1e-3)
end


group_driver.before_test(
'test_custom_quantile_tolerated_error',
function(g)
t.skip_if(g.is_metrics_supported == false, 'Metrics registry is unsupported')
end
)

group_driver.test_custom_quantile_tolerated_error = function(g)
g.router:call('crud.cfg', {{
stats = true,
stats_driver = 'metrics',
stats_quantiles = true,
stats_quantile_tolerated_error = 5e-4,
}})

local resp = g.router:eval([[
local metrics = require('metrics')
local summary = metrics.summary('tnt_crud_stats')
return summary.objectives
]])

t.assert_equals(resp, {[0.99] = 5e-4})
end


group_driver.before_test(
'test_stats_reenable_with_different_driver_reset_stats',
function(g)
Expand Down

0 comments on commit f4d44a7

Please sign in to comment.