diff --git a/CHANGELOG.md b/CHANGELOG.md index 81e76d672..a3f360af6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/). - Ability to configure client certificate chain depth [PR #1006](https://github.com/3scale/APIcast/pull/1006) - You can filter services by endpoint name using Regexp [PR #1022](https://github.com/3scale/APIcast/pull/1022) [THREESCALE-1524](https://issues.jboss.org/browse/THREESCALE-1524) +- Enable APICAST_EXTENDED_METRICS env variable to provide more in-depth details [PR #1024](https://github.com/3scale/APIcast/pull/1024) [THREESCALE-2150](https://issues.jboss.org/browse/THREESCALE-2150) ### Fixed diff --git a/doc/parameters.md b/doc/parameters.md index b83c77f2f..487a59404 100644 --- a/doc/parameters.md +++ b/doc/parameters.md @@ -374,3 +374,18 @@ Defines a HTTP proxy to be used for connecting to HTTPS services. Authentication **Example:** `foo,bar.com,.extra.dot.com` Defines a comma-separated list of hostnames and domain names for which the requests should not be proxied. Setting to a single `*` character, which matches all hosts, effectively disables the proxy. + +### `APICAST_EXTENDED_METRICS` + +**Default:** false +**Value:** boolean +**Example:** "true" + +Enables additional information on Prometheus metrics; some labels will be used +with specific information that will provide more in-depth details about APIcast. + +The metrics that will have extended information are: + +- total_response_time_seconds: label service +- upstream_response_time_seconds: label service +- upstream_status: label service diff --git a/doc/prometheus-metrics.md b/doc/prometheus-metrics.md index 6510ccf40..405ec2f53 100644 --- a/doc/prometheus-metrics.md +++ b/doc/prometheus-metrics.md @@ -7,9 +7,9 @@ | openresty_shdict_capacity | Capacity of the dictionaries shared between workers | gauge | dict(one for every dictionary) | Default | | openresty_shdict_free_space | Free space of the dictionaries shared between workers | gauge | dict(one for every dictionary) | Default | | nginx_metric_errors_total | Number of errors of the Lua library that manages the metrics | counter | - | Default | -| total_response_time_seconds | Time needed to sent a response to the client (in seconds) | histogram | - | Default | -| upstream_response_time_seconds | Response times from upstream servers (in seconds) | histogram | - | Default | -| upstream_status | HTTP status from upstream servers | counter | status | Default | +| total_response_time_seconds | Time needed to sent a response to the client (in seconds) | histogram | service | Default | +| upstream_response_time_seconds | Response times from upstream servers (in seconds) | histogram | service | Default | +| upstream_status | HTTP status from upstream servers | counter | status, service | Default | | threescale_backend_calls | Authorize and report requests to the 3scale backend (Apisonator) | counter | endpoint(authrep, auth, report), status(2xx, 4xx, 5xx) | APIcast | | batching_policy_auths_cache_hits | Hits in the auths cache of the 3scale batching policy | counter | - | 3scale Batcher | | batching_policy_auths_cache_misses | Misses in the auths cache of the 3scale batching policy | counter | - | 3scale Batcher | diff --git a/gateway/src/apicast/metrics/upstream.lua b/gateway/src/apicast/metrics/upstream.lua index 308dc164c..ab6100047 100644 --- a/gateway/src/apicast/metrics/upstream.lua +++ b/gateway/src/apicast/metrics/upstream.lua @@ -4,36 +4,40 @@ local prometheus = require('apicast.prometheus') local _M = {} +local service_label = 'service' +local status_label = 'status' + local upstream_status_codes = prometheus( 'counter', 'upstream_status', 'HTTP status from upstream servers', - { 'status' } + { status_label, service_label } ) local upstream_resp_times = prometheus( 'histogram', 'upstream_response_time_seconds', - 'Response times from upstream servers' + 'Response times from upstream servers', + { service_label } ) -local function inc_status_codes_counter(status) +local function inc_status_codes_counter(status, service) if tonumber(status) and upstream_status_codes then - upstream_status_codes:inc(1, { status }) + upstream_status_codes:inc(1, { status, service }) end end -local function add_resp_time(response_time) +local function add_resp_time(response_time, service) local time = tonumber(response_time) if time and upstream_resp_times then - upstream_resp_times:observe(time) + upstream_resp_times:observe(time, { service }) end end -function _M.report(status, response_time) - inc_status_codes_counter(status) - add_resp_time(response_time) +function _M.report(status, response_time, service) + inc_status_codes_counter(status, service) + add_resp_time(response_time, service) end return _M diff --git a/gateway/src/apicast/policy/nginx_metrics/nginx_metrics.lua b/gateway/src/apicast/policy/nginx_metrics/nginx_metrics.lua index b31d9f366..5d4f16391 100644 --- a/gateway/src/apicast/policy/nginx_metrics/nginx_metrics.lua +++ b/gateway/src/apicast/policy/nginx_metrics/nginx_metrics.lua @@ -9,6 +9,14 @@ local select = select local find = string.find local pairs = pairs +-- extended_metrics is a variable used to report multiple labels in some +-- metrics. This can be useful in small environements, but can be problematic +-- for large users due this can create a large matrix of metrics. +-- More info about this can be found in Prometheus doc: +-- https://prometheus.io/docs/practices/naming/#labels + +local extended_metrics = resty_env.enabled('APICAST_EXTENDED_METRICS') + local upstream_metrics = require('apicast.metrics.upstream') local new = _M.new @@ -76,7 +84,8 @@ local shdict_free_space_metric = prometheus('gauge', 'openresty_shdict_free_spac local response_times = prometheus( 'histogram', 'total_response_time_seconds', - 'Time needed to sent a response to the client (in seconds).' + 'Time needed to sent a response to the client (in seconds).', + { 'service' } ) function _M.init() @@ -118,19 +127,22 @@ function _M:metrics() end end -local function report_req_response_time() +local function report_req_response_time(service) -- Use ngx.var.original_request_time instead of ngx.var.request_time so -- the time spent in the post_action phase is not taken into account. local resp_time = tonumber(ngx.var.original_request_time) - if resp_time and response_times then - response_times:observe(resp_time) + response_times:observe(resp_time, { service }) end end -function _M.log() - upstream_metrics.report(ngx.var.upstream_status, ngx.var.upstream_response_time) - report_req_response_time() +function _M.log(_, context) + local service = "" + if context.service and context.service.id then + service = context.service.id + end + upstream_metrics.report(ngx.var.upstream_status, ngx.var.upstream_response_time, service) + report_req_response_time(service) end return _M diff --git a/spec/metrics/upstream_spec.lua b/spec/metrics/upstream_spec.lua index 75fe852cb..12655d16a 100644 --- a/spec/metrics/upstream_spec.lua +++ b/spec/metrics/upstream_spec.lua @@ -1,6 +1,7 @@ describe('upstream metrics', function() describe('report', function() local upstream_metrics + local service_metric_name = "42" local test_counter = { inc = function() end } local test_histogram = { observe = function() end } @@ -36,8 +37,8 @@ describe('upstream metrics', function() end) it('adds the latency to the histogram', function() - upstream_metrics.report(200, 0.1) - assert.stub(test_histogram.observe).was_called_with(test_histogram, 0.1) + upstream_metrics.report(200, 0.1, service_metric_name) + assert.stub(test_histogram.observe).was_called_with(test_histogram, 0.1, { service_metric_name }) end) describe('when the status is nil or empty', function()