[THREESCALE-2150][Metrics] Add services on metrics.

This commits adds support to have additional information on some metrics[0], this commit enables users to know the service in the metric. Due to this can be a problematic for Prometheus[1], to enable this, a new ENV variable(`APICAST_EXTENDED_METRICS`) is defined. Users that have a few services can run the extended metrics without affecting Prometheus stability, if the APICast instance has thousands of services the recommendation is to have this option disabled. In case of the extended_metrics is disabled, all the service labels will use "all" value and Prometheus performance will not be affected. The new metrics list looks like this: ``` bash-4.2$ curl http://localhost:9421/metrics -s | grep service total_response_time_seconds_bucket{service="all",le="00.100"} 91 total_response_time_seconds_bucket{service="all",le="00.200"} 189 total_response_time_seconds_bucket{service="all",le="00.300"} 220 total_response_time_seconds_bucket{service="all",le="00.400"} 220 total_response_time_seconds_bucket{service="all",le="00.500"} 222 total_response_time_seconds_bucket{service="all",le="00.750"} 223 total_response_time_seconds_bucket{service="all",le="01.000"} 223 total_response_time_seconds_bucket{service="all",le="01.500"} 224 total_response_time_seconds_bucket{service="all",le="02.000"} 224 total_response_time_seconds_bucket{service="all",le="03.000"} 224 total_response_time_seconds_bucket{service="all",le="04.000"} 224 total_response_time_seconds_bucket{service="all",le="05.000"} 224 total_response_time_seconds_bucket{service="all",le="10.000"} 224 total_response_time_seconds_bucket{service="all",le="+Inf"} 224 total_response_time_seconds_count{service="all"} 224 total_response_time_seconds_sum{service="all"} 33.616 upstream_response_time_seconds_bucket{service="all",le="00.100"} 93 upstream_response_time_seconds_bucket{service="all",le="00.200"} 190 upstream_response_time_seconds_bucket{service="all",le="00.300"} 220 upstream_response_time_seconds_bucket{service="all",le="00.400"} 220 upstream_response_time_seconds_bucket{service="all",le="00.500"} 223 upstream_response_time_seconds_bucket{service="all",le="00.750"} 224 upstream_response_time_seconds_bucket{service="all",le="01.000"} 224 upstream_response_time_seconds_bucket{service="all",le="01.500"} 224 upstream_response_time_seconds_bucket{service="all",le="02.000"} 224 upstream_response_time_seconds_bucket{service="all",le="03.000"} 224 upstream_response_time_seconds_bucket{service="all",le="04.000"} 224 upstream_response_time_seconds_bucket{service="all",le="05.000"} 224 upstream_response_time_seconds_bucket{service="all",le="10.000"} 224 upstream_response_time_seconds_bucket{service="all",le="+Inf"} 224 upstream_response_time_seconds_count{service="all"} 224 upstream_response_time_seconds_sum{service="all"} 32.226 upstream_status{status="200",service="all"} 224 ``` [0] List of affected metrics: total_response_time_seconds upstream_response_time_seconds upstream_status [1] https://prometheus.io/docs/practices/naming/#labels Signed-off-by: Eloy Coto <eloy.coto@gmail.com>
3scale · May 2, 2019 · 6cb6dd0 · 6cb6dd0
1 parent 66168b7
commit 6cb6dd0
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 
 - Ability to configure client certificate chain depth [PR #1006](https://github.com/3scale/APIcast/pull/1006)
 - You can filter services by endpoint name using Regexp [PR #1022](https://github.com/3scale/APIcast/pull/1022) [THREESCALE-1524](https://issues.jboss.org/browse/THREESCALE-1524)
+- Enable APICAST_EXTENDED_METRICS env variable to provide more in-depth details [PR #1024](https://github.com/3scale/APIcast/pull/1024) [THREESCALE-2150](https://issues.jboss.org/browse/THREESCALE-2150)
 
 ### Fixed
 

diff --git a/doc/parameters.md b/doc/parameters.md
@@ -374,3 +374,18 @@ Defines a HTTP proxy to be used for connecting to HTTPS services. Authentication
 **Example:** `foo,bar.com,.extra.dot.com`
 
 Defines a comma-separated list of hostnames and domain names for which the requests should not be proxied. Setting to a single `*` character, which matches all hosts, effectively disables the proxy.
+
+### `APICAST_EXTENDED_METRICS`
+
+**Default:** false
+**Value:** boolean
+**Example:** "true"
+
+Enables additional information on Prometheus metrics; some labels will be used
+with specific information that will provide more in-depth details about APIcast.
+
+The metrics that will have extended information are:
+
+- total_response_time_seconds: label service
+- upstream_response_time_seconds: label service
+- upstream_status: label service
diff --git a/doc/prometheus-metrics.md b/doc/prometheus-metrics.md
@@ -7,9 +7,9 @@
 | openresty_shdict_capacity          | Capacity of the dictionaries shared between workers              | gauge     | dict(one for every dictionary)                               | Default        |
 | openresty_shdict_free_space        | Free space of the dictionaries shared between workers            | gauge     | dict(one for every dictionary)                               | Default        |
 | nginx_metric_errors_total          | Number of errors of the Lua library that manages the metrics     | counter   | -                                                            | Default        |
-| total_response_time_seconds        | Time needed to sent a response to the client (in seconds)        | histogram | -                                                            | Default        |
-| upstream_response_time_seconds     | Response times from upstream servers (in seconds)                | histogram | -                                                            | Default        |
-| upstream_status                    | HTTP status from upstream servers                                | counter   | status                                                       | Default        |
+| total_response_time_seconds        | Time needed to sent a response to the client (in seconds)        | histogram | service                                                      | Default        |
+| upstream_response_time_seconds     | Response times from upstream servers (in seconds)                | histogram | service                                                      | Default        |
+| upstream_status                    | HTTP status from upstream servers                                | counter   | status, service                                              | Default        |
 | threescale_backend_calls           | Authorize and report requests to the 3scale backend (Apisonator) | counter   | endpoint(authrep, auth, report), status(2xx, 4xx, 5xx)       | APIcast        |
 | batching_policy_auths_cache_hits   | Hits in the auths cache of the 3scale batching policy            | counter   | -                                                            | 3scale Batcher |
 | batching_policy_auths_cache_misses | Misses in the auths cache of the 3scale batching policy          | counter   | -                                                            | 3scale Batcher |
diff --git a/gateway/src/apicast/metrics/upstream.lua b/gateway/src/apicast/metrics/upstream.lua
@@ -4,36 +4,40 @@ local prometheus = require('apicast.prometheus')
 
 local _M = {}
 
+local service_label = 'service'
+local status_label = 'status'
+
 local upstream_status_codes = prometheus(
   'counter',
   'upstream_status',
   'HTTP status from upstream servers',
-  { 'status' }
+  { status_label, service_label }
 )
 
 local upstream_resp_times = prometheus(
   'histogram',
   'upstream_response_time_seconds',
-  'Response times from upstream servers'
+  'Response times from upstream servers',
+  { service_label }
 )
 
-local function inc_status_codes_counter(status)
+local function inc_status_codes_counter(status, service)
   if tonumber(status) and upstream_status_codes then
-    upstream_status_codes:inc(1, { status })
+    upstream_status_codes:inc(1, { status, service })
   end
 end
 
-local function add_resp_time(response_time)
+local function add_resp_time(response_time, service)
   local time = tonumber(response_time)
 
   if time and upstream_resp_times then
-    upstream_resp_times:observe(time)
+    upstream_resp_times:observe(time, { service })
   end
 end
 
-function _M.report(status, response_time)
-  inc_status_codes_counter(status)
-  add_resp_time(response_time)
+function _M.report(status, response_time, service)
+  inc_status_codes_counter(status, service)
+  add_resp_time(response_time, service)
 end
 
 return _M
diff --git a/gateway/src/apicast/policy/nginx_metrics/nginx_metrics.lua b/gateway/src/apicast/policy/nginx_metrics/nginx_metrics.lua
@@ -9,6 +9,14 @@ local select = select
 local find = string.find
 local pairs = pairs
 
+-- extended_metrics is a variable used to report multiple labels in some
+-- metrics. This can be useful in small environements, but can be problematic
+-- for large users due this can create a large matrix of metrics.
+-- More info about this can be found in Prometheus doc:
+-- https://prometheus.io/docs/practices/naming/#labels
+
+local extended_metrics = resty_env.enabled('APICAST_EXTENDED_METRICS')
+
 local upstream_metrics = require('apicast.metrics.upstream')
 
 local new = _M.new
@@ -76,7 +84,8 @@ local shdict_free_space_metric = prometheus('gauge', 'openresty_shdict_free_spac
 local response_times = prometheus(
   'histogram',
   'total_response_time_seconds',
-  'Time needed to sent a response to the client (in seconds).'
+  'Time needed to sent a response to the client (in seconds).',
+  { 'service' }
 )
 
 function _M.init()
@@ -118,19 +127,22 @@ function _M:metrics()
   end
 end
 
-local function report_req_response_time()
+local function report_req_response_time(service)
   -- Use ngx.var.original_request_time instead of ngx.var.request_time so
   -- the time spent in the post_action phase is not taken into account.
   local resp_time = tonumber(ngx.var.original_request_time)
-
   if resp_time and response_times then
-    response_times:observe(resp_time)
+    response_times:observe(resp_time, { service })
   end
 end
 
-function _M.log()
-  upstream_metrics.report(ngx.var.upstream_status, ngx.var.upstream_response_time)
-  report_req_response_time()
+function _M.log(_, context)
+  local service = ""
+  if context.service and context.service.id then
+    service = context.service.id
+  end
+  upstream_metrics.report(ngx.var.upstream_status, ngx.var.upstream_response_time, service)
+  report_req_response_time(service)
 end
 
 return _M
diff --git a/spec/metrics/upstream_spec.lua b/spec/metrics/upstream_spec.lua
@@ -1,6 +1,7 @@
 describe('upstream metrics', function()
   describe('report', function()
     local upstream_metrics
+    local service_metric_name = "42"
     local test_counter = { inc = function() end }
     local test_histogram = { observe = function() end }
 
@@ -36,8 +37,8 @@ describe('upstream metrics', function()
     end)
 
     it('adds the latency to the histogram', function()
-      upstream_metrics.report(200, 0.1)
-      assert.stub(test_histogram.observe).was_called_with(test_histogram, 0.1)
+      upstream_metrics.report(200, 0.1, service_metric_name)
+      assert.stub(test_histogram.observe).was_called_with(test_histogram, 0.1, { service_metric_name })
     end)
 
     describe('when the status is nil or empty', function()