diff --git a/CHANGELOG.md b/CHANGELOG.md index edf3294b..fdb86cf8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ * [CHANGE] Renamed `CortexInconsistentConfig` alert to `CortexInconsistentRuntimeConfig` and increased severity to `critical`. #335 * [CHANGE] Increased `CortexBadRuntimeConfig` alert severity to `critical` and removed support for `cortex_overrides_last_reload_successful` metric (was removed in Cortex 1.3.0). #335 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317 +* [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards. #324 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 diff --git a/cortex-mixin/config.libsonnet b/cortex-mixin/config.libsonnet index 95ddc0df..dacd06ea 100644 --- a/cortex-mixin/config.libsonnet +++ b/cortex-mixin/config.libsonnet @@ -58,5 +58,11 @@ // The label used to differentiate between different nodes (i.e. servers). per_node_label: 'instance', + + // Whether certain dashboard description headers should be shown + show_dashboard_descriptions: { + writes: true, + reads: true, + }, }, } diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet index 657cfce7..aeb64491 100644 --- a/cortex-mixin/dashboards/compactor.libsonnet +++ b/cortex-mixin/dashboards/compactor.libsonnet @@ -6,12 +6,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addClusterSelectorTemplates() .addRow( $.row('Summary') - .addPanel( - $.textPanel('', ||| - - **Per-instance runs**: number of times a compactor instance triggers a compaction across all tenants its shard manage. - - **Tenants compaction progress**: in a multi-tenant cluster it shows the progress of tenants compacted while compaction is running. Reset to 0 once the compaction run is completed for all tenants in the shard. - |||), - ) .addPanel( $.startedCompletedFailedPanel( 'Per-instance runs / sec', @@ -20,7 +14,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor) ) + $.bars + - { yaxes: $.yaxes('ops') }, + { yaxes: $.yaxes('ops') } + + $.panelDescription( + 'Per-instance runs', + ||| + Number of times a compactor instance triggers a compaction across all tenants that it manages. + ||| + ), ) .addPanel( $.panel('Tenants compaction progress') + @@ -31,42 +31,55 @@ local utils = import 'mixin-utils/utils.libsonnet'; cortex_compactor_tenants_skipped{%s} ) / cortex_compactor_tenants_discovered{%s} ||| % [$.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor)], '{{%s}}' % $._config.per_instance_label) + - { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) } + + $.panelDescription( + 'Tenants compaction progress', + ||| + In a multi-tenant cluster, display the progress of tenants that are compacted while compaction is running. + Reset to 0 after the compaction run is completed for all tenants in the shard. + ||| + ), ) ) .addRow( $.row('') - .addPanel( - $.textPanel('', ||| - - **Compacted blocks**: number of blocks generated as a result of a compaction operation. - - **Per-block compaction duration**: time taken to generate a single compacted block. - |||), - ) .addPanel( $.panel('Compacted blocks / sec') + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + - { yaxes: $.yaxes('ops') }, + { yaxes: $.yaxes('ops') } + + $.panelDescription( + 'Compacted blocks / sec', + ||| + Rate of blocks that are generated as a result of a compaction operation. + ||| + ), ) .addPanel( $.panel('Per-block compaction duration') + - $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) + $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) + + $.panelDescription( + 'Per-block compaction duration', + ||| + Display the amount of time that it has taken to generate a single compacted block. + ||| + ), ) ) .addRow( $.row('') - .addPanel( - $.textPanel('', ||| - - **Average blocks / tenant**: the average number of blocks per tenant. - - **Tenants with largest number of blocks**: the 10 tenants with the largest number of blocks. - |||), - ) .addPanel( $.panel('Average blocks / tenant') + $.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), 'avg'), ) .addPanel( $.panel('Tenants with largest number of blocks') + - $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}'), + $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}') + + $.panelDescription( + 'Tenants with largest number of blocks', + ||| + The 10 tenants with the largest number of blocks. + ||| + ), ) ) .addRow( @@ -103,6 +116,5 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)), ) ) - .addRow($.objectStorePanels1('Object Store', 'compactor')) - .addRow($.objectStorePanels2('', 'compactor')), + .addRows($.getObjectStoreRows('Object Store', 'compactor')), } diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet index ded63ddc..c965b265 100644 --- a/cortex-mixin/dashboards/dashboard-utils.libsonnet +++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet @@ -14,6 +14,24 @@ local utils = import 'mixin-utils/utils.libsonnet'; then self.addRow(row) else self, + addRowsIf(condition, rows):: + if condition + then + local reduceRows(dashboard, remainingRows) = + if (std.length(remainingRows) == 0) + then dashboard + else + reduceRows( + dashboard.addRow(remainingRows[0]), + std.slice(remainingRows, 1, std.length(remainingRows), 1) + ) + ; + reduceRows(self, rows) + else self, + + addRows(rows):: + self.addRowsIf(true, rows), + addClusterSelectorTemplates(multi=true):: local d = self { tags: $._config.tags, @@ -43,7 +61,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; else d .addTemplate('cluster', 'cortex_build_info', 'cluster') .addTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace'), - }, // The mixin allow specialism of the job selector depending on if its a single binary @@ -274,7 +291,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; type: 'text', } + options, - objectStorePanels1(title, component):: + getObjectStoreRows(title, component):: [ super.row(title) .addPanel( $.panel('Operations / sec') + @@ -288,62 +305,135 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('percentunit') }, ) .addPanel( - $.panel('Op: Attributes') + + $.panel('Latency of Op: Attributes') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="attributes"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Op: Exists') + + $.panel('Latency of Op: Exists') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="exists"}' % [$.namespaceMatcher(), component]), ), - - // Second row of Object Store stats - objectStorePanels2(title, component):: - super.row(title) + $.row('') .addPanel( - $.panel('Op: Get') + + $.panel('Latency of Op: Get') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Op: GetRange') + + $.panel('Latency of Op: GetRange') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get_range"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Op: Upload') + + $.panel('Latency of Op: Upload') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="upload"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Op: Delete') + + $.panel('Latency of Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="delete"}' % [$.namespaceMatcher(), component]), ), + ], thanosMemcachedCache(title, jobName, component, cacheName):: + local config = { + jobMatcher: $.jobMatcher(jobName), + component: component, + cacheName: cacheName, + }; super.row(title) .addPanel( - $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{%s,component="%s",name="%s"}[$__rate_interval]))' % [$.jobMatcher(jobName), component, cacheName], '{{operation}}') + + $.panel('Requests / sec') + + $.queryPanel( + ||| + sum by(operation) ( + rate( + thanos_memcached_operations_total{ + %(jobMatcher)s, + component="%(component)s", + name="%(cacheName)s" + }[$__rate_interval] + ) + ) + ||| % config, + '{{operation}}' + ) + $.stack + - { yaxes: $.yaxes('ops') }, + { yaxes: $.yaxes('ops') } ) .addPanel( $.panel('Latency (getmulti)') + - $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="%s",name="%s"}' % [$.jobMatcher(jobName), component, cacheName]) + $.latencyPanel( + 'thanos_memcached_operation_duration_seconds', + ||| + { + %(jobMatcher)s, + operation="getmulti", + component="%(component)s", + name="%(cacheName)s" + } + ||| % config + ) ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum(rate(thanos_cache_memcached_hits_total{%s,component="%s",name="%s"}[$__rate_interval])) / sum(rate(thanos_cache_memcached_requests_total{%s,component="%s",name="%s"}[$__rate_interval]))' % - [ - $.jobMatcher(jobName), - component, - cacheName, - $.jobMatcher(jobName), - component, - cacheName, - ], 'items') + - { yaxes: $.yaxes('percentunit') }, + $.queryPanel( + ||| + sum( + rate( + thanos_cache_memcached_hits_total{ + %(jobMatcher)s, + component="%(component)s", + name="%(cacheName)s" + }[$__rate_interval] + ) + ) + / + sum( + rate( + thanos_cache_memcached_requests_total{ + %(jobMatcher)s, + component="%(component)s", + name="%(cacheName)s" + }[$__rate_interval] + ) + ) + ||| % config, + 'items' + ) + + { yaxes: $.yaxes('percentunit') } ), filterNodeDiskContainer(containerName):: ||| - ignoring(%s) group_right() (label_replace(count by(%s, %s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) - ||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName], + ignoring(%s) group_right() ( + label_replace( + count by( + %s, + %s, + device + ) + ( + container_fs_writes_bytes_total{ + %s, + container="%s", + device!~".*sda.*" + } + ), + "device", + "$1", + "device", + "/dev/(.*)" + ) * 0 + ) + ||| % [ + $._config.per_instance_label, + $._config.per_node_label, + $._config.per_instance_label, + $.namespaceMatcher(), + containerName, + ], + + panelDescription(title, description):: { + description: ||| + ### %s + %s + ||| % [title, description], + }, } diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet index 9f98308c..965e0e76 100644 --- a/cortex-mixin/dashboards/reads.libsonnet +++ b/cortex-mixin/dashboards/reads.libsonnet @@ -4,10 +4,95 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'cortex-reads.json': ($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' }) .addClusterSelectorTemplates() + .addRowIf( + $._config.show_dashboard_descriptions.reads, + ($.row('Reads dashboard description') { height: '175px', showTitle: false }) + .addPanel( + $.textPanel('', ||| +
+ This dashboard shows health metrics for the Cortex read path.
+ It is broken into sections for each service on the read path, and organized by the order in which the read request flows.
+
+ Incoming queries travel from the gateway → query frontend → query scheduler → querier → ingester and/or store-gateway (depending on the time range of the query).
+
+ For each service, there are 3 panels showing (1) requests per second to that service, (2) average, median, and p99 latency of requests to that service, and (3) p99 latency of requests to each instance of that service.
+
+ The dashboard also shows metrics for the 4 optional caches that can be deployed with Cortex:
+ the query results cache, the metadata cache, the chunks cache, and the index cache.
+
+ These panels will show “no data” if the caches are not deployed.
+
+ Lastly, it also includes metrics for how the ingester and store-gateway interact with object storage. +
+ |||), + ) + ) + .addRow( + ($.row('Headlines') + + { + height: '100px', + showTitle: false, + }) + .addPanel( + $.panel('Instant queries / sec') + + $.statPanel(||| + sum( + rate( + cortex_request_duration_seconds_count{ + %(queryFrontend)s, + route=~"(prometheus|api_prom)_api_v1_query" + }[$__rate_interval] + ) + ) + + sum( + rate( + cortex_prometheus_rule_evaluations_total{ + %(ruler)s + }[$__rate_interval] + ) + ) + ||| % { + queryFrontend: $.jobMatcher($._config.job_names.query_frontend), + ruler: $.jobMatcher($._config.job_names.ruler), + }, format='reqps') + + $.panelDescription( + 'Instant queries per second', + ||| + Rate of instant queries per second being made to the system. + Includes both queries made to the /prometheus API as + well as queries from the ruler. + ||| + ), + ) + .addPanel( + $.panel('Range queries / sec') + + $.statPanel(||| + sum( + rate( + cortex_request_duration_seconds_count{ + %(queryFrontend)s, + route=~"(prometheus|api_prom)_api_v1_query_range" + }[$__rate_interval] + ) + ) + ||| % { + queryFrontend: $.jobMatcher($._config.job_names.query_frontend), + }, format='reqps') + + $.panelDescription( + 'Range queries per second', + ||| + Rate of range queries per second being made to + Cortex via the /prometheus API. + ||| + ), + ) + ) .addRow( $.row('Gateway') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( @@ -25,7 +110,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( @@ -43,7 +128,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Scheduler') .addPanel( - $.panel('QPS') + + $.textPanel( + '', + ||| ++ The query scheduler is an optional service that moves + the internal queue from the query-frontend into a + separate component. + If this service is not deployed, + these panels will show "No data." +
+ ||| + ) + ) + .addPanel( + $.panel('Requests / sec') + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) .addPanel( @@ -54,7 +153,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Cache - Query Results') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( @@ -65,7 +164,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Querier') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -83,7 +182,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -102,7 +201,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( @@ -121,7 +220,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached - Chunks storage - Index') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -133,7 +232,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached - Chunks storage - Chunks') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -143,41 +242,109 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Memcached – Blocks Storage – Block Index (Store-gateway)') - .addPanel( - $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{component="store-gateway",name="index-cache", %s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + + $.row('Memcached – Blocks storage – Block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache + .addPanel( + $.panel('Requests / sec') + + $.queryPanel( + ||| + sum by(operation) ( + rate( + thanos_memcached_operations_total{ + component="store-gateway", + name="index-cache", + %s + }[$__rate_interval] + ) + ) + ||| % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}' + ) + $.stack + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Latency (getmulti)') + - $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="store-gateway",name="index-cache"}' % $.jobMatcher($._config.job_names.store_gateway)) + $.latencyPanel( + 'thanos_memcached_operation_duration_seconds', + ||| + { + %s, + operation="getmulti", + component="store-gateway", + name="index-cache" + } + ||| % $.jobMatcher($._config.job_names.store_gateway) + ) ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum by(item_type) (rate(thanos_store_index_cache_hits_total{component="store-gateway",%s}[$__rate_interval])) / sum by(item_type) (rate(thanos_store_index_cache_requests_total{component="store-gateway",%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{item_type}}') + - { yaxes: $.yaxes('percentunit') }, + $.queryPanel( + ||| + sum by(item_type) ( + rate( + thanos_store_index_cache_hits_total{ + component="store-gateway", + %s + }[$__rate_interval] + ) + ) + / + sum by(item_type) ( + rate( + thanos_store_index_cache_requests_total{ + component="store-gateway", + %s + }[$__rate_interval] + ) + ) + ||| % [ + $.jobMatcher($._config.job_names.store_gateway), + $.jobMatcher($._config.job_names.store_gateway), + ], + '{{item_type}}' + ) + + { yaxes: $.yaxes('percentunit') } + + $.panelDescription( + 'Hit Ratio', + ||| + Even if you do not set up memcached for the blocks index cache, you will still see data in this panel because Cortex by default has an + in-memory blocks index cache. + ||| + ), ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.thanosMemcachedCache('Memcached – Blocks Storage – Chunks (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'chunks-cache') + $.thanosMemcachedCache( + 'Memcached – Blocks storage – Chunks cache (store-gateway accesses)', + $._config.job_names.store_gateway, + 'store-gateway', + 'chunks-cache' + ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.thanosMemcachedCache('Memcached – Blocks Storage – Metadata (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache') + $.thanosMemcachedCache( + 'Memcached – Blocks storage – Metadata cache (store-gateway accesses)', + $._config.job_names.store_gateway, + 'store-gateway', + 'metadata-cache' + ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.thanosMemcachedCache('Memcached – Blocks Storage – Metadata (Querier)', $._config.job_names.querier, 'querier', 'metadata-cache') + $.thanosMemcachedCache( + 'Memcached – Blocks storage – Metadata cache (querier accesses)', + $._config.job_names.querier, + 'querier', + 'metadata-cache' + ) ) .addRowIf( std.member($._config.storage_engine, 'chunks') && std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), $.row('Cassandra') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -190,7 +357,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), $.row('BigTable') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -203,7 +370,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), $.row('DynamoDB') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -216,7 +383,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_store_backend, 'gcs'), $.row('GCS') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -225,21 +392,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) // Object store metrics for the store-gateway. - .addRowIf( - std.member($._config.storage_engine, 'blocks'), - $.objectStorePanels1('Store-gateway - Blocks Object Store', 'store-gateway'), - ) - .addRowIf( + .addRowsIf( std.member($._config.storage_engine, 'blocks'), - $.objectStorePanels2('', 'store-gateway'), + $.getObjectStoreRows('Blocks Object Store (Store-gateway accesses)', 'store-gateway') ) // Object store metrics for the querier. - .addRowIf( - std.member($._config.storage_engine, 'blocks'), - $.objectStorePanels1('Querier - Blocks Object Store', 'querier'), - ) - .addRowIf( + .addRowsIf( std.member($._config.storage_engine, 'blocks'), - $.objectStorePanels2('', 'querier'), + $.getObjectStoreRows('Blocks Object Store (Querier accesses)', 'querier') ), } diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index ea2ce3c3..8a77be1c 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -4,21 +4,44 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'cortex-writes.json': ($.dashboard('Cortex / Writes') + { uid: '0156f6d15aa234d452a33a4f13c838e3' }) .addClusterSelectorTemplates() - .addRow( + .addRowIf( + $._config.show_dashboard_descriptions.writes, + ($.row('Writes dashboard description') { height: '125px', showTitle: false }) + .addPanel( + $.textPanel('', ||| +
+ This dashboard shows various health metrics for the Cortex write path.
+ It is broken into sections for each service on the write path,
+ and organized by the order in which the write request flows.
+
+ Incoming metrics data travels from the gateway → distributor → ingester.
+
+ For each service, there are 3 panels showing
+ (1) requests per second to that service,
+ (2) average, median, and p99 latency of requests to that service, and
+ (3) p99 latency of requests to each instance of that service.
+
+ It also includes metrics for the key-value (KV) stores used to manage + the high-availability tracker and the ingesters. +
+ |||), + ) + ).addRow( ($.row('Headlines') + { height: '100px', showTitle: false, }) .addPanel( - $.panel('Samples / s') + + $.panel('Samples / sec') + $.statPanel( 'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( $._config { job: $.jobMatcher($._config.job_names.distributor), } ), - format='reqps' + format='short' ) ) .addPanel( @@ -37,14 +60,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short') ) .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( @@ -62,7 +85,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Distributor') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( @@ -78,9 +101,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('KV Store (HA Dedupe)') + $.row('Key-value store for high-availability (HA) deduplication') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( @@ -91,7 +114,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -107,9 +130,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('KV Store (Ring)') + $.row('Key-value store for the ingesters ring') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -121,7 +144,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -134,7 +157,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), $.row('Cassandra') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -147,7 +170,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), $.row('BigTable') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -160,7 +183,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), $.row('DynamoDB') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -173,7 +196,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_store_backend, 'gcs'), $.row('GCS') .addPanel( - $.panel('QPS') + + $.panel('Requests / sec') + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -189,11 +212,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Uploaded blocks / sec', 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), + ) + + $.panelDescription( + 'Uploaded blocks / sec', + ||| + The rate of blocks being uploaded from the ingesters + to object storage. + ||| ), ) .addPanel( $.panel('Upload latency') + - $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)), + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)) + + $.panelDescription( + 'Upload latency', + ||| + The average, median (50th percentile), and 99th percentile time + the ingesters take to upload blocks to object storage. + ||| + ), ) ) .addRowIf( @@ -204,21 +241,43 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Compactions / sec', 'sum(rate(cortex_ingester_tsdb_compactions_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_compactions_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), + ) + + $.panelDescription( + 'Compactions per second', + ||| + Ingesters maintain a local TSDB per-tenant on disk. Each TSDB maintains a head block for each + active time series; these blocks get periodically compacted (by default, every 2h). + This panel shows the rate of compaction operations across all TSDBs on all ingesters. + ||| ), ) .addPanel( $.panel('Compactions latency') + - $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)), + $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)) + + $.panelDescription( + 'Compaction latency', + ||| + The average, median (50th percentile), and 99th percentile time ingesters take to compact TSDB head blocks + on the local filesystem. + ||| + ), ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Ingester - Blocks storage - TSDB WAL') + $.row('Ingester - Blocks storage - TSDB write ahead log (WAL)') .addPanel( $.successFailurePanel( 'WAL truncations / sec', 'sum(rate(cortex_ingester_tsdb_wal_truncations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), + ) + + $.panelDescription( + 'WAL truncations per second', + ||| + The WAL is truncated each time a new TSDB block is written. This panel measures the rate of + truncations. + ||| ), ) .addPanel( @@ -226,12 +285,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Checkpoints created / sec', 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), + ) + + $.panelDescription( + 'Checkpoints created per second', + ||| + Checkpoints are created as part of the WAL truncation process. + This metric measures the rate of checkpoint creation. + ||| ), ) .addPanel( $.panel('WAL truncations latency (includes checkpointing)') + $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__rate_interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + - { yaxes: $.yaxes('s') }, + { yaxes: $.yaxes('s') } + + $.panelDescription( + 'WAL truncations latency (including checkpointing)', + ||| + Average time taken to perform a full WAL truncation, + including the time taken for the checkpointing to complete. + ||| + ), ) .addPanel( $.panel('Corruptions / sec') +