From 47c52ade2d1a5ea30f75a910a5e01db162e48320 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Sat, 16 Nov 2019 13:25:58 +0000 Subject: [PATCH 001/364] Initial commit just move everything over Signed-off-by: Goutham Veeramachaneni --- jsonnet/mimir-mixin/alerts.jsonnet | 1 + jsonnet/mimir-mixin/alerts.libsonnet | 520 ++++++++++++++ jsonnet/mimir-mixin/dashboards.jsonnet | 6 + jsonnet/mimir-mixin/dashboards.libsonnet | 649 ++++++++++++++++++ jsonnet/mimir-mixin/jsonnetfile.json | 24 + jsonnet/mimir-mixin/jsonnetfile.lock.json | 26 + jsonnet/mimir-mixin/mixin.libsonnet | 3 + jsonnet/mimir-mixin/recording_rules.jsonnet | 1 + jsonnet/mimir-mixin/recording_rules.libsonnet | 114 +++ 9 files changed, 1344 insertions(+) create mode 100644 jsonnet/mimir-mixin/alerts.jsonnet create mode 100644 jsonnet/mimir-mixin/alerts.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboards.jsonnet create mode 100644 jsonnet/mimir-mixin/dashboards.libsonnet create mode 100644 jsonnet/mimir-mixin/jsonnetfile.json create mode 100644 jsonnet/mimir-mixin/jsonnetfile.lock.json create mode 100644 jsonnet/mimir-mixin/mixin.libsonnet create mode 100644 jsonnet/mimir-mixin/recording_rules.jsonnet create mode 100644 jsonnet/mimir-mixin/recording_rules.libsonnet diff --git a/jsonnet/mimir-mixin/alerts.jsonnet b/jsonnet/mimir-mixin/alerts.jsonnet new file mode 100644 index 00000000000..e54b1704020 --- /dev/null +++ b/jsonnet/mimir-mixin/alerts.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheus_alerts) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet new file mode 100644 index 00000000000..a5a547b38e7 --- /dev/null +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -0,0 +1,520 @@ +// According to https://developers.soundcloud.com/blog/alerting-on-slos : +local windows = [ + { long_period: '1h', short_period: '5m', for_period: '2m', factor: 14.4, severity: 'critical' }, + { long_period: '6h', short_period: '30m', for_period: '15m', factor: 6, severity: 'critical' }, + { long_period: '1d', short_period: '2h', for_period: '1h', factor: 3, severity: 'warning' }, + { long_period: '3d', short_period: '6h', for_period: '3h', factor: 1, severity: 'warning' }, +]; + +{ + _config+:: { + cortex_p99_latency_threshold_seconds: 2.5, + }, + + prometheus_alerts+:: { + groups+: [ + { + name: 'cortex_alerts', + rules: [ + { + alert: 'CortexIngesterUnhealthy', + 'for': '15m', + expr: ||| + min(cortex_ring_members{state="Unhealthy", job=~"[a-z]+/distributor"}) by (namespace, job) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.job }} reports more than one unhealthy ingester.', + }, + }, + { + alert: 'CortexFlushStuck', + expr: ||| + (cortex_ingester_memory_chunks / cortex_ingester_memory_series) > 1.3 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.job }}/{{ $labels.instance }} is stuck flushing chunks.', + }, + }, + { + alert: 'CortexRequestErrors', + expr: ||| + 100 * sum(rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) + / + sum(rate(cortex_request_duration_seconds_count[1m])) by (namespace, job, route) + > 1 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'CortexRequestLatency', + expr: ||| + cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process"} + > + %(cortex_p99_latency_threshold_seconds)s + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, + }, + }, + { + // We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail + // and we will never trigger the alert. + // We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage. + alert: 'CortexTableSyncFailure', + expr: ||| + 100 * rate(cortex_dynamo_sync_tables_seconds_count{status_code!~"2.."}[15m]) + / + rate(cortex_dynamo_sync_tables_seconds_count[15m]) + > 10 + |||, + 'for': '30m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables. + |||, + }, + }, + { + alert: 'CortexQueriesIncorrect', + expr: ||| + 100 * sum by (job, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) + / + sum by (job, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} is reporting incorrect results for {{ printf "%.2f" $value }}% of queries. + |||, + }, + }, + { + alert: 'CortexBadOverrides', + expr: ||| + cortex_overrides_last_reload_successful{job!~".+/table-manager"} == 0 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} failed to reload overrides. + |||, + }, + }, + { + alert: 'CortexQuerierCapacityFull', + expr: ||| + prometheus_engine_queries_concurrent_max{job=~".+/querier"} - prometheus_engine_queries{job=~".+/querier"} == 0 + |||, + 'for': '5m', // We don't want to block for longer. + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} is at capacity processing queries. + |||, + }, + }, + { + alert: 'CortexFrontendQueriesStuck', + expr: ||| + sum by (namespace) (cortex_query_frontend_queue_length{job=~".+/query-frontend"}) > 1 + |||, + 'for': '5m', // We don't want to block for longer. + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} has {{ $value }} queued up queries. + |||, + }, + }, + { + alert: 'CortexCacheRequestErrors', + expr: ||| + 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, method) + / + sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (namespace, job, method) + > 1 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} cache {{ $labels.method }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'CortexIngesterRestarts', + expr: ||| + rate(kube_pod_container_status_restarts_total{container="ingester"}[30m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.namespace }}/{{ $labels.pod }} is restarting', + }, + }, + { + alert: 'CortexTransferFailed', + expr: ||| + max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m]) + |||, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.namespace }}/{{ $labels.pod }} transfer failed.', + }, + }, + { + alert: 'CortexOldChunkInMemory', + // We flush chunks after 6h and then keep them in memory for extra 15m. If chunks are older + // than 7h (= 25200 seconds), raise an alert. Ignore cortex_oldest_unflushed_chunk_timestamp_seconds + // that are zero (eg. distributors). + expr: ||| + (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 25200) and cortex_oldest_unflushed_chunk_timestamp_seconds > 0 + |||, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.pod }} has very old unflushed chunk in memory. + |||, + }, + }, + ], + }, + { + name: 'cortex_slo_alerts', + rules: [ + { + alert: 'CortexWriteErrorBudgetBurn', + expr: ||| + ( + ( + 100 * namespace_job:cortex_gateway_write_slo_errors_per_request:ratio_rate%(long_period)s + > 0.1 * %(factor)f + ) + and + ( + 100 * namespace_job:cortex_gateway_write_slo_errors_per_request:ratio_rate%(short_period)s + > 0.1 * %(factor)f + ) + ) + ||| % window, + 'for': window.for_period, + labels: { + severity: window.severity, + period: window.long_period, // The annotation alone doesn't make this alert unique. + }, + annotations: { + summary: 'Cortex burns its write error budget too fast.', + description: "{{ $value | printf `%%.2f` }}%% of {{ $labels.job }}'s write requests in the last %(long_period)s are failing or too slow to meet the SLO." % window, + }, + } + for window in windows + ] + [ + { + alert: 'CortexReadErrorBudgetBurn', + expr: ||| + ( + ( + 100 * namespace_job:cortex_gateway_read_slo_errors_per_request:ratio_rate%(long_period)s + > 0.5 * %(factor)f + ) + and + ( + 100 * namespace_job:cortex_gateway_read_slo_errors_per_request:ratio_rate%(short_period)s + > 0.5 * %(factor)f + ) + ) + ||| % window, + 'for': window.for_period, + labels: { + severity: window.severity, + period: window.long_period, // The annotation alone doesn't make this alert unique. + }, + annotations: { + summary: 'Cortex burns its read error budget too fast.', + description: "{{ $value | printf `%%.2f` }}%% of {{ $labels.job }}'s read requests in the last %(long_period)s are failing or too slow to meet the SLO." % window, + }, + } + for window in windows + ] + [ + { + alert: 'LegacyCortexWriteErrorBudgetBurn', + expr: ||| + ( + ( + 100 * namespace_job:cortex_gw_write_slo_errors_per_request:ratio_rate%(long_period)s + > 0.1 * %(factor)f + ) + and + ( + 100 * namespace_job:cortex_gw_write_slo_errors_per_request:ratio_rate%(short_period)s + > 0.1 * %(factor)f + ) + ) + ||| % window, + 'for': window.for_period, + labels: { + severity: window.severity, + period: window.long_period, // The annotation alone doesn't make this alert unique. + }, + annotations: { + summary: 'Cortex burns its write error budget too fast.', + description: "{{ $value | printf `%%.2f` }}%% of {{ $labels.job }}'s write requests in the last %(long_period)s are failing or too slow to meet the SLO." % window, + }, + } + for window in windows + ] + [ + { + alert: 'LegacyCortexReadErrorBudgetBurn', + expr: ||| + ( + ( + 100 * namespace_job:cortex_gw_read_slo_errors_per_request:ratio_rate%(long_period)s + > 0.5 * %(factor)f + ) + and + ( + 100 * namespace_job:cortex_gw_read_slo_errors_per_request:ratio_rate%(short_period)s + > 0.5 * %(factor)f + ) + ) + ||| % window, + 'for': window.for_period, + labels: { + severity: window.severity, + period: window.long_period, // The annotation alone doesn't make this alert unique. + }, + annotations: { + summary: 'Cortex burns its read error budget too fast.', + description: "{{ $value | printf `%%.2f` }}%% of {{ $labels.job }}'s read requests in the last %(long_period)s are failing or too slow to meet the SLO." % window, + }, + } + for window in windows + ], + }, + { + name: 'cortex_gw_alerts', + rules: [ + { + alert: 'CortexGWRequestErrors', + expr: ||| + 100 * sum(rate(cortex_gw_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) + / + sum(rate(cortex_gw_request_duration_seconds_count[1m])) by (namespace, job, route) + > 0.1 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'CortexGWRequestLatency', + expr: ||| + namespace_job_route:cortex_gw_request_duration_seconds:99quantile{route!="metrics"} + > + %(cortex_p99_latency_threshold_seconds)s + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, + }, + }, + ], + }, + + { + name: 'cortex-provisioning', + rules: [ + { + alert: 'CortexProvisioningMemcachedTooSmall', + // 4 x in-memory series size = 24hrs of data. + expr: ||| + ( + 4 * + sum by(cluster, namespace) (cortex_ingester_memory_series{job=~".+/ingester"} * cortex_ingester_chunk_size_bytes_sum{job=~".+/ingester"} / cortex_ingester_chunk_size_bytes_count{job=~".+/ingester"}) + / 1e9 + ) + > + ( + sum by (cluster, namespace) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 + ) + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Chunk memcached cluster for namespace {{ $labels.namespace }} are too small, should be at least {{ printf "%.2f" $value }}GB. + |||, + }, + }, + { + alert: 'CortexProvisioningTooManyActiveSeries', + // 1 million active series per ingester max. + expr: ||| + avg by (cluster, namespace) (cortex_ingester_memory_series{job=~".+/ingester"}) > 1.1e6 + and + sum by (cluster, namespace) (rate(cortex_ingester_received_chunks{job=~".+/ingester"}[1h])) == 0 + |||, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Too many active series for ingesters in namespace {{ $labels.namespace }}, add more ingesters. + |||, + }, + }, + { + alert: 'CortexProvisioningTooManyWrites', + // 80k writes / s per ingester max. + expr: ||| + avg by (cluster,namespace) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Too much write QPS for ingesters in namespace {{ $labels.namespace }}, add more ingesters. + |||, + }, + }, + { + alert: 'CortexProvisioningTooMuchMemory', + expr: ||| + avg by (cluster, namespace) (container_memory_working_set_bytes{container_name="ingester"} / container_spec_memory_limit_bytes{container_name="ingester"}) > 0.7 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Too much memory being used by ingesters in namespace {{ $labels.namespace }}, add more ingesters. + |||, + }, + }, + ], + }, + { + name: 'memcached', + rules: [ + { + alert: 'MemcachedDown', + expr: ||| + memcached_up == 0 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Memcached Instance {{ $labels.instance }} is down for more than 15mins. + |||, + }, + }, + ], + }, + { + name: 'ruler_alerts', + rules: [ + { + alert: 'CortexRulerFailedEvaluations', + expr: ||| + sum(rate(cortex_prometheus_rule_evaluation_failures_total[1m])) by (namespace, job) + / + sum(rate(cortex_prometheus_rule_evaluation_total[1m])) by (namespace, job) + > 0.01 + |||, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'CortexRulerMissedEvaluations', + expr: ||| + sum(rate(cortex_prometheus_rule_group_missed_iterations_total[1m])) by (namespace, job) + / + sum(rate(cortex_prometheus_rule_group_iterations_total[1m])) by (namespace, job) + > 0.01 + |||, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% missed iterations. + |||, + }, + }, + ], + }, + ], + }, +} diff --git a/jsonnet/mimir-mixin/dashboards.jsonnet b/jsonnet/mimir-mixin/dashboards.jsonnet new file mode 100644 index 00000000000..fb102817cd9 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards.jsonnet @@ -0,0 +1,6 @@ +local dashboards = (import 'mixin.libsonnet').dashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet new file mode 100644 index 00000000000..4400365532e --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -0,0 +1,649 @@ +local utils = (import 'mixin-utils/utils.libsonnet'); + +local g = (import 'grafana-builder/grafana.libsonnet') + { + qpsPanel(selector):: + super.qpsPanel(selector) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + }, + + latencyPanel(metricName, selector, multiplier='1e3'):: + super.latencyPanel(metricName, selector, multiplier) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + }, +}; + +{ + _config+:: { + storage_backend: error 'must specify storage backend (cassandra, gcp)', + gcs_enabled: false, + }, + + dashboards+: { + 'cortex-writes.json': + if $._config.gcs_enabled then + $.cortex_writes_dashboard.addRow( + g.row('GCS') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_gcs_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="POST"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'POST')]) + ) + ) + else $.cortex_writes_dashboard, + + 'cortex-reads.json': + if $._config.gcs_enabled then + $.cortex_reads_dashboard.addRow( + g.row('GCS') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_gcs_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="GET"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'GET')]) + ) + ) + else $.cortex_reads_dashboard, + + 'cortex-chunks.json': + g.dashboard('Cortex / Chunks') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + g.row('Active Series / Chunks') + .addPanel( + g.panel('Series') + + g.queryPanel('sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'series'), + ) + .addPanel( + g.panel('Chunks per series') + + g.queryPanel('sum(cortex_ingester_memory_chunks{cluster=~"$cluster", job=~"($namespace)/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'chunks'), + ) + ) + .addRow( + g.row('Flush Stats') + .addPanel( + g.panel('Utilization') + + g.latencyPanel('cortex_ingester_chunk_utilization', '{cluster=~"$cluster", job=~"($namespace)/ingester"}', multiplier='1') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Age') + + g.latencyPanel('cortex_ingester_chunk_age_seconds', '{cluster=~"$cluster", job=~"($namespace)/ingester"}'), + ), + ) + .addRow( + g.row('Flush Stats') + .addPanel( + g.panel('Size') + + g.latencyPanel('cortex_ingester_chunk_length', '{cluster=~"$cluster", job=~"($namespace)/ingester"}', multiplier='1') + + { yaxes: g.yaxes('short') }, + ) + .addPanel( + g.panel('Entries') + + g.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{cluster=~"$cluster", job=~"($namespace)/ingester"}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{cluster=~"$cluster", job=~"($namespace)/ingester"}[5m]))', 'entries'), + ), + ) + .addRow( + g.row('Flush Stats') + .addPanel( + g.panel('Queue Length') + + g.queryPanel('cortex_ingester_flush_queue_length{cluster=~"$cluster", job=~"($namespace)/ingester"}', '{{instance}}'), + ) + .addPanel( + g.panel('Flush Rate') + + g.qpsPanel('cortex_ingester_chunk_age_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester"}'), + ), + ), + + 'cortex-queries.json': + g.dashboard('Cortex / Queries') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + g.row('Query Frontend') + .addPanel( + g.panel('Queue Duration') + + g.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/query-frontend"}'), + ) + .addPanel( + g.panel('Retries') + + g.latencyPanel('cortex_query_frontend_retries', '{cluster=~"$cluster", job=~"($namespace)/query-frontend"}', multiplier=1) + + { yaxes: g.yaxes('short') }, + ) + .addPanel( + g.panel('Queue Length') + + g.queryPanel('cortex_query_frontend_queue_length{cluster=~"$cluster", job=~"($namespace)/query-frontend"}', '{{cluster}} / {{namespace}} / {{instance}}'), + ) + ) + .addRow( + g.row('Query Frontend - Results Cache') + .addPanel( + g.panel('Cache Hit %') + + g.queryPanel('sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m])) / sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m]))', 'Hit Rate') + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Cache misses') + + g.queryPanel('sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m])) - sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m]))', 'Miss Rate'), + ) + ) + .addRow( + g.row('Querier') + .addPanel( + g.panel('Stages') + + g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",cluster=~"$cluster",job=~"($namespace)/querier"}) * 1e3', '{{slice}}') + + { yaxes: g.yaxes('ms') } + + g.stack, + ) + .addPanel( + g.panel('Chunk cache misses') + + g.queryPanel('sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/querier",name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/querier",name="chunksmemcache"}[1m]))', 'Hit rate'), + ) + .addPanel( + g.panel('Chunk cache corruptions') + + g.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'Corrupt chunks'), + ) + ) + .addRow( + g.row('Querier - Index Cache') + .addPanel( + g.panel('Total entries') + + g.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"})', 'Entries'), + ) + .addPanel( + g.panel('Cache Hit %') + + g.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'hit rate') + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Churn Rate') + + g.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'churn rate'), + ) + ) + .addRow( + g.row('Ingester') + .addPanel( + g.panel('Series per Query') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_series', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + + { yaxes: g.yaxes('short') }, + ) + .addPanel( + g.panel('Chunks per Query') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + + { yaxes: g.yaxes('short') }, + ) + .addPanel( + g.panel('Samples per Query') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + + { yaxes: g.yaxes('short') }, + ) + ) + .addRow( + g.row('Chunk Store') + .addPanel( + g.panel('Index Lookups per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + { yaxes: g.yaxes('short') }, + ) + .addPanel( + g.panel('Series (pre-intersection) per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + { yaxes: g.yaxes('short') }, + ) + .addPanel( + g.panel('Series (post-intersection) per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + { yaxes: g.yaxes('short') }, + ) + .addPanel( + g.panel('Chunks per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + { yaxes: g.yaxes('short') }, + ) + ), + + 'frontend.json': + g.dashboard('Frontend') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + g.row('Cortex Reqs (cortex_gw)') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw')]) + ) + ), + + 'ruler.json': + g.dashboard('Cortex / Ruler') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + g.row('Rule Evaluations') + .addPanel( + g.panel('EPS') + + g.queryPanel('sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval]))', 'rules processed'), + ) + .addPanel( + g.panel('Latency') + + g.queryPanel( + ||| + sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) + / + sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) + |||, 'average' + ), + ) + ) + .addRow( + g.row('Group Evaluations') + .addPanel( + g.panel('Missed Iterations') + + g.queryPanel('sum(rate(prometheus_rule_group_iterations_missed_total{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval]))', 'iterations missed'), + ) + .addPanel( + g.panel('Latency') + + g.queryPanel( + ||| + sum (rate(cortex_prometheus_rule_group_duration_seconds_sum{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) + / + sum (rate(cortex_prometheus_rule_group_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) + |||, 'average' + ), + ) + ), + + 'cortex-scaling.json': + g.dashboard('Cortex / Scaling') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + g.row('Workload-based scaling') + .addPanel( + g.panel('Workload-based scaling') + { sort: { col: 1, desc: false } } + + g.tablePanel([ + ||| + sum by (cluster, namespace, deployment) ( + kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"} + or + label_replace( + kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"}, + "deployment", "$1", "statefulset", "(.*)" + ) + ) + |||, + ||| + quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(cortex_distributor_received_samples_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "ingester", "cluster", ".*"))[1h:]) + * 3 / 80e3 + |||, + ||| + label_replace( + sum by(cluster, namespace) ( + cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace"} + ) / 1e+6, + "deployment", "ingester", "cluster", ".*" + ) + or + label_replace( + sum by (cluster, namespace) ( + 4 * cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} + * + cortex_ingester_chunk_size_bytes_sum{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} + / + cortex_ingester_chunk_size_bytes_count{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} + ) + / + avg by (cluster, namespace) (memcached_limit_bytes{cluster=~"$cluster", namespace=~"$namespace", job=~".+/memcached"}), + "deployment", "memcached", "namespace", ".*" + ) + |||, + ], { + cluster: { alias: 'Cluster' }, + namespace: { alias: 'Namespace' }, + deployment: { alias: 'Deployment' }, + 'Value #A': { alias: 'Current Replicas', decimals: 0 }, + 'Value #B': { alias: 'Required Replicas, by ingestion rate', decimals: 0 }, + 'Value #C': { alias: 'Required Replicas, by active series', decimals: 0 }, + }) + ) + ) + .addRow( + (g.row('Resource-based scaling') + { height: '500px' }) + .addPanel( + g.panel('Resource-based scaling') + { sort: { col: 1, desc: false } } + + g.tablePanel([ + ||| + sum by (cluster, namespace, deployment) ( + kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} + or + label_replace( + kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, + "deployment", "$1", "statefulset", "(.*)" + ) + ) + |||, + ||| + sum by (cluster, namespace, deployment) ( + kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} + or + label_replace( + kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, + "deployment", "$1", "statefulset", "(.*)" + ) + ) + * + quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod_name", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:]) + / + sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_cpu_cores{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) + |||, + ||| + sum by (cluster, namespace, deployment) ( + kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} + or + label_replace( + kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, + "deployment", "$1", "statefulset", "(.*)" + ) + ) + * + quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod_name", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m]) + / + sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_memory_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) + |||, + ], { + cluster: { alias: 'Cluster' }, + namespace: { alias: 'Namespace' }, + deployment: { alias: 'Deployment' }, + 'Value #A': { alias: 'Current Replicas', decimals: 0 }, + 'Value #B': { alias: 'Required Replicas, by CPU usage', decimals: 0 }, + 'Value #C': { alias: 'Required Replicas, by Memory usage', decimals: 0 }, + }) + ) + ), + }, + + cortex_writes_dashboard:: + g.dashboard('Cortex / Writes') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + (g.row('Headlines') + + { + height: '100px', + showTitle: false, + }) + .addPanel( + g.panel('Samples / s') + + g.statPanel('sum(cluster_namespace:cortex_distributor_received_samples:rate5m{cluster=~"$cluster", namespace=~"$namespace"})', format='reqps') + ) + .addPanel( + g.panel('Active Series') + + g.statPanel(||| + sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"} + / on(namespace) group_left + max by (namespace) (cortex_distributor_replication_factor{cluster=~"$cluster", job=~"($namespace)/distributor"})) + |||, format='short') + ) + .addPanel( + g.panel('QPS') + + g.statPanel('sum(rate(cortex_gw_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="cortex-write"}[5m]))', format='reqps') + ) + ) + .addRow( + g.row('Legacy Gateway') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="cortex-write"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.eq('route', 'cortex-write')]) + ) + ) + .addRow( + g.row('Gateway') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="api_prom_push"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.eq('route', 'api_prom_push')]) + ) + ) + .addRow( + g.row('Distributor') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) + ) + ) + .addRow( + g.row('Etcd (HA Dedupe)') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) + ) + ) + .addRow( + g.row('Ingester') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route="/cortex.Ingester/Push"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) + ) + ) + .addRow( + g.row('Consul (Ring)') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')]) + ) + ) + .addRow( + g.row('Memcached') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_memcache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",method="Memcache.Put"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('method', 'Memcache.Put')]) + ) + ) + .addRow({ + cassandra: + g.row('Cassandra') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_cassandra_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="INSERT"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', 'INSERT')]) + ), + + gcp: + g.row('BigTable') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="/google.bigtable.v2.Bigtable/MutateRows"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')]) + ), + + dynamodb: + g.row('DynamoDB') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_dynamo_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="DynamoDB.BatchWriteItem"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', 'DynamoDB.BatchWriteItem')]) + ), + }[$._config.storage_backend]), + + cortex_reads_dashboard:: + g.dashboard('Cortex / Reads') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + g.row('Legacy Gateway') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="cortex-read"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.eq('route', 'cortex-read')]) + ) + ) + .addRow( + g.row('Gateway') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route=~"(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.re('route', '(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)')]) + ) + ) + .addRow( + g.row('Query Frontend') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend'), utils.selector.neq('route', '/frontend.Frontend/Process')]) + ) + ) + .addRow( + g.row('Cache - Query Results') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend')]) + ) + ) + .addRow( + g.row('Querier') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')]) + ) + ) + .addRow( + g.row('Ingester') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route!~"/cortex.Ingester/Push|metrics|ready|traces"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.nre('route', '/cortex.Ingester/Push|metrics|ready')]) + ) + ) + .addRow( + g.row('Memcached - Index') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="store.index-cache-read.memcache.fetch"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) + ) + ) + .addRow( + g.row('Memcached - Chunks') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="chunksmemcache.fetch"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'chunksmemcache.fetch')]) + ) + ) + .addRow({ + cassandra: + g.row('Cassandra') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_cassandra_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="SELECT"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'SELECT')]) + ), + + gcp: + g.row('BigTable') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="/google.bigtable.v2.Bigtable/ReadRows"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) + ), + + dynamodb: + g.row('DynamoDB') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_dynamo_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="DynamoDB.QueryPages"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'DynamoDB.QueryPages')]) + ), + }[$._config.storage_backend]), +} diff --git a/jsonnet/mimir-mixin/jsonnetfile.json b/jsonnet/mimir-mixin/jsonnetfile.json new file mode 100644 index 00000000000..87e724d5ed1 --- /dev/null +++ b/jsonnet/mimir-mixin/jsonnetfile.json @@ -0,0 +1,24 @@ +{ + "dependencies": [ + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "grafana-builder" + } + }, + "version": "master" + }, + { + "name": "mixin-utils", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "mixin-utils" + } + }, + "version": "master" + } + ] +} diff --git a/jsonnet/mimir-mixin/jsonnetfile.lock.json b/jsonnet/mimir-mixin/jsonnetfile.lock.json new file mode 100644 index 00000000000..fe50a404cb0 --- /dev/null +++ b/jsonnet/mimir-mixin/jsonnetfile.lock.json @@ -0,0 +1,26 @@ +{ + "dependencies": [ + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "grafana-builder" + } + }, + "version": "8f9d72b2e35b5f3cc1b7c2a8af9bbae7658804e2", + "sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE=" + }, + { + "name": "mixin-utils", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "mixin-utils" + } + }, + "version": "8f9d72b2e35b5f3cc1b7c2a8af9bbae7658804e2", + "sum": "J1iExBloZLjVEvdzHVjvP9AVTqDOJSfFOtBoeQ7EhKk=" + } + ] +} diff --git a/jsonnet/mimir-mixin/mixin.libsonnet b/jsonnet/mimir-mixin/mixin.libsonnet new file mode 100644 index 00000000000..b2b2f10dd23 --- /dev/null +++ b/jsonnet/mimir-mixin/mixin.libsonnet @@ -0,0 +1,3 @@ +(import 'dashboards.libsonnet') + +(import 'alerts.libsonnet') + +(import 'recording_rules.libsonnet') diff --git a/jsonnet/mimir-mixin/recording_rules.jsonnet b/jsonnet/mimir-mixin/recording_rules.jsonnet new file mode 100644 index 00000000000..4cda6c6f256 --- /dev/null +++ b/jsonnet/mimir-mixin/recording_rules.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheus_rules) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet new file mode 100644 index 00000000000..7291ac5c3f7 --- /dev/null +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -0,0 +1,114 @@ +local utils = import 'mixin-utils/utils.libsonnet'; +local windows = [ + { period: '5m' }, + { period: '30m' }, + { period: '1h' }, + { period: '2h' }, + { period: '6h' }, + { period: '1d' }, + { period: '3d' }, +]; + +{ + prometheus_rules+:: { + groups+: [{ + name: 'cortex_rules', + rules: + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job']) + + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job', 'route']) + + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']) + + utils.histogramRules('cortex_memcache_request_duration_seconds', ['cluster', 'job', 'method']) + + utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job']) + + utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job', 'method']) + + utils.histogramRules('cortex_bigtable_request_duration_seconds', ['cluster', 'job', 'operation']) + + utils.histogramRules('cortex_cassandra_request_duration_seconds', ['cluster', 'job', 'operation']) + + utils.histogramRules('cortex_dynamo_request_duration_seconds', ['cluster', 'job', 'operation']) + + utils.histogramRules('cortex_query_frontend_retries', ['cluster', 'job']) + + utils.histogramRules('cortex_query_frontend_queue_duration_seconds', ['cluster', 'job']) + + utils.histogramRules('cortex_ingester_queried_series', ['cluster', 'job']) + + utils.histogramRules('cortex_ingester_queried_chunks', ['cluster', 'job']) + + utils.histogramRules('cortex_ingester_queried_samples', ['cluster', 'job']) + + utils.histogramRules('cortex_chunk_store_index_lookups_per_query', ['cluster', 'job']) + + utils.histogramRules('cortex_chunk_store_series_pre_intersection_per_query', ['cluster', 'job']) + + utils.histogramRules('cortex_chunk_store_series_post_intersection_per_query', ['cluster', 'job']) + + utils.histogramRules('cortex_chunk_store_chunks_per_query', ['cluster', 'job']) + + utils.histogramRules('cortex_database_request_duration_seconds', ['cluster', 'job', 'method']) + + utils.histogramRules('cortex_gcs_request_duration_seconds', ['cluster', 'job', 'operation']) + + utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job']), + }, { + name: 'frontend_rules', + rules: + utils.histogramRules('tsdb_gw_request_duration_seconds', ['cluster', 'job']) + + utils.histogramRules('tsdb_gw_request_duration_seconds', ['cluster', 'job', 'route']) + + utils.histogramRules('tsdb_gw_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']) + + utils.histogramRules('cortex_gw_request_duration_seconds', ['cluster', 'job']) + + utils.histogramRules('cortex_gw_request_duration_seconds', ['cluster', 'job', 'route']) + + utils.histogramRules('cortex_gw_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']), + }, { + name: 'cortex_slo_rules', + rules: [ + { + record: 'namespace_job:cortex_gateway_write_slo_errors_per_request:ratio_rate%(period)s' % window, + expr: ||| + 1 - + ( + sum by (namespace, job) (rate(cortex_request_duration_seconds_bucket{status_code!~"5..", le="1", route="api_prom_push", job=~".*/cortex-gw"}[%(period)s])) + / + sum by (namespace, job) (rate(cortex_request_duration_seconds_count{route="api_prom_push", job=~".*/cortex-gw"}[%(period)s])) + ) + ||| % window, + } + for window in windows + ] + [ + { + record: 'namespace_job:cortex_gateway_read_slo_errors_per_request:ratio_rate%(period)s' % window, + expr: ||| + 1 - + ( + sum by (namespace, job) (rate(cortex_request_duration_seconds_bucket{status_code!~"5..",le="2.5",route=~"api_prom_api_v1_query.*", job=~".*/cortex-gw"}[%(period)s])) + / + sum by (namespace, job) (rate(cortex_request_duration_seconds_count{route=~"api_prom_api_v1_query.*", job=~".*/cortex-gw"}[%(period)s])) + ) + ||| % window, + } + for window in windows + ] + [ + { + record: 'namespace_job:cortex_gw_write_slo_errors_per_request:ratio_rate%(period)s' % window, + expr: ||| + 1 - + ( + sum by (namespace, job) (rate(cortex_gw_request_duration_seconds_bucket{status_code!~"error|5..",le="1",route="cortex-write"}[%(period)s])) + / + sum by (namespace, job) (rate(cortex_gw_request_duration_seconds_count{route="cortex-write"}[%(period)s])) + ) + ||| % window, + } + for window in windows + ] + [ + { + record: 'namespace_job:cortex_gw_read_slo_errors_per_request:ratio_rate%(period)s' % window, + expr: ||| + 1 - + ( + sum by (namespace, job) (rate(cortex_gw_request_duration_seconds_bucket{status_code!~"error|5..",le="2.5",route="cortex-read"}[%(period)s])) + / + sum by (namespace, job) (rate(cortex_gw_request_duration_seconds_count{route="cortex-read"}[%(period)s])) + ) + ||| % window, + } + for window in windows + ], + }, { + name: 'cortex_received_samples', + rules: [ + { + record: 'cluster_namespace:cortex_distributor_received_samples:rate5m', + expr: ||| + sum by (cluster, namespace) (rate(cortex_distributor_received_samples_total{job=~".*/distributor"}[5m])) + |||, + }, + ], + }], + }, +} From 06169e7df7612c86575bbde52ffd3091f46f084c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Wed, 4 Mar 2020 12:08:03 +0100 Subject: [PATCH 002/364] Copy internal changes to public mixin. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Peter Štibraný --- jsonnet/mimir-mixin/alerts.libsonnet | 133 +--- jsonnet/mimir-mixin/dashboards.libsonnet | 695 ++++++++++++------ jsonnet/mimir-mixin/recording_rules.libsonnet | 31 +- 3 files changed, 494 insertions(+), 365 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index a5a547b38e7..f106cc3889f 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -118,7 +118,7 @@ local windows = [ { alert: 'CortexBadOverrides', expr: ||| - cortex_overrides_last_reload_successful{job!~".+/table-manager"} == 0 + cortex_overrides_last_reload_successful{job!~".+/table-manager|.+/alertmanager"} == 0 |||, 'for': '15m', labels: { @@ -200,16 +200,16 @@ local windows = [ severity: 'critical', }, annotations: { - message: '{{ $labels.namespace }}/{{ $labels.pod }} transfer failed.', + message: '{{ $labels.namespace }}/{{ $labels.instance }} transfer failed.', }, }, { alert: 'CortexOldChunkInMemory', - // We flush chunks after 6h and then keep them in memory for extra 15m. If chunks are older - // than 7h (= 25200 seconds), raise an alert. Ignore cortex_oldest_unflushed_chunk_timestamp_seconds - // that are zero (eg. distributors). + // Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer + // to 10 hours. + // Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors). expr: ||| - (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 25200) and cortex_oldest_unflushed_chunk_timestamp_seconds > 0 + (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000) and cortex_oldest_unflushed_chunk_timestamp_seconds > 0 |||, 'for': '5m', labels: { @@ -217,7 +217,7 @@ local windows = [ }, annotations: { message: ||| - {{ $labels.namespace }}/{{ $labels.pod }} has very old unflushed chunk in memory. + {{ $labels.namespace }}/{{ $labels.instance }} has very old unflushed chunk in memory. |||, }, }, @@ -279,103 +279,8 @@ local windows = [ }, } for window in windows - ] + [ - { - alert: 'LegacyCortexWriteErrorBudgetBurn', - expr: ||| - ( - ( - 100 * namespace_job:cortex_gw_write_slo_errors_per_request:ratio_rate%(long_period)s - > 0.1 * %(factor)f - ) - and - ( - 100 * namespace_job:cortex_gw_write_slo_errors_per_request:ratio_rate%(short_period)s - > 0.1 * %(factor)f - ) - ) - ||| % window, - 'for': window.for_period, - labels: { - severity: window.severity, - period: window.long_period, // The annotation alone doesn't make this alert unique. - }, - annotations: { - summary: 'Cortex burns its write error budget too fast.', - description: "{{ $value | printf `%%.2f` }}%% of {{ $labels.job }}'s write requests in the last %(long_period)s are failing or too slow to meet the SLO." % window, - }, - } - for window in windows - ] + [ - { - alert: 'LegacyCortexReadErrorBudgetBurn', - expr: ||| - ( - ( - 100 * namespace_job:cortex_gw_read_slo_errors_per_request:ratio_rate%(long_period)s - > 0.5 * %(factor)f - ) - and - ( - 100 * namespace_job:cortex_gw_read_slo_errors_per_request:ratio_rate%(short_period)s - > 0.5 * %(factor)f - ) - ) - ||| % window, - 'for': window.for_period, - labels: { - severity: window.severity, - period: window.long_period, // The annotation alone doesn't make this alert unique. - }, - annotations: { - summary: 'Cortex burns its read error budget too fast.', - description: "{{ $value | printf `%%.2f` }}%% of {{ $labels.job }}'s read requests in the last %(long_period)s are failing or too slow to meet the SLO." % window, - }, - } - for window in windows ], }, - { - name: 'cortex_gw_alerts', - rules: [ - { - alert: 'CortexGWRequestErrors', - expr: ||| - 100 * sum(rate(cortex_gw_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) - / - sum(rate(cortex_gw_request_duration_seconds_count[1m])) by (namespace, job, route) - > 0.1 - |||, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, - }, - }, - { - alert: 'CortexGWRequestLatency', - expr: ||| - namespace_job_route:cortex_gw_request_duration_seconds:99quantile{route!="metrics"} - > - %(cortex_p99_latency_threshold_seconds)s - ||| % $._config, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - |||, - }, - }, - ], - }, - { name: 'cortex-provisioning', rules: [ @@ -482,7 +387,7 @@ local windows = [ expr: ||| sum(rate(cortex_prometheus_rule_evaluation_failures_total[1m])) by (namespace, job) / - sum(rate(cortex_prometheus_rule_evaluation_total[1m])) by (namespace, job) + sum(rate(cortex_prometheus_rule_evaluations_total[1m])) by (namespace, job) > 0.01 |||, 'for': '5m', @@ -498,7 +403,7 @@ local windows = [ { alert: 'CortexRulerMissedEvaluations', expr: ||| - sum(rate(cortex_prometheus_rule_group_missed_iterations_total[1m])) by (namespace, job) + sum(rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) by (namespace, job) / sum(rate(cortex_prometheus_rule_group_iterations_total[1m])) by (namespace, job) > 0.01 @@ -515,6 +420,26 @@ local windows = [ }, ], }, + { + name: 'gossip_alerts', + rules: [ + { + alert: 'CortexGossipMembersMismatch', + expr: ||| + memberlist_client_cluster_members_count + != on (cluster,namespace) group_left + sum(up{job=~".+/(distributor|ingester|querier)"}) by (cluster,namespace) + |||, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.', + }, + }, + ], + }, ], }, } diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 4400365532e..9aced072b0a 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -20,18 +20,75 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { for target in super.targets ], }, + + successFailurePanel(title, successMetric, failureMetric):: + g.panel(title) + + g.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + + g.stack + { + aliasColors: { + successful: '#7EB26D', + failed: '#E24D42', + }, + }, + + objectStorePanels1(title, metricPrefix):: + local opsTotal = '%s_thanos_objstore_bucket_operations_total' % [metricPrefix]; + local opsTotalFailures = '%s_thanos_objstore_bucket_operation_failures_total' % [metricPrefix]; + local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; + local interval = '$__interval'; + super.row(title) + .addPanel( + // We use 'up{cluster=~"$cluster", job="($namespace)/.+"}' to add 0 if there are no failed operations. + self.successFailurePanel( + 'Operations/sec', + 'sum(rate(%s{cluster=~"$cluster"}[%s])) - sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotal, interval, opsTotalFailures, interval], + 'sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotalFailures, interval] + ) + ) + .addPanel( + g.panel('Op: ObjectSize') + + g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="objectsize"}'), + ) + // oper="iter" is also available, but not used. + .addPanel( + g.panel('Op: Exists') + + g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="exists"}'), + ), + + // Second row of Object Store stats + objectStorePanels2(title, metricPrefix):: + local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; + super.row(title) + .addPanel( + g.panel('Op: Get') + + g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get"}'), + ) + .addPanel( + g.panel('Op: GetRange') + + g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get_range"}'), + ) + .addPanel( + g.panel('Op: Upload') + + g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="upload"}'), + ) + .addPanel( + g.panel('Op: Delete') + + g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="delete"}'), + ), }; { _config+:: { storage_backend: error 'must specify storage backend (cassandra, gcp)', + // may contain 'chunks', 'tsdb' or both. Enables chunks- or tsdb- specific panels and dashboards. + storage_engine: ['chunks'], gcs_enabled: false, }, dashboards+: { 'cortex-writes.json': - if $._config.gcs_enabled then - $.cortex_writes_dashboard.addRow( + local addGcsRow(dashboard) = if $._config.gcs_enabled then + dashboard.addRow( g.row('GCS') .addPanel( g.panel('QPS') + @@ -41,12 +98,29 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { g.panel('Latency') + utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'POST')]) ) + ) else dashboard; + + local addBlocksRows(dashboard) = if std.setMember('tsdb', $._config.storage_engine) then + // Used by ingester when using TSDB storage engine. + dashboard.addRow( + g.row('Blocks Shipper') + .addPanel( + g.successFailurePanel( + 'Uploaded blocks / sec', + 'sum(rate(cortex_ingester_shipper_uploads_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~"$cluster"}[$__interval]))', + 'sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~"$cluster"}[$__interval]))' + ) + ) ) - else $.cortex_writes_dashboard, + .addRow(g.objectStorePanels1('Blocks Object Store Stats (Ingester)', 'cortex_ingester')) + .addRow(g.objectStorePanels2('', 'cortex_ingester')) + else dashboard; + + addBlocksRows(addGcsRow($.cortex_writes_dashboard)), 'cortex-reads.json': - if $._config.gcs_enabled then - $.cortex_reads_dashboard.addRow( + local addGcsRows(dashboard) = if $._config.gcs_enabled then + dashboard.addRow( g.row('GCS') .addPanel( g.panel('QPS') + @@ -57,9 +131,41 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'GET')]) ) ) - else $.cortex_reads_dashboard, + else dashboard; + + local addBlocksRows(dashboard) = if std.setMember('tsdb', $._config.storage_engine) then + dashboard.addRow( + g.row('Querier - Blocks Storage') + .addPanel( + g.successFailurePanel( + 'Block Loads / sec', + 'sum(rate(cortex_querier_bucket_store_block_loads_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_load_failures_total{cluster=~"$cluster"}[$__interval]))', + 'sum(rate(cortex_querier_bucket_store_block_load_failures_total{cluster=~"$cluster"}[$__interval]))' + ) + ) + .addPanel( + g.successFailurePanel( + 'Block Drops / sec', + 'sum(rate(cortex_querier_bucket_store_block_drops_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_drop_failures_total{cluster=~"$cluster"}[$__interval]))', + 'sum(rate(cortex_querier_bucket_store_block_drop_failures_total{cluster=~"$cluster"}[$__interval]))' + ) + ) + .addPanel( + g.panel('Per-block prepares and preloads duration') + + g.latencyPanel('cortex_querier_bucket_store_series_get_all_duration_seconds', '{cluster=~"$cluster"}'), + ) + .addPanel( + g.panel('Series merge duration') + + g.latencyPanel('cortex_querier_bucket_store_series_merge_duration_seconds', '{cluster=~"$cluster"}'), + ) + ) + .addRow(g.objectStorePanels1('Blocks Object Store Stats (Querier)', 'cortex_querier')) + .addRow(g.objectStorePanels2('', 'cortex_querier')) + else dashboard; - 'cortex-chunks.json': + addBlocksRows(addGcsRows($.cortex_reads_dashboard)), + + [if std.setMember('chunks', $._config.storage_engine) then 'cortex-chunks.json' else null]: g.dashboard('Cortex / Chunks') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') @@ -217,22 +323,6 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { ) ), - 'frontend.json': - g.dashboard('Frontend') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - g.row('Cortex Reqs (cortex_gw)') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw')]) - ) - ), - 'ruler.json': g.dashboard('Cortex / Ruler') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') @@ -258,7 +348,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { g.row('Group Evaluations') .addPanel( g.panel('Missed Iterations') + - g.queryPanel('sum(rate(prometheus_rule_group_iterations_missed_total{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval]))', 'iterations missed'), + g.queryPanel('sum(rate(cortex_prometheus_rule_group_iterations_missed_total{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval]))', 'iterations missed'), ) .addPanel( g.panel('Latency') + @@ -379,114 +469,255 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { }) ) ), - }, - cortex_writes_dashboard:: - g.dashboard('Cortex / Writes') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - (g.row('Headlines') + - { - height: '100px', - showTitle: false, - }) - .addPanel( - g.panel('Samples / s') + - g.statPanel('sum(cluster_namespace:cortex_distributor_received_samples:rate5m{cluster=~"$cluster", namespace=~"$namespace"})', format='reqps') - ) - .addPanel( - g.panel('Active Series') + - g.statPanel(||| - sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"} - / on(namespace) group_left - max by (namespace) (cortex_distributor_replication_factor{cluster=~"$cluster", job=~"($namespace)/distributor"})) - |||, format='short') - ) - .addPanel( - g.panel('QPS') + - g.statPanel('sum(rate(cortex_gw_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="cortex-write"}[5m]))', format='reqps') - ) - ) - .addRow( - g.row('Legacy Gateway') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="cortex-write"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.eq('route', 'cortex-write')]) + [if std.setMember('tsdb', $._config.storage_engine) then 'cortex-blocks.json' else null]: + g.dashboard('Cortex / Blocks') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + // repeated from Cortex / Chunks + .addRow( + g.row('Active Series / Chunks') + .addPanel( + g.panel('Series') + + g.queryPanel('sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'series'), + ) + // Chunks per series doesn't make sense for Blocks storage ) - ) - .addRow( - g.row('Gateway') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="api_prom_push"}') + .addRow( + g.row('Compactor') + .addPanel( + g.successFailurePanel( + 'Compactor Runs / second', + 'sum(rate(cortex_compactor_runs_completed_total{cluster=~"$cluster"}[$__interval]))', + 'sum(rate(cortex_compactor_runs_failed_total{cluster=~"$cluster"}[$__interval]))' + ) + ) + .addPanel( + g.successFailurePanel( + 'Per-tenant Compaction Runs / seconds', + 'sum(rate(cortex_compactor_group_compactions_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_group_compactions_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + 'sum(rate(cortex_compactor_group_compactions_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + ) + ) ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.eq('route', 'api_prom_push')]) + .addRow( + g.row('Compactor – Blocks Garbage Collections') + .addPanel( + g.successFailurePanel( + 'Collections Rate', + 'sum(rate(cortex_compactor_garbage_collection_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_garbage_collection_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + 'sum(rate(cortex_compactor_garbage_collection_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + ) + ) + .addPanel( + g.panel('Collections Duration') + + g.latencyPanel('cortex_compactor_garbage_collection_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + ) + .addPanel( + g.panel('Collected Blocks Rate') + + g.queryPanel('sum(rate(cortex_compactor_garbage_collected_blocks_total{cluster=~"$cluster"}[$__interval]))', 'blocks') + ) ) - ) - .addRow( - g.row('Distributor') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') + .addRow( + g.row('Compactor - Meta Syncs') + .addPanel( + g.successFailurePanel( + 'Meta Syncs / sec', + 'sum(rate(cortex_compactor_sync_meta_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_sync_meta_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + 'sum(rate(cortex_compactor_sync_meta_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + ) + ) + .addPanel( + g.panel('Meta Sync Durations') + + g.latencyPanel('cortex_compactor_sync_meta_duration_seconds', '{cluster=~"$cluster"}'), + ) ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) + .addRow( + g.row('Prometheus TSDB Compactions') + .addPanel( + g.panel('Compactions Rate') + + g.queryPanel('sum(rate(prometheus_tsdb_compactions_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', 'rate') + ) + .addPanel( + g.panel('Compaction Duration') + + g.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + ) + .addPanel( + g.panel('Chunk Size Bytes') + + g.latencyPanel('prometheus_tsdb_compaction_chunk_size_bytes', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + + { yaxes: g.yaxes('bytes') } + ) + .addPanel( + g.panel('Chunk Samples') + + g.latencyPanel('prometheus_tsdb_compaction_chunk_samples', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + + { yaxes: g.yaxes('short') } + ) + .addPanel( + g.panel('Chunk Range (seconds)') + + g.latencyPanel('prometheus_tsdb_compaction_chunk_range_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + ) ) - ) - .addRow( - g.row('Etcd (HA Dedupe)') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') + .addRow(g.objectStorePanels1('Object Store Stats', 'cortex_compactor')) + .addRow(g.objectStorePanels2('', 'cortex_compactor')), + + [if std.setMember('tsdb', $._config.storage_engine) && std.setMember('chunks', $._config.storage_engine) then 'cortex-blocks-vs-chunks.json' else null]: + g.dashboard('Cortex / Blocks vs Chunks') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addTemplate('blocks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addTemplate('chunks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + g.row('Ingesters') + .addPanel( + g.panel('Samples / sec') + + g.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($blocks_namespace)/ingester"}[1m]))', 'blocks') + + g.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($chunks_namespace)/ingester"}[1m]))', 'chunks') + ) ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) + .addRow( + g.row('') + .addPanel( + g.panel('Blocks Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($blocks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) + ) + .addPanel( + g.panel('Chunks Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($chunks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) + ) ) - ) - .addRow( - g.row('Ingester') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route="/cortex.Ingester/Push"}') + .addRow( + g.row('') + .addPanel( + g.panel('CPU per sample') + + g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[1m])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[1m]))', 'blocks') + + g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[1m])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[1m]))', 'chunks') + ) + .addPanel( + g.panel('Memory per active series') + + g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - working set') + + g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - working set') + + g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + + g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + + { yaxes: g.yaxes('bytes') } + ) ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) + .addRow( + g.row('') + .addPanel( + g.panel('CPU') + + g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[1m]))', 'blocks') + + g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[1m]))', 'chunks') + ) + .addPanel( + g.panel('Memory') + + g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"})', 'blocks - working set') + + g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"})', 'chunks - working set') + + g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + + g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + + { yaxes: g.yaxes('bytes') } + ) + ), + }, + + cortex_writes_dashboard:: + local out = + g.dashboard('Cortex / Writes') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + (g.row('Headlines') + + { + height: '100px', + showTitle: false, + }) + .addPanel( + g.panel('Samples / s') + + g.statPanel('sum(cluster_namespace:cortex_distributor_received_samples:rate5m{cluster=~"$cluster", namespace=~"$namespace"})', format='reqps') + ) + .addPanel( + g.panel('Active Series') + + g.statPanel(||| + sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"} + / on(namespace) group_left + max by (namespace) (cortex_distributor_replication_factor{cluster=~"$cluster", job=~"($namespace)/distributor"})) + |||, format='short') + ) + .addPanel( + g.panel('QPS') + + g.statPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="api_prom_push"}[5m]))', format='reqps') + ) ) - ) - .addRow( - g.row('Consul (Ring)') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester"}') + .addRow( + g.row('Gateway') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="api_prom_push"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.eq('route', 'api_prom_push')]) + ) ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')]) + .addRow( + g.row('Distributor') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) + ) ) - ) - .addRow( - g.row('Memcached') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_memcache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",method="Memcache.Put"}') + .addRow( + g.row('Etcd (HA Dedupe)') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) + ) ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('method', 'Memcache.Put')]) + .addRow( + g.row('Ingester') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route="/cortex.Ingester/Push"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) + ) ) - ) - .addRow({ - cassandra: + .addRow( + g.row('Consul (Ring)') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')]) + ) + ); + + local addChunksRows(dashboard) = + if std.setMember('chunks', $._config.storage_engine) then + dashboard.addRow( + g.row('Memcached') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_memcache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",method="Memcache.Put"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('method', 'Memcache.Put')]) + ) + ) else dashboard; + + local addStorageRows(dashboard) = if $._config.storage_backend == 'cassandra' then + dashboard.addRow( g.row('Cassandra') .addPanel( g.panel('QPS') + @@ -495,9 +726,10 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { .addPanel( g.panel('Latency') + utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', 'INSERT')]) - ), - - gcp: + ) + ) + else if $._config.storage_backend == 'gcp' && std.setMember('chunks', $._config.storage_engine) then // only show BigTable if chunks panels are enabled + dashboard.addRow( g.row('BigTable') .addPanel( g.panel('QPS') + @@ -506,9 +738,10 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { .addPanel( g.panel('Latency') + utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')]) - ), - - dynamodb: + ) + ) + else if $._config.storage_backend == 'dynamodb' then + dashboard.addRow( g.row('DynamoDB') .addPanel( g.panel('QPS') + @@ -517,133 +750,133 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { .addPanel( g.panel('Latency') + utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', 'DynamoDB.BatchWriteItem')]) - ), - }[$._config.storage_backend]), + ) + ) else dashboard; + + addStorageRows(addChunksRows(out)), cortex_reads_dashboard:: - g.dashboard('Cortex / Reads') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - g.row('Legacy Gateway') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="cortex-read"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.eq('route', 'cortex-read')]) - ) - ) - .addRow( - g.row('Gateway') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route=~"(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.re('route', '(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)')]) - ) - ) - .addRow( - g.row('Query Frontend') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend'), utils.selector.neq('route', '/frontend.Frontend/Process')]) - ) - ) - .addRow( - g.row('Cache - Query Results') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend')]) - ) - ) - .addRow( - g.row('Querier') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')]) - ) - ) - .addRow( - g.row('Ingester') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route!~"/cortex.Ingester/Push|metrics|ready|traces"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.nre('route', '/cortex.Ingester/Push|metrics|ready')]) - ) - ) - .addRow( - g.row('Memcached - Index') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="store.index-cache-read.memcache.fetch"}') + local out = + g.dashboard('Cortex / Reads') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + g.row('Gateway') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route=~"(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.re('route', '(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)')]) + ) ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) + .addRow( + g.row('Query Frontend') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend'), utils.selector.neq('route', '/frontend.Frontend/Process')]) + ) ) - ) - .addRow( - g.row('Memcached - Chunks') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="chunksmemcache.fetch"}') + .addRow( + g.row('Cache - Query Results') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend')]) + ) ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'chunksmemcache.fetch')]) + .addRow( + g.row('Querier') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')]) + ) ) - ) - .addRow({ - cassandra: - g.row('Cassandra') + .addRow( + g.row('Ingester') .addPanel( g.panel('QPS') + - g.qpsPanel('cortex_cassandra_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="SELECT"}') + g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route!~"/cortex.Ingester/Push|metrics|ready|traces"}') ) .addPanel( g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'SELECT')]) - ), + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.nre('route', '/cortex.Ingester/Push|metrics|ready')]) + ) + ); - gcp: - g.row('BigTable') + local addChunksRows(dashboard) = if std.setMember('chunks', $._config.storage_engine) then + dashboard.addRow( + g.row('Memcached - Index') .addPanel( g.panel('QPS') + - g.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="/google.bigtable.v2.Bigtable/ReadRows"}') + g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="store.index-cache-read.memcache.fetch"}') ) .addPanel( g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) - ), - - dynamodb: - g.row('DynamoDB') + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) + ) + ) + .addRow( + g.row('Memcached - Chunks') .addPanel( g.panel('QPS') + - g.qpsPanel('cortex_dynamo_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="DynamoDB.QueryPages"}') + g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="chunksmemcache.fetch"}') ) .addPanel( g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'DynamoDB.QueryPages')]) - ), - }[$._config.storage_backend]), + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'chunksmemcache.fetch')]) + ) + ) else dashboard; + + local addStorageRows(dashboard) = + if $._config.storage_backend == 'cassandra' then + dashboard.addRow( + g.row('Cassandra') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_cassandra_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="SELECT"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'SELECT')]) + ), + ) + else if $._config.storage_backend == 'gcp' && std.setMember('chunks', $._config.storage_engine) then // only show BigTable if chunks panels are enabled + dashboard.addRow( + g.row('BigTable') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="/google.bigtable.v2.Bigtable/ReadRows"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) + ), + ) + else if $._config.storage_backend == 'dynamodb' then + dashboard.addRow( + g.row('DynamoDB') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_dynamo_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="DynamoDB.QueryPages"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'DynamoDB.QueryPages')]) + ), + ) else dashboard; + + addStorageRows(addChunksRows(out)), } diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 7291ac5c3f7..fdb9cf22a80 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -40,10 +40,7 @@ local windows = [ rules: utils.histogramRules('tsdb_gw_request_duration_seconds', ['cluster', 'job']) + utils.histogramRules('tsdb_gw_request_duration_seconds', ['cluster', 'job', 'route']) + - utils.histogramRules('tsdb_gw_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']) + - utils.histogramRules('cortex_gw_request_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('cortex_gw_request_duration_seconds', ['cluster', 'job', 'route']) + - utils.histogramRules('cortex_gw_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']), + utils.histogramRules('tsdb_gw_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']), }, { name: 'cortex_slo_rules', rules: [ @@ -72,32 +69,6 @@ local windows = [ ||| % window, } for window in windows - ] + [ - { - record: 'namespace_job:cortex_gw_write_slo_errors_per_request:ratio_rate%(period)s' % window, - expr: ||| - 1 - - ( - sum by (namespace, job) (rate(cortex_gw_request_duration_seconds_bucket{status_code!~"error|5..",le="1",route="cortex-write"}[%(period)s])) - / - sum by (namespace, job) (rate(cortex_gw_request_duration_seconds_count{route="cortex-write"}[%(period)s])) - ) - ||| % window, - } - for window in windows - ] + [ - { - record: 'namespace_job:cortex_gw_read_slo_errors_per_request:ratio_rate%(period)s' % window, - expr: ||| - 1 - - ( - sum by (namespace, job) (rate(cortex_gw_request_duration_seconds_bucket{status_code!~"error|5..",le="2.5",route="cortex-read"}[%(period)s])) - / - sum by (namespace, job) (rate(cortex_gw_request_duration_seconds_count{route="cortex-read"}[%(period)s])) - ) - ||| % window, - } - for window in windows ], }, { name: 'cortex_received_samples', From 7fdbc97a0adc6237a80b9566b95b7817de675431 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Wed, 4 Mar 2020 12:45:56 +0100 Subject: [PATCH 003/364] Remove obsolete tsdb_gw_request_duration_seconds metrics. --- jsonnet/mimir-mixin/recording_rules.libsonnet | 6 ------ 1 file changed, 6 deletions(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index fdb9cf22a80..56ee5266cc7 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -35,12 +35,6 @@ local windows = [ utils.histogramRules('cortex_database_request_duration_seconds', ['cluster', 'job', 'method']) + utils.histogramRules('cortex_gcs_request_duration_seconds', ['cluster', 'job', 'operation']) + utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job']), - }, { - name: 'frontend_rules', - rules: - utils.histogramRules('tsdb_gw_request_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('tsdb_gw_request_duration_seconds', ['cluster', 'job', 'route']) + - utils.histogramRules('tsdb_gw_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']), }, { name: 'cortex_slo_rules', rules: [ From 800bdac010543b9adbf7e95bc433314ab4f58383 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 5 Mar 2020 17:40:02 +0100 Subject: [PATCH 004/364] Added queriers to Cortex blocks vs chunks dashboard Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards.libsonnet | 56 +++++++++++++++++++++--- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 9aced072b0a..a5ae6684e6e 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -569,8 +569,8 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { g.row('Ingesters') .addPanel( g.panel('Samples / sec') + - g.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($blocks_namespace)/ingester"}[1m]))', 'blocks') + - g.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($chunks_namespace)/ingester"}[1m]))', 'chunks') + g.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($blocks_namespace)/ingester"}[$__interval]))', 'blocks') + + g.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($chunks_namespace)/ingester"}[$__interval]))', 'chunks') ) ) .addRow( @@ -588,8 +588,8 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { g.row('') .addPanel( g.panel('CPU per sample') + - g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[1m])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[1m]))', 'blocks') + - g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[1m])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[1m]))', 'chunks') + g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__interval]))', 'blocks') + + g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__interval]))', 'chunks') ) .addPanel( g.panel('Memory per active series') + @@ -604,8 +604,8 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { g.row('') .addPanel( g.panel('CPU') + - g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[1m]))', 'blocks') + - g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[1m]))', 'chunks') + g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[$__interval]))', 'blocks') + + g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[$__interval]))', 'chunks') ) .addPanel( g.panel('Memory') + @@ -615,6 +615,50 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + { yaxes: g.yaxes('bytes') } ) + ) + .addRow( + g.row('Queriers') + .addPanel( + g.panel('Queries / sec (query-frontend)') + + g.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$blocks_namespace/query-frontend",route!="metrics"}[$__interval]))', 'blocks') + + g.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$chunks_namespace/query-frontend",route!="metrics"}[$__interval]))', 'chunks') + ) + .addPanel( + g.panel('Queries / sec (query-tee)') + + g.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval]))', 'blocks') + + g.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval]))', 'chunks') + ) + ) + .addRow( + g.row('') + .addPanel( + g.panel('Latency 99th') + + g.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval])))', 'blocks') + + g.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval])))', 'chunks') + + { yaxes: g.yaxes('s') } + ) + .addPanel( + g.panel('Latency average') + + g.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval]))', 'blocks') + + g.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval]))', 'chunks') + + { yaxes: g.yaxes('s') } + ) + ) + .addRow( + g.row('') + .addPanel( + g.panel('CPU') + + g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="querier"}[$__interval]))', 'blocks') + + g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="querier"}[$__interval]))', 'chunks') + ) + .addPanel( + g.panel('Memory') + + g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="querier"})', 'blocks - working set') + + g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="querier"})', 'chunks - working set') + + g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/querier"})', 'blocks - heap inuse') + + g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/querier"})', 'chunks - heap inuse') + + { yaxes: g.yaxes('bytes') } + ) ), }, From 3b2f167212383546cb84b6fb477798d9c7a2d8c4 Mon Sep 17 00:00:00 2001 From: Cyril Tovena Date: Wed, 11 Mar 2020 12:38:18 -0400 Subject: [PATCH 005/364] Add the ability to filter alerts by namespace. Signed-off-by: Cyril Tovena --- jsonnet/mimir-mixin/alerts.libsonnet | 35 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index f106cc3889f..6c388ded20f 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -9,6 +9,7 @@ local windows = [ { _config+:: { cortex_p99_latency_threshold_seconds: 2.5, + alert_namespace_matcher: '.*', }, prometheus_alerts+:: { @@ -20,8 +21,8 @@ local windows = [ alert: 'CortexIngesterUnhealthy', 'for': '15m', expr: ||| - min(cortex_ring_members{state="Unhealthy", job=~"[a-z]+/distributor"}) by (namespace, job) > 0 - |||, + min(cortex_ring_members{state="Unhealthy", job=~"[a-z]+/distributor", namespace=~"%(alert_namespace_matcher)s"}) by (namespace, job) > 0 + ||| % $._config, labels: { severity: 'critical', }, @@ -118,8 +119,8 @@ local windows = [ { alert: 'CortexBadOverrides', expr: ||| - cortex_overrides_last_reload_successful{job!~".+/table-manager|.+/alertmanager"} == 0 - |||, + cortex_overrides_last_reload_successful{job!~".+/table-manager|.+/alertmanager", namespace=~"%(alert_namespace_matcher)s"} == 0 + ||| % $._config, 'for': '15m', labels: { severity: 'warning', @@ -148,8 +149,8 @@ local windows = [ { alert: 'CortexFrontendQueriesStuck', expr: ||| - sum by (namespace) (cortex_query_frontend_queue_length{job=~".+/query-frontend"}) > 1 - |||, + sum by (namespace) (cortex_query_frontend_queue_length{job=~".+/query-frontend", namespace=~"%(alert_namespace_matcher)s"}) > 1 + ||| % $._config, 'for': '5m', // We don't want to block for longer. labels: { severity: 'critical', @@ -163,11 +164,11 @@ local windows = [ { alert: 'CortexCacheRequestErrors', expr: ||| - 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, method) + 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5..", namespace=~"%(alert_namespace_matcher)s"}[1m])) by (namespace, job, method) / - sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (namespace, job, method) + sum(rate(cortex_cache_request_duration_seconds_count{namespace=~"%(alert_namespace_matcher)s"}[1m])) by (namespace, job, method) > 1 - |||, + ||| % $._config, 'for': '15m', labels: { severity: 'warning', @@ -181,8 +182,8 @@ local windows = [ { alert: 'CortexIngesterRestarts', expr: ||| - rate(kube_pod_container_status_restarts_total{container="ingester"}[30m]) > 0 - |||, + rate(kube_pod_container_status_restarts_total{container="ingester", namespace=~"%(alert_namespace_matcher)s"}[30m]) > 0 + ||| % $._config, labels: { severity: 'critical', }, @@ -193,8 +194,8 @@ local windows = [ { alert: 'CortexTransferFailed', expr: ||| - max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m]) - |||, + max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success", namespace=~"%(alert_namespace_matcher)s"}[15m]) + ||| % $._config, 'for': '5m', labels: { severity: 'critical', @@ -345,8 +346,8 @@ local windows = [ { alert: 'CortexProvisioningTooMuchMemory', expr: ||| - avg by (cluster, namespace) (container_memory_working_set_bytes{container_name="ingester"} / container_spec_memory_limit_bytes{container_name="ingester"}) > 0.7 - |||, + avg by (cluster, namespace) (container_memory_working_set_bytes{container_name="ingester", namespace=~"%(alert_namespace_matcher)s} / container_spec_memory_limit_bytes{container_name="ingester", namespace=~"%(alert_namespace_matcher)s}) > 0.7 + ||| % $._config, 'for': '15m', labels: { severity: 'critical', @@ -426,10 +427,10 @@ local windows = [ { alert: 'CortexGossipMembersMismatch', expr: ||| - memberlist_client_cluster_members_count + memberlist_client_cluster_members_count{namespace=~"%(alert_namespace_matcher)s"}, != on (cluster,namespace) group_left sum(up{job=~".+/(distributor|ingester|querier)"}) by (cluster,namespace) - |||, + ||| % $._config, 'for': '5m', labels: { severity: 'warning', From 7b27255bd97ccdefd90b711573db8e5842b6066d Mon Sep 17 00:00:00 2001 From: Cyril Tovena Date: Wed, 11 Mar 2020 13:37:46 -0400 Subject: [PATCH 006/364] Do not add namespace matcher if not set. Signed-off-by: Cyril Tovena --- jsonnet/mimir-mixin/alerts.libsonnet | 40 +++++++++++++++------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index 6c388ded20f..d017b0cf0d0 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -9,10 +9,14 @@ local windows = [ { _config+:: { cortex_p99_latency_threshold_seconds: 2.5, - alert_namespace_matcher: '.*', + alert_namespace_matcher: '', }, prometheus_alerts+:: { + local namespace_matcher(prefix='') = + if std.length($._config.alert_namespace_matcher) != 0 + then '%s namespace=~"%s"' % [prefix, $._config.alert_namespace_matcher] + else '', groups+: [ { name: 'cortex_alerts', @@ -21,8 +25,8 @@ local windows = [ alert: 'CortexIngesterUnhealthy', 'for': '15m', expr: ||| - min(cortex_ring_members{state="Unhealthy", job=~"[a-z]+/distributor", namespace=~"%(alert_namespace_matcher)s"}) by (namespace, job) > 0 - ||| % $._config, + min(cortex_ring_members{state="Unhealthy", job=~"[a-z]+/distributor" %s}) by (namespace, job) > 0 + ||| % namespace_matcher(','), labels: { severity: 'critical', }, @@ -119,8 +123,8 @@ local windows = [ { alert: 'CortexBadOverrides', expr: ||| - cortex_overrides_last_reload_successful{job!~".+/table-manager|.+/alertmanager", namespace=~"%(alert_namespace_matcher)s"} == 0 - ||| % $._config, + cortex_overrides_last_reload_successful{job!~".+/table-manager|.+/alertmanager" %s} == 0 + ||| % namespace_matcher(','), 'for': '15m', labels: { severity: 'warning', @@ -149,8 +153,8 @@ local windows = [ { alert: 'CortexFrontendQueriesStuck', expr: ||| - sum by (namespace) (cortex_query_frontend_queue_length{job=~".+/query-frontend", namespace=~"%(alert_namespace_matcher)s"}) > 1 - ||| % $._config, + sum by (namespace) (cortex_query_frontend_queue_length{job=~".+/query-frontend" %s}) > 1 + ||| % namespace_matcher(','), 'for': '5m', // We don't want to block for longer. labels: { severity: 'critical', @@ -164,11 +168,11 @@ local windows = [ { alert: 'CortexCacheRequestErrors', expr: ||| - 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5..", namespace=~"%(alert_namespace_matcher)s"}[1m])) by (namespace, job, method) + 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5.." %s) by (namespace, job, method) / - sum(rate(cortex_cache_request_duration_seconds_count{namespace=~"%(alert_namespace_matcher)s"}[1m])) by (namespace, job, method) + sum(rate(cortex_cache_request_duration_seconds_count{%s}[1m])) by (namespace, job, method) > 1 - ||| % $._config, + ||| % [namespace_matcher(','), namespace_matcher()], 'for': '15m', labels: { severity: 'warning', @@ -182,8 +186,8 @@ local windows = [ { alert: 'CortexIngesterRestarts', expr: ||| - rate(kube_pod_container_status_restarts_total{container="ingester", namespace=~"%(alert_namespace_matcher)s"}[30m]) > 0 - ||| % $._config, + rate(kube_pod_container_status_restarts_total{container="ingester" %s}[30m]) > 0 + ||| % namespace_matcher(','), labels: { severity: 'critical', }, @@ -194,8 +198,8 @@ local windows = [ { alert: 'CortexTransferFailed', expr: ||| - max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success", namespace=~"%(alert_namespace_matcher)s"}[15m]) - ||| % $._config, + max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success" %s}[15m]) + ||| % namespace_matcher(','), 'for': '5m', labels: { severity: 'critical', @@ -346,8 +350,8 @@ local windows = [ { alert: 'CortexProvisioningTooMuchMemory', expr: ||| - avg by (cluster, namespace) (container_memory_working_set_bytes{container_name="ingester", namespace=~"%(alert_namespace_matcher)s} / container_spec_memory_limit_bytes{container_name="ingester", namespace=~"%(alert_namespace_matcher)s}) > 0.7 - ||| % $._config, + avg by (cluster, namespace) (container_memory_working_set_bytes{container_name="ingester" %s} / container_spec_memory_limit_bytes{container_name="ingester" %s}) > 0.7 + ||| % [namespace_matcher(','), namespace_matcher(',')], 'for': '15m', labels: { severity: 'critical', @@ -427,10 +431,10 @@ local windows = [ { alert: 'CortexGossipMembersMismatch', expr: ||| - memberlist_client_cluster_members_count{namespace=~"%(alert_namespace_matcher)s"}, + memberlist_client_cluster_members_count{%s}, != on (cluster,namespace) group_left sum(up{job=~".+/(distributor|ingester|querier)"}) by (cluster,namespace) - ||| % $._config, + ||| % namespace_matcher(), 'for': '5m', labels: { severity: 'warning', From e54ce681442f8a91e659ef1fdbf9cdf8ce848553 Mon Sep 17 00:00:00 2001 From: Cyril Tovena Date: Wed, 11 Mar 2020 14:01:32 -0400 Subject: [PATCH 007/364] Fix errors in rate. Signed-off-by: Cyril Tovena --- jsonnet/mimir-mixin/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index d017b0cf0d0..06c49f64cf4 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -168,7 +168,7 @@ local windows = [ { alert: 'CortexCacheRequestErrors', expr: ||| - 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5.." %s) by (namespace, job, method) + 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5.." %s}[1m])) by (namespace, job, method) / sum(rate(cortex_cache_request_duration_seconds_count{%s}[1m])) by (namespace, job, method) > 1 From a0c06c583a06f0eb9cf5c44006c89e4ed49a9683 Mon Sep 17 00:00:00 2001 From: Cyril Tovena Date: Wed, 11 Mar 2020 15:23:05 -0400 Subject: [PATCH 008/364] Fix alerts, my previous commit introduced a bug in the gossip alert. Signed-off-by: Cyril Tovena --- jsonnet/mimir-mixin/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index 06c49f64cf4..8812c0c1e18 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -431,7 +431,7 @@ local windows = [ { alert: 'CortexGossipMembersMismatch', expr: ||| - memberlist_client_cluster_members_count{%s}, + memberlist_client_cluster_members_count{%s} != on (cluster,namespace) group_left sum(up{job=~".+/(distributor|ingester|querier)"}) by (cluster,namespace) ||| % namespace_matcher(), From beb538bcb455452cbf5cbf66351abe1a359b9637 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Wed, 1 Apr 2020 13:05:29 -0400 Subject: [PATCH 009/364] sharding + interval panels Signed-off-by: Owen Diehl --- jsonnet/mimir-mixin/dashboards.libsonnet | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index a5ae6684e6e..75e37c1af10 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -265,6 +265,21 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { g.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'Corrupt chunks'), ) ) + .addRow( + g.row('Query Frontend - Sharding/Splitting') + .addPanel( + g.panel('Intervals per Query') + + g.queryPanel('sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{cluster="$cluster", namespace="$namespace", method="split_by_interval"}[1m]))', 'partition rate'), + ) + .addPanel( + g.panel('Sharded Queries %') + + g.queryPanel('sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) * 100', 'shard rate'), + ) + .addPanel( + g.panel('Sharding factor') + + g.queryPanel('sum(rate(cortex_frontend_sharded_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m]))', 'Average'), + ) + ) .addRow( g.row('Querier - Index Cache') .addPanel( From aae85de0aeb60b53cd4c6dcd19a8f6bbdb572f19 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Wed, 1 Apr 2020 13:13:21 -0400 Subject: [PATCH 010/364] aligns frontend panels together Signed-off-by: Owen Diehl --- jsonnet/mimir-mixin/dashboards.libsonnet | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 75e37c1af10..72a41970aea 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -248,6 +248,21 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { g.queryPanel('sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m])) - sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m]))', 'Miss Rate'), ) ) + .addRow( + g.row('Query Frontend - Sharding/Splitting') + .addPanel( + g.panel('Intervals per Query') + + g.queryPanel('sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{cluster="$cluster", namespace="$namespace", method="split_by_interval"}[1m]))', 'partition rate'), + ) + .addPanel( + g.panel('Sharded Queries %') + + g.queryPanel('sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) * 100', 'shard rate'), + ) + .addPanel( + g.panel('Sharding factor') + + g.queryPanel('sum(rate(cortex_frontend_sharded_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m]))', 'Average'), + ) + ) .addRow( g.row('Querier') .addPanel( @@ -265,21 +280,6 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { g.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'Corrupt chunks'), ) ) - .addRow( - g.row('Query Frontend - Sharding/Splitting') - .addPanel( - g.panel('Intervals per Query') + - g.queryPanel('sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{cluster="$cluster", namespace="$namespace", method="split_by_interval"}[1m]))', 'partition rate'), - ) - .addPanel( - g.panel('Sharded Queries %') + - g.queryPanel('sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) * 100', 'shard rate'), - ) - .addPanel( - g.panel('Sharding factor') + - g.queryPanel('sum(rate(cortex_frontend_sharded_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m]))', 'Average'), - ) - ) .addRow( g.row('Querier - Index Cache') .addPanel( From fc482ae140e3982b1aba211726557862a5821305 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Mon, 6 Apr 2020 13:00:17 +0200 Subject: [PATCH 011/364] Added "Memcached - Blocks Index" to the Cortex / Reads dashboard. --- jsonnet/mimir-mixin/dashboards.libsonnet | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 72a41970aea..e2f870f5cb5 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -899,6 +899,20 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { ) ) else dashboard; + local addBlocksRows(dashboard) = if std.setMember('tsdb', $._config.storage_engine) then + dashboard.addRow( + g.row('Memcached - Blocks Index') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",operation="getmulti"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/querier", operation="getmulti"}') + ) + ) + else dashboard; + local addStorageRows(dashboard) = if $._config.storage_backend == 'cassandra' then dashboard.addRow( @@ -937,5 +951,5 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { ), ) else dashboard; - addStorageRows(addChunksRows(out)), + addStorageRows(addBlocksRows(addChunksRows(out))), } From 6fe6a530f92eb3e9dc98e708f48f10f34db71d0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 6 Apr 2020 14:39:58 +0200 Subject: [PATCH 012/364] Expose Iter operation. (https://github.com/grafana/cortex-jsonnet/pull/31) --- jsonnet/mimir-mixin/dashboards.libsonnet | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index e2f870f5cb5..50ddf9e4bfb 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -49,7 +49,10 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { g.panel('Op: ObjectSize') + g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="objectsize"}'), ) - // oper="iter" is also available, but not used. + .addPanel( + g.panel('Op: Iter') + + g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="iter"}'), + ) .addPanel( g.panel('Op: Exists') + g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="exists"}'), From 421f1ee871a9c17b3161b260174c5e6b8fb184da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 6 Apr 2020 15:22:27 +0200 Subject: [PATCH 013/364] Dashboard links (https://github.com/grafana/cortex-jsonnet/pull/32) * Added links to other Cortex dashboards, preserving variables and time range. * Restored mistakenly removed change. --- jsonnet/mimir-mixin/dashboards.libsonnet | 33 +++++++++++++++++++----- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 50ddf9e4bfb..42e76035427 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -81,11 +81,30 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { }; { + dashboardWithTagsAndLinks(title):: + g.dashboard(title) + { + tags: $._config.tags, + + links: [ + { + asDropdown: true, + icon: 'external link', + includeVars: true, + keepTime: true, + tags: $._config.tags, + targetBlank: false, + title: 'Cortex Dashboards', + type: 'dashboards', + }, + ], + }, + _config+:: { storage_backend: error 'must specify storage backend (cassandra, gcp)', // may contain 'chunks', 'tsdb' or both. Enables chunks- or tsdb- specific panels and dashboards. storage_engine: ['chunks'], gcs_enabled: false, + tags: ['cortex'], }, dashboards+: { @@ -169,7 +188,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { addBlocksRows(addGcsRows($.cortex_reads_dashboard)), [if std.setMember('chunks', $._config.storage_engine) then 'cortex-chunks.json' else null]: - g.dashboard('Cortex / Chunks') + $.dashboardWithTagsAndLinks('Cortex / Chunks') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( @@ -220,7 +239,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { ), 'cortex-queries.json': - g.dashboard('Cortex / Queries') + $.dashboardWithTagsAndLinks('Cortex / Queries') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( @@ -342,7 +361,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { ), 'ruler.json': - g.dashboard('Cortex / Ruler') + $.dashboardWithTagsAndLinks('Cortex / Ruler') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( @@ -381,7 +400,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { ), 'cortex-scaling.json': - g.dashboard('Cortex / Scaling') + $.dashboardWithTagsAndLinks('Cortex / Scaling') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( @@ -489,7 +508,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { ), [if std.setMember('tsdb', $._config.storage_engine) then 'cortex-blocks.json' else null]: - g.dashboard('Cortex / Blocks') + $.dashboardWithTagsAndLinks('Cortex / Blocks') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') // repeated from Cortex / Chunks @@ -682,7 +701,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { cortex_writes_dashboard:: local out = - g.dashboard('Cortex / Writes') + $.dashboardWithTagsAndLinks('Cortex / Writes') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( @@ -819,7 +838,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { cortex_reads_dashboard:: local out = - g.dashboard('Cortex / Reads') + $.dashboardWithTagsAndLinks('Cortex / Reads') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( From fa688a8778e88bd2d9264c0b27859d26f0cdd9f3 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Mon, 6 Apr 2020 13:03:08 -0700 Subject: [PATCH 014/364] remove gateway recording rules and alerts Signed-off-by: Callum Styan --- jsonnet/mimir-mixin/alerts.libsonnet | 66 ------------------- jsonnet/mimir-mixin/recording_rules.libsonnet | 48 -------------- 2 files changed, 114 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index 8812c0c1e18..dc8168de0a8 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -1,11 +1,3 @@ -// According to https://developers.soundcloud.com/blog/alerting-on-slos : -local windows = [ - { long_period: '1h', short_period: '5m', for_period: '2m', factor: 14.4, severity: 'critical' }, - { long_period: '6h', short_period: '30m', for_period: '15m', factor: 6, severity: 'critical' }, - { long_period: '1d', short_period: '2h', for_period: '1h', factor: 3, severity: 'warning' }, - { long_period: '3d', short_period: '6h', for_period: '3h', factor: 1, severity: 'warning' }, -]; - { _config+:: { cortex_p99_latency_threshold_seconds: 2.5, @@ -228,64 +220,6 @@ local windows = [ }, ], }, - { - name: 'cortex_slo_alerts', - rules: [ - { - alert: 'CortexWriteErrorBudgetBurn', - expr: ||| - ( - ( - 100 * namespace_job:cortex_gateway_write_slo_errors_per_request:ratio_rate%(long_period)s - > 0.1 * %(factor)f - ) - and - ( - 100 * namespace_job:cortex_gateway_write_slo_errors_per_request:ratio_rate%(short_period)s - > 0.1 * %(factor)f - ) - ) - ||| % window, - 'for': window.for_period, - labels: { - severity: window.severity, - period: window.long_period, // The annotation alone doesn't make this alert unique. - }, - annotations: { - summary: 'Cortex burns its write error budget too fast.', - description: "{{ $value | printf `%%.2f` }}%% of {{ $labels.job }}'s write requests in the last %(long_period)s are failing or too slow to meet the SLO." % window, - }, - } - for window in windows - ] + [ - { - alert: 'CortexReadErrorBudgetBurn', - expr: ||| - ( - ( - 100 * namespace_job:cortex_gateway_read_slo_errors_per_request:ratio_rate%(long_period)s - > 0.5 * %(factor)f - ) - and - ( - 100 * namespace_job:cortex_gateway_read_slo_errors_per_request:ratio_rate%(short_period)s - > 0.5 * %(factor)f - ) - ) - ||| % window, - 'for': window.for_period, - labels: { - severity: window.severity, - period: window.long_period, // The annotation alone doesn't make this alert unique. - }, - annotations: { - summary: 'Cortex burns its read error budget too fast.', - description: "{{ $value | printf `%%.2f` }}%% of {{ $labels.job }}'s read requests in the last %(long_period)s are failing or too slow to meet the SLO." % window, - }, - } - for window in windows - ], - }, { name: 'cortex-provisioning', rules: [ diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 56ee5266cc7..41ab97d7607 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -1,13 +1,4 @@ local utils = import 'mixin-utils/utils.libsonnet'; -local windows = [ - { period: '5m' }, - { period: '30m' }, - { period: '1h' }, - { period: '2h' }, - { period: '6h' }, - { period: '1d' }, - { period: '3d' }, -]; { prometheus_rules+:: { @@ -35,45 +26,6 @@ local windows = [ utils.histogramRules('cortex_database_request_duration_seconds', ['cluster', 'job', 'method']) + utils.histogramRules('cortex_gcs_request_duration_seconds', ['cluster', 'job', 'operation']) + utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job']), - }, { - name: 'cortex_slo_rules', - rules: [ - { - record: 'namespace_job:cortex_gateway_write_slo_errors_per_request:ratio_rate%(period)s' % window, - expr: ||| - 1 - - ( - sum by (namespace, job) (rate(cortex_request_duration_seconds_bucket{status_code!~"5..", le="1", route="api_prom_push", job=~".*/cortex-gw"}[%(period)s])) - / - sum by (namespace, job) (rate(cortex_request_duration_seconds_count{route="api_prom_push", job=~".*/cortex-gw"}[%(period)s])) - ) - ||| % window, - } - for window in windows - ] + [ - { - record: 'namespace_job:cortex_gateway_read_slo_errors_per_request:ratio_rate%(period)s' % window, - expr: ||| - 1 - - ( - sum by (namespace, job) (rate(cortex_request_duration_seconds_bucket{status_code!~"5..",le="2.5",route=~"api_prom_api_v1_query.*", job=~".*/cortex-gw"}[%(period)s])) - / - sum by (namespace, job) (rate(cortex_request_duration_seconds_count{route=~"api_prom_api_v1_query.*", job=~".*/cortex-gw"}[%(period)s])) - ) - ||| % window, - } - for window in windows - ], - }, { - name: 'cortex_received_samples', - rules: [ - { - record: 'cluster_namespace:cortex_distributor_received_samples:rate5m', - expr: ||| - sum by (cluster, namespace) (rate(cortex_distributor_received_samples_total{job=~".*/distributor"}[5m])) - |||, - }, - ], }], }, } From e91bf6911768df52a40fe654588c7a284e9a7cd2 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 7 Apr 2020 16:08:01 +0100 Subject: [PATCH 015/364] Use camelCase from mixin spec. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/alerts.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards.libsonnet | 2 +- jsonnet/mimir-mixin/recording_rules.libsonnet | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index 8812c0c1e18..1b5a360d031 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -12,7 +12,7 @@ local windows = [ alert_namespace_matcher: '', }, - prometheus_alerts+:: { + prometheusAlerts+:: { local namespace_matcher(prefix='') = if std.length($._config.alert_namespace_matcher) != 0 then '%s namespace=~"%s"' % [prefix, $._config.alert_namespace_matcher] diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 42e76035427..665a55f8e2e 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -107,7 +107,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { tags: ['cortex'], }, - dashboards+: { + grafanaDashboards+: { 'cortex-writes.json': local addGcsRow(dashboard) = if $._config.gcs_enabled then dashboard.addRow( diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 56ee5266cc7..6b62b1fa817 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -10,7 +10,7 @@ local windows = [ ]; { - prometheus_rules+:: { + prometheusRules+:: { groups+: [{ name: 'cortex_rules', rules: From 8512245735882867c6e5912159ce76bd35078abd Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Tue, 7 Apr 2020 14:28:02 -0700 Subject: [PATCH 016/364] Forgot to remove the import of recording_rules.jsonet since we no longer have prometheus_rules for recording rules. Signed-off-by: Callum Styan --- jsonnet/mimir-mixin/mixin.libsonnet | 3 +-- jsonnet/mimir-mixin/recording_rules.jsonnet | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) delete mode 100644 jsonnet/mimir-mixin/recording_rules.jsonnet diff --git a/jsonnet/mimir-mixin/mixin.libsonnet b/jsonnet/mimir-mixin/mixin.libsonnet index b2b2f10dd23..a9f14e8837e 100644 --- a/jsonnet/mimir-mixin/mixin.libsonnet +++ b/jsonnet/mimir-mixin/mixin.libsonnet @@ -1,3 +1,2 @@ (import 'dashboards.libsonnet') + -(import 'alerts.libsonnet') + -(import 'recording_rules.libsonnet') +(import 'alerts.libsonnet') diff --git a/jsonnet/mimir-mixin/recording_rules.jsonnet b/jsonnet/mimir-mixin/recording_rules.jsonnet deleted file mode 100644 index 4cda6c6f256..00000000000 --- a/jsonnet/mimir-mixin/recording_rules.jsonnet +++ /dev/null @@ -1 +0,0 @@ -std.manifestYamlDoc((import 'mixin.libsonnet').prometheus_rules) From 891cc9e98a8f2a3825a3f72f13851c2ec1312990 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Tue, 7 Apr 2020 15:44:59 -0700 Subject: [PATCH 017/364] Actual fix for things I broke in https://github.com/grafana/cortex-jsonnet/pull/33. Signed-off-by: Callum Styan --- jsonnet/mimir-mixin/mixin.libsonnet | 3 ++- jsonnet/mimir-mixin/recording_rules.jsonnet | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 jsonnet/mimir-mixin/recording_rules.jsonnet diff --git a/jsonnet/mimir-mixin/mixin.libsonnet b/jsonnet/mimir-mixin/mixin.libsonnet index a9f14e8837e..b2b2f10dd23 100644 --- a/jsonnet/mimir-mixin/mixin.libsonnet +++ b/jsonnet/mimir-mixin/mixin.libsonnet @@ -1,2 +1,3 @@ (import 'dashboards.libsonnet') + -(import 'alerts.libsonnet') +(import 'alerts.libsonnet') + +(import 'recording_rules.libsonnet') diff --git a/jsonnet/mimir-mixin/recording_rules.jsonnet b/jsonnet/mimir-mixin/recording_rules.jsonnet new file mode 100644 index 00000000000..dbe13f417b4 --- /dev/null +++ b/jsonnet/mimir-mixin/recording_rules.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusRules) From d273b38959930cf8ed3c6da84eb6c5f3f4c6311a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 8 Apr 2020 09:33:09 +0200 Subject: [PATCH 018/364] Graph ops/sec for "iter" operation (https://github.com/grafana/cortex-jsonnet/pull/37) * Graph ops/sec for "iter" operation * Simplify query construction. --- jsonnet/mimir-mixin/dashboards.libsonnet | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 665a55f8e2e..3da773b1d97 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -50,8 +50,9 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="objectsize"}'), ) .addPanel( + // Cortex (Thanos) doesn't track timing for 'iter', so we use ops/sec instead. g.panel('Op: Iter') + - g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="iter"}'), + g.queryPanel('sum(rate(%s{cluster=~"$cluster", operation="iter"}[$__interval]))' % [opsTotal], 'ops/sec') ) .addPanel( g.panel('Op: Exists') + From 76ebb116a2e581a452b18197da3e23294a5051bf Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Wed, 8 Apr 2020 18:14:14 +0100 Subject: [PATCH 019/364] Build the mixin yamls/jsons in CI. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/alerts.jsonnet | 2 +- jsonnet/mimir-mixin/dashboards.jsonnet | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts.jsonnet b/jsonnet/mimir-mixin/alerts.jsonnet index e54b1704020..75e7c1b297a 100644 --- a/jsonnet/mimir-mixin/alerts.jsonnet +++ b/jsonnet/mimir-mixin/alerts.jsonnet @@ -1 +1 @@ -std.manifestYamlDoc((import 'mixin.libsonnet').prometheus_alerts) +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) diff --git a/jsonnet/mimir-mixin/dashboards.jsonnet b/jsonnet/mimir-mixin/dashboards.jsonnet index fb102817cd9..40341f06dab 100644 --- a/jsonnet/mimir-mixin/dashboards.jsonnet +++ b/jsonnet/mimir-mixin/dashboards.jsonnet @@ -1,6 +1,13 @@ -local dashboards = (import 'mixin.libsonnet').dashboards; +local mixin = (import 'mixin.libsonnet') { + _config: { + storage_backend: "cassandra", + storage_engine: ["chunks"], + tags: "cortex", + gcs_enabled: false, + }, +}; { - [name]: dashboards[name] - for name in std.objectFields(dashboards) + [name]: std.manifestJsonEx(mixin.grafanaDashboards[name], " ") + for name in std.objectFields(mixin.grafanaDashboards) } From b54f1136bfa6b0bb658f3032768e5a92c1ca677b Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Wed, 8 Apr 2020 18:15:40 +0100 Subject: [PATCH 020/364] make fmt Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/dashboards.jsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards.jsonnet b/jsonnet/mimir-mixin/dashboards.jsonnet index 40341f06dab..a70fad511bc 100644 --- a/jsonnet/mimir-mixin/dashboards.jsonnet +++ b/jsonnet/mimir-mixin/dashboards.jsonnet @@ -1,13 +1,13 @@ local mixin = (import 'mixin.libsonnet') { _config: { - storage_backend: "cassandra", - storage_engine: ["chunks"], - tags: "cortex", + storage_backend: 'cassandra', + storage_engine: ['chunks'], + tags: 'cortex', gcs_enabled: false, }, }; { - [name]: std.manifestJsonEx(mixin.grafanaDashboards[name], " ") + [name]: std.manifestJsonEx(mixin.grafanaDashboards[name], ' ') for name in std.objectFields(mixin.grafanaDashboards) } From d250ccf826b8593f4eb7f464886ac87b2f4dbc90 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Thu, 9 Apr 2020 18:19:35 +0100 Subject: [PATCH 021/364] Add grafanaDashboardFolder Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/mixin.libsonnet | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/mixin.libsonnet b/jsonnet/mimir-mixin/mixin.libsonnet index b2b2f10dd23..06aa01566d8 100644 --- a/jsonnet/mimir-mixin/mixin.libsonnet +++ b/jsonnet/mimir-mixin/mixin.libsonnet @@ -1,3 +1,5 @@ (import 'dashboards.libsonnet') + (import 'alerts.libsonnet') + -(import 'recording_rules.libsonnet') +(import 'recording_rules.libsonnet') { + grafanaDashboardFolder: "Cortex", +} From ecf3768580150e30a305a3bafb0ac894c9429fdf Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Thu, 9 Apr 2020 10:53:26 -0700 Subject: [PATCH 022/364] Add playbooks for alerts. (https://github.com/grafana/cortex-jsonnet/pull/24) Signed-off-by: Callum Styan --- jsonnet/mimir-mixin/docs/playbooks.md | 67 +++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 jsonnet/mimir-mixin/docs/playbooks.md diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md new file mode 100644 index 00000000000..e4372a5c9e0 --- /dev/null +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -0,0 +1,67 @@ +# Playbooks +This document contains playbooks, or at least a checklist of what to look for, for alerts in the cortex-mixin. + +# Alerts + +## CortexIngesterRestarts +First, check if the alert is for a single ingester or multiple. Even if the alert is only for one ingester, it's best to follow up by checking `kubectl get pods --namespace=` every few minutes, or looking at the query `rate(kube_pod_container_status_restarts_total{container="ingester"}[30m]) > 0` just until you're sure there isn't a larger issue causing multiple restarts. + +Next, check `kubectl get events`, with and without the addition of the `--namespace` flag, to look for node restarts or other related issues. Grep or something similar to filter the output can be useful here. The most common cause of this alert is a single cloud providers node restarting and causing the ingester on that node to be rescheduled somewhere else. + +In events you're looking for things like: +``` +57m Normal NodeControllerEviction Pod Marking for deletion Pod ingester-01 from Node cloud-provider-node-01 +37m Normal SuccessfulDelete ReplicaSet (combined from similar events): Deleted pod: ingester-01 +32m Normal NodeNotReady Node Node cloud-provider-node-01 status is now: NodeNotReady +28m Normal DeletingAllPods Node Node cloud-provider-node-01 event: Deleting all Pods from Node cloud-provider-node-01. +``` + +## CortexRequest Latency +First establish if the alert is for read or write latency. The alert should say. + +### Write Latency +Using the Cortex write dashboard, find the cluster which reported the high write latency and deduce where in the stack the latency is being introduced: + +distributor: It is quite normal for the distributor P99 latency to be 50-100ms, and for the ingesters to be ~5ms. If the distributor latency is higher than this, you may need to scale up the distributors. If there is a high error rate being introduced at the distributors (400s or 500s) this has been know to induce latency. + +ingesters: It is very unusual for ingester latency to be high, as they just write to memory. They probably needs scaling up, but it is worth investigating what is going on first. + +### Read Latency +Query performance is an known problem. When you get this alert, you need to work out if: (a) this is a operation issue / configuration (b) this is because of algorithms and inherently limited (c) this is a bug + +Using the Cortex read dashboard, find the cluster which reported the high read latency and deduce where in the stack the latency is being introduced. + +query_frontend: If there is a significant P99 or avg latency difference between the frontend and the querier, you can't scale them up - we rely on their being two frontend. Is this latency coming from the cache? Scale that up. What the CPU usage of the query frontend service? Do we need to increase the CPU requests and have it scheduled to a less busy box? Note QPS on the querier will be higher than on the frontend as it splits queries into multiple smaller ones. + +ingesters: Latency should be in the ~100ms - queries are in memory. If its more, check the CPU usage and consider scaling it up. NB scale ingesters slowly, 1-2 new replicas an hour. + +If you think its provisioning / scaling is the problem, consult the scaling dashboard. These are just recommendations - make reasonable adjustments. + +Right now most of the execution time will be spent in PromQL's innerEval. NB that the prepare (index and chunk fetch) are now interleaved with Eval, so you need to expand both to confirm if its flow execution of slow fetching. + +## CortexTransferFailed +This alert goes off when an ingester fails to find another node to transfer its data to when it was shutting down. If there is both a pod stuck terminating and one stuck joining, look at the kubernetes events. This may be due to scheduling problems caused by some combination of anti affinity rules/resource utilization. Adding a new node can help in these circumstances. You can see recent events associated with a resource via kubectl describe, ex: `kubectl -n describe pod ` + +## CortexIngesterUnhealthy +This alert goes off when an ingester is marked as unhealthy. Check the ring web page to see which is marked as unhealthy. You could then check the logs to see if there are any related to that ingester ex: `kubectl logs -f ingester-01 --namespace=prod`. A simple way to resolve this may be to click the "Forgot" button on the ring page, especially if the pod doesn't exist anymore. It might not exist anymore because it was on a node that got shut down, so you could check to see if there are any logs related to the node that pod is/was on, ex: `kubectl get events --namespace=prod | grep cloud-provider-node`. + +## CortexFlushStuck +@todo + +## CortexLoadBalancerErrors +@todo + +## CortexTableSyncFailure +@todo + +## CortexQuerierCapacityFull +@todo + +## CortexFrontendQueriesStuck +@todo + +## CortexProvisioningTooMuchMemory +@todo + +## MemcachedDown +@todo \ No newline at end of file From 7427048607990d9183886c2c228a942ded3730b3 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Fri, 10 Apr 2020 13:46:58 +0100 Subject: [PATCH 023/364] Need 4 shards for the cortex dashboards. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/mixin.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/jsonnet/mimir-mixin/mixin.libsonnet b/jsonnet/mimir-mixin/mixin.libsonnet index 06aa01566d8..0f11bf17489 100644 --- a/jsonnet/mimir-mixin/mixin.libsonnet +++ b/jsonnet/mimir-mixin/mixin.libsonnet @@ -2,4 +2,5 @@ (import 'alerts.libsonnet') + (import 'recording_rules.libsonnet') { grafanaDashboardFolder: "Cortex", + grafanaDashboardShards: 4, } From 8d15dd1d74f9be846cadb4be1e4defe4a966fd11 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Tue, 14 Apr 2020 13:52:26 -0700 Subject: [PATCH 024/364] Fix lint errors in dashboards mixin.libsonnet. (https://github.com/grafana/cortex-jsonnet/pull/40) Signed-off-by: Callum Styan --- jsonnet/mimir-mixin/mixin.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/mixin.libsonnet b/jsonnet/mimir-mixin/mixin.libsonnet index 0f11bf17489..0a1c20aac5b 100644 --- a/jsonnet/mimir-mixin/mixin.libsonnet +++ b/jsonnet/mimir-mixin/mixin.libsonnet @@ -1,6 +1,6 @@ (import 'dashboards.libsonnet') + (import 'alerts.libsonnet') + (import 'recording_rules.libsonnet') { - grafanaDashboardFolder: "Cortex", - grafanaDashboardShards: 4, + grafanaDashboardFolder: 'Cortex', + grafanaDashboardShards: 4, } From a684ba5178a6a35de86d7294ff8210dd76dcd729 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Wed, 15 Apr 2020 16:39:26 -0700 Subject: [PATCH 025/364] Add additional ingester restart debugging info from Ganesh. (https://github.com/grafana/cortex-jsonnet/pull/39) --- jsonnet/mimir-mixin/docs/playbooks.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index e4372a5c9e0..7be960930cf 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -16,6 +16,10 @@ In events you're looking for things like: 28m Normal DeletingAllPods Node Node cloud-provider-node-01 event: Deleting all Pods from Node cloud-provider-node-01. ``` +If nothing obvious from the above, check for increased load: +- If there is an increase in the number of active series and the memory provisioned is not enough, scale up the ingesters horizontally to have the same number of series as before per ingester. +- If we had an outage and once Cortex is back up, the incoming traffic increases. (or) The clients have their Prometheus remote-write lagging and starts to send samples at a higher rate (again, an increase in traffic but in terms of number of samples). Scale up the ingester horizontally in this case too. + ## CortexRequest Latency First establish if the alert is for read or write latency. The alert should say. From 2545610300c4000b2a13c61470ec4ad020c18e7f Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Sun, 19 Apr 2020 14:50:31 +0100 Subject: [PATCH 026/364] Small refactors: - Put all the mixin config in one place. - Make dashboardWithTagsAndLinks just an override on the dashboard constructor. - Factor out the helper functions. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/config.libsonnet | 12 ++ jsonnet/mimir-mixin/dashboard-utils.libsonnet | 102 ++++++++++++++ jsonnet/mimir-mixin/dashboards.libsonnet | 124 ++---------------- jsonnet/mimir-mixin/mixin.libsonnet | 6 +- 4 files changed, 125 insertions(+), 119 deletions(-) create mode 100644 jsonnet/mimir-mixin/config.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboard-utils.libsonnet diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet new file mode 100644 index 00000000000..236f5414962 --- /dev/null +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -0,0 +1,12 @@ +{ + grafanaDashboardFolder: 'Cortex', + grafanaDashboardShards: 4, + + _config+:: { + storage_backend: "cassandra", #error 'must specify storage backend (cassandra, gcp)', + // may contain 'chunks', 'tsdb' or both. Enables chunks- or tsdb- specific panels and dashboards. + storage_engine: ['chunks'], + gcs_enabled: false, + tags: ['cortex'], + }, +} diff --git a/jsonnet/mimir-mixin/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboard-utils.libsonnet new file mode 100644 index 00000000000..3ae031ea041 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboard-utils.libsonnet @@ -0,0 +1,102 @@ +(import 'grafana-builder/grafana.libsonnet') { + + // Override the dashboard constructor to add: + // - default tags, + // - some links that propagate the selectred cluster. + dashboard(title):: + super.dashboard(title) + { + tags: $._config.tags, + + links: [ + { + asDropdown: true, + icon: 'external link', + includeVars: true, + keepTime: true, + tags: $._config.tags, + targetBlank: false, + title: 'Cortex Dashboards', + type: 'dashboards', + }, + ], + }, + + qpsPanel(selector):: + super.qpsPanel(selector) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + }, + + latencyPanel(metricName, selector, multiplier='1e3'):: + super.latencyPanel(metricName, selector, multiplier) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + }, + + successFailurePanel(title, successMetric, failureMetric):: + $.panel(title) + + $.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + + $.stack + { + aliasColors: { + successful: '#7EB26D', + failed: '#E24D42', + }, + }, + + objectStorePanels1(title, metricPrefix):: + local opsTotal = '%s_thanos_objstore_bucket_operations_total' % [metricPrefix]; + local opsTotalFailures = '%s_thanos_objstore_bucket_operation_failures_total' % [metricPrefix]; + local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; + local interval = '$__interval'; + super.row(title) + .addPanel( + // We use 'up{cluster=~"$cluster", job="($namespace)/.+"}' to add 0 if there are no failed operations. + self.successFailurePanel( + 'Operations/sec', + 'sum(rate(%s{cluster=~"$cluster"}[%s])) - sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotal, interval, opsTotalFailures, interval], + 'sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotalFailures, interval] + ) + ) + .addPanel( + $.panel('Op: ObjectSize') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="objectsize"}'), + ) + .addPanel( + // Cortex (Thanos) doesn't track timing for 'iter', so we use ops/sec instead. + $.panel('Op: Iter') + + $.queryPanel('sum(rate(%s{cluster=~"$cluster", operation="iter"}[$__interval]))' % [opsTotal], 'ops/sec') + ) + .addPanel( + $.panel('Op: Exists') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="exists"}'), + ), + + // Second row of Object Store stats + objectStorePanels2(title, metricPrefix):: + local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; + super.row(title) + .addPanel( + $.panel('Op: Get') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get"}'), + ) + .addPanel( + $.panel('Op: GetRange') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get_range"}'), + ) + .addPanel( + $.panel('Op: Upload') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="upload"}'), + ) + .addPanel( + $.panel('Op: Delete') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="delete"}'), + ), +} diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 3da773b1d97..fa448c29396 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -1,113 +1,7 @@ local utils = (import 'mixin-utils/utils.libsonnet'); +local g = (import 'dashboard-utils.libsonnet'); -local g = (import 'grafana-builder/grafana.libsonnet') + { - qpsPanel(selector):: - super.qpsPanel(selector) + { - targets: [ - target { - interval: '1m', - } - for target in super.targets - ], - }, - - latencyPanel(metricName, selector, multiplier='1e3'):: - super.latencyPanel(metricName, selector, multiplier) + { - targets: [ - target { - interval: '1m', - } - for target in super.targets - ], - }, - - successFailurePanel(title, successMetric, failureMetric):: - g.panel(title) + - g.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + - g.stack + { - aliasColors: { - successful: '#7EB26D', - failed: '#E24D42', - }, - }, - - objectStorePanels1(title, metricPrefix):: - local opsTotal = '%s_thanos_objstore_bucket_operations_total' % [metricPrefix]; - local opsTotalFailures = '%s_thanos_objstore_bucket_operation_failures_total' % [metricPrefix]; - local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; - local interval = '$__interval'; - super.row(title) - .addPanel( - // We use 'up{cluster=~"$cluster", job="($namespace)/.+"}' to add 0 if there are no failed operations. - self.successFailurePanel( - 'Operations/sec', - 'sum(rate(%s{cluster=~"$cluster"}[%s])) - sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotal, interval, opsTotalFailures, interval], - 'sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotalFailures, interval] - ) - ) - .addPanel( - g.panel('Op: ObjectSize') + - g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="objectsize"}'), - ) - .addPanel( - // Cortex (Thanos) doesn't track timing for 'iter', so we use ops/sec instead. - g.panel('Op: Iter') + - g.queryPanel('sum(rate(%s{cluster=~"$cluster", operation="iter"}[$__interval]))' % [opsTotal], 'ops/sec') - ) - .addPanel( - g.panel('Op: Exists') + - g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="exists"}'), - ), - - // Second row of Object Store stats - objectStorePanels2(title, metricPrefix):: - local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; - super.row(title) - .addPanel( - g.panel('Op: Get') + - g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get"}'), - ) - .addPanel( - g.panel('Op: GetRange') + - g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get_range"}'), - ) - .addPanel( - g.panel('Op: Upload') + - g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="upload"}'), - ) - .addPanel( - g.panel('Op: Delete') + - g.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="delete"}'), - ), -}; - -{ - dashboardWithTagsAndLinks(title):: - g.dashboard(title) + { - tags: $._config.tags, - - links: [ - { - asDropdown: true, - icon: 'external link', - includeVars: true, - keepTime: true, - tags: $._config.tags, - targetBlank: false, - title: 'Cortex Dashboards', - type: 'dashboards', - }, - ], - }, - - _config+:: { - storage_backend: error 'must specify storage backend (cassandra, gcp)', - // may contain 'chunks', 'tsdb' or both. Enables chunks- or tsdb- specific panels and dashboards. - storage_engine: ['chunks'], - gcs_enabled: false, - tags: ['cortex'], - }, - +g { grafanaDashboards+: { 'cortex-writes.json': local addGcsRow(dashboard) = if $._config.gcs_enabled then @@ -189,7 +83,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { addBlocksRows(addGcsRows($.cortex_reads_dashboard)), [if std.setMember('chunks', $._config.storage_engine) then 'cortex-chunks.json' else null]: - $.dashboardWithTagsAndLinks('Cortex / Chunks') + $.dashboard('Cortex / Chunks') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( @@ -240,7 +134,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { ), 'cortex-queries.json': - $.dashboardWithTagsAndLinks('Cortex / Queries') + $.dashboard('Cortex / Queries') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( @@ -362,7 +256,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { ), 'ruler.json': - $.dashboardWithTagsAndLinks('Cortex / Ruler') + $.dashboard('Cortex / Ruler') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( @@ -401,7 +295,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { ), 'cortex-scaling.json': - $.dashboardWithTagsAndLinks('Cortex / Scaling') + $.dashboard('Cortex / Scaling') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( @@ -509,7 +403,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { ), [if std.setMember('tsdb', $._config.storage_engine) then 'cortex-blocks.json' else null]: - $.dashboardWithTagsAndLinks('Cortex / Blocks') + $.dashboard('Cortex / Blocks') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') // repeated from Cortex / Chunks @@ -702,7 +596,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { cortex_writes_dashboard:: local out = - $.dashboardWithTagsAndLinks('Cortex / Writes') + $.dashboard('Cortex / Writes') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( @@ -839,7 +733,7 @@ local g = (import 'grafana-builder/grafana.libsonnet') + { cortex_reads_dashboard:: local out = - $.dashboardWithTagsAndLinks('Cortex / Reads') + $.dashboard('Cortex / Reads') .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addRow( diff --git a/jsonnet/mimir-mixin/mixin.libsonnet b/jsonnet/mimir-mixin/mixin.libsonnet index 0a1c20aac5b..ed281b5b664 100644 --- a/jsonnet/mimir-mixin/mixin.libsonnet +++ b/jsonnet/mimir-mixin/mixin.libsonnet @@ -1,6 +1,4 @@ +(import 'config.libsonnet') + (import 'dashboards.libsonnet') + (import 'alerts.libsonnet') + -(import 'recording_rules.libsonnet') { - grafanaDashboardFolder: 'Cortex', - grafanaDashboardShards: 4, -} +(import 'recording_rules.libsonnet') From c3b49b69cb66a5a03d1184fb42e177c301f67675 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Sun, 19 Apr 2020 15:42:04 +0100 Subject: [PATCH 027/364] More refactoring: - Move the dashboards to separate files and namespace them off so they only have access to the config. - Add a AddRowIf helper to massively rationalise the inclusion / exclusion of rates. - Alter how we conditionally include dashboards themselves. Should be a no-op change to the dashboards themselve. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/config.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards.libsonnet | 892 +----------------- .../mimir-mixin/dashboards/blocks.libsonnet | 93 ++ .../mimir-mixin/dashboards/chunks.libsonnet | 54 ++ .../dashboards/comparison.libsonnet | 105 +++ .../dashboards/dashboard-utils.libsonnet | 109 +++ .../mimir-mixin/dashboards/queries.libsonnet | 126 +++ .../mimir-mixin/dashboards/reads.libsonnet | 185 ++++ .../mimir-mixin/dashboards/ruler.libsonnet | 43 + .../mimir-mixin/dashboards/scaling.libsonnet | 112 +++ .../mimir-mixin/dashboards/writes.libsonnet | 166 ++++ 11 files changed, 1015 insertions(+), 872 deletions(-) create mode 100644 jsonnet/mimir-mixin/dashboards/blocks.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboards/chunks.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboards/comparison.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboards/queries.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboards/reads.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboards/ruler.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboards/scaling.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboards/writes.libsonnet diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 236f5414962..0ba7f80d05b 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -3,7 +3,7 @@ grafanaDashboardShards: 4, _config+:: { - storage_backend: "cassandra", #error 'must specify storage backend (cassandra, gcp)', + storage_backend: 'cassandra', //error 'must specify storage backend (cassandra, gcp)', // may contain 'chunks', 'tsdb' or both. Enables chunks- or tsdb- specific panels and dashboards. storage_engine: ['chunks'], gcs_enabled: false, diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index fa448c29396..d9abe2bccc5 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -1,872 +1,22 @@ -local utils = (import 'mixin-utils/utils.libsonnet'); -local g = (import 'dashboard-utils.libsonnet'); - -g { - grafanaDashboards+: { - 'cortex-writes.json': - local addGcsRow(dashboard) = if $._config.gcs_enabled then - dashboard.addRow( - g.row('GCS') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_gcs_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="POST"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'POST')]) - ) - ) else dashboard; - - local addBlocksRows(dashboard) = if std.setMember('tsdb', $._config.storage_engine) then - // Used by ingester when using TSDB storage engine. - dashboard.addRow( - g.row('Blocks Shipper') - .addPanel( - g.successFailurePanel( - 'Uploaded blocks / sec', - 'sum(rate(cortex_ingester_shipper_uploads_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~"$cluster"}[$__interval]))', - 'sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~"$cluster"}[$__interval]))' - ) - ) - ) - .addRow(g.objectStorePanels1('Blocks Object Store Stats (Ingester)', 'cortex_ingester')) - .addRow(g.objectStorePanels2('', 'cortex_ingester')) - else dashboard; - - addBlocksRows(addGcsRow($.cortex_writes_dashboard)), - - 'cortex-reads.json': - local addGcsRows(dashboard) = if $._config.gcs_enabled then - dashboard.addRow( - g.row('GCS') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_gcs_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="GET"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'GET')]) - ) - ) - else dashboard; - - local addBlocksRows(dashboard) = if std.setMember('tsdb', $._config.storage_engine) then - dashboard.addRow( - g.row('Querier - Blocks Storage') - .addPanel( - g.successFailurePanel( - 'Block Loads / sec', - 'sum(rate(cortex_querier_bucket_store_block_loads_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_load_failures_total{cluster=~"$cluster"}[$__interval]))', - 'sum(rate(cortex_querier_bucket_store_block_load_failures_total{cluster=~"$cluster"}[$__interval]))' - ) - ) - .addPanel( - g.successFailurePanel( - 'Block Drops / sec', - 'sum(rate(cortex_querier_bucket_store_block_drops_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_drop_failures_total{cluster=~"$cluster"}[$__interval]))', - 'sum(rate(cortex_querier_bucket_store_block_drop_failures_total{cluster=~"$cluster"}[$__interval]))' - ) - ) - .addPanel( - g.panel('Per-block prepares and preloads duration') + - g.latencyPanel('cortex_querier_bucket_store_series_get_all_duration_seconds', '{cluster=~"$cluster"}'), - ) - .addPanel( - g.panel('Series merge duration') + - g.latencyPanel('cortex_querier_bucket_store_series_merge_duration_seconds', '{cluster=~"$cluster"}'), - ) - ) - .addRow(g.objectStorePanels1('Blocks Object Store Stats (Querier)', 'cortex_querier')) - .addRow(g.objectStorePanels2('', 'cortex_querier')) - else dashboard; - - addBlocksRows(addGcsRows($.cortex_reads_dashboard)), - - [if std.setMember('chunks', $._config.storage_engine) then 'cortex-chunks.json' else null]: - $.dashboard('Cortex / Chunks') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - g.row('Active Series / Chunks') - .addPanel( - g.panel('Series') + - g.queryPanel('sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'series'), - ) - .addPanel( - g.panel('Chunks per series') + - g.queryPanel('sum(cortex_ingester_memory_chunks{cluster=~"$cluster", job=~"($namespace)/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'chunks'), - ) - ) - .addRow( - g.row('Flush Stats') - .addPanel( - g.panel('Utilization') + - g.latencyPanel('cortex_ingester_chunk_utilization', '{cluster=~"$cluster", job=~"($namespace)/ingester"}', multiplier='1') + - { yaxes: g.yaxes('percentunit') }, - ) - .addPanel( - g.panel('Age') + - g.latencyPanel('cortex_ingester_chunk_age_seconds', '{cluster=~"$cluster", job=~"($namespace)/ingester"}'), - ), - ) - .addRow( - g.row('Flush Stats') - .addPanel( - g.panel('Size') + - g.latencyPanel('cortex_ingester_chunk_length', '{cluster=~"$cluster", job=~"($namespace)/ingester"}', multiplier='1') + - { yaxes: g.yaxes('short') }, - ) - .addPanel( - g.panel('Entries') + - g.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{cluster=~"$cluster", job=~"($namespace)/ingester"}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{cluster=~"$cluster", job=~"($namespace)/ingester"}[5m]))', 'entries'), - ), - ) - .addRow( - g.row('Flush Stats') - .addPanel( - g.panel('Queue Length') + - g.queryPanel('cortex_ingester_flush_queue_length{cluster=~"$cluster", job=~"($namespace)/ingester"}', '{{instance}}'), - ) - .addPanel( - g.panel('Flush Rate') + - g.qpsPanel('cortex_ingester_chunk_age_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester"}'), - ), - ), - - 'cortex-queries.json': - $.dashboard('Cortex / Queries') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - g.row('Query Frontend') - .addPanel( - g.panel('Queue Duration') + - g.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/query-frontend"}'), - ) - .addPanel( - g.panel('Retries') + - g.latencyPanel('cortex_query_frontend_retries', '{cluster=~"$cluster", job=~"($namespace)/query-frontend"}', multiplier=1) + - { yaxes: g.yaxes('short') }, - ) - .addPanel( - g.panel('Queue Length') + - g.queryPanel('cortex_query_frontend_queue_length{cluster=~"$cluster", job=~"($namespace)/query-frontend"}', '{{cluster}} / {{namespace}} / {{instance}}'), - ) - ) - .addRow( - g.row('Query Frontend - Results Cache') - .addPanel( - g.panel('Cache Hit %') + - g.queryPanel('sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m])) / sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m]))', 'Hit Rate') + - { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, - ) - .addPanel( - g.panel('Cache misses') + - g.queryPanel('sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m])) - sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m]))', 'Miss Rate'), - ) - ) - .addRow( - g.row('Query Frontend - Sharding/Splitting') - .addPanel( - g.panel('Intervals per Query') + - g.queryPanel('sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{cluster="$cluster", namespace="$namespace", method="split_by_interval"}[1m]))', 'partition rate'), - ) - .addPanel( - g.panel('Sharded Queries %') + - g.queryPanel('sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) * 100', 'shard rate'), - ) - .addPanel( - g.panel('Sharding factor') + - g.queryPanel('sum(rate(cortex_frontend_sharded_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m]))', 'Average'), - ) - ) - .addRow( - g.row('Querier') - .addPanel( - g.panel('Stages') + - g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",cluster=~"$cluster",job=~"($namespace)/querier"}) * 1e3', '{{slice}}') + - { yaxes: g.yaxes('ms') } + - g.stack, - ) - .addPanel( - g.panel('Chunk cache misses') + - g.queryPanel('sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/querier",name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/querier",name="chunksmemcache"}[1m]))', 'Hit rate'), - ) - .addPanel( - g.panel('Chunk cache corruptions') + - g.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'Corrupt chunks'), - ) - ) - .addRow( - g.row('Querier - Index Cache') - .addPanel( - g.panel('Total entries') + - g.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"})', 'Entries'), - ) - .addPanel( - g.panel('Cache Hit %') + - g.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'hit rate') - { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, - ) - .addPanel( - g.panel('Churn Rate') + - g.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'churn rate'), - ) - ) - .addRow( - g.row('Ingester') - .addPanel( - g.panel('Series per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_series', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + - { yaxes: g.yaxes('short') }, - ) - .addPanel( - g.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + - { yaxes: g.yaxes('short') }, - ) - .addPanel( - g.panel('Samples per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + - { yaxes: g.yaxes('short') }, - ) - ) - .addRow( - g.row('Chunk Store') - .addPanel( - g.panel('Index Lookups per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + - { yaxes: g.yaxes('short') }, - ) - .addPanel( - g.panel('Series (pre-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + - { yaxes: g.yaxes('short') }, - ) - .addPanel( - g.panel('Series (post-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + - { yaxes: g.yaxes('short') }, - ) - .addPanel( - g.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + - { yaxes: g.yaxes('short') }, - ) - ), - - 'ruler.json': - $.dashboard('Cortex / Ruler') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - g.row('Rule Evaluations') - .addPanel( - g.panel('EPS') + - g.queryPanel('sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval]))', 'rules processed'), - ) - .addPanel( - g.panel('Latency') + - g.queryPanel( - ||| - sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) - / - sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) - |||, 'average' - ), - ) - ) - .addRow( - g.row('Group Evaluations') - .addPanel( - g.panel('Missed Iterations') + - g.queryPanel('sum(rate(cortex_prometheus_rule_group_iterations_missed_total{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval]))', 'iterations missed'), - ) - .addPanel( - g.panel('Latency') + - g.queryPanel( - ||| - sum (rate(cortex_prometheus_rule_group_duration_seconds_sum{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) - / - sum (rate(cortex_prometheus_rule_group_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) - |||, 'average' - ), - ) - ), - - 'cortex-scaling.json': - $.dashboard('Cortex / Scaling') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - g.row('Workload-based scaling') - .addPanel( - g.panel('Workload-based scaling') + { sort: { col: 1, desc: false } } + - g.tablePanel([ - ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"}, - "deployment", "$1", "statefulset", "(.*)" - ) - ) - |||, - ||| - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(cortex_distributor_received_samples_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "ingester", "cluster", ".*"))[1h:]) - * 3 / 80e3 - |||, - ||| - label_replace( - sum by(cluster, namespace) ( - cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace"} - ) / 1e+6, - "deployment", "ingester", "cluster", ".*" - ) - or - label_replace( - sum by (cluster, namespace) ( - 4 * cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} - * - cortex_ingester_chunk_size_bytes_sum{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} - / - cortex_ingester_chunk_size_bytes_count{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} - ) - / - avg by (cluster, namespace) (memcached_limit_bytes{cluster=~"$cluster", namespace=~"$namespace", job=~".+/memcached"}), - "deployment", "memcached", "namespace", ".*" - ) - |||, - ], { - cluster: { alias: 'Cluster' }, - namespace: { alias: 'Namespace' }, - deployment: { alias: 'Deployment' }, - 'Value #A': { alias: 'Current Replicas', decimals: 0 }, - 'Value #B': { alias: 'Required Replicas, by ingestion rate', decimals: 0 }, - 'Value #C': { alias: 'Required Replicas, by active series', decimals: 0 }, - }) - ) - ) - .addRow( - (g.row('Resource-based scaling') + { height: '500px' }) - .addPanel( - g.panel('Resource-based scaling') + { sort: { col: 1, desc: false } } + - g.tablePanel([ - ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, - "deployment", "$1", "statefulset", "(.*)" - ) - ) - |||, - ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, - "deployment", "$1", "statefulset", "(.*)" - ) - ) - * - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod_name", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:]) - / - sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_cpu_cores{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) - |||, - ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, - "deployment", "$1", "statefulset", "(.*)" - ) - ) - * - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod_name", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m]) - / - sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_memory_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) - |||, - ], { - cluster: { alias: 'Cluster' }, - namespace: { alias: 'Namespace' }, - deployment: { alias: 'Deployment' }, - 'Value #A': { alias: 'Current Replicas', decimals: 0 }, - 'Value #B': { alias: 'Required Replicas, by CPU usage', decimals: 0 }, - 'Value #C': { alias: 'Required Replicas, by Memory usage', decimals: 0 }, - }) - ) - ), - - [if std.setMember('tsdb', $._config.storage_engine) then 'cortex-blocks.json' else null]: - $.dashboard('Cortex / Blocks') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - // repeated from Cortex / Chunks - .addRow( - g.row('Active Series / Chunks') - .addPanel( - g.panel('Series') + - g.queryPanel('sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'series'), - ) - // Chunks per series doesn't make sense for Blocks storage - ) - .addRow( - g.row('Compactor') - .addPanel( - g.successFailurePanel( - 'Compactor Runs / second', - 'sum(rate(cortex_compactor_runs_completed_total{cluster=~"$cluster"}[$__interval]))', - 'sum(rate(cortex_compactor_runs_failed_total{cluster=~"$cluster"}[$__interval]))' - ) - ) - .addPanel( - g.successFailurePanel( - 'Per-tenant Compaction Runs / seconds', - 'sum(rate(cortex_compactor_group_compactions_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_group_compactions_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - 'sum(rate(cortex_compactor_group_compactions_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - ) - ) - ) - .addRow( - g.row('Compactor – Blocks Garbage Collections') - .addPanel( - g.successFailurePanel( - 'Collections Rate', - 'sum(rate(cortex_compactor_garbage_collection_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_garbage_collection_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - 'sum(rate(cortex_compactor_garbage_collection_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - ) - ) - .addPanel( - g.panel('Collections Duration') + - g.latencyPanel('cortex_compactor_garbage_collection_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') - ) - .addPanel( - g.panel('Collected Blocks Rate') + - g.queryPanel('sum(rate(cortex_compactor_garbage_collected_blocks_total{cluster=~"$cluster"}[$__interval]))', 'blocks') - ) - ) - .addRow( - g.row('Compactor - Meta Syncs') - .addPanel( - g.successFailurePanel( - 'Meta Syncs / sec', - 'sum(rate(cortex_compactor_sync_meta_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_sync_meta_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - 'sum(rate(cortex_compactor_sync_meta_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - ) - ) - .addPanel( - g.panel('Meta Sync Durations') + - g.latencyPanel('cortex_compactor_sync_meta_duration_seconds', '{cluster=~"$cluster"}'), - ) - ) - .addRow( - g.row('Prometheus TSDB Compactions') - .addPanel( - g.panel('Compactions Rate') + - g.queryPanel('sum(rate(prometheus_tsdb_compactions_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', 'rate') - ) - .addPanel( - g.panel('Compaction Duration') + - g.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') - ) - .addPanel( - g.panel('Chunk Size Bytes') + - g.latencyPanel('prometheus_tsdb_compaction_chunk_size_bytes', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + - { yaxes: g.yaxes('bytes') } - ) - .addPanel( - g.panel('Chunk Samples') + - g.latencyPanel('prometheus_tsdb_compaction_chunk_samples', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + - { yaxes: g.yaxes('short') } - ) - .addPanel( - g.panel('Chunk Range (seconds)') + - g.latencyPanel('prometheus_tsdb_compaction_chunk_range_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') - ) - ) - .addRow(g.objectStorePanels1('Object Store Stats', 'cortex_compactor')) - .addRow(g.objectStorePanels2('', 'cortex_compactor')), - - [if std.setMember('tsdb', $._config.storage_engine) && std.setMember('chunks', $._config.storage_engine) then 'cortex-blocks-vs-chunks.json' else null]: - g.dashboard('Cortex / Blocks vs Chunks') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addTemplate('blocks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addTemplate('chunks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - g.row('Ingesters') - .addPanel( - g.panel('Samples / sec') + - g.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($blocks_namespace)/ingester"}[$__interval]))', 'blocks') + - g.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($chunks_namespace)/ingester"}[$__interval]))', 'chunks') - ) - ) - .addRow( - g.row('') - .addPanel( - g.panel('Blocks Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($blocks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) - ) - .addPanel( - g.panel('Chunks Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($chunks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) - ) - ) - .addRow( - g.row('') - .addPanel( - g.panel('CPU per sample') + - g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__interval]))', 'blocks') + - g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__interval]))', 'chunks') - ) - .addPanel( - g.panel('Memory per active series') + - g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - working set') + - g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - working set') + - g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + - g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + - { yaxes: g.yaxes('bytes') } - ) - ) - .addRow( - g.row('') - .addPanel( - g.panel('CPU') + - g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[$__interval]))', 'blocks') + - g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[$__interval]))', 'chunks') - ) - .addPanel( - g.panel('Memory') + - g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"})', 'blocks - working set') + - g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"})', 'chunks - working set') + - g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + - g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + - { yaxes: g.yaxes('bytes') } - ) - ) - .addRow( - g.row('Queriers') - .addPanel( - g.panel('Queries / sec (query-frontend)') + - g.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$blocks_namespace/query-frontend",route!="metrics"}[$__interval]))', 'blocks') + - g.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$chunks_namespace/query-frontend",route!="metrics"}[$__interval]))', 'chunks') - ) - .addPanel( - g.panel('Queries / sec (query-tee)') + - g.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval]))', 'blocks') + - g.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval]))', 'chunks') - ) - ) - .addRow( - g.row('') - .addPanel( - g.panel('Latency 99th') + - g.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval])))', 'blocks') + - g.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval])))', 'chunks') + - { yaxes: g.yaxes('s') } - ) - .addPanel( - g.panel('Latency average') + - g.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval]))', 'blocks') + - g.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval]))', 'chunks') + - { yaxes: g.yaxes('s') } - ) - ) - .addRow( - g.row('') - .addPanel( - g.panel('CPU') + - g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="querier"}[$__interval]))', 'blocks') + - g.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="querier"}[$__interval]))', 'chunks') - ) - .addPanel( - g.panel('Memory') + - g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="querier"})', 'blocks - working set') + - g.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="querier"})', 'chunks - working set') + - g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/querier"})', 'blocks - heap inuse') + - g.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/querier"})', 'chunks - heap inuse') + - { yaxes: g.yaxes('bytes') } - ) - ), - }, - - cortex_writes_dashboard:: - local out = - $.dashboard('Cortex / Writes') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - (g.row('Headlines') + - { - height: '100px', - showTitle: false, - }) - .addPanel( - g.panel('Samples / s') + - g.statPanel('sum(cluster_namespace:cortex_distributor_received_samples:rate5m{cluster=~"$cluster", namespace=~"$namespace"})', format='reqps') - ) - .addPanel( - g.panel('Active Series') + - g.statPanel(||| - sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"} - / on(namespace) group_left - max by (namespace) (cortex_distributor_replication_factor{cluster=~"$cluster", job=~"($namespace)/distributor"})) - |||, format='short') - ) - .addPanel( - g.panel('QPS') + - g.statPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="api_prom_push"}[5m]))', format='reqps') - ) - ) - .addRow( - g.row('Gateway') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="api_prom_push"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.eq('route', 'api_prom_push')]) - ) - ) - .addRow( - g.row('Distributor') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) - ) - ) - .addRow( - g.row('Etcd (HA Dedupe)') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) - ) - ) - .addRow( - g.row('Ingester') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route="/cortex.Ingester/Push"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) - ) - ) - .addRow( - g.row('Consul (Ring)') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')]) - ) - ); - - local addChunksRows(dashboard) = - if std.setMember('chunks', $._config.storage_engine) then - dashboard.addRow( - g.row('Memcached') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_memcache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",method="Memcache.Put"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('method', 'Memcache.Put')]) - ) - ) else dashboard; - - local addStorageRows(dashboard) = if $._config.storage_backend == 'cassandra' then - dashboard.addRow( - g.row('Cassandra') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_cassandra_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="INSERT"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', 'INSERT')]) - ) - ) - else if $._config.storage_backend == 'gcp' && std.setMember('chunks', $._config.storage_engine) then // only show BigTable if chunks panels are enabled - dashboard.addRow( - g.row('BigTable') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="/google.bigtable.v2.Bigtable/MutateRows"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')]) - ) - ) - else if $._config.storage_backend == 'dynamodb' then - dashboard.addRow( - g.row('DynamoDB') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_dynamo_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="DynamoDB.BatchWriteItem"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', 'DynamoDB.BatchWriteItem')]) - ) - ) else dashboard; - - addStorageRows(addChunksRows(out)), - - cortex_reads_dashboard:: - local out = - $.dashboard('Cortex / Reads') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - g.row('Gateway') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route=~"(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.re('route', '(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)')]) - ) - ) - .addRow( - g.row('Query Frontend') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend'), utils.selector.neq('route', '/frontend.Frontend/Process')]) - ) - ) - .addRow( - g.row('Cache - Query Results') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend')]) - ) - ) - .addRow( - g.row('Querier') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')]) - ) - ) - .addRow( - g.row('Ingester') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route!~"/cortex.Ingester/Push|metrics|ready|traces"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.nre('route', '/cortex.Ingester/Push|metrics|ready')]) - ) - ); - - local addChunksRows(dashboard) = if std.setMember('chunks', $._config.storage_engine) then - dashboard.addRow( - g.row('Memcached - Index') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="store.index-cache-read.memcache.fetch"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) - ) - ) - .addRow( - g.row('Memcached - Chunks') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="chunksmemcache.fetch"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'chunksmemcache.fetch')]) - ) - ) else dashboard; - - local addBlocksRows(dashboard) = if std.setMember('tsdb', $._config.storage_engine) then - dashboard.addRow( - g.row('Memcached - Blocks Index') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",operation="getmulti"}') - ) - .addPanel( - g.panel('Latency') + - g.latencyPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/querier", operation="getmulti"}') - ) - ) - else dashboard; - - local addStorageRows(dashboard) = - if $._config.storage_backend == 'cassandra' then - dashboard.addRow( - g.row('Cassandra') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_cassandra_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="SELECT"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'SELECT')]) - ), - ) - else if $._config.storage_backend == 'gcp' && std.setMember('chunks', $._config.storage_engine) then // only show BigTable if chunks panels are enabled - dashboard.addRow( - g.row('BigTable') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="/google.bigtable.v2.Bigtable/ReadRows"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) - ), - ) - else if $._config.storage_backend == 'dynamodb' then - dashboard.addRow( - g.row('DynamoDB') - .addPanel( - g.panel('QPS') + - g.qpsPanel('cortex_dynamo_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="DynamoDB.QueryPages"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'DynamoDB.QueryPages')]) - ), - ) else dashboard; - - addStorageRows(addBlocksRows(addChunksRows(out))), +{ + grafanaDashboards+: + { _config:: $._config } + + (import 'dashboards/queries.libsonnet') + + (import 'dashboards/reads.libsonnet') + + (import 'dashboards/ruler.libsonnet') + + (import 'dashboards/scaling.libsonnet') + + (import 'dashboards/writes.libsonnet') + + + (if std.setMember('tsdb', $._config.storage_engine) + then import 'dashboards/blocks.libsonnet' + else {}) + + + (if std.setMember('chunks', $._config.storage_engine) + then import 'dashboards/chunks.libsonnet' + else {}) + + + (if std.setMember('tsdb', $._config.storage_engine) + && std.setMember('chunks', $._config.storage_engine) + then import 'dashboards/comparison.libsonnet' + else {}), } diff --git a/jsonnet/mimir-mixin/dashboards/blocks.libsonnet b/jsonnet/mimir-mixin/dashboards/blocks.libsonnet new file mode 100644 index 00000000000..62a121bfcb0 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/blocks.libsonnet @@ -0,0 +1,93 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + 'cortex-blocks.json': + $.dashboard('Cortex / Blocks') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + // repeated from Cortex / Chunks + .addRow( + $.row('Active Series / Chunks') + .addPanel( + $.panel('Series') + + $.queryPanel('sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'series'), + ) + // Chunks per series doesn't make sense for Blocks storage + ) + .addRow( + $.row('Compactor') + .addPanel( + $.successFailurePanel( + 'Compactor Runs / second', + 'sum(rate(cortex_compactor_runs_completed_total{cluster=~"$cluster"}[$__interval]))', + 'sum(rate(cortex_compactor_runs_failed_total{cluster=~"$cluster"}[$__interval]))' + ) + ) + .addPanel( + $.successFailurePanel( + 'Per-tenant Compaction Runs / seconds', + 'sum(rate(cortex_compactor_group_compactions_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_group_compactions_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + 'sum(rate(cortex_compactor_group_compactions_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + ) + ) + ) + .addRow( + $.row('Compactor – Blocks Garbage Collections') + .addPanel( + $.successFailurePanel( + 'Collections Rate', + 'sum(rate(cortex_compactor_garbage_collection_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_garbage_collection_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + 'sum(rate(cortex_compactor_garbage_collection_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + ) + ) + .addPanel( + $.panel('Collections Duration') + + $.latencyPanel('cortex_compactor_garbage_collection_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + ) + .addPanel( + $.panel('Collected Blocks Rate') + + $.queryPanel('sum(rate(cortex_compactor_garbage_collected_blocks_total{cluster=~"$cluster"}[$__interval]))', 'blocks') + ) + ) + .addRow( + $.row('Compactor - Meta Syncs') + .addPanel( + $.successFailurePanel( + 'Meta Syncs / sec', + 'sum(rate(cortex_compactor_sync_meta_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_sync_meta_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + 'sum(rate(cortex_compactor_sync_meta_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + ) + ) + .addPanel( + $.panel('Meta Sync Durations') + + $.latencyPanel('cortex_compactor_sync_meta_duration_seconds', '{cluster=~"$cluster"}'), + ) + ) + .addRow( + $.row('Prometheus TSDB Compactions') + .addPanel( + $.panel('Compactions Rate') + + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', 'rate') + ) + .addPanel( + $.panel('Compaction Duration') + + $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + ) + .addPanel( + $.panel('Chunk Size Bytes') + + $.latencyPanel('prometheus_tsdb_compaction_chunk_size_bytes', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + + { yaxes: $.yaxes('bytes') } + ) + .addPanel( + $.panel('Chunk Samples') + + $.latencyPanel('prometheus_tsdb_compaction_chunk_samples', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + + { yaxes: $.yaxes('short') } + ) + .addPanel( + $.panel('Chunk Range (seconds)') + + $.latencyPanel('prometheus_tsdb_compaction_chunk_range_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + ) + ) + .addRow($.objectStorePanels1('Object Store Stats', 'cortex_compactor')) + .addRow($.objectStorePanels2('', 'cortex_compactor')), +} diff --git a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet new file mode 100644 index 00000000000..21938d09a73 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet @@ -0,0 +1,54 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + 'cortex-chunks.json': + $.dashboard('Cortex / Chunks') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + $.row('Active Series / Chunks') + .addPanel( + $.panel('Series') + + $.queryPanel('sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'series'), + ) + .addPanel( + $.panel('Chunks per series') + + $.queryPanel('sum(cortex_ingester_memory_chunks{cluster=~"$cluster", job=~"($namespace)/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'chunks'), + ) + ) + .addRow( + $.row('Flush Stats') + .addPanel( + $.panel('Utilization') + + $.latencyPanel('cortex_ingester_chunk_utilization', '{cluster=~"$cluster", job=~"($namespace)/ingester"}', multiplier='1') + + { yaxes: $.yaxes('percentunit') }, + ) + .addPanel( + $.panel('Age') + + $.latencyPanel('cortex_ingester_chunk_age_seconds', '{cluster=~"$cluster", job=~"($namespace)/ingester"}'), + ), + ) + .addRow( + $.row('Flush Stats') + .addPanel( + $.panel('Size') + + $.latencyPanel('cortex_ingester_chunk_length', '{cluster=~"$cluster", job=~"($namespace)/ingester"}', multiplier='1') + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Entries') + + $.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{cluster=~"$cluster", job=~"($namespace)/ingester"}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{cluster=~"$cluster", job=~"($namespace)/ingester"}[5m]))', 'entries'), + ), + ) + .addRow( + $.row('Flush Stats') + .addPanel( + $.panel('Queue Length') + + $.queryPanel('cortex_ingester_flush_queue_length{cluster=~"$cluster", job=~"($namespace)/ingester"}', '{{instance}}'), + ) + .addPanel( + $.panel('Flush Rate') + + $.qpsPanel('cortex_ingester_chunk_age_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester"}'), + ), + ), +} diff --git a/jsonnet/mimir-mixin/dashboards/comparison.libsonnet b/jsonnet/mimir-mixin/dashboards/comparison.libsonnet new file mode 100644 index 00000000000..2646fffb981 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/comparison.libsonnet @@ -0,0 +1,105 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') +{ + 'cortex-blocks-vs-chunks.json': + $.dashboard('Cortex / Blocks vs Chunks') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addTemplate('blocks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addTemplate('chunks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + $.row('Ingesters') + .addPanel( + $.panel('Samples / sec') + + $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($blocks_namespace)/ingester"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($chunks_namespace)/ingester"}[$__interval]))', 'chunks') + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('Blocks Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($blocks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) + ) + .addPanel( + $.panel('Chunks Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($chunks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('CPU per sample') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__interval]))', 'chunks') + ) + .addPanel( + $.panel('Memory per active series') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - working set') + + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + + { yaxes: $.yaxes('bytes') } + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('CPU') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[$__interval]))', 'chunks') + ) + .addPanel( + $.panel('Memory') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"})', 'blocks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"})', 'chunks - working set') + + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + + { yaxes: $.yaxes('bytes') } + ) + ) + .addRow( + $.row('Queriers') + .addPanel( + $.panel('Queries / sec (query-frontend)') + + $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$blocks_namespace/query-frontend",route!="metrics"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$chunks_namespace/query-frontend",route!="metrics"}[$__interval]))', 'chunks') + ) + .addPanel( + $.panel('Queries / sec (query-tee)') + + $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval]))', 'chunks') + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('Latency 99th') + + $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval])))', 'blocks') + + $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval])))', 'chunks') + + { yaxes: $.yaxes('s') } + ) + .addPanel( + $.panel('Latency average') + + $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval]))', 'blocks') + + $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval]))', 'chunks') + + { yaxes: $.yaxes('s') } + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('CPU') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="querier"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="querier"}[$__interval]))', 'chunks') + ) + .addPanel( + $.panel('Memory') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="querier"})', 'blocks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="querier"})', 'chunks - working set') + + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/querier"})', 'blocks - heap inuse') + + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/querier"})', 'chunks - heap inuse') + + { yaxes: $.yaxes('bytes') } + ) + ), +} diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet new file mode 100644 index 00000000000..df5b18fbce9 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -0,0 +1,109 @@ +(import 'grafana-builder/grafana.libsonnet') { + + //_config:: error "must provide _config", + + // Override the dashboard constructor to add: + // - default tags, + // - some links that propagate the selectred cluster. + dashboard(title):: + super.dashboard(title) + { + tags: $._config.tags, + + links: [ + { + asDropdown: true, + icon: 'external link', + includeVars: true, + keepTime: true, + tags: $._config.tags, + targetBlank: false, + title: 'Cortex Dashboards', + type: 'dashboards', + }, + ], + + addRowIf(condition, row):: + if condition + then self.addRow(row) + else self, + }, + + qpsPanel(selector):: + super.qpsPanel(selector) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + }, + + latencyPanel(metricName, selector, multiplier='1e3'):: + super.latencyPanel(metricName, selector, multiplier) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + }, + + successFailurePanel(title, successMetric, failureMetric):: + $.panel(title) + + $.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + + $.stack + { + aliasColors: { + successful: '#7EB26D', + failed: '#E24D42', + }, + }, + + objectStorePanels1(title, metricPrefix):: + local opsTotal = '%s_thanos_objstore_bucket_operations_total' % [metricPrefix]; + local opsTotalFailures = '%s_thanos_objstore_bucket_operation_failures_total' % [metricPrefix]; + local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; + local interval = '$__interval'; + super.row(title) + .addPanel( + // We use 'up{cluster=~"$cluster", job="($namespace)/.+"}' to add 0 if there are no failed operations. + self.successFailurePanel( + 'Operations/sec', + 'sum(rate(%s{cluster=~"$cluster"}[%s])) - sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotal, interval, opsTotalFailures, interval], + 'sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotalFailures, interval] + ) + ) + .addPanel( + $.panel('Op: ObjectSize') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="objectsize"}'), + ) + .addPanel( + // Cortex (Thanos) doesn't track timing for 'iter', so we use ops/sec instead. + $.panel('Op: Iter') + + $.queryPanel('sum(rate(%s{cluster=~"$cluster", operation="iter"}[$__interval]))' % [opsTotal], 'ops/sec') + ) + .addPanel( + $.panel('Op: Exists') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="exists"}'), + ), + + // Second row of Object Store stats + objectStorePanels2(title, metricPrefix):: + local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; + super.row(title) + .addPanel( + $.panel('Op: Get') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get"}'), + ) + .addPanel( + $.panel('Op: GetRange') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get_range"}'), + ) + .addPanel( + $.panel('Op: Upload') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="upload"}'), + ) + .addPanel( + $.panel('Op: Delete') + + $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="delete"}'), + ), +} diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet new file mode 100644 index 00000000000..8bc9a641f35 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -0,0 +1,126 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + + 'cortex-queries.json': + $.dashboard('Cortex / Queries') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + $.row('Query Frontend') + .addPanel( + $.panel('Queue Duration') + + $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/query-frontend"}'), + ) + .addPanel( + $.panel('Retries') + + $.latencyPanel('cortex_query_frontend_retries', '{cluster=~"$cluster", job=~"($namespace)/query-frontend"}', multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Queue Length') + + $.queryPanel('cortex_query_frontend_queue_length{cluster=~"$cluster", job=~"($namespace)/query-frontend"}', '{{cluster}} / {{namespace}} / {{instance}}'), + ) + ) + .addRow( + $.row('Query Frontend - Results Cache') + .addPanel( + $.panel('Cache Hit %') + + $.queryPanel('sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m])) / sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m]))', 'Hit Rate') + + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + $.panel('Cache misses') + + $.queryPanel('sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m])) - sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m]))', 'Miss Rate'), + ) + ) + .addRow( + $.row('Query Frontend - Sharding/Splitting') + .addPanel( + $.panel('Intervals per Query') + + $.queryPanel('sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{cluster="$cluster", namespace="$namespace", method="split_by_interval"}[1m]))', 'partition rate'), + ) + .addPanel( + $.panel('Sharded Queries %') + + $.queryPanel('sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) * 100', 'shard rate'), + ) + .addPanel( + $.panel('Sharding factor') + + $.queryPanel('sum(rate(cortex_frontend_sharded_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m]))', 'Average'), + ) + ) + .addRow( + $.row('Querier') + .addPanel( + $.panel('Stages') + + $.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",cluster=~"$cluster",job=~"($namespace)/querier"}) * 1e3', '{{slice}}') + + { yaxes: $.yaxes('ms') } + + $.stack, + ) + .addPanel( + $.panel('Chunk cache misses') + + $.queryPanel('sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/querier",name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/querier",name="chunksmemcache"}[1m]))', 'Hit rate'), + ) + .addPanel( + $.panel('Chunk cache corruptions') + + $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'Corrupt chunks'), + ) + ) + .addRow( + $.row('Querier - Index Cache') + .addPanel( + $.panel('Total entries') + + $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"})', 'Entries'), + ) + .addPanel( + $.panel('Cache Hit %') + + $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'hit rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + $.panel('Churn Rate') + + $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'churn rate'), + ) + ) + .addRow( + $.row('Ingester') + .addPanel( + $.panel('Series per Query') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_series', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Chunks per Query') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Samples per Query') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + ) + .addRow( + $.row('Chunk Store') + .addPanel( + $.panel('Index Lookups per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Series (pre-intersection) per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Series (post-intersection) per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Chunks per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + ), +} diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet new file mode 100644 index 00000000000..289dbb863c2 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -0,0 +1,185 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + 'cortex-reads.json': + $.dashboard('Cortex / Reads') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + $.row('Gateway') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route=~"(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.re('route', '(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)')]) + ) + ) + .addRow( + $.row('Query Frontend') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend'), utils.selector.neq('route', '/frontend.Frontend/Process')]) + ) + ) + .addRow( + $.row('Cache - Query Results') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend')]) + ) + ) + .addRow( + $.row('Querier') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')]) + ) + ) + .addRow( + $.row('Ingester') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route!~"/cortex.Ingester/Push|metrics|ready|traces"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.nre('route', '/cortex.Ingester/Push|metrics|ready')]) + ) + ) + .addRowIf( + std.setMember('chunks', $._config.storage_engine), + $.row('Memcached - Index') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="store.index-cache-read.memcache.fetch"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) + ) + ) + .addRowIf( + std.setMember('chunks', $._config.storage_engine), + $.row('Memcached - Chunks') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="chunksmemcache.fetch"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'chunksmemcache.fetch')]) + ) + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.row('Memcached - Blocks Index') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",operation="getmulti"}') + ) + .addPanel( + $.panel('Latency') + + $.latencyPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/querier", operation="getmulti"}') + ) + ) + .addRowIf( + $._config.storage_backend == 'cassandra', + $.row('Cassandra') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="SELECT"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'SELECT')]) + ) + ) + .addRowIf( + // only show BigTable if chunks panels are enabled + $._config.storage_backend == 'gcp' && std.setMember('chunks', $._config.storage_engine), + + $.row('BigTable') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="/google.bigtable.v2.Bigtable/ReadRows"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) + ), + ) + .addRowIf( + $._config.storage_backend == 'dynamodb', + $.row('DynamoDB') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="DynamoDB.QueryPages"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'DynamoDB.QueryPages')]) + ), + ) + .addRowIf( + + $._config.gcs_enabled, + + $.row('GCS') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_gcs_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="GET"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'GET')]) + ) + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.row('Querier - Blocks Storage') + .addPanel( + $.successFailurePanel( + 'Block Loads / sec', + 'sum(rate(cortex_querier_bucket_store_block_loads_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_load_failures_total{cluster=~"$cluster"}[$__interval]))', + 'sum(rate(cortex_querier_bucket_store_block_load_failures_total{cluster=~"$cluster"}[$__interval]))' + ) + ) + .addPanel( + $.successFailurePanel( + 'Block Drops / sec', + 'sum(rate(cortex_querier_bucket_store_block_drops_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_drop_failures_total{cluster=~"$cluster"}[$__interval]))', + 'sum(rate(cortex_querier_bucket_store_block_drop_failures_total{cluster=~"$cluster"}[$__interval]))' + ) + ) + .addPanel( + $.panel('Per-block prepares and preloads duration') + + $.latencyPanel('cortex_querier_bucket_store_series_get_all_duration_seconds', '{cluster=~"$cluster"}'), + ) + .addPanel( + $.panel('Series merge duration') + + $.latencyPanel('cortex_querier_bucket_store_series_merge_duration_seconds', '{cluster=~"$cluster"}'), + ) + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.objectStorePanels1('Blocks Object Store Stats (Querier)', 'cortex_querier'), + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.objectStorePanels2('', 'cortex_querier'), + ), +} diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet new file mode 100644 index 00000000000..5e87d55858e --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -0,0 +1,43 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + + 'ruler.json': + $.dashboard('Cortex / Ruler') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + $.row('Rule Evaluations') + .addPanel( + $.panel('EPS') + + $.queryPanel('sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval]))', 'rules processed'), + ) + .addPanel( + $.panel('Latency') + + $.queryPanel( + ||| + sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) + / + sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) + |||, 'average' + ), + ) + ) + .addRow( + $.row('Group Evaluations') + .addPanel( + $.panel('Missed Iterations') + + $.queryPanel('sum(rate(cortex_prometheus_rule_group_iterations_missed_total{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval]))', 'iterations missed'), + ) + .addPanel( + $.panel('Latency') + + $.queryPanel( + ||| + sum (rate(cortex_prometheus_rule_group_duration_seconds_sum{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) + / + sum (rate(cortex_prometheus_rule_group_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) + |||, 'average' + ), + ) + ), +} diff --git a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet new file mode 100644 index 00000000000..8142d23bf87 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet @@ -0,0 +1,112 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + + 'cortex-scaling.json': + $.dashboard('Cortex / Scaling') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + $.row('Workload-based scaling') + .addPanel( + $.panel('Workload-based scaling') + { sort: { col: 1, desc: false } } + + $.tablePanel([ + ||| + sum by (cluster, namespace, deployment) ( + kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"} + or + label_replace( + kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"}, + "deployment", "$1", "statefulset", "(.*)" + ) + ) + |||, + ||| + quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(cortex_distributor_received_samples_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "ingester", "cluster", ".*"))[1h:]) + * 3 / 80e3 + |||, + ||| + label_replace( + sum by(cluster, namespace) ( + cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace"} + ) / 1e+6, + "deployment", "ingester", "cluster", ".*" + ) + or + label_replace( + sum by (cluster, namespace) ( + 4 * cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} + * + cortex_ingester_chunk_size_bytes_sum{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} + / + cortex_ingester_chunk_size_bytes_count{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} + ) + / + avg by (cluster, namespace) (memcached_limit_bytes{cluster=~"$cluster", namespace=~"$namespace", job=~".+/memcached"}), + "deployment", "memcached", "namespace", ".*" + ) + |||, + ], { + cluster: { alias: 'Cluster' }, + namespace: { alias: 'Namespace' }, + deployment: { alias: 'Deployment' }, + 'Value #A': { alias: 'Current Replicas', decimals: 0 }, + 'Value #B': { alias: 'Required Replicas, by ingestion rate', decimals: 0 }, + 'Value #C': { alias: 'Required Replicas, by active series', decimals: 0 }, + }) + ) + ) + .addRow( + ($.row('Resource-based scaling') + { height: '500px' }) + .addPanel( + $.panel('Resource-based scaling') + { sort: { col: 1, desc: false } } + + $.tablePanel([ + ||| + sum by (cluster, namespace, deployment) ( + kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} + or + label_replace( + kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, + "deployment", "$1", "statefulset", "(.*)" + ) + ) + |||, + ||| + sum by (cluster, namespace, deployment) ( + kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} + or + label_replace( + kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, + "deployment", "$1", "statefulset", "(.*)" + ) + ) + * + quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod_name", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:]) + / + sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_cpu_cores{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) + |||, + ||| + sum by (cluster, namespace, deployment) ( + kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} + or + label_replace( + kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, + "deployment", "$1", "statefulset", "(.*)" + ) + ) + * + quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod_name", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m]) + / + sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_memory_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) + |||, + ], { + cluster: { alias: 'Cluster' }, + namespace: { alias: 'Namespace' }, + deployment: { alias: 'Deployment' }, + 'Value #A': { alias: 'Current Replicas', decimals: 0 }, + 'Value #B': { alias: 'Required Replicas, by CPU usage', decimals: 0 }, + 'Value #C': { alias: 'Required Replicas, by Memory usage', decimals: 0 }, + }) + ) + ), +} diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet new file mode 100644 index 00000000000..bf60cca1176 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -0,0 +1,166 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + 'cortex-writes.json': + $.dashboard('Cortex / Writes') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') + .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addRow( + ($.row('Headlines') + + { + height: '100px', + showTitle: false, + }) + .addPanel( + $.panel('Samples / s') + + $.statPanel('sum(cluster_namespace:cortex_distributor_received_samples:rate5m{cluster=~"$cluster", namespace=~"$namespace"})', format='reqps') + ) + .addPanel( + $.panel('Active Series') + + $.statPanel(||| + sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"} + / on(namespace) group_left + max by (namespace) (cortex_distributor_replication_factor{cluster=~"$cluster", job=~"($namespace)/distributor"})) + |||, format='short') + ) + .addPanel( + $.panel('QPS') + + $.statPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="api_prom_push"}[5m]))', format='reqps') + ) + ) + .addRow( + $.row('Gateway') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="api_prom_push"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.eq('route', 'api_prom_push')]) + ) + ) + .addRow( + $.row('Distributor') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) + ) + ) + .addRow( + $.row('Etcd (HA Dedupe)') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) + ) + ) + .addRow( + $.row('Ingester') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route="/cortex.Ingester/Push"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) + ) + ) + .addRow( + $.row('Consul (Ring)') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')]) + ) + ) + .addRowIf( + std.setMember('chunks', $._config.storage_engine), + $.row('Memcached') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_memcache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",method="Memcache.Put"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('method', 'Memcache.Put')]) + ) + ) + .addRowIf( + $._config.storage_backend == 'cassandra', + $.row('Cassandra') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="INSERT"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', 'INSERT')]) + ) + ) + .addRowIf( + // only show BigTable if chunks panels are enabled + $._config.storage_backend == 'gcp' && std.setMember('chunks', $._config.storage_engine), + $.row('BigTable') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="/google.bigtable.v2.Bigtable/MutateRows"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')]) + ) + ) + .addRowIf( + $._config.storage_backend == 'dynamodb', + $.row('DynamoDB') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="DynamoDB.BatchWriteItem"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', 'DynamoDB.BatchWriteItem')]) + ) + ) + .addRowIf( + $._config.gcs_enabled, + $.row('GCS') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_gcs_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="POST"}') + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'POST')]) + ) + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.row('Blocks Shipper') + .addPanel( + $.successFailurePanel( + 'Uploaded blocks / sec', + 'sum(rate(cortex_ingester_shipper_uploads_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~"$cluster"}[$__interval]))', + 'sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~"$cluster"}[$__interval]))' + ), + ) + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.objectStorePanels1('Blocks Object Store Stats (Ingester)', 'cortex_ingester'), + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.objectStorePanels2('', 'cortex_ingester'), + ), +} From 0fbc8712c59d3174c45cc1c1eff61cc803fd8ce5 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Sun, 19 Apr 2020 15:58:54 +0100 Subject: [PATCH 028/364] Use set-style selector for bigtable/cassandra/dynamodb. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/config.libsonnet | 18 ++++++++++++++---- jsonnet/mimir-mixin/dashboards.jsonnet | 9 +-------- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 16 ++++++++-------- .../mimir-mixin/dashboards/writes.libsonnet | 13 ++++++++----- 4 files changed, 31 insertions(+), 25 deletions(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 0ba7f80d05b..193ed8761cb 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -3,10 +3,20 @@ grafanaDashboardShards: 4, _config+:: { - storage_backend: 'cassandra', //error 'must specify storage backend (cassandra, gcp)', - // may contain 'chunks', 'tsdb' or both. Enables chunks- or tsdb- specific panels and dashboards. - storage_engine: ['chunks'], - gcs_enabled: false, + // Switch for overall storage engine. + // May contain 'chunks', 'tsdb' or both. + // Enables chunks- or tsdb- specific panels and dashboards. + storage_engine: ['chunks', 'tsdb'], + + // For chunks backend, switch for chunk index type. + // May contain 'bigtable', 'dynamodb' or 'cassandra'. + chunk_index_backend: ['bigtable', 'dyamodb', 'cassandra'], + + // For chunks backend, switch for chunk store type. + // May contain 'bigtable', 'dynamodb', 'cassandra', 's3' or 'gcs'. + chunk_store_backend: ['bigtable', 'dyamodb', 'cassandra', 's3', 'gcs'], + + // Tags for dashboards. tags: ['cortex'], }, } diff --git a/jsonnet/mimir-mixin/dashboards.jsonnet b/jsonnet/mimir-mixin/dashboards.jsonnet index a70fad511bc..c3ec625a279 100644 --- a/jsonnet/mimir-mixin/dashboards.jsonnet +++ b/jsonnet/mimir-mixin/dashboards.jsonnet @@ -1,11 +1,4 @@ -local mixin = (import 'mixin.libsonnet') { - _config: { - storage_backend: 'cassandra', - storage_engine: ['chunks'], - tags: 'cortex', - gcs_enabled: false, - }, -}; +local mixin = import 'mixin.libsonnet'; { [name]: std.manifestJsonEx(mixin.grafanaDashboards[name], ' ') diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 289dbb863c2..52e0c83d6e4 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -97,7 +97,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - $._config.storage_backend == 'cassandra', + std.setMember('chunks', $._config.storage_engine) && + std.setMember('cassandra', $._config.chunk_index_backend + $._config.chunk_store_backend), $.row('Cassandra') .addPanel( $.panel('QPS') + @@ -109,9 +110,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - // only show BigTable if chunks panels are enabled - $._config.storage_backend == 'gcp' && std.setMember('chunks', $._config.storage_engine), - + std.setMember('chunks', $._config.storage_engine) && + std.setMember('bigtable', $._config.chunk_index_backend + $._config.chunk_store_backend), $.row('BigTable') .addPanel( $.panel('QPS') + @@ -123,7 +123,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addRowIf( - $._config.storage_backend == 'dynamodb', + std.setMember('chunks', $._config.storage_engine) && + std.setMember('dynamodb', $._config.chunk_index_backend + $._config.chunk_store_backend), $.row('DynamoDB') .addPanel( $.panel('QPS') + @@ -135,9 +136,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addRowIf( - - $._config.gcs_enabled, - + std.setMember('chunks', $._config.storage_engine) && + std.setMember('gcs', $._config.chunk_store_backend), $.row('GCS') .addPanel( $.panel('QPS') + diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index bf60cca1176..a7ad66880ad 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -96,7 +96,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - $._config.storage_backend == 'cassandra', + std.setMember('chunks', $._config.storage_engine) && + std.setMember('cassandra', $._config.chunk_index_backend + $._config.chunk_store_backend), $.row('Cassandra') .addPanel( $.panel('QPS') + @@ -108,8 +109,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - // only show BigTable if chunks panels are enabled - $._config.storage_backend == 'gcp' && std.setMember('chunks', $._config.storage_engine), + std.setMember('chunks', $._config.storage_engine) && + std.setMember('bigtable', $._config.chunk_index_backend + $._config.chunk_store_backend), $.row('BigTable') .addPanel( $.panel('QPS') + @@ -121,7 +122,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - $._config.storage_backend == 'dynamodb', + std.setMember('chunks', $._config.storage_engine) && + std.setMember('dynamodb', $._config.chunk_index_backend + $._config.chunk_store_backend), $.row('DynamoDB') .addPanel( $.panel('QPS') + @@ -133,7 +135,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - $._config.gcs_enabled, + std.setMember('chunks', $._config.storage_engine) && + std.setMember('gcs', $._config.chunk_store_backend), $.row('GCS') .addPanel( $.panel('QPS') + From e60e545f4a449f6906e508561a48129cb67c399e Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Sun, 19 Apr 2020 16:40:44 +0100 Subject: [PATCH 029/364] Make selectors in PromQL queries on the writes dashboard support single process mode. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/config.libsonnet | 4 ++ jsonnet/mimir-mixin/dashboards.libsonnet | 5 +- .../mimir-mixin/dashboards/blocks.libsonnet | 3 +- .../mimir-mixin/dashboards/chunks.libsonnet | 3 +- .../dashboards/dashboard-utils.libsonnet | 26 +++++++- .../mimir-mixin/dashboards/queries.libsonnet | 3 +- .../mimir-mixin/dashboards/reads.libsonnet | 3 +- .../mimir-mixin/dashboards/ruler.libsonnet | 3 +- .../mimir-mixin/dashboards/scaling.libsonnet | 3 +- .../mimir-mixin/dashboards/writes.libsonnet | 60 ++++++++++--------- 10 files changed, 69 insertions(+), 44 deletions(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 193ed8761cb..e5c32b90bb5 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -18,5 +18,9 @@ // Tags for dashboards. tags: ['cortex'], + + // If Cortex is deployed as a single binary, set to true to + // modify the job selectors in the dashboard queries. + singleBinary: false, }, } diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index d9abe2bccc5..73ac28fb46b 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -1,6 +1,5 @@ { grafanaDashboards+: - { _config:: $._config } + (import 'dashboards/queries.libsonnet') + (import 'dashboards/reads.libsonnet') + (import 'dashboards/ruler.libsonnet') + @@ -18,5 +17,7 @@ (if std.setMember('tsdb', $._config.storage_engine) && std.setMember('chunks', $._config.storage_engine) then import 'dashboards/comparison.libsonnet' - else {}), + else {}) + + + { _config:: $._config }, } diff --git a/jsonnet/mimir-mixin/dashboards/blocks.libsonnet b/jsonnet/mimir-mixin/dashboards/blocks.libsonnet index 62a121bfcb0..2944169dd75 100644 --- a/jsonnet/mimir-mixin/dashboards/blocks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/blocks.libsonnet @@ -3,8 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-blocks.json': $.dashboard('Cortex / Blocks') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addClusterSelectorTemplates() // repeated from Cortex / Chunks .addRow( $.row('Active Series / Chunks') diff --git a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet index 21938d09a73..cd83106b4ea 100644 --- a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet @@ -3,8 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-chunks.json': $.dashboard('Cortex / Chunks') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addClusterSelectorTemplates() .addRow( $.row('Active Series / Chunks') .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index df5b18fbce9..75cc0d09ba4 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -1,6 +1,8 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + (import 'grafana-builder/grafana.libsonnet') { - //_config:: error "must provide _config", + _config:: error 'must provide _config', // Override the dashboard constructor to add: // - default tags, @@ -26,8 +28,30 @@ if condition then self.addRow(row) else self, + + addClusterSelectorTemplates():: + if $._config.singleBinary + then self.addMultiTemplate('job', 'cortex_build_info', 'job') + else self + .addMultiTemplate('cluster', 'cortex_build_info', 'cluster') + .addMultiTemplate('namespace', 'cortex_build_info', 'namespace'), }, + // The ,ixin allow specialism of the job selector depending on if its a single binary + // deployment or a namespaced one. + jobMatcher(job):: + if $._config.singleBinary + then 'job=~"$job"' + else 'cluster=~"$cluster", job=~"($namespace)/%s"' % job, + + namespaceMatcher():: + if $._config.singleBinary + then 'job=~"$job"' + else 'cluster=~"$cluster", namespace=~"$namespace"', + + jobSelector(job):: + [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/%s' % job)], + qpsPanel(selector):: super.qpsPanel(selector) + { targets: [ diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index 8bc9a641f35..f57027e4d97 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -4,8 +4,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'cortex-queries.json': $.dashboard('Cortex / Queries') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addClusterSelectorTemplates() .addRow( $.row('Query Frontend') .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 52e0c83d6e4..9d161ca42cf 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -3,8 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-reads.json': $.dashboard('Cortex / Reads') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addClusterSelectorTemplates() .addRow( $.row('Gateway') .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 5e87d55858e..21d5dc183a4 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -4,8 +4,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'ruler.json': $.dashboard('Cortex / Ruler') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addClusterSelectorTemplates() .addRow( $.row('Rule Evaluations') .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet index 8142d23bf87..4f37132e154 100644 --- a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet @@ -4,8 +4,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'cortex-scaling.json': $.dashboard('Cortex / Scaling') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addClusterSelectorTemplates() .addRow( $.row('Workload-based scaling') .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index a7ad66880ad..5a9ae75eb4c 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -3,8 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-writes.json': $.dashboard('Cortex / Writes') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addMultiTemplate('namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') + .addClusterSelectorTemplates() .addRow( ($.row('Headlines') + { @@ -13,74 +12,77 @@ local utils = import 'mixin-utils/utils.libsonnet'; }) .addPanel( $.panel('Samples / s') + - $.statPanel('sum(cluster_namespace:cortex_distributor_received_samples:rate5m{cluster=~"$cluster", namespace=~"$namespace"})', format='reqps') + $.statPanel('sum(cluster_namespace:cortex_distributor_received_samples:rate5m{%s})' % $.namespaceMatcher(), format='reqps') ) .addPanel( $.panel('Active Series') + $.statPanel(||| - sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"} + sum(cortex_ingester_memory_series{%(ingester)s} / on(namespace) group_left - max by (namespace) (cortex_distributor_replication_factor{cluster=~"$cluster", job=~"($namespace)/distributor"})) - |||, format='short') + max by (namespace) (cortex_distributor_replication_factor{%(distributor)s})) + ||| % { + ingester: $.jobMatcher('ingester'), + distributor: $.jobMatcher('distributor'), + }, format='short') ) .addPanel( $.panel('QPS') + - $.statPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="api_prom_push"}[5m]))', format='reqps') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route="api_prom_push"}[5m]))' % $.jobMatcher('cortex-gw'), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route="api_prom_push"}') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route="api_prom_push"}' % $.jobMatcher('cortex-gw')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.eq('route', 'api_prom_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('cortex-gw') + [utils.selector.eq('route', 'api_prom_push')]) ) ) .addRow( $.row('Distributor') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route="api_prom_push"}' % $.jobMatcher('distributor')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('distributor') + [utils.selector.eq('route', 'api_prom_push')]) ) ) .addRow( $.row('Etcd (HA Dedupe)') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/distributor"}') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher('distributor')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/distributor')]) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector('distributor')) ) ) .addRow( $.row('Ingester') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route="/cortex.Ingester/Push"}') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher('ingester')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('route', '/cortex.Ingester/Push')]) ) ) .addRow( $.row('Consul (Ring)') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester"}') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher('ingester')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')]) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector('ingester')) ) ) .addRowIf( @@ -88,11 +90,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_memcache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",method="Memcache.Put"}') + $.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher('ingester')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('method', 'Memcache.Put')]) + utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('method', 'Memcache.Put')]) ) ) .addRowIf( @@ -101,11 +103,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Cassandra') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cassandra_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="INSERT"}') + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher('ingester')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', 'INSERT')]) + utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('operation', 'INSERT')]) ) ) .addRowIf( @@ -114,11 +116,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('BigTable') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="/google.bigtable.v2.Bigtable/MutateRows"}') + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher('ingester')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')]) + utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')]) ) ) .addRowIf( @@ -127,11 +129,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('DynamoDB') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_dynamo_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester", operation="DynamoDB.BatchWriteItem"}') + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher('ingester')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.eq('operation', 'DynamoDB.BatchWriteItem')]) + utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('operation', 'DynamoDB.BatchWriteItem')]) ) ) .addRowIf( @@ -140,11 +142,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('GCS') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_gcs_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="POST"}') + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher('ingester')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'POST')]) + utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('operation', 'POST')]) ) ) .addRowIf( @@ -153,8 +155,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.successFailurePanel( 'Uploaded blocks / sec', - 'sum(rate(cortex_ingester_shipper_uploads_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~"$cluster"}[$__interval]))', - 'sum(rate(cortex_ingester_shipper_upload_failures_total{cluster=~"$cluster"}[$__interval]))' + 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], + 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % [$.namespaceMatcher()], ), ) ) From e0e72ae586dae6d2ed2a2a3f7a6dd0a74ccbc2fe Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Sun, 19 Apr 2020 17:11:38 +0100 Subject: [PATCH 030/364] Get all the panels on the write dashboard working with the single binary. Signed-off-by: Tom Wilkie --- .../dashboards/dashboard-utils.libsonnet | 4 +- .../mimir-mixin/dashboards/writes.libsonnet | 6 +- jsonnet/mimir-mixin/recording_rules.libsonnet | 63 +++++++++++-------- 3 files changed, 44 insertions(+), 29 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 75cc0d09ba4..6d52773b994 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -50,7 +50,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; else 'cluster=~"$cluster", namespace=~"$namespace"', jobSelector(job):: - [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/%s' % job)], + if $._config.singleBinary + then [utils.selector.noop('cluster'), utils.selector.re('job', '$job')] + else [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/%s' % job)], qpsPanel(selector):: super.qpsPanel(selector) + { diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 5a9ae75eb4c..00fb3d3e613 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -12,7 +12,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }) .addPanel( $.panel('Samples / s') + - $.statPanel('sum(cluster_namespace:cortex_distributor_received_samples:rate5m{%s})' % $.namespaceMatcher(), format='reqps') + $.statPanel('sum(cluster_namespace_job:cortex_distributor_received_samples:rate5m{%s})' % $.jobMatcher('distributor'), format='reqps') ) .addPanel( $.panel('Active Series') + @@ -53,7 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Etcd (HA Dedupe)') + $.row('KV Store (HA Dedupe)') .addPanel( $.panel('QPS') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher('distributor')) @@ -75,7 +75,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Consul (Ring)') + $.row('KV Store (Ring)') .addPanel( $.panel('QPS') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher('ingester')) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index fb0a1f258e1..acdd35e4a9d 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -2,30 +2,43 @@ local utils = import 'mixin-utils/utils.libsonnet'; { prometheusRules+:: { - groups+: [{ - name: 'cortex_rules', - rules: - utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job', 'route']) + - utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']) + - utils.histogramRules('cortex_memcache_request_duration_seconds', ['cluster', 'job', 'method']) + - utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job', 'method']) + - utils.histogramRules('cortex_bigtable_request_duration_seconds', ['cluster', 'job', 'operation']) + - utils.histogramRules('cortex_cassandra_request_duration_seconds', ['cluster', 'job', 'operation']) + - utils.histogramRules('cortex_dynamo_request_duration_seconds', ['cluster', 'job', 'operation']) + - utils.histogramRules('cortex_query_frontend_retries', ['cluster', 'job']) + - utils.histogramRules('cortex_query_frontend_queue_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('cortex_ingester_queried_series', ['cluster', 'job']) + - utils.histogramRules('cortex_ingester_queried_chunks', ['cluster', 'job']) + - utils.histogramRules('cortex_ingester_queried_samples', ['cluster', 'job']) + - utils.histogramRules('cortex_chunk_store_index_lookups_per_query', ['cluster', 'job']) + - utils.histogramRules('cortex_chunk_store_series_pre_intersection_per_query', ['cluster', 'job']) + - utils.histogramRules('cortex_chunk_store_series_post_intersection_per_query', ['cluster', 'job']) + - utils.histogramRules('cortex_chunk_store_chunks_per_query', ['cluster', 'job']) + - utils.histogramRules('cortex_database_request_duration_seconds', ['cluster', 'job', 'method']) + - utils.histogramRules('cortex_gcs_request_duration_seconds', ['cluster', 'job', 'operation']) + - utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job']), - }], + groups+: [ + { + name: 'cortex_rules', + rules: + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job']) + + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job', 'route']) + + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']) + + utils.histogramRules('cortex_memcache_request_duration_seconds', ['cluster', 'job', 'method']) + + utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job']) + + utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job', 'method']) + + utils.histogramRules('cortex_bigtable_request_duration_seconds', ['cluster', 'job', 'operation']) + + utils.histogramRules('cortex_cassandra_request_duration_seconds', ['cluster', 'job', 'operation']) + + utils.histogramRules('cortex_dynamo_request_duration_seconds', ['cluster', 'job', 'operation']) + + utils.histogramRules('cortex_query_frontend_retries', ['cluster', 'job']) + + utils.histogramRules('cortex_query_frontend_queue_duration_seconds', ['cluster', 'job']) + + utils.histogramRules('cortex_ingester_queried_series', ['cluster', 'job']) + + utils.histogramRules('cortex_ingester_queried_chunks', ['cluster', 'job']) + + utils.histogramRules('cortex_ingester_queried_samples', ['cluster', 'job']) + + utils.histogramRules('cortex_chunk_store_index_lookups_per_query', ['cluster', 'job']) + + utils.histogramRules('cortex_chunk_store_series_pre_intersection_per_query', ['cluster', 'job']) + + utils.histogramRules('cortex_chunk_store_series_post_intersection_per_query', ['cluster', 'job']) + + utils.histogramRules('cortex_chunk_store_chunks_per_query', ['cluster', 'job']) + + utils.histogramRules('cortex_database_request_duration_seconds', ['cluster', 'job', 'method']) + + utils.histogramRules('cortex_gcs_request_duration_seconds', ['cluster', 'job', 'operation']) + + utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job']), + }, + { + name: 'cortex_received_samples', + rules: [ + { + record: 'cluster_namespace_job:cortex_distributor_received_samples:rate5m', + expr: ||| + sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) + |||, + }, + ], + }, + ], }, } From 67ae0f9d26691ef2fd4d0942252eb282230c5e5a Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Sun, 19 Apr 2020 17:51:49 +0100 Subject: [PATCH 031/364] Get read dashboard working for single process. Signed-off-by: Tom Wilkie --- .../mimir-mixin/dashboards/reads.libsonnet | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 9d161ca42cf..8a9a19a4d68 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -8,55 +8,55 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Gateway') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/cortex-gw", route=~"(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)"}') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher('cortex-gw')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/cortex-gw'), utils.selector.re('route', '(api_prom_api_v1_query_range|api_prom_api_v1_query|api_prom_api_v1_label_name_values|api_prom_api_v1_series|api_prom_api_v1_labels)')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('cortex-gw') + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) ) .addRow( $.row('Query Frontend') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher('query-frontend')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend'), utils.selector.neq('route', '/frontend.Frontend/Process')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('query-frontend') + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) ) .addRow( $.row('Cache - Query Results') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/query-frontend"}') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s}' % $.jobMatcher('query-frontend')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/query-frontend')]) + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector('query-frontend')) ) ) .addRow( $.row('Querier') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier"}') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher('querier')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('querier') + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) ) .addRow( $.row('Ingester') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester",route!~"/cortex.Ingester/Push|metrics|ready|traces"}') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers"}' % $.jobMatcher('querier')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester'), utils.selector.nre('route', '/cortex.Ingester/Push|metrics|ready')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('querier') + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers')]) ) ) .addRowIf( @@ -64,11 +64,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached - Index') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="store.index-cache-read.memcache.fetch"}') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher('querier')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) ) ) .addRowIf( @@ -76,11 +76,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached - Chunks') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",method="chunksmemcache.fetch"}') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher('querier')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('method', 'chunksmemcache.fetch')]) + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('method', 'chunksmemcache.fetch')]) ) ) .addRowIf( @@ -88,11 +88,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached - Blocks Index') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier",operation="getmulti"}') + $.qpsPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds_count{%s,operation="getmulti"}' % $.jobMatcher('querier')) ) .addPanel( $.panel('Latency') + - $.latencyPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/querier", operation="getmulti"}') + $.latencyPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds', '{%s,operation="getmulti"}' % $.jobMatcher('querier')) ) ) .addRowIf( @@ -101,11 +101,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Cassandra') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cassandra_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="SELECT"}') + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher('querier')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'SELECT')]) + utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('operation', 'SELECT')]) ) ) .addRowIf( @@ -114,11 +114,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('BigTable') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_bigtable_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="/google.bigtable.v2.Bigtable/ReadRows"}') + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher('querier')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) + utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) ), ) .addRowIf( @@ -127,11 +127,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('DynamoDB') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_dynamo_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="DynamoDB.QueryPages"}') + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher('querier')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'DynamoDB.QueryPages')]) + utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('operation', 'DynamoDB.QueryPages')]) ), ) .addRowIf( @@ -140,11 +140,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('GCS') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_gcs_request_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/querier", operation="GET"}') + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher('querier')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier'), utils.selector.eq('operation', 'GET')]) + utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('operation', 'GET')]) ) ) .addRowIf( @@ -153,24 +153,24 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.successFailurePanel( 'Block Loads / sec', - 'sum(rate(cortex_querier_bucket_store_block_loads_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_load_failures_total{cluster=~"$cluster"}[$__interval]))', - 'sum(rate(cortex_querier_bucket_store_block_load_failures_total{cluster=~"$cluster"}[$__interval]))' + 'sum(rate(cortex_querier_bucket_store_block_loads_total{%s}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_load_failures_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], + 'sum(rate(cortex_querier_bucket_store_block_load_failures_total{%s}[$__interval]))' % $.namespaceMatcher(), ) ) .addPanel( $.successFailurePanel( 'Block Drops / sec', - 'sum(rate(cortex_querier_bucket_store_block_drops_total{cluster=~"$cluster"}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_drop_failures_total{cluster=~"$cluster"}[$__interval]))', - 'sum(rate(cortex_querier_bucket_store_block_drop_failures_total{cluster=~"$cluster"}[$__interval]))' + 'sum(rate(cortex_querier_bucket_store_block_drops_total{%s}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_drop_failures_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], + 'sum(rate(cortex_querier_bucket_store_block_drop_failures_total{%s}[$__interval]))' % $.namespaceMatcher(), ) ) .addPanel( $.panel('Per-block prepares and preloads duration') + - $.latencyPanel('cortex_querier_bucket_store_series_get_all_duration_seconds', '{cluster=~"$cluster"}'), + $.latencyPanel('cortex_querier_bucket_store_series_get_all_duration_seconds', '{%s}' % $.namespaceMatcher()), ) .addPanel( $.panel('Series merge duration') + - $.latencyPanel('cortex_querier_bucket_store_series_merge_duration_seconds', '{cluster=~"$cluster"}'), + $.latencyPanel('cortex_querier_bucket_store_series_merge_duration_seconds', '{%s}' % $.namespaceMatcher()), ) ) .addRowIf( From d6b1e4cdde02829c78c5b2ff2738f1dfd9259381 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Sun, 19 Apr 2020 20:33:52 +0100 Subject: [PATCH 032/364] Apply suggestions from code review Co-Authored-By: Jacob Lisi --- jsonnet/mimir-mixin/config.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index e5c32b90bb5..49f960eded9 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -10,11 +10,11 @@ // For chunks backend, switch for chunk index type. // May contain 'bigtable', 'dynamodb' or 'cassandra'. - chunk_index_backend: ['bigtable', 'dyamodb', 'cassandra'], + chunk_index_backend: ['bigtable', 'dynamodb', 'cassandra'], // For chunks backend, switch for chunk store type. // May contain 'bigtable', 'dynamodb', 'cassandra', 's3' or 'gcs'. - chunk_store_backend: ['bigtable', 'dyamodb', 'cassandra', 's3', 'gcs'], + chunk_store_backend: ['bigtable', 'dynamodb', 'cassandra', 's3', 'gcs'], // Tags for dashboards. tags: ['cortex'], From 32765f4869ca58536bad603e1643eae0ec4a2c13 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Sun, 19 Apr 2020 21:13:24 +0100 Subject: [PATCH 033/364] Update chunks and queries dashboards. Signed-off-by: Tom Wilkie --- .../mimir-mixin/dashboards/chunks.libsonnet | 16 +++---- .../mimir-mixin/dashboards/queries.libsonnet | 42 +++++++++---------- .../mimir-mixin/dashboards/ruler.libsonnet | 18 ++++---- 3 files changed, 39 insertions(+), 37 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet index cd83106b4ea..91f7d5b9c85 100644 --- a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet @@ -8,46 +8,46 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Active Series / Chunks') .addPanel( $.panel('Series') + - $.queryPanel('sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'series'), + $.queryPanel('sum(cortex_ingester_memory_series{%s})' % $.jobMatcher('ingester'), 'series'), ) .addPanel( $.panel('Chunks per series') + - $.queryPanel('sum(cortex_ingester_memory_chunks{cluster=~"$cluster", job=~"($namespace)/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'chunks'), + $.queryPanel('sum(cortex_ingester_memory_chunks{%s}) / sum(cortex_ingester_memory_series{%s})' % [$.jobMatcher('ingester'), $.jobMatcher('ingester')], 'chunks'), ) ) .addRow( $.row('Flush Stats') .addPanel( $.panel('Utilization') + - $.latencyPanel('cortex_ingester_chunk_utilization', '{cluster=~"$cluster", job=~"($namespace)/ingester"}', multiplier='1') + + $.latencyPanel('cortex_ingester_chunk_utilization', '{%s}' % $.jobMatcher('ingester'), multiplier='1') + { yaxes: $.yaxes('percentunit') }, ) .addPanel( $.panel('Age') + - $.latencyPanel('cortex_ingester_chunk_age_seconds', '{cluster=~"$cluster", job=~"($namespace)/ingester"}'), + $.latencyPanel('cortex_ingester_chunk_age_seconds', '{%s}' % $.jobMatcher('ingester')), ), ) .addRow( $.row('Flush Stats') .addPanel( $.panel('Size') + - $.latencyPanel('cortex_ingester_chunk_length', '{cluster=~"$cluster", job=~"($namespace)/ingester"}', multiplier='1') + + $.latencyPanel('cortex_ingester_chunk_length', '{%s}' % $.jobMatcher('ingester'), multiplier='1') + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Entries') + - $.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{cluster=~"$cluster", job=~"($namespace)/ingester"}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{cluster=~"$cluster", job=~"($namespace)/ingester"}[5m]))', 'entries'), + $.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{%s}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{%s}[5m])' % [$.jobMatcher('ingester'), $.jobMatcher('ingester')], 'entries'), ), ) .addRow( $.row('Flush Stats') .addPanel( $.panel('Queue Length') + - $.queryPanel('cortex_ingester_flush_queue_length{cluster=~"$cluster", job=~"($namespace)/ingester"}', '{{instance}}'), + $.queryPanel('cortex_ingester_flush_queue_length{%s}' % $.jobMatcher('ingester'), '{{instance}}'), ) .addPanel( $.panel('Flush Rate') + - $.qpsPanel('cortex_ingester_chunk_age_seconds_count{cluster=~"$cluster", job=~"($namespace)/ingester"}'), + $.qpsPanel('cortex_ingester_chunk_age_seconds_count{%s}' % $.jobMatcher('ingester')), ), ), } diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index f57027e4d97..18257887b85 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -9,93 +9,93 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Query Frontend') .addPanel( $.panel('Queue Duration') + - $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/query-frontend"}'), + $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{%s}' % $.jobMatcher('query-frontend')), ) .addPanel( $.panel('Retries') + - $.latencyPanel('cortex_query_frontend_retries', '{cluster=~"$cluster", job=~"($namespace)/query-frontend"}', multiplier=1) + + $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher('query-frontend'), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Queue Length') + - $.queryPanel('cortex_query_frontend_queue_length{cluster=~"$cluster", job=~"($namespace)/query-frontend"}', '{{cluster}} / {{namespace}} / {{instance}}'), + $.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher('query-frontend'), '{{cluster}} / {{namespace}} / {{instance}}'), ) ) .addRow( $.row('Query Frontend - Results Cache') .addPanel( $.panel('Cache Hit %') + - $.queryPanel('sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m])) / sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m]))', 'Hit Rate') + + $.queryPanel('sum(rate(cortex_cache_hits{%s}[1m])) / sum(rate(cortex_cache_fetched_keys{%s}[1m]))' % [$.jobMatcher('query-frontend'), $.jobMatcher('query-frontend')], 'Hit Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( $.panel('Cache misses') + - $.queryPanel('sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m])) - sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/query-frontend"}[1m]))', 'Miss Rate'), + $.queryPanel('sum(rate(cortex_cache_fetched_keys{%s}[1m])) - sum(rate(cortex_cache_hits{%s}[1m]))' % [$.jobMatcher('query-frontend'), $.jobMatcher('query-frontend')], 'Miss Rate'), ) ) .addRow( $.row('Query Frontend - Sharding/Splitting') .addPanel( $.panel('Intervals per Query') + - $.queryPanel('sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{cluster="$cluster", namespace="$namespace", method="split_by_interval"}[1m]))', 'partition rate'), + $.queryPanel('sum(rate(cortex_frontend_split_queries_total{%s}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{%s, method="split_by_interval"}[1m]))' % [$.jobMatcher('query-frontend'), $.jobMatcher('query-frontend')], 'partition rate'), ) .addPanel( $.panel('Sharded Queries %') + - $.queryPanel('sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_split_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) * 100', 'shard rate'), + $.queryPanel('sum(rate(cortex_frontend_mapped_asts_total{%s}[1m])) / sum(rate(cortex_frontend_split_queries_total{%s}[1m])) * 100' % [$.jobMatcher('query-frontend'), $.jobMatcher('query-frontend')], 'shard rate'), ) .addPanel( $.panel('Sharding factor') + - $.queryPanel('sum(rate(cortex_frontend_sharded_queries_total{cluster="$cluster", namespace="$namespace"}[1m])) / sum(rate(cortex_frontend_mapped_asts_total{cluster="$cluster", namespace="$namespace"}[1m]))', 'Average'), + $.queryPanel('sum(rate(cortex_frontend_sharded_queries_total{%s}[1m])) / sum(rate(cortex_frontend_mapped_asts_total{%s}[1m]))' % [$.jobMatcher('query-frontend'), $.jobMatcher('query-frontend')], 'Average'), ) ) .addRow( $.row('Querier') .addPanel( $.panel('Stages') + - $.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",cluster=~"$cluster",job=~"($namespace)/querier"}) * 1e3', '{{slice}}') + + $.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",%s}) * 1e3' % $.jobMatcher('querier'), '{{slice}}') + { yaxes: $.yaxes('ms') } + $.stack, ) .addPanel( $.panel('Chunk cache misses') + - $.queryPanel('sum(rate(cortex_cache_fetched_keys{cluster=~"$cluster",job=~"($namespace)/querier",name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{cluster=~"$cluster",job=~"($namespace)/querier",name="chunksmemcache"}[1m]))', 'Hit rate'), + $.queryPanel('sum(rate(cortex_cache_fetched_keys{%s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{%s,name="chunksmemcache"}[1m]))' % [$.jobMatcher('querier'), $.jobMatcher('querier')], 'Hit rate'), ) .addPanel( $.panel('Chunk cache corruptions') + - $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'Corrupt chunks'), + $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{%s}[1m]))' % $.jobMatcher('querier'), 'Corrupt chunks'), ) ) .addRow( $.row('Querier - Index Cache') .addPanel( $.panel('Total entries') + - $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"})', 'Entries'), + $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher('querier'), $.jobMatcher('querier')], 'Entries'), ) .addPanel( $.panel('Cache Hit %') + - $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'hit rate') + $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher('querier'), $.jobMatcher('querier'), $.jobMatcher('querier')], 'hit rate') { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( $.panel('Churn Rate') + - $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache", cluster=~"$cluster",job=~"($namespace)/querier"}[1m]))', 'churn rate'), + $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % $.jobMatcher('querier'), 'churn rate'), ) ) .addRow( $.row('Ingester') .addPanel( $.panel('Series per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_series', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + + utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector('ingester'), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + + utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector('ingester'), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Samples per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/ingester')], multiplier=1) + + utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector('ingester'), multiplier=1) + { yaxes: $.yaxes('short') }, ) ) @@ -103,22 +103,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Chunk Store') .addPanel( $.panel('Index Lookups per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector('querier'), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Series (pre-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', $.jobSelector('querier'), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Series (post-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', $.jobSelector('querier'), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/querier')], multiplier=1) + + utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector('querier'), multiplier=1) + { yaxes: $.yaxes('short') }, ) ), diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 21d5dc183a4..ae7f7c524c4 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -9,16 +9,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Rule Evaluations') .addPanel( $.panel('EPS') + - $.queryPanel('sum(rate(cortex_prometheus_rule_evaluations_total{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval]))', 'rules processed'), + $.queryPanel('sum(rate(cortex_prometheus_rule_evaluations_total{%s}[$__interval]))' % $.jobMatcher('ruler'), 'rules processed'), ) .addPanel( $.panel('Latency') + $.queryPanel( ||| - sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) + sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{%s}[$__interval])) / - sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) - |||, 'average' + sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{%s}[$__interval])) + ||| % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], + 'average' ), ) ) @@ -26,16 +27,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Group Evaluations') .addPanel( $.panel('Missed Iterations') + - $.queryPanel('sum(rate(cortex_prometheus_rule_group_iterations_missed_total{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval]))', 'iterations missed'), + $.queryPanel('sum(rate(cortex_prometheus_rule_group_iterations_missed_total{%s}[$__interval]))' % $.jobMatcher('ruler'), 'iterations missed'), ) .addPanel( $.panel('Latency') + $.queryPanel( ||| - sum (rate(cortex_prometheus_rule_group_duration_seconds_sum{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) + sum (rate(cortex_prometheus_rule_group_duration_seconds_sum{%s}[$__interval])) / - sum (rate(cortex_prometheus_rule_group_duration_seconds_count{cluster=~"$cluster", job=~"($namespace)/ruler"}[$__interval])) - |||, 'average' + sum (rate(cortex_prometheus_rule_group_duration_seconds_count{%s}[$__interval])) + ||| % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], + 'average' ), ) ), From 85f36982a694c6a4dc5d1b7a9f2f26f9fd999dd1 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Mon, 20 Apr 2020 13:38:36 +0100 Subject: [PATCH 034/364] Only add the links if addClusterSelectorTemplates is called. Signed-off-by: Tom Wilkie --- .../dashboards/dashboard-utils.libsonnet | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 6d52773b994..716343234bb 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -11,28 +11,30 @@ local utils = import 'mixin-utils/utils.libsonnet'; super.dashboard(title) + { tags: $._config.tags, - links: [ - { - asDropdown: true, - icon: 'external link', - includeVars: true, - keepTime: true, - tags: $._config.tags, - targetBlank: false, - title: 'Cortex Dashboards', - type: 'dashboards', - }, - ], - addRowIf(condition, row):: if condition then self.addRow(row) else self, addClusterSelectorTemplates():: + local d = self { + links: [ + { + asDropdown: true, + icon: 'external link', + includeVars: true, + keepTime: true, + tags: $._config.tags, + targetBlank: false, + title: 'Cortex Dashboards', + type: 'dashboards', + }, + ], + }; + if $._config.singleBinary - then self.addMultiTemplate('job', 'cortex_build_info', 'job') - else self + then d.addMultiTemplate('job', 'cortex_build_info', 'job') + else d .addMultiTemplate('cluster', 'cortex_build_info', 'cluster') .addMultiTemplate('namespace', 'cortex_build_info', 'namespace'), }, From 9254f1841d2694d044536a7a1c275cdc8ae7935a Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 20 Apr 2020 16:41:09 +0200 Subject: [PATCH 035/364] Renamed cortex_dynamo_sync_tables_seconds_count metric to cortex_table_manager_sync_duration_seconds_count Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index 50a1a491f71..e2490e36111 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -80,9 +80,9 @@ // We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage. alert: 'CortexTableSyncFailure', expr: ||| - 100 * rate(cortex_dynamo_sync_tables_seconds_count{status_code!~"2.."}[15m]) + 100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m]) / - rate(cortex_dynamo_sync_tables_seconds_count[15m]) + rate(cortex_table_manager_sync_duration_seconds_count[15m]) > 10 |||, 'for': '30m', From bb1d8c59b5efe7010c47b104222aeade822f2a3e Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 21 Apr 2020 12:46:11 +0100 Subject: [PATCH 036/364] Only tag dashboards with cluster selectors. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 716343234bb..8df4b132e33 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -9,8 +9,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; // - some links that propagate the selectred cluster. dashboard(title):: super.dashboard(title) + { - tags: $._config.tags, - addRowIf(condition, row):: if condition then self.addRow(row) @@ -18,6 +16,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; addClusterSelectorTemplates():: local d = self { + tags: $._config.tags, links: [ { asDropdown: true, From 0e94a339a70acf3b1928039ecc211ea8ebf9e45b Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Tue, 17 Mar 2020 21:11:43 +0530 Subject: [PATCH 037/364] Alerts for WAL metrics Signed-off-by: Ganesh Vernekar --- jsonnet/mimir-mixin/alerts.libsonnet | 76 ++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index 8812c0c1e18..4deea0f2f50 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -226,6 +226,82 @@ local windows = [ |||, }, }, + { + // Alert immediately if WAL is corrupt. + alert: 'CortexWALCorruption', + expr: ||| + increase(cortex_ingester_wal_corruptions_total[5m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. + |||, + }, + }, + { + // 1 failed checkpoint creation is a warning. + alert: 'CortexCheckpointCreationFailed', + expr: ||| + increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0 + |||, + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.instance }} failed to create checkpoint. + |||, + }, + }, + { + // 2 or more failed checkpoint creation in 1h means something is wrong. + alert: 'CortexCheckpointCreationFailing', + expr: ||| + increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.instance }} is failing to create checkpoint. + |||, + }, + }, + { + // 1 failed checkpoint deletion is a warning. + alert: 'CortexCheckpointDeletionFailed', + expr: ||| + increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0 + |||, + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.instance }} failed to delete checkpoint. + |||, + }, + }, + { + // 2 or more failed checkpoint deletion in 2h means something is wrong. + // We give this more buffer than creation as this is a less critical operation. + alert: 'CortexCheckpointDeletionFailed', + expr: ||| + increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete checkpoint. + |||, + }, + }, ], }, { From 89a385c3e61121b27f69078447869dd60bf635f9 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 May 2020 14:47:21 +0200 Subject: [PATCH 038/364] Fixed metrics for blocks storage Signed-off-by: Marco Pracucci --- .../mimir-mixin/dashboards/blocks.libsonnet | 20 ++++++------- .../mimir-mixin/dashboards/reads.libsonnet | 30 ++++++++++++------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/blocks.libsonnet b/jsonnet/mimir-mixin/dashboards/blocks.libsonnet index 2944169dd75..23f1e380fa3 100644 --- a/jsonnet/mimir-mixin/dashboards/blocks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/blocks.libsonnet @@ -6,7 +6,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addClusterSelectorTemplates() // repeated from Cortex / Chunks .addRow( - $.row('Active Series / Chunks') + $.row('Active Series') .addPanel( $.panel('Series') + $.queryPanel('sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'series'), @@ -18,8 +18,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.successFailurePanel( 'Compactor Runs / second', - 'sum(rate(cortex_compactor_runs_completed_total{cluster=~"$cluster"}[$__interval]))', - 'sum(rate(cortex_compactor_runs_failed_total{cluster=~"$cluster"}[$__interval]))' + 'sum(rate(cortex_compactor_runs_completed_total{cluster=~"$cluster",job=~"($namespace)/compactor"}[$__interval]))', + 'sum(rate(cortex_compactor_runs_failed_total{cluster=~"$cluster",job=~"($namespace)/compactor"}[$__interval]))' ) ) .addPanel( @@ -45,21 +45,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Collected Blocks Rate') + - $.queryPanel('sum(rate(cortex_compactor_garbage_collected_blocks_total{cluster=~"$cluster"}[$__interval]))', 'blocks') + $.queryPanel('sum(rate(cortex_compactor_garbage_collected_blocks_total{cluster=~"$cluster",job=~"($namespace)/compactor"}[$__interval]))', 'blocks') ) ) .addRow( - $.row('Compactor - Meta Syncs') + $.row('Compactor - Metadata Fetcher') .addPanel( $.successFailurePanel( - 'Meta Syncs / sec', - 'sum(rate(cortex_compactor_sync_meta_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_sync_meta_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - 'sum(rate(cortex_compactor_sync_meta_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + 'Metadata Syncs / sec', + 'sum(rate(cortex_compactor_meta_syncs_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_meta_sync_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', + 'sum(rate(cortex_compactor_meta_sync_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', ) ) .addPanel( - $.panel('Meta Sync Durations') + - $.latencyPanel('cortex_compactor_sync_meta_duration_seconds', '{cluster=~"$cluster"}'), + g.panel('Metadata Sync Duration') + + g.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{cluster=~"$cluster",job=~"($namespace)/compactor"}'), ) ) .addRow( diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 8a9a19a4d68..28d25917380 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -88,11 +88,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached - Blocks Index') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds_count{%s,operation="getmulti"}' % $.jobMatcher('querier')) + $.qpsPanel('cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds_count{%s,operation="getmulti"}' % $.jobMatcher('store-gateway')) ) .addPanel( $.panel('Latency') + - $.latencyPanel('cortex_querier_blocks_index_cache_memcached_operation_duration_seconds', '{%s,operation="getmulti"}' % $.jobMatcher('querier')) + $.latencyPanel('cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds', '{%s,operation="getmulti"}' % $.jobMatcher('store-gateway')) ) ) .addRowIf( @@ -149,33 +149,43 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.row('Querier - Blocks Storage') + $.row('Store-gateway - Blocks') .addPanel( $.successFailurePanel( 'Block Loads / sec', - 'sum(rate(cortex_querier_bucket_store_block_loads_total{%s}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_load_failures_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], - 'sum(rate(cortex_querier_bucket_store_block_load_failures_total{%s}[$__interval]))' % $.namespaceMatcher(), + 'sum(rate(cortex_storegateway_bucket_store_block_loads_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], + 'sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % $.namespaceMatcher(), ) ) .addPanel( $.successFailurePanel( 'Block Drops / sec', - 'sum(rate(cortex_querier_bucket_store_block_drops_total{%s}[$__interval])) - sum(rate(cortex_querier_bucket_store_block_drop_failures_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], - 'sum(rate(cortex_querier_bucket_store_block_drop_failures_total{%s}[$__interval]))' % $.namespaceMatcher(), + 'sum(rate(cortex_storegateway_bucket_store_block_drops_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], + 'sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % $.namespaceMatcher(), ) ) .addPanel( $.panel('Per-block prepares and preloads duration') + - $.latencyPanel('cortex_querier_bucket_store_series_get_all_duration_seconds', '{%s}' % $.namespaceMatcher()), + $.latencyPanel('cortex_storegateway_bucket_store_series_get_all_duration_seconds', '{%s}' % $.namespaceMatcher()), ) .addPanel( $.panel('Series merge duration') + - $.latencyPanel('cortex_querier_bucket_store_series_merge_duration_seconds', '{%s}' % $.namespaceMatcher()), + $.latencyPanel('cortex_storegateway_bucket_store_series_merge_duration_seconds', '{%s}' % $.namespaceMatcher()), ) ) + // Object store metrics for the store-gateway. .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.objectStorePanels1('Blocks Object Store Stats (Querier)', 'cortex_querier'), + $.objectStorePanels1('Store-gateway - Blocks Object Store', 'cortex_storegateway'), + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.objectStorePanels2('', 'cortex_storegateway'), + ) + // Object store metrics for the querier. + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.objectStorePanels1('Querier - Blocks Object Store', 'cortex_querier'), ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), From d6272ab769630ad09813bedb48e5d5ed3ea18ad7 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 May 2020 14:50:59 +0200 Subject: [PATCH 039/364] Fixed panel Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/blocks.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/blocks.libsonnet b/jsonnet/mimir-mixin/dashboards/blocks.libsonnet index 23f1e380fa3..1e1b1a068e0 100644 --- a/jsonnet/mimir-mixin/dashboards/blocks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/blocks.libsonnet @@ -58,8 +58,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - g.panel('Metadata Sync Duration') + - g.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{cluster=~"$cluster",job=~"($namespace)/compactor"}'), + $.panel('Metadata Sync Duration') + + $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{cluster=~"$cluster",job=~"($namespace)/compactor"}'), ) ) .addRow( From 99610d173434c7c8a3a1ce237944554230c5a9ff Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 May 2020 14:51:37 +0200 Subject: [PATCH 040/364] Renamed panel Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/blocks.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/blocks.libsonnet b/jsonnet/mimir-mixin/dashboards/blocks.libsonnet index 1e1b1a068e0..2557383e939 100644 --- a/jsonnet/mimir-mixin/dashboards/blocks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/blocks.libsonnet @@ -49,7 +49,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Compactor - Metadata Fetcher') + $.row('Compactor - Metadata Sync') .addPanel( $.successFailurePanel( 'Metadata Syncs / sec', From e534fca1356188782094ce7b9066b9bb10a7c038 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 6 May 2020 17:05:39 +0200 Subject: [PATCH 041/364] Remove duplicated utils Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboard-utils.libsonnet | 102 ------------------ 1 file changed, 102 deletions(-) delete mode 100644 jsonnet/mimir-mixin/dashboard-utils.libsonnet diff --git a/jsonnet/mimir-mixin/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboard-utils.libsonnet deleted file mode 100644 index 3ae031ea041..00000000000 --- a/jsonnet/mimir-mixin/dashboard-utils.libsonnet +++ /dev/null @@ -1,102 +0,0 @@ -(import 'grafana-builder/grafana.libsonnet') { - - // Override the dashboard constructor to add: - // - default tags, - // - some links that propagate the selectred cluster. - dashboard(title):: - super.dashboard(title) + { - tags: $._config.tags, - - links: [ - { - asDropdown: true, - icon: 'external link', - includeVars: true, - keepTime: true, - tags: $._config.tags, - targetBlank: false, - title: 'Cortex Dashboards', - type: 'dashboards', - }, - ], - }, - - qpsPanel(selector):: - super.qpsPanel(selector) + { - targets: [ - target { - interval: '1m', - } - for target in super.targets - ], - }, - - latencyPanel(metricName, selector, multiplier='1e3'):: - super.latencyPanel(metricName, selector, multiplier) + { - targets: [ - target { - interval: '1m', - } - for target in super.targets - ], - }, - - successFailurePanel(title, successMetric, failureMetric):: - $.panel(title) + - $.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + - $.stack + { - aliasColors: { - successful: '#7EB26D', - failed: '#E24D42', - }, - }, - - objectStorePanels1(title, metricPrefix):: - local opsTotal = '%s_thanos_objstore_bucket_operations_total' % [metricPrefix]; - local opsTotalFailures = '%s_thanos_objstore_bucket_operation_failures_total' % [metricPrefix]; - local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; - local interval = '$__interval'; - super.row(title) - .addPanel( - // We use 'up{cluster=~"$cluster", job="($namespace)/.+"}' to add 0 if there are no failed operations. - self.successFailurePanel( - 'Operations/sec', - 'sum(rate(%s{cluster=~"$cluster"}[%s])) - sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotal, interval, opsTotalFailures, interval], - 'sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotalFailures, interval] - ) - ) - .addPanel( - $.panel('Op: ObjectSize') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="objectsize"}'), - ) - .addPanel( - // Cortex (Thanos) doesn't track timing for 'iter', so we use ops/sec instead. - $.panel('Op: Iter') + - $.queryPanel('sum(rate(%s{cluster=~"$cluster", operation="iter"}[$__interval]))' % [opsTotal], 'ops/sec') - ) - .addPanel( - $.panel('Op: Exists') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="exists"}'), - ), - - // Second row of Object Store stats - objectStorePanels2(title, metricPrefix):: - local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; - super.row(title) - .addPanel( - $.panel('Op: Get') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get"}'), - ) - .addPanel( - $.panel('Op: GetRange') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get_range"}'), - ) - .addPanel( - $.panel('Op: Upload') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="upload"}'), - ) - .addPanel( - $.panel('Op: Delete') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="delete"}'), - ), -} From 91fd9b3dd23b97de6a6f79711f85701114367aaf Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 6 May 2020 18:03:43 +0200 Subject: [PATCH 042/364] Fixed memcached QPS panel in Cortex / Reads Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 28d25917380..ee5429ada2d 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -88,10 +88,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached - Blocks Index') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds_count{%s,operation="getmulti"}' % $.jobMatcher('store-gateway')) + $.queryPanel('sum by(operation) (rate(cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds_count{%s}[$__interval]))' % $.jobMatcher('store-gateway'), '{{operation}}') + + $.stack, ) .addPanel( - $.panel('Latency') + + $.panel('Latency (getmulti)') + $.latencyPanel('cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds', '{%s,operation="getmulti"}' % $.jobMatcher('store-gateway')) ) ) From 5d2133714bb76e297e827b4197025ee50d14722d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 6 May 2020 22:20:46 +0200 Subject: [PATCH 043/364] Added Compactor dashboard Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards.libsonnet | 2 +- .../mimir-mixin/dashboards/blocks.libsonnet | 92 ------------------- .../dashboards/compactor.libsonnet | 77 ++++++++++++++++ .../dashboards/dashboard-utils.libsonnet | 28 ++++++ 4 files changed, 106 insertions(+), 93 deletions(-) delete mode 100644 jsonnet/mimir-mixin/dashboards/blocks.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboards/compactor.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 73ac28fb46b..a1ca1f27e65 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -7,7 +7,7 @@ (import 'dashboards/writes.libsonnet') + (if std.setMember('tsdb', $._config.storage_engine) - then import 'dashboards/blocks.libsonnet' + then import 'dashboards/compactor.libsonnet' else {}) + (if std.setMember('chunks', $._config.storage_engine) diff --git a/jsonnet/mimir-mixin/dashboards/blocks.libsonnet b/jsonnet/mimir-mixin/dashboards/blocks.libsonnet deleted file mode 100644 index 2557383e939..00000000000 --- a/jsonnet/mimir-mixin/dashboards/blocks.libsonnet +++ /dev/null @@ -1,92 +0,0 @@ -local utils = import 'mixin-utils/utils.libsonnet'; - -(import 'dashboard-utils.libsonnet') { - 'cortex-blocks.json': - $.dashboard('Cortex / Blocks') - .addClusterSelectorTemplates() - // repeated from Cortex / Chunks - .addRow( - $.row('Active Series') - .addPanel( - $.panel('Series') + - $.queryPanel('sum(cortex_ingester_memory_series{cluster=~"$cluster", job=~"($namespace)/ingester"})', 'series'), - ) - // Chunks per series doesn't make sense for Blocks storage - ) - .addRow( - $.row('Compactor') - .addPanel( - $.successFailurePanel( - 'Compactor Runs / second', - 'sum(rate(cortex_compactor_runs_completed_total{cluster=~"$cluster",job=~"($namespace)/compactor"}[$__interval]))', - 'sum(rate(cortex_compactor_runs_failed_total{cluster=~"$cluster",job=~"($namespace)/compactor"}[$__interval]))' - ) - ) - .addPanel( - $.successFailurePanel( - 'Per-tenant Compaction Runs / seconds', - 'sum(rate(cortex_compactor_group_compactions_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_group_compactions_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - 'sum(rate(cortex_compactor_group_compactions_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - ) - ) - ) - .addRow( - $.row('Compactor – Blocks Garbage Collections') - .addPanel( - $.successFailurePanel( - 'Collections Rate', - 'sum(rate(cortex_compactor_garbage_collection_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_garbage_collection_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - 'sum(rate(cortex_compactor_garbage_collection_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - ) - ) - .addPanel( - $.panel('Collections Duration') + - $.latencyPanel('cortex_compactor_garbage_collection_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') - ) - .addPanel( - $.panel('Collected Blocks Rate') + - $.queryPanel('sum(rate(cortex_compactor_garbage_collected_blocks_total{cluster=~"$cluster",job=~"($namespace)/compactor"}[$__interval]))', 'blocks') - ) - ) - .addRow( - $.row('Compactor - Metadata Sync') - .addPanel( - $.successFailurePanel( - 'Metadata Syncs / sec', - 'sum(rate(cortex_compactor_meta_syncs_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval])) - sum(rate(cortex_compactor_meta_sync_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - 'sum(rate(cortex_compactor_meta_sync_failures_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', - ) - ) - .addPanel( - $.panel('Metadata Sync Duration') + - $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{cluster=~"$cluster",job=~"($namespace)/compactor"}'), - ) - ) - .addRow( - $.row('Prometheus TSDB Compactions') - .addPanel( - $.panel('Compactions Rate') + - $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{cluster=~"$cluster", job=~"($namespace)/compactor"}[$__interval]))', 'rate') - ) - .addPanel( - $.panel('Compaction Duration') + - $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') - ) - .addPanel( - $.panel('Chunk Size Bytes') + - $.latencyPanel('prometheus_tsdb_compaction_chunk_size_bytes', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + - { yaxes: $.yaxes('bytes') } - ) - .addPanel( - $.panel('Chunk Samples') + - $.latencyPanel('prometheus_tsdb_compaction_chunk_samples', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') + - { yaxes: $.yaxes('short') } - ) - .addPanel( - $.panel('Chunk Range (seconds)') + - $.latencyPanel('prometheus_tsdb_compaction_chunk_range_seconds', '{cluster=~"$cluster", job=~"($namespace)/compactor"}') - ) - ) - .addRow($.objectStorePanels1('Object Store Stats', 'cortex_compactor')) - .addRow($.objectStorePanels2('', 'cortex_compactor')), -} diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet new file mode 100644 index 00000000000..e9b9e2a1e8b --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -0,0 +1,77 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + 'cortex-compactor.json': + $.dashboard('Cortex / Compactor') + .addClusterSelectorTemplates() + .addRow( + $.row('Compactions') + .addPanel( + $.startedCompletedFailedPanel( + 'Per-instance runs / sec', + 'sum(rate(cortex_compactor_runs_started_total{%s}[$__interval]))' % $.jobMatcher('compactor'), + 'sum(rate(cortex_compactor_runs_completed_total{%s}[$__interval]))' % $.jobMatcher('compactor'), + 'sum(rate(cortex_compactor_runs_failed_total{%s}[$__interval]))' % $.jobMatcher('compactor') + ) + + $.bars + + { yaxes: $.yaxes('ops') }, + ) + .addPanel( + $.successFailurePanel( + 'Per-tenant runs / sec', + 'sum(rate(cortex_compactor_group_compactions_total{%s}[$__interval])) - sum(rate(cortex_compactor_group_compactions_failures_total{%s}[$__interval]))' % [$.jobMatcher('compactor'), $.jobMatcher('compactor')], + 'sum(rate(cortex_compactor_group_compactions_failures_total{%s}[$__interval]))' % $.jobMatcher('compactor'), + ) + + $.bars + + { yaxes: $.yaxes('ops') }, + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('Compacted blocks / sec') + + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__interval]))' % $.jobMatcher('compactor'), 'blocks') + + { yaxes: $.yaxes('ops') }, + ) + .addPanel( + $.panel('Compaction Duration') + + $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher('compactor')) + ) + ) + .addRow( + $.row('Garbage Collector') + .addPanel( + $.panel('Blocks marked for deletion / sec') + + $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__interval]))' % $.jobMatcher('compactor'), 'blocks') + + { yaxes: $.yaxes('ops') }, + ) + .addPanel( + $.successFailurePanel( + 'Blocks deletions / sec', + // The cortex_compactor_blocks_cleaned_total tracks the number of successfully + // deleted blocks. + 'sum(rate(cortex_compactor_blocks_cleaned_total{%s}[$__interval]))' % $.jobMatcher('compactor'), + 'sum(rate(cortex_compactor_block_cleanup_failures_total{%s}[$__interval]))' % $.jobMatcher('compactor'), + ) + { yaxes: $.yaxes('ops') } + ) + ) + .addRow( + $.row('Metadata Sync') + .addPanel( + $.successFailurePanel( + 'Metadata Syncs / sec', + // The cortex_compactor_meta_syncs_total metric is incremented each time a per-tenant + // metadata sync is triggered. + 'sum(rate(cortex_compactor_meta_syncs_total{%s}[$__interval])) - sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__interval]))' % [$.jobMatcher('compactor'), $.jobMatcher('compactor')], + 'sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__interval]))' % $.jobMatcher('compactor'), + ) + { yaxes: $.yaxes('ops') } + ) + .addPanel( + $.panel('Metadata Sync Duration') + + // This metric tracks the duration of a per-tenant metadata sync. + $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher('compactor')), + ) + ) + .addRow($.objectStorePanels1('Object Store', 'cortex_compactor')) + .addRow($.objectStorePanels2('', 'cortex_compactor')), +} diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 8df4b132e33..b3a04658e0b 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -55,6 +55,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; then [utils.selector.noop('cluster'), utils.selector.re('job', '$job')] else [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($namespace)/%s' % job)], + queryPanel(queries, legends, legendLink=null):: + super.queryPanel(queries, legends, legendLink) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + }, + qpsPanel(selector):: super.qpsPanel(selector) + { targets: [ @@ -85,6 +95,24 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, + // Displays started, completed and failed rate. + startedCompletedFailedPanel(title, startedMetric, completedMetric, failedMetric):: + $.panel(title) + + $.queryPanel([startedMetric, completedMetric, failedMetric], ['started', 'completed', 'failed']) + + $.stack + { + aliasColors: { + started: '#34CCEB', + completed: '#7EB26D', + failed: '#E24D42', + }, + }, + + // Switches a panel from lines (default) to bars. + bars:: { + bars: true, + lines: false, + }, + objectStorePanels1(title, metricPrefix):: local opsTotal = '%s_thanos_objstore_bucket_operations_total' % [metricPrefix]; local opsTotalFailures = '%s_thanos_objstore_bucket_operation_failures_total' % [metricPrefix]; From 28f8e5aaa8993ecedcf055e4d20955d848d970ee Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 7 May 2020 08:07:49 +0200 Subject: [PATCH 044/364] Added text panel to explain some compactor metrics Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/compactor.libsonnet | 14 +++++++++++++- .../dashboards/dashboard-utils.libsonnet | 10 ++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index e9b9e2a1e8b..e47c96442db 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -6,6 +6,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addClusterSelectorTemplates() .addRow( $.row('Compactions') + .addPanel( + $.textPanel('', ||| + - **Per-instance runs**: number of times a compactor instance triggers a compaction across all tenants its shard manage. + - **Per-tenant runs**: number of times a compactor instance triggers the compaction for a single tenant's blocks. + |||), + ) .addPanel( $.startedCompletedFailedPanel( 'Per-instance runs / sec', @@ -28,13 +34,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRow( $.row('') + .addPanel( + $.textPanel('', ||| + - **Compacted blocks**: number of blocks generated as a result of a compaction operation. + - **Per-block compaction duration**: time taken to generate a single compacted block. + |||), + ) .addPanel( $.panel('Compacted blocks / sec') + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__interval]))' % $.jobMatcher('compactor'), 'blocks') + { yaxes: $.yaxes('ops') }, ) .addPanel( - $.panel('Compaction Duration') + + $.panel('Per-block compaction duration') + $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher('compactor')) ) ) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index b3a04658e0b..e384a3f20e5 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -113,6 +113,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; lines: false, }, + textPanel(title, content, options={}):: { + content: content, + datasource: null, + description: '', + mode: 'markdown', + title: title, + transparent: true, + type: 'text', + } + options, + objectStorePanels1(title, metricPrefix):: local opsTotal = '%s_thanos_objstore_bucket_operations_total' % [metricPrefix]; local opsTotalFailures = '%s_thanos_objstore_bucket_operation_failures_total' % [metricPrefix]; From d2f0140c25565e1710f5580b469849454684aed2 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 7 May 2020 08:37:45 +0200 Subject: [PATCH 045/364] Fixed Ingester panel in the 'Cortex / Reads' dashboard Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index ee5429ada2d..799d2b869f4 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -52,11 +52,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ingester') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers"}' % $.jobMatcher('querier')) + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher('ingester')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('querier') + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) ) ) .addRowIf( From 94f5b2699c4f9fb28921a8fee973789e704779bc Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 7 May 2020 11:42:58 +0200 Subject: [PATCH 046/364] Improved blocks storage support in the Reads and Queries dashboards Signed-off-by: Marco Pracucci --- .../dashboards/dashboard-utils.libsonnet | 21 +++--- .../mimir-mixin/dashboards/queries.libsonnet | 69 +++++++++++++++++-- .../mimir-mixin/dashboards/reads.libsonnet | 52 ++++++-------- 3 files changed, 97 insertions(+), 45 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index e384a3f20e5..e084d546040 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -127,28 +127,27 @@ local utils = import 'mixin-utils/utils.libsonnet'; local opsTotal = '%s_thanos_objstore_bucket_operations_total' % [metricPrefix]; local opsTotalFailures = '%s_thanos_objstore_bucket_operation_failures_total' % [metricPrefix]; local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; - local interval = '$__interval'; super.row(title) .addPanel( - // We use 'up{cluster=~"$cluster", job="($namespace)/.+"}' to add 0 if there are no failed operations. + // We use 'up' to add 0 if there are no failed operations. self.successFailurePanel( 'Operations/sec', - 'sum(rate(%s{cluster=~"$cluster"}[%s])) - sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotal, interval, opsTotalFailures, interval], - 'sum(rate(%s{cluster=~"$cluster"}[%s]) or (up{cluster=~"$cluster", job="($namespace)/.+"}*0))' % [opsTotalFailures, interval] + 'sum(rate(%s{%s}[$__interval])) - sum(rate(%s{%s}[$__interval]) or (up{%s}*0))' % [opsTotal, $.namespaceMatcher(), opsTotalFailures, $.namespaceMatcher(), $.namespaceMatcher()], + 'sum(rate(%s{%s}[$__interval]) or (up{%s}*0))' % [opsTotalFailures, $.namespaceMatcher(), $.namespaceMatcher()] ) ) .addPanel( $.panel('Op: ObjectSize') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="objectsize"}'), + $.latencyPanel(operationDuration, '{%s, operation="objectsize"}' % $.namespaceMatcher()), ) .addPanel( // Cortex (Thanos) doesn't track timing for 'iter', so we use ops/sec instead. $.panel('Op: Iter') + - $.queryPanel('sum(rate(%s{cluster=~"$cluster", operation="iter"}[$__interval]))' % [opsTotal], 'ops/sec') + $.queryPanel('sum(rate(%s{%s, operation="iter"}[$__interval]))' % [opsTotal, $.namespaceMatcher()], 'ops/sec') ) .addPanel( $.panel('Op: Exists') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="exists"}'), + $.latencyPanel(operationDuration, '{%s, operation="exists"}' % $.namespaceMatcher()), ), // Second row of Object Store stats @@ -157,18 +156,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; super.row(title) .addPanel( $.panel('Op: Get') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get"}'), + $.latencyPanel(operationDuration, '{%s, operation="get"}' % $.namespaceMatcher()), ) .addPanel( $.panel('Op: GetRange') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="get_range"}'), + $.latencyPanel(operationDuration, '{%s, operation="get_range"}' % $.namespaceMatcher()), ) .addPanel( $.panel('Op: Upload') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="upload"}'), + $.latencyPanel(operationDuration, '{%s, operation="upload"}' % $.namespaceMatcher()), ) .addPanel( $.panel('Op: Delete') + - $.latencyPanel(operationDuration, '{cluster=~"$cluster", operation="delete"}'), + $.latencyPanel(operationDuration, '{%s, operation="delete"}' % $.namespaceMatcher()), ), } diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index 18257887b85..85c68e4429a 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -65,8 +65,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{%s}[1m]))' % $.jobMatcher('querier'), 'Corrupt chunks'), ) ) - .addRow( - $.row('Querier - Index Cache') + .addRowIf( + std.setMember('chunks', $._config.storage_engine), + $.row('Querier - Chunks storage - Index Cache') .addPanel( $.panel('Total entries') + $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher('querier'), $.jobMatcher('querier')], 'Entries'), @@ -99,8 +100,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('short') }, ) ) - .addRow( - $.row('Chunk Store') + .addRowIf( + std.setMember('chunks', $._config.storage_engine), + $.row('Querier - Chunks storage - Store') .addPanel( $.panel('Index Lookups per Query') + utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector('querier'), multiplier=1) + @@ -121,5 +123,64 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector('querier'), multiplier=1) + { yaxes: $.yaxes('short') }, ) + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.row('Store-gateway - Blocks') + .addPanel( + $.panel('Blocks queried / sec') + + $.queryPanel('sum(rate(cortex_storegateway_bucket_store_series_blocks_queried_sum{%s}[$__interval]))' % $.jobMatcher('store-gateway'), 'blocks') + + { yaxes: $.yaxes('ops') }, + ) + .addPanel( + $.panel('Data fetched / sec') + + $.queryPanel('sum by(data_type) (rate(cortex_storegateway_bucket_store_series_data_fetched_sum{%s}[$__interval]))' % $.jobMatcher('store-gateway'), '{{data_type}}') + + $.stack + + { yaxes: $.yaxes('ops') }, + ) + .addPanel( + $.panel('Data touched / sec') + + $.queryPanel('sum by(data_type) (rate(cortex_storegateway_bucket_store_series_data_touched_sum{%s}[$__interval]))' % $.jobMatcher('store-gateway'), '{{data_type}}') + + $.stack + + { yaxes: $.yaxes('ops') }, + ) + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.row('') + .addPanel( + $.panel('Series fetch duration (per request)') + + $.latencyPanel('cortex_storegateway_bucket_store_series_get_all_duration_seconds', '{%s}' % $.jobMatcher('store-gateway')), + ) + .addPanel( + $.panel('Series merge duration (per request)') + + $.latencyPanel('cortex_storegateway_bucket_store_series_merge_duration_seconds', '{%s}' % $.jobMatcher('store-gateway')), + ) + .addPanel( + $.panel('Series returned (per request)') + + $.queryPanel('sum(rate(cortex_storegateway_bucket_store_series_result_series_sum{%s}[$__interval])) / sum(rate(cortex_storegateway_bucket_store_series_result_series_count{%s}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], 'avg series returned'), + ) + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.row('') + .addPanel( + $.panel('Blocks currently loaded') + + $.queryPanel('cortex_storegateway_bucket_store_blocks_loaded{%s}' % $.jobMatcher('store-gateway'), '{{instance}}') + ) + .addPanel( + $.successFailurePanel( + 'Blocks loaded / sec', + 'sum(rate(cortex_storegateway_bucket_store_block_loads_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], + 'sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % $.jobMatcher('store-gateway'), + ) + ) + .addPanel( + $.successFailurePanel( + 'Blocks dropped / sec', + 'sum(rate(cortex_storegateway_bucket_store_block_drops_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], + 'sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % $.jobMatcher('store-gateway'), + ) + ) ), } diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 799d2b869f4..b67590cc566 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -59,9 +59,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) ) ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.row('Store-gateway') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher('store-gateway')) + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('store-gateway') + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) + ) + ) .addRowIf( std.setMember('chunks', $._config.storage_engine), - $.row('Memcached - Index') + $.row('Memcached - Chunks storage - Index') .addPanel( $.panel('QPS') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher('querier')) @@ -73,7 +85,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.setMember('chunks', $._config.storage_engine), - $.row('Memcached - Chunks') + $.row('Memcached - Chunks storage - Chunks') .addPanel( $.panel('QPS') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher('querier')) @@ -85,16 +97,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.row('Memcached - Blocks Index') + $.row('Memcached - Blocks Storage - Index header') .addPanel( $.panel('QPS') + $.queryPanel('sum by(operation) (rate(cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds_count{%s}[$__interval]))' % $.jobMatcher('store-gateway'), '{{operation}}') + - $.stack, + $.stack + + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Latency (getmulti)') + $.latencyPanel('cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds', '{%s,operation="getmulti"}' % $.jobMatcher('store-gateway')) ) + .addPanel( + $.panel('Hit ratio') + + $.queryPanel('sum by(item_type) (rate(cortex_storegateway_blocks_index_cache_hits_total{%s}[$__interval])) / sum by(item_type) (rate(cortex_storegateway_blocks_index_cache_requests_total{%s}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], '{{item_type}}') + + { yaxes: $.yaxes('percentunit') }, + ) ) .addRowIf( std.setMember('chunks', $._config.storage_engine) && @@ -148,32 +166,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('operation', 'GET')]) ) ) - .addRowIf( - std.setMember('tsdb', $._config.storage_engine), - $.row('Store-gateway - Blocks') - .addPanel( - $.successFailurePanel( - 'Block Loads / sec', - 'sum(rate(cortex_storegateway_bucket_store_block_loads_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], - 'sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % $.namespaceMatcher(), - ) - ) - .addPanel( - $.successFailurePanel( - 'Block Drops / sec', - 'sum(rate(cortex_storegateway_bucket_store_block_drops_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], - 'sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % $.namespaceMatcher(), - ) - ) - .addPanel( - $.panel('Per-block prepares and preloads duration') + - $.latencyPanel('cortex_storegateway_bucket_store_series_get_all_duration_seconds', '{%s}' % $.namespaceMatcher()), - ) - .addPanel( - $.panel('Series merge duration') + - $.latencyPanel('cortex_storegateway_bucket_store_series_merge_duration_seconds', '{%s}' % $.namespaceMatcher()), - ) - ) // Object store metrics for the store-gateway. .addRowIf( std.setMember('tsdb', $._config.storage_engine), From 4ed6813ee0b1348fbb14f5328c712822aeb4689e Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 7 May 2020 17:12:22 +0530 Subject: [PATCH 047/364] Add dashboard for WAL metrics (https://github.com/grafana/cortex-jsonnet/pull/46) Signed-off-by: Ganesh Vernekar --- .../mimir-mixin/dashboards/chunks.libsonnet | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet index 91f7d5b9c85..e4d349323f8 100644 --- a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet @@ -50,4 +50,51 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.qpsPanel('cortex_ingester_chunk_age_seconds_count{%s}' % $.jobMatcher('ingester')), ), ), + + 'cortex-wal.json': + $.dashboard('Cortex / WAL') + .addClusterSelectorTemplates() + .addRow( + $.row('') + .addPanel( + $.panel('Bytes Logged (WAL+Checkpoint) / ingester / second') + + $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval])) + avg(rate(cortex_ingester_checkpoint_logged_bytes_total{%(m)s}[$__interval]))' % { m: $.jobMatcher('ingester') }, 'bytes') + + { yaxes: $.yaxes('bytes') }, + ) + ) + .addRow( + $.row('WAL') + .addPanel( + $.panel('Records logged / ingester / second') + + $.queryPanel('avg(rate(cortex_ingester_wal_records_logged_total{%s}[$__interval]))' % $.jobMatcher('ingester'), 'records'), + ) + .addPanel( + $.panel('Bytes per record') + + $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval]) / rate(cortex_ingester_wal_records_logged_total{%(m)s}[$__interval]))' % { m: $.jobMatcher('ingester') }, 'bytes') + + { yaxes: $.yaxes('bytes') }, + ) + .addPanel( + $.panel('Bytes per sample') + + $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval]) / rate(cortex_ingester_ingested_samples_total{%(m)s}[$__interval]))' % { m: $.jobMatcher('ingester') }, 'bytes') + + { yaxes: $.yaxes('bytes') }, + ) + .addPanel( + $.panel('Min(available disk space)') + + $.queryPanel('min(kubelet_volume_stats_available_bytes{cluster=~"$cluster", namespace=~"$namespace", persistentvolumeclaim=~"ingester.*"})', 'bytes') + + { yaxes: $.yaxes('bytes') }, + ) + ) + .addRow( + $.row('Checkpoint') + .addPanel( + $.panel('Checkpoint creation/deletion / sec') + + $.queryPanel('rate(cortex_ingester_checkpoint_creations_total{%s}[$__interval])' % $.jobMatcher('ingester'), '{{instance}}-creation') + + $.queryPanel('rate(cortex_ingester_checkpoint_deletions_total{%s}[$__interval])' % $.jobMatcher('ingester'), '{{instance}}-deletion'), + ) + .addPanel( + $.panel('Checkpoint creation/deletion failed / sec') + + $.queryPanel('rate(cortex_ingester_checkpoint_creations_failed_total{%s}[$__interval])' % $.jobMatcher('ingester'), '{{instance}}-creation') + + $.queryPanel('rate(cortex_ingester_checkpoint_deletions_failed_total{%s}[$__interval])' % $.jobMatcher('ingester'), '{{instance}}-deletion'), + ) + ), } From f47f37f1f74a0bf03c7e67f650ffb3e995f988b0 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 7 May 2020 19:13:31 +0530 Subject: [PATCH 048/364] Fix Distributor panel in Writes dashboard (https://github.com/grafana/cortex-jsonnet/pull/58) Signed-off-by: Ganesh Vernekar --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 00fb3d3e613..ed765770920 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -45,11 +45,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Distributor') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route="api_prom_push"}' % $.jobMatcher('distributor')) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/httpgrpc.*|api_prom_push"}' % $.jobMatcher('distributor')) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('distributor') + [utils.selector.eq('route', 'api_prom_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('distributor') + [utils.selector.re('route', '/httpgrpc.*|api_prom_push')]) ) ) .addRow( From 927254521e8adb462832acd6c851e4a450df8d69 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 8 May 2020 11:00:50 +0200 Subject: [PATCH 049/364] Updated Object Store metrics in dashboards Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards.libsonnet | 4 +- .../dashboards/compactor.libsonnet | 4 +- .../dashboards/dashboard-utils.libsonnet | 38 +++++------ .../dashboards/object-store.libsonnet | 65 +++++++++++++++++++ .../mimir-mixin/dashboards/reads.libsonnet | 8 +-- .../mimir-mixin/dashboards/writes.libsonnet | 18 ++--- 6 files changed, 97 insertions(+), 40 deletions(-) create mode 100644 jsonnet/mimir-mixin/dashboards/object-store.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index a1ca1f27e65..bf068135805 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -7,7 +7,9 @@ (import 'dashboards/writes.libsonnet') + (if std.setMember('tsdb', $._config.storage_engine) - then import 'dashboards/compactor.libsonnet' + then + (import 'dashboards/compactor.libsonnet') + + (import 'dashboards/object-store.libsonnet') else {}) + (if std.setMember('chunks', $._config.storage_engine) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index e47c96442db..2327b4d1e34 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -84,6 +84,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher('compactor')), ) ) - .addRow($.objectStorePanels1('Object Store', 'cortex_compactor')) - .addRow($.objectStorePanels2('', 'cortex_compactor')), + .addRow($.objectStorePanels1('Object Store', 'compactor')) + .addRow($.objectStorePanels2('', 'compactor')), } diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index e084d546040..dddf60c4266 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -123,51 +123,45 @@ local utils = import 'mixin-utils/utils.libsonnet'; type: 'text', } + options, - objectStorePanels1(title, metricPrefix):: - local opsTotal = '%s_thanos_objstore_bucket_operations_total' % [metricPrefix]; - local opsTotalFailures = '%s_thanos_objstore_bucket_operation_failures_total' % [metricPrefix]; - local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; + objectStorePanels1(title, component):: super.row(title) .addPanel( - // We use 'up' to add 0 if there are no failed operations. - self.successFailurePanel( - 'Operations/sec', - 'sum(rate(%s{%s}[$__interval])) - sum(rate(%s{%s}[$__interval]) or (up{%s}*0))' % [opsTotal, $.namespaceMatcher(), opsTotalFailures, $.namespaceMatcher(), $.namespaceMatcher()], - 'sum(rate(%s{%s}[$__interval]) or (up{%s}*0))' % [opsTotalFailures, $.namespaceMatcher(), $.namespaceMatcher()] - ) + $.panel('Operations / sec') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__interval]))' % [$.namespaceMatcher(), component], '{{operation}}') + + $.stack + + { yaxes: $.yaxes('rps') }, ) .addPanel( - $.panel('Op: ObjectSize') + - $.latencyPanel(operationDuration, '{%s, operation="objectsize"}' % $.namespaceMatcher()), + $.panel('Error rate') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s,component="%s"}[$__interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__interval]))' % [$.namespaceMatcher(), component, $.namespaceMatcher(), component], '{{operation}}') + + { yaxes: $.yaxes('percentunit') }, ) .addPanel( - // Cortex (Thanos) doesn't track timing for 'iter', so we use ops/sec instead. - $.panel('Op: Iter') + - $.queryPanel('sum(rate(%s{%s, operation="iter"}[$__interval]))' % [opsTotal, $.namespaceMatcher()], 'ops/sec') + $.panel('Op: ObjectSize') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="objectsize"}' % [$.namespaceMatcher(), component]), ) .addPanel( $.panel('Op: Exists') + - $.latencyPanel(operationDuration, '{%s, operation="exists"}' % $.namespaceMatcher()), + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="exists"}' % [$.namespaceMatcher(), component]), ), // Second row of Object Store stats - objectStorePanels2(title, metricPrefix):: - local operationDuration = '%s_thanos_objstore_bucket_operation_duration_seconds' % [metricPrefix]; + objectStorePanels2(title, component):: super.row(title) .addPanel( $.panel('Op: Get') + - $.latencyPanel(operationDuration, '{%s, operation="get"}' % $.namespaceMatcher()), + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get"}' % [$.namespaceMatcher(), component]), ) .addPanel( $.panel('Op: GetRange') + - $.latencyPanel(operationDuration, '{%s, operation="get_range"}' % $.namespaceMatcher()), + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get_range"}' % [$.namespaceMatcher(), component]), ) .addPanel( $.panel('Op: Upload') + - $.latencyPanel(operationDuration, '{%s, operation="upload"}' % $.namespaceMatcher()), + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="upload"}' % [$.namespaceMatcher(), component]), ) .addPanel( $.panel('Op: Delete') + - $.latencyPanel(operationDuration, '{%s, operation="delete"}' % $.namespaceMatcher()), + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="delete"}' % [$.namespaceMatcher(), component]), ), } diff --git a/jsonnet/mimir-mixin/dashboards/object-store.libsonnet b/jsonnet/mimir-mixin/dashboards/object-store.libsonnet new file mode 100644 index 00000000000..c02160236a0 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/object-store.libsonnet @@ -0,0 +1,65 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + 'cortex-object-store.json': + $.dashboard('Cortex / Object Store') + .addClusterSelectorTemplates() + .addRow( + $.row('Components') + .addPanel( + $.panel('RPS / component') + + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__interval]))' % $.namespaceMatcher(), '{{component}}') + + $.stack + + { yaxes: $.yaxes('rps') }, + ) + .addPanel( + $.panel('Error rate / component') + + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') + + { yaxes: $.yaxes('percentunit') }, + ) + ) + .addRow( + $.row('Operations') + .addPanel( + $.panel('RPS / operation') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__interval]))' % $.namespaceMatcher(), '{{operation}}') + + $.stack + + { yaxes: $.yaxes('rps') }, + ) + .addPanel( + $.panel('Error rate / operation') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') + + { yaxes: $.yaxes('percentunit') }, + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('Op: Get') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="get"}' % $.namespaceMatcher()), + ) + .addPanel( + $.panel('Op: GetRange') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="get_range"}' % $.namespaceMatcher()), + ) + .addPanel( + $.panel('Op: Exists') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="exists"}' % $.namespaceMatcher()), + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('Op: ObjectSize') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="objectsize"}' % $.namespaceMatcher()), + ) + .addPanel( + $.panel('Op: Upload') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="upload"}' % $.namespaceMatcher()), + ) + .addPanel( + $.panel('Op: Delete') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="delete"}' % $.namespaceMatcher()), + ) + ), +} diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index b67590cc566..f0bc2d8de89 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -169,19 +169,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Object store metrics for the store-gateway. .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.objectStorePanels1('Store-gateway - Blocks Object Store', 'cortex_storegateway'), + $.objectStorePanels1('Store-gateway - Blocks Object Store', 'store-gateway'), ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.objectStorePanels2('', 'cortex_storegateway'), + $.objectStorePanels2('', 'store-gateway'), ) // Object store metrics for the querier. .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.objectStorePanels1('Querier - Blocks Object Store', 'cortex_querier'), + $.objectStorePanels1('Querier - Blocks Object Store', 'querier'), ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.objectStorePanels2('', 'cortex_querier'), + $.objectStorePanels2('', 'querier'), ), } diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index ed765770920..64a6a6d5a75 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -151,21 +151,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.row('Blocks Shipper') + $.row('Ingester - Blocks storage - Shipper') .addPanel( $.successFailurePanel( 'Uploaded blocks / sec', - 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], - 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % [$.namespaceMatcher()], + 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % [$.jobMatcher('ingester'), $.jobMatcher('ingester')], + 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % $.jobMatcher('ingester'), ), ) - ) - .addRowIf( - std.setMember('tsdb', $._config.storage_engine), - $.objectStorePanels1('Blocks Object Store Stats (Ingester)', 'cortex_ingester'), - ) - .addRowIf( - std.setMember('tsdb', $._config.storage_engine), - $.objectStorePanels2('', 'cortex_ingester'), + .addPanel( + $.panel('Upload latency') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher('ingester')), + ) ), } From 30cdd924ac6eca4ffe88f507d38347d8cc6f1297 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 8 May 2020 13:17:04 +0200 Subject: [PATCH 050/364] Moved alerts to dedicated folder Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts.libsonnet | 460 +----------------- .../mimir-mixin/alerts/alert-utils.libsonnet | 8 + jsonnet/mimir-mixin/alerts/alerts.libsonnet | 449 +++++++++++++++++ jsonnet/mimir-mixin/config.libsonnet | 3 + 4 files changed, 463 insertions(+), 457 deletions(-) create mode 100644 jsonnet/mimir-mixin/alerts/alert-utils.libsonnet create mode 100644 jsonnet/mimir-mixin/alerts/alerts.libsonnet diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index a6a29b91dc0..7d1867caf81 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -1,460 +1,6 @@ { - _config+:: { - cortex_p99_latency_threshold_seconds: 2.5, - alert_namespace_matcher: '', - }, + prometheusAlerts+:: + (import 'alerts/alerts.libsonnet') + - prometheusAlerts+:: { - local namespace_matcher(prefix='') = - if std.length($._config.alert_namespace_matcher) != 0 - then '%s namespace=~"%s"' % [prefix, $._config.alert_namespace_matcher] - else '', - groups+: [ - { - name: 'cortex_alerts', - rules: [ - { - alert: 'CortexIngesterUnhealthy', - 'for': '15m', - expr: ||| - min(cortex_ring_members{state="Unhealthy", job=~"[a-z]+/distributor" %s}) by (namespace, job) > 0 - ||| % namespace_matcher(','), - labels: { - severity: 'critical', - }, - annotations: { - message: '{{ $labels.job }} reports more than one unhealthy ingester.', - }, - }, - { - alert: 'CortexFlushStuck', - expr: ||| - (cortex_ingester_memory_chunks / cortex_ingester_memory_series) > 1.3 - |||, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: '{{ $labels.job }}/{{ $labels.instance }} is stuck flushing chunks.', - }, - }, - { - alert: 'CortexRequestErrors', - expr: ||| - 100 * sum(rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) - / - sum(rate(cortex_request_duration_seconds_count[1m])) by (namespace, job, route) - > 1 - |||, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, - }, - }, - { - alert: 'CortexRequestLatency', - expr: ||| - cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process"} - > - %(cortex_p99_latency_threshold_seconds)s - ||| % $._config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - |||, - }, - }, - { - // We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail - // and we will never trigger the alert. - // We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage. - alert: 'CortexTableSyncFailure', - expr: ||| - 100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m]) - / - rate(cortex_table_manager_sync_duration_seconds_count[15m]) - > 10 - |||, - 'for': '30m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables. - |||, - }, - }, - { - alert: 'CortexQueriesIncorrect', - expr: ||| - 100 * sum by (job, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) - / - sum by (job, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1 - |||, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }} is reporting incorrect results for {{ printf "%.2f" $value }}% of queries. - |||, - }, - }, - { - alert: 'CortexBadOverrides', - expr: ||| - cortex_overrides_last_reload_successful{job!~".+/table-manager|.+/alertmanager" %s} == 0 - ||| % namespace_matcher(','), - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }} failed to reload overrides. - |||, - }, - }, - { - alert: 'CortexQuerierCapacityFull', - expr: ||| - prometheus_engine_queries_concurrent_max{job=~".+/querier"} - prometheus_engine_queries{job=~".+/querier"} == 0 - |||, - 'for': '5m', // We don't want to block for longer. - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} is at capacity processing queries. - |||, - }, - }, - { - alert: 'CortexFrontendQueriesStuck', - expr: ||| - sum by (namespace) (cortex_query_frontend_queue_length{job=~".+/query-frontend" %s}) > 1 - ||| % namespace_matcher(','), - 'for': '5m', // We don't want to block for longer. - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} has {{ $value }} queued up queries. - |||, - }, - }, - { - alert: 'CortexCacheRequestErrors', - expr: ||| - 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5.." %s}[1m])) by (namespace, job, method) - / - sum(rate(cortex_cache_request_duration_seconds_count{%s}[1m])) by (namespace, job, method) - > 1 - ||| % [namespace_matcher(','), namespace_matcher()], - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }} cache {{ $labels.method }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, - }, - }, - { - alert: 'CortexIngesterRestarts', - expr: ||| - rate(kube_pod_container_status_restarts_total{container="ingester" %s}[30m]) > 0 - ||| % namespace_matcher(','), - labels: { - severity: 'critical', - }, - annotations: { - message: '{{ $labels.namespace }}/{{ $labels.pod }} is restarting', - }, - }, - { - alert: 'CortexTransferFailed', - expr: ||| - max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success" %s}[15m]) - ||| % namespace_matcher(','), - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - message: '{{ $labels.namespace }}/{{ $labels.instance }} transfer failed.', - }, - }, - { - alert: 'CortexOldChunkInMemory', - // Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer - // to 10 hours. - // Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors). - expr: ||| - (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000) and cortex_oldest_unflushed_chunk_timestamp_seconds > 0 - |||, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} has very old unflushed chunk in memory. - |||, - }, - }, - { - // Alert immediately if WAL is corrupt. - alert: 'CortexWALCorruption', - expr: ||| - increase(cortex_ingester_wal_corruptions_total[5m]) > 0 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. - |||, - }, - }, - { - // 1 failed checkpoint creation is a warning. - alert: 'CortexCheckpointCreationFailed', - expr: ||| - increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0 - |||, - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} failed to create checkpoint. - |||, - }, - }, - { - // 2 or more failed checkpoint creation in 1h means something is wrong. - alert: 'CortexCheckpointCreationFailing', - expr: ||| - increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} is failing to create checkpoint. - |||, - }, - }, - { - // 1 failed checkpoint deletion is a warning. - alert: 'CortexCheckpointDeletionFailed', - expr: ||| - increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0 - |||, - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} failed to delete checkpoint. - |||, - }, - }, - { - // 2 or more failed checkpoint deletion in 2h means something is wrong. - // We give this more buffer than creation as this is a less critical operation. - alert: 'CortexCheckpointDeletionFailed', - expr: ||| - increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete checkpoint. - |||, - }, - }, - ], - }, - { - name: 'cortex-provisioning', - rules: [ - { - alert: 'CortexProvisioningMemcachedTooSmall', - // 4 x in-memory series size = 24hrs of data. - expr: ||| - ( - 4 * - sum by(cluster, namespace) (cortex_ingester_memory_series{job=~".+/ingester"} * cortex_ingester_chunk_size_bytes_sum{job=~".+/ingester"} / cortex_ingester_chunk_size_bytes_count{job=~".+/ingester"}) - / 1e9 - ) - > - ( - sum by (cluster, namespace) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 - ) - |||, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - Chunk memcached cluster for namespace {{ $labels.namespace }} are too small, should be at least {{ printf "%.2f" $value }}GB. - |||, - }, - }, - { - alert: 'CortexProvisioningTooManyActiveSeries', - // 1 million active series per ingester max. - expr: ||| - avg by (cluster, namespace) (cortex_ingester_memory_series{job=~".+/ingester"}) > 1.1e6 - and - sum by (cluster, namespace) (rate(cortex_ingester_received_chunks{job=~".+/ingester"}[1h])) == 0 - |||, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - Too many active series for ingesters in namespace {{ $labels.namespace }}, add more ingesters. - |||, - }, - }, - { - alert: 'CortexProvisioningTooManyWrites', - // 80k writes / s per ingester max. - expr: ||| - avg by (cluster,namespace) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3 - |||, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - Too much write QPS for ingesters in namespace {{ $labels.namespace }}, add more ingesters. - |||, - }, - }, - { - alert: 'CortexProvisioningTooMuchMemory', - expr: ||| - avg by (cluster, namespace) (container_memory_working_set_bytes{container_name="ingester" %s} / container_spec_memory_limit_bytes{container_name="ingester" %s}) > 0.7 - ||| % [namespace_matcher(','), namespace_matcher(',')], - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - Too much memory being used by ingesters in namespace {{ $labels.namespace }}, add more ingesters. - |||, - }, - }, - ], - }, - { - name: 'memcached', - rules: [ - { - alert: 'MemcachedDown', - expr: ||| - memcached_up == 0 - |||, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - Memcached Instance {{ $labels.instance }} is down for more than 15mins. - |||, - }, - }, - ], - }, - { - name: 'ruler_alerts', - rules: [ - { - alert: 'CortexRulerFailedEvaluations', - expr: ||| - sum(rate(cortex_prometheus_rule_evaluation_failures_total[1m])) by (namespace, job) - / - sum(rate(cortex_prometheus_rule_evaluations_total[1m])) by (namespace, job) - > 0.01 - |||, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, - }, - }, - { - alert: 'CortexRulerMissedEvaluations', - expr: ||| - sum(rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) by (namespace, job) - / - sum(rate(cortex_prometheus_rule_group_iterations_total[1m])) by (namespace, job) - > 0.01 - |||, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% missed iterations. - |||, - }, - }, - ], - }, - { - name: 'gossip_alerts', - rules: [ - { - alert: 'CortexGossipMembersMismatch', - expr: ||| - memberlist_client_cluster_members_count{%s} - != on (cluster,namespace) group_left - sum(up{job=~".+/(distributor|ingester|querier)"}) by (cluster,namespace) - ||| % namespace_matcher(), - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.', - }, - }, - ], - }, - ], - }, + { _config:: $._config }, } diff --git a/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet b/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet new file mode 100644 index 00000000000..e72d20c8b3f --- /dev/null +++ b/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet @@ -0,0 +1,8 @@ +{ + _config:: error 'must provide _config for alerts', + + namespace_matcher(prefix=''):: + if std.length($._config.alert_namespace_matcher) != 0 + then '%s namespace=~"%s"' % [prefix, $._config.alert_namespace_matcher] + else '', +} diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet new file mode 100644 index 00000000000..367c1fb969c --- /dev/null +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -0,0 +1,449 @@ +(import 'alert-utils.libsonnet') { + groups+: [ + { + name: 'cortex_alerts', + rules: [ + { + alert: 'CortexIngesterUnhealthy', + 'for': '15m', + expr: ||| + min(cortex_ring_members{state="Unhealthy", job=~"[a-z]+/distributor" %s}) by (namespace, job) > 0 + ||| % $.namespace_matcher(','), + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.job }} reports more than one unhealthy ingester.', + }, + }, + { + alert: 'CortexFlushStuck', + expr: ||| + (cortex_ingester_memory_chunks / cortex_ingester_memory_series) > 1.3 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.job }}/{{ $labels.instance }} is stuck flushing chunks.', + }, + }, + { + alert: 'CortexRequestErrors', + expr: ||| + 100 * sum(rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) + / + sum(rate(cortex_request_duration_seconds_count[1m])) by (namespace, job, route) + > 1 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'CortexRequestLatency', + expr: ||| + cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process"} + > + %(cortex_p99_latency_threshold_seconds)s + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, + }, + }, + { + // We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail + // and we will never trigger the alert. + // We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage. + alert: 'CortexTableSyncFailure', + expr: ||| + 100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m]) + / + rate(cortex_table_manager_sync_duration_seconds_count[15m]) + > 10 + |||, + 'for': '30m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables. + |||, + }, + }, + { + alert: 'CortexQueriesIncorrect', + expr: ||| + 100 * sum by (job, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) + / + sum by (job, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} is reporting incorrect results for {{ printf "%.2f" $value }}% of queries. + |||, + }, + }, + { + alert: 'CortexBadOverrides', + expr: ||| + cortex_overrides_last_reload_successful{job!~".+/table-manager|.+/alertmanager" %s} == 0 + ||| % $.namespace_matcher(','), + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} failed to reload overrides. + |||, + }, + }, + { + alert: 'CortexQuerierCapacityFull', + expr: ||| + prometheus_engine_queries_concurrent_max{job=~".+/querier"} - prometheus_engine_queries{job=~".+/querier"} == 0 + |||, + 'for': '5m', // We don't want to block for longer. + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} is at capacity processing queries. + |||, + }, + }, + { + alert: 'CortexFrontendQueriesStuck', + expr: ||| + sum by (namespace) (cortex_query_frontend_queue_length{job=~".+/query-frontend" %s}) > 1 + ||| % $.namespace_matcher(','), + 'for': '5m', // We don't want to block for longer. + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} has {{ $value }} queued up queries. + |||, + }, + }, + { + alert: 'CortexCacheRequestErrors', + expr: ||| + 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5.." %s}[1m])) by (namespace, job, method) + / + sum(rate(cortex_cache_request_duration_seconds_count{%s}[1m])) by (namespace, job, method) + > 1 + ||| % [$.namespace_matcher(','), $.namespace_matcher()], + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} cache {{ $labels.method }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'CortexIngesterRestarts', + expr: ||| + rate(kube_pod_container_status_restarts_total{container="ingester" %s}[30m]) > 0 + ||| % $.namespace_matcher(','), + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.namespace }}/{{ $labels.pod }} is restarting', + }, + }, + { + alert: 'CortexTransferFailed', + expr: ||| + max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success" %s}[15m]) + ||| % $.namespace_matcher(','), + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.namespace }}/{{ $labels.instance }} transfer failed.', + }, + }, + { + alert: 'CortexOldChunkInMemory', + // Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer + // to 10 hours. + // Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors). + expr: ||| + (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000) and cortex_oldest_unflushed_chunk_timestamp_seconds > 0 + |||, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.instance }} has very old unflushed chunk in memory. + |||, + }, + }, + { + // Alert immediately if WAL is corrupt. + alert: 'CortexWALCorruption', + expr: ||| + increase(cortex_ingester_wal_corruptions_total[5m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. + |||, + }, + }, + { + // 1 failed checkpoint creation is a warning. + alert: 'CortexCheckpointCreationFailed', + expr: ||| + increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0 + |||, + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.instance }} failed to create checkpoint. + |||, + }, + }, + { + // 2 or more failed checkpoint creation in 1h means something is wrong. + alert: 'CortexCheckpointCreationFailing', + expr: ||| + increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.instance }} is failing to create checkpoint. + |||, + }, + }, + { + // 1 failed checkpoint deletion is a warning. + alert: 'CortexCheckpointDeletionFailed', + expr: ||| + increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0 + |||, + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.instance }} failed to delete checkpoint. + |||, + }, + }, + { + // 2 or more failed checkpoint deletion in 2h means something is wrong. + // We give this more buffer than creation as this is a less critical operation. + alert: 'CortexCheckpointDeletionFailed', + expr: ||| + increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete checkpoint. + |||, + }, + }, + ], + }, + { + name: 'cortex-provisioning', + rules: [ + { + alert: 'CortexProvisioningMemcachedTooSmall', + // 4 x in-memory series size = 24hrs of data. + expr: ||| + ( + 4 * + sum by(cluster, namespace) (cortex_ingester_memory_series{job=~".+/ingester"} * cortex_ingester_chunk_size_bytes_sum{job=~".+/ingester"} / cortex_ingester_chunk_size_bytes_count{job=~".+/ingester"}) + / 1e9 + ) + > + ( + sum by (cluster, namespace) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 + ) + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Chunk memcached cluster for namespace {{ $labels.namespace }} are too small, should be at least {{ printf "%.2f" $value }}GB. + |||, + }, + }, + { + alert: 'CortexProvisioningTooManyActiveSeries', + // 1 million active series per ingester max. + expr: ||| + avg by (cluster, namespace) (cortex_ingester_memory_series{job=~".+/ingester"}) > 1.1e6 + and + sum by (cluster, namespace) (rate(cortex_ingester_received_chunks{job=~".+/ingester"}[1h])) == 0 + |||, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Too many active series for ingesters in namespace {{ $labels.namespace }}, add more ingesters. + |||, + }, + }, + { + alert: 'CortexProvisioningTooManyWrites', + // 80k writes / s per ingester max. + expr: ||| + avg by (cluster,namespace) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Too much write QPS for ingesters in namespace {{ $labels.namespace }}, add more ingesters. + |||, + }, + }, + { + alert: 'CortexProvisioningTooMuchMemory', + expr: ||| + avg by (cluster, namespace) (container_memory_working_set_bytes{container_name="ingester" %s} / container_spec_memory_limit_bytes{container_name="ingester" %s}) > 0.7 + ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Too much memory being used by ingesters in namespace {{ $labels.namespace }}, add more ingesters. + |||, + }, + }, + ], + }, + { + name: 'memcached', + rules: [ + { + alert: 'MemcachedDown', + expr: ||| + memcached_up == 0 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Memcached Instance {{ $labels.instance }} is down for more than 15mins. + |||, + }, + }, + ], + }, + { + name: 'ruler_alerts', + rules: [ + { + alert: 'CortexRulerFailedEvaluations', + expr: ||| + sum(rate(cortex_prometheus_rule_evaluation_failures_total[1m])) by (namespace, job) + / + sum(rate(cortex_prometheus_rule_evaluations_total[1m])) by (namespace, job) + > 0.01 + |||, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'CortexRulerMissedEvaluations', + expr: ||| + sum(rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) by (namespace, job) + / + sum(rate(cortex_prometheus_rule_group_iterations_total[1m])) by (namespace, job) + > 0.01 + |||, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% missed iterations. + |||, + }, + }, + ], + }, + { + name: 'gossip_alerts', + rules: [ + { + alert: 'CortexGossipMembersMismatch', + expr: ||| + memberlist_client_cluster_members_count{%s} + != on (cluster,namespace) group_left + sum(up{job=~".+/(distributor|ingester|querier)"}) by (cluster,namespace) + ||| % $.namespace_matcher(), + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.', + }, + }, + ], + }, + ], +} diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 49f960eded9..1b0e7b71103 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -22,5 +22,8 @@ // If Cortex is deployed as a single binary, set to true to // modify the job selectors in the dashboard queries. singleBinary: false, + + cortex_p99_latency_threshold_seconds: 2.5, + alert_namespace_matcher: '', }, } From be29cb945759d9f860a4e64dc4b198dbad47aa25 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Mon, 11 May 2020 14:04:58 -0400 Subject: [PATCH 051/364] add alert to ensure the ruler can successfully check ownership of it's rule groups against the ring Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 367c1fb969c..b44e3bff7d1 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -423,6 +423,22 @@ |||, }, }, + { + alert: 'CortexRulerFailedRingCheck', + expr: ||| + sum(rate(cortex_ruler_ring_check_errors_total[5m]) by (namespace, job) + > 0 + |||, + 'for': '1m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors when checking the ring for rule group ownership. + |||, + }, + }, ], }, { From 25b33d1cd1d0add377773d7a7072760889c444c6 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Mon, 11 May 2020 14:11:29 -0400 Subject: [PATCH 052/364] add playbook Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/docs/playbooks.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 7be960930cf..4c5d3477845 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -68,4 +68,8 @@ This alert goes off when an ingester is marked as unhealthy. Check the ring web @todo ## MemcachedDown -@todo \ No newline at end of file +@todo + +## CortexRulerFailedRingCheck + +This alert occurs when a ruler is unable to validate whether or not it should claim ownership over the evaluation of a rule group. The most likely cause is that one of the rule ring entries is unhealthy. If this is the case proceed to the ring admin http page and forget the unhealth ruler. The other possible cause would be an error returned the ring client. If this is the case look into debugging the ring based on the in-use backend implementation. \ No newline at end of file From e0663c3e870dca1a736faadb57c7f56c8e0ed170 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 May 2020 12:20:11 +0200 Subject: [PATCH 053/364] Added 'Writes Resources' and 'Reads Resources' dashboards Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards.libsonnet | 2 + .../dashboards/dashboard-utils.libsonnet | 38 +++++++++++ .../dashboards/reads-resources.libsonnet | 65 +++++++++++++++++++ .../dashboards/writes-resources.libsonnet | 59 +++++++++++++++++ 4 files changed, 164 insertions(+) create mode 100644 jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet create mode 100644 jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index a1ca1f27e65..ff52bb5ad0f 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -2,9 +2,11 @@ grafanaDashboards+: (import 'dashboards/queries.libsonnet') + (import 'dashboards/reads.libsonnet') + + (import 'dashboards/reads-resources.libsonnet') + (import 'dashboards/ruler.libsonnet') + (import 'dashboards/scaling.libsonnet') + (import 'dashboards/writes.libsonnet') + + (import 'dashboards/writes-resources.libsonnet') + (if std.setMember('tsdb', $._config.storage_engine) then import 'dashboards/compactor.libsonnet' diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index e084d546040..f9acb4ee6a2 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -107,6 +107,44 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, + containerCPUUsagePanel(title, containerName):: + $.panel(title) + + $.queryPanel([ + 'sum by(pod_name) (rate(container_cpu_usage_seconds_total{%s,container_name="%s"}[$__interval]))' % [$.namespaceMatcher(), containerName], + 'min(container_spec_cpu_quota{%s,container_name="%s"} / container_spec_cpu_period{%s,container_name="%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], + ], ['{{pod_name}}', 'limit']) + + { + seriesOverrides: [ + { + alias: 'limit', + color: '#E02F44', + fill: 0, + }, + ], + }, + + containerMemoryWorkingSetPanel(title, containerName):: + $.panel(title) + + $.queryPanel([ + 'sum by(pod_name) (container_memory_working_set_bytes{%s,container_name="%s"})' % [$.namespaceMatcher(), containerName], + 'min(container_spec_memory_limit_bytes{%s,container_name="%s"} > 0)' % [$.namespaceMatcher(), containerName], + ], ['{{pod_name}}', 'limit']) + + { + seriesOverrides: [ + { + alias: 'limit', + color: '#E02F44', + fill: 0, + }, + ], + yaxes: $.yaxes('bytes'), + }, + + goHeapInUsePanel(title, jobName):: + $.panel(title) + + $.queryPanel('sum by(instance) (go_memstats_heap_inuse_bytes{%s})' % $.jobMatcher(jobName), '{{instance}}') + + { yaxes: $.yaxes('bytes') }, + // Switches a panel from lines (default) to bars. bars:: { bars: true, diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet new file mode 100644 index 00000000000..a41d86d1a77 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -0,0 +1,65 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + 'cortex-reads-resources.json': + $.dashboard('Cortex / Reads Resources') + .addClusterSelectorTemplates() + .addRow( + $.row('Gateway') + .addPanel( + $.containerCPUUsagePanel('CPU', 'cortex-gw'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'), + ) + ) + .addRow( + $.row('Query Frontend') + .addPanel( + $.containerCPUUsagePanel('CPU', 'query-frontend'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'query-frontend'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'query-frontend'), + ) + ) + .addRow( + $.row('Querier') + .addPanel( + $.containerCPUUsagePanel('CPU', 'querier'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'querier'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'querier'), + ) + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.row('Store-gateway') + .addPanel( + $.containerCPUUsagePanel('CPU', 'store-gateway'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'store-gateway'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'store-gateway'), + ) + ) + { + templating+: { + list: [ + // Do not allow to include all clusters/namespaces otherwise this dashboard + // risks to explode because it shows resources per pod. + l + (if (l.name == 'cluster' || l.name == 'namespace') then { includeAll: false } else {}) + for l in super.list + ], + }, + }, +} diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet new file mode 100644 index 00000000000..51c313b66e1 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -0,0 +1,59 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + 'cortex-writes-resources.json': + $.dashboard('Cortex / Writes Resources') + .addClusterSelectorTemplates() + .addRow( + $.row('Gateway') + .addPanel( + $.containerCPUUsagePanel('CPU', 'cortex-gw'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'), + ) + ) + .addRow( + $.row('Distributor') + .addPanel( + $.containerCPUUsagePanel('CPU', 'distributor'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'distributor'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'distributor'), + ) + ) + .addRow( + $.row('Ingester') + .addPanel( + $.panel('In-memory series') + + $.queryPanel('sum by(instance) (cortex_ingester_memory_series{%s})' % $.jobMatcher('ingester'), '{{instance}}'), + ) + .addPanel( + $.containerCPUUsagePanel('CPU', 'ingester'), + ) + ) + .addRow( + $.row('') + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ingester'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), + ) + ) + { + templating+: { + list: [ + // Do not allow to include all clusters/namespaces otherwise this dashboard + // risks to explode because it shows resources per pod. + l + (if (l.name == 'cluster' || l.name == 'namespace') then { includeAll: false } else {}) + for l in super.list + ], + }, + }, +} From dc716442dd3c3ef036ee2b3f6b5f994a72f9f5c8 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 May 2020 14:45:06 +0200 Subject: [PATCH 054/364] Added a config option to optionally enable resources dashboards Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/config.libsonnet | 3 +++ jsonnet/mimir-mixin/dashboards.libsonnet | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 1b0e7b71103..d13b2206da9 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -25,5 +25,8 @@ cortex_p99_latency_threshold_seconds: 2.5, alert_namespace_matcher: '', + + // Whether resources dashboards are enabled (based on cAdvisor metrics). + resources_dashboards_enabled: false, }, } diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index ff52bb5ad0f..a93586f8ddd 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -2,11 +2,9 @@ grafanaDashboards+: (import 'dashboards/queries.libsonnet') + (import 'dashboards/reads.libsonnet') + - (import 'dashboards/reads-resources.libsonnet') + (import 'dashboards/ruler.libsonnet') + (import 'dashboards/scaling.libsonnet') + (import 'dashboards/writes.libsonnet') + - (import 'dashboards/writes-resources.libsonnet') + (if std.setMember('tsdb', $._config.storage_engine) then import 'dashboards/compactor.libsonnet' @@ -21,5 +19,9 @@ then import 'dashboards/comparison.libsonnet' else {}) + + (if !$._config.resources_dashboards_enabled then {} else + (import 'dashboards/reads-resources.libsonnet') + + (import 'dashboards/writes-resources.libsonnet')) + + { _config:: $._config }, } From 96ff1f8e2406f93920d67927051281d01881b11a Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 May 2020 14:46:05 +0200 Subject: [PATCH 055/364] Fixed linter Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index a93586f8ddd..e43a0a0fd31 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -20,8 +20,8 @@ else {}) + (if !$._config.resources_dashboards_enabled then {} else - (import 'dashboards/reads-resources.libsonnet') + - (import 'dashboards/writes-resources.libsonnet')) + + (import 'dashboards/reads-resources.libsonnet') + + (import 'dashboards/writes-resources.libsonnet')) + { _config:: $._config }, } From b38cca5c61b4237ebafcd5fe967cf2b739a93e14 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 May 2020 15:29:26 +0200 Subject: [PATCH 056/364] Fixed ruler alert Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index b44e3bff7d1..d44b8bb7e1f 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -426,7 +426,7 @@ { alert: 'CortexRulerFailedRingCheck', expr: ||| - sum(rate(cortex_ruler_ring_check_errors_total[5m]) by (namespace, job) + sum(rate(cortex_ruler_ring_check_errors_total[5m])) by (namespace, job) > 0 |||, 'for': '1m', From da7d7bd1cf295ef906a8b1fcc97be47cbc8efff3 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 8 May 2020 15:54:46 +0200 Subject: [PATCH 057/364] Added blocks storage alerts Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts.libsonnet | 6 +++ jsonnet/mimir-mixin/alerts/blocks.libsonnet | 39 +++++++++++++++++++ .../mimir-mixin/alerts/compactor.libsonnet | 39 +++++++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 jsonnet/mimir-mixin/alerts/blocks.libsonnet create mode 100644 jsonnet/mimir-mixin/alerts/compactor.libsonnet diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index 7d1867caf81..c6e52da8973 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -2,5 +2,11 @@ prometheusAlerts+:: (import 'alerts/alerts.libsonnet') + + (if std.setMember('tsdb', $._config.storage_engine) + then + (import 'alerts/blocks.libsonnet') + + (import 'alerts/compactor.libsonnet') + else {}) + + { _config:: $._config }, } diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet new file mode 100644 index 00000000000..302d66e11c4 --- /dev/null +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -0,0 +1,39 @@ +(import 'alert-utils.libsonnet') { + groups+: [ + { + name: 'cortex_blocks_alerts', + rules: [ + { + // Alert if the ingester has not shipped any block in the last 4h. + alert: 'CortexIngesterHasNotShippedBlocks', + 'for': '15m', + expr: ||| + (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"%s} > 60 * 60 * 4) + and + (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"%s} > 0) + ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', + }, + }, + { + // Alert if the ingester has not shipped any block since start. + alert: 'CortexIngesterHasNotShippedBlocks', + 'for': '4h', + expr: ||| + thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"%s} == 0 + ||| % $.namespace_matcher(','), + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', + }, + }, + ] + } + ], +} diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet new file mode 100644 index 00000000000..a1ff4886e8b --- /dev/null +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -0,0 +1,39 @@ +(import 'alert-utils.libsonnet') { + groups+: [ + { + name: 'cortex_compactor_alerts', + rules: [ + { + // Alert if the compactor has not uploaded anything in the last 24h. + alert: 'CortexCompactorHasNotRun', + 'for': '15m', + expr: ||| + (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"%s} > 60 * 60 * 24) + and + (thanos_objstore_bucket_last_successful_upload_time > 0) + ||| % $.namespace_matcher(','), + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded anything in the last 24 hours.', + }, + }, + { + // Alert if the compactor has not uploaded anything since its start. + alert: 'CortexCompactorHasNotRunSinceStart', + 'for': '24h', + expr: ||| + thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"%s} == 0 + ||| % $.namespace_matcher(','), + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded anything in the last 24 hours.', + }, + }, + ] + } + ], +} From 6dde08b982ddefd71f5bc8c4e5d06fa28d35dc05 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 8 May 2020 16:00:24 +0200 Subject: [PATCH 058/364] Fixed linter Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 4 ++-- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 302d66e11c4..5c09175f761 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -33,7 +33,7 @@ message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', }, }, - ] - } + ], + }, ], } diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index a1ff4886e8b..6bf24cc502d 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -33,7 +33,7 @@ message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded anything in the last 24 hours.', }, }, - ] - } + ], + }, ], } From 458e408693e1d314ea02e6757d24f8f9db2fa846 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 May 2020 12:46:14 +0200 Subject: [PATCH 059/364] Added more alerts Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 32 ++++++++++++++++ .../mimir-mixin/alerts/compactor.libsonnet | 38 +++++++++++++++++-- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 5c09175f761..915f08141a0 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -33,6 +33,38 @@ message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', }, }, + { + // Alert if the querier is not successfully scanning the bucket. + alert: 'CortexQuerierHasNotScanTheBucket', + 'for': '5m', + expr: ||| + (time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds{%s} > 60 * 30) + and + cortex_querier_blocks_last_successful_scan_timestamp_seconds{%s} > 0 + ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.', + }, + }, + { + // Alert if the store-gateway is not successfully synching the bucket. + alert: 'CortexStoreGatewayHasNotSyncTheBucket', + 'for': '5m', + expr: ||| + (time() - cortex_storegateway_blocks_last_successful_sync_timestamp_seconds{%s} > 60 * 30) + and + cortex_storegateway_blocks_last_successful_sync_timestamp_seconds > 0 + ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.', + }, + }, ], }, ], diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index 6bf24cc502d..e4328133157 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -3,9 +3,39 @@ { name: 'cortex_compactor_alerts', rules: [ + { + // Alert if the compactor has not successfully completed a run in the last 24h. + alert: 'CortexCompactorHasNotSuccessfullyRun', + 'for': '15m', + expr: ||| + (time() - cortex_compactor_last_successful_run_timestamp_seconds{%s} > 60 * 60 * 24) + and + (cortex_compactor_last_successful_run_timestamp_seconds > 0) + ||| % $.namespace_matcher(','), + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully completed a run in the last 24 hours.', + }, + }, + { + // Alert if the compactor has not successfully completed a run since its start. + alert: 'CortexCompactorHasNotSuccessfullyRunSinceStart', + 'for': '24h', + expr: ||| + cortex_compactor_last_successful_run_timestamp_seconds{%s} == 0 + ||| % $.namespace_matcher(','), + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully completed a run in the last 24 hours.', + }, + }, { // Alert if the compactor has not uploaded anything in the last 24h. - alert: 'CortexCompactorHasNotRun', + alert: 'CortexCompactorHasNotUploadedBlocks', 'for': '15m', expr: ||| (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"%s} > 60 * 60 * 24) @@ -16,12 +46,12 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded anything in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.', }, }, { // Alert if the compactor has not uploaded anything since its start. - alert: 'CortexCompactorHasNotRunSinceStart', + alert: 'CortexCompactorHasNotUploadedBlocksSinceStart', 'for': '24h', expr: ||| thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"%s} == 0 @@ -30,7 +60,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded anything in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.', }, }, ], From 18bd96d1e070d8ecb2eacd9ba62480c3cc087c09 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 May 2020 12:48:47 +0200 Subject: [PATCH 060/364] Fixed alert Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 915f08141a0..978f7dc0b2b 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -56,7 +56,7 @@ expr: ||| (time() - cortex_storegateway_blocks_last_successful_sync_timestamp_seconds{%s} > 60 * 30) and - cortex_storegateway_blocks_last_successful_sync_timestamp_seconds > 0 + cortex_storegateway_blocks_last_successful_sync_timestamp_seconds{%s} > 0 ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], labels: { severity: 'critical', From 4ce207652a10325c939a86dd2fc37fb1836e798e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 14 May 2020 15:16:53 +0200 Subject: [PATCH 061/364] Fixed alerts Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index e4328133157..6c11f35cee9 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -10,8 +10,8 @@ expr: ||| (time() - cortex_compactor_last_successful_run_timestamp_seconds{%s} > 60 * 60 * 24) and - (cortex_compactor_last_successful_run_timestamp_seconds > 0) - ||| % $.namespace_matcher(','), + (cortex_compactor_last_successful_run_timestamp_seconds{%s} > 0) + ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], labels: { severity: 'critical', }, @@ -40,8 +40,8 @@ expr: ||| (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"%s} > 60 * 60 * 24) and - (thanos_objstore_bucket_last_successful_upload_time > 0) - ||| % $.namespace_matcher(','), + (thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"%s} > 0) + ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], labels: { severity: 'critical', }, From 8be3e290fb01f1728b032093e4105573bef7a3fa Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 14 May 2020 15:20:02 +0200 Subject: [PATCH 062/364] Fixed alerts Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 4 ++-- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 978f7dc0b2b..81dfe6b722e 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -41,7 +41,7 @@ (time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds{%s} > 60 * 30) and cortex_querier_blocks_last_successful_scan_timestamp_seconds{%s} > 0 - ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], + ||| % [$.namespace_matcher(''), $.namespace_matcher('')], labels: { severity: 'critical', }, @@ -57,7 +57,7 @@ (time() - cortex_storegateway_blocks_last_successful_sync_timestamp_seconds{%s} > 60 * 30) and cortex_storegateway_blocks_last_successful_sync_timestamp_seconds{%s} > 0 - ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], + ||| % [$.namespace_matcher(''), $.namespace_matcher('')], labels: { severity: 'critical', }, diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index 6c11f35cee9..2f2d1a17de8 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -11,7 +11,7 @@ (time() - cortex_compactor_last_successful_run_timestamp_seconds{%s} > 60 * 60 * 24) and (cortex_compactor_last_successful_run_timestamp_seconds{%s} > 0) - ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], + ||| % [$.namespace_matcher(''), $.namespace_matcher('')], labels: { severity: 'critical', }, @@ -25,7 +25,7 @@ 'for': '24h', expr: ||| cortex_compactor_last_successful_run_timestamp_seconds{%s} == 0 - ||| % $.namespace_matcher(','), + ||| % $.namespace_matcher(''), labels: { severity: 'critical', }, From 17fbef27fe746985772facf34b047bec4ad4dd14 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 14 May 2020 15:47:10 +0200 Subject: [PATCH 063/364] Added memcached chunks caching (blocks storage) panel Signed-off-by: Marco Pracucci --- .../mimir-mixin/dashboards/reads.libsonnet | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index f0bc2d8de89..f422d26eb0e 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -114,6 +114,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('percentunit') }, ) ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.row('Memcached - Blocks Storage - Chunks') + .addPanel( + $.panel('QPS') + + $.queryPanel('sum by(operation) (rate(cortex_storegateway_thanos_memcached_operations_total{%s,name="chunks-cache"}[$__interval]))' % $.jobMatcher('store-gateway'), '{{operation}}') + + $.stack + + { yaxes: $.yaxes('ops') }, + ) + .addPanel( + $.panel('Latency (getmulti)') + + $.latencyPanel('cortex_storegateway_thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",name="chunks-cache"}' % $.jobMatcher('store-gateway')) + ) + .addPanel( + $.panel('Hit ratio') + + $.queryPanel('sum by(item_type) (rate(cortex_storegateway_thanos_cache_memcached_hits_total{%s,name="chunks-cache"}[$__interval])) / sum by(item_type) (rate(cortex_storegateway_thanos_cache_memcached_requests_total{%s,name="chunks-cache"}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], '{{item_type}}') + + { yaxes: $.yaxes('percentunit') }, + ) + ) .addRowIf( std.setMember('chunks', $._config.storage_engine) && std.setMember('cassandra', $._config.chunk_index_backend + $._config.chunk_store_backend), From d344f95e0a46882b4367f1198733e01a3dba6b4a Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 14 May 2020 15:57:30 +0200 Subject: [PATCH 064/364] Fixed hit ratio query Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index f422d26eb0e..9d410030f20 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -129,7 +129,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum by(item_type) (rate(cortex_storegateway_thanos_cache_memcached_hits_total{%s,name="chunks-cache"}[$__interval])) / sum by(item_type) (rate(cortex_storegateway_thanos_cache_memcached_requests_total{%s,name="chunks-cache"}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], '{{item_type}}') + + $.queryPanel('sum(rate(cortex_storegateway_thanos_cache_memcached_hits_total{%s,name="chunks-cache"}[$__interval])) / sum(rate(cortex_storegateway_thanos_cache_memcached_requests_total{%s,name="chunks-cache"}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], 'chunks') + { yaxes: $.yaxes('percentunit') }, ) ) From b6edf49b6b76e406eeaf98629a8df89edea9aba9 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 14 May 2020 16:12:23 +0200 Subject: [PATCH 065/364] Added playbooks for blocks storage alerts Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 2 +- jsonnet/mimir-mixin/docs/playbooks.md | 54 ++++++++++++++++++++- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 81dfe6b722e..822a9bc38af 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -21,7 +21,7 @@ }, { // Alert if the ingester has not shipped any block since start. - alert: 'CortexIngesterHasNotShippedBlocks', + alert: 'CortexIngesterHasNotShippedBlocksSinceStart', 'for': '4h', expr: ||| thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"%s} == 0 diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 4c5d3477845..2a76525ccd3 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -72,4 +72,56 @@ This alert goes off when an ingester is marked as unhealthy. Check the ring web ## CortexRulerFailedRingCheck -This alert occurs when a ruler is unable to validate whether or not it should claim ownership over the evaluation of a rule group. The most likely cause is that one of the rule ring entries is unhealthy. If this is the case proceed to the ring admin http page and forget the unhealth ruler. The other possible cause would be an error returned the ring client. If this is the case look into debugging the ring based on the in-use backend implementation. \ No newline at end of file +This alert occurs when a ruler is unable to validate whether or not it should claim ownership over the evaluation of a rule group. The most likely cause is that one of the rule ring entries is unhealthy. If this is the case proceed to the ring admin http page and forget the unhealth ruler. The other possible cause would be an error returned the ring client. If this is the case look into debugging the ring based on the in-use backend implementation. + +## CortexIngesterHasNotShippedBlocks + +This alert fires when a Cortex ingester is not uploading any block to the long-term storage. An ingester is expected to upload a block to the storage every block range period (defaults to 2h) and if a longer time elapse since the last successful upload it means something is not working correctly. + +How to investigate: +- Ensure the ingester is receiving write-path traffic (samples to ingest) +- Look for any upload error in the ingester logs (ie. networking or authentication issues) + +## CortexIngesterHasNotShippedBlocksSinceStart + +Same as [`CortexIngesterHasNotShippedBlocks`](#CortexIngesterHasNotShippedBlocks). + +## CortexQuerierHasNotScanTheBucket + +This alert fixes when a Cortex querier is not successfully scanning blocks in the storage (bucket). A querier is expected to periodically iterate the bucket to find new/deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time it may end up querying only a subset of blocks, thus leading to potentially partial results. + +How to investigate: +- Look for any scan error in the querier logs (ie. networking or rate limiting issues) + +## CortexStoreGatewayHasNotSyncTheBucket + +This alert fixes when a Cortex store-gateway is not successfully scanning blocks in the storage (bucket). A store-gateway is expected to periodically iterate the bucket to find new/deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time it may end up querying only a subset of blocks, thus leading to potentially partial results. + +How to investigate: +- Look for any scan error in the store-gateway logs (ie. networking or rate limiting issues) + +## CortexCompactorHasNotSuccessfullyRun + +This alert fires when a Cortex compactor is not successfully completing a compaction run since a long time. + +How to investigate: +- Ensure the compactor is not crashing during compaction (ie. `OOMKilled`) +- Look for any error in the compactor logs + +## CortexCompactorHasNotSuccessfullyRunSinceStart + +Same as [`CortexCompactorHasNotSuccessfullyRun`](#CortexCompactorHasNotSuccessfullyRun). + +## CortexCompactorHasNotUploadedBlocks + +This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time. + +How to investigate: +- If the alert `CortexCompactorHasNotSuccessfullyRun` or `CortexCompactorHasNotSuccessfullyRunSinceStart` have fired as well, then investigate that issue first +- If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first +- Ensure ingesters are successfully shipping blocks to the storage +- Look for any error in the compactor logs + +## CortexCompactorHasNotUploadedBlocksSinceStart + +Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks). From 195047a2eac3dfe0da9a7cd808e99a5f0bc17979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Thu, 14 May 2020 16:16:24 +0200 Subject: [PATCH 066/364] Update playbooks.md --- jsonnet/mimir-mixin/docs/playbooks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 2a76525ccd3..dd8678048ef 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -88,14 +88,14 @@ Same as [`CortexIngesterHasNotShippedBlocks`](#CortexIngesterHasNotShippedBlocks ## CortexQuerierHasNotScanTheBucket -This alert fixes when a Cortex querier is not successfully scanning blocks in the storage (bucket). A querier is expected to periodically iterate the bucket to find new/deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time it may end up querying only a subset of blocks, thus leading to potentially partial results. +This alert fires when a Cortex querier is not successfully scanning blocks in the storage (bucket). A querier is expected to periodically iterate the bucket to find new/deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time it may end up querying only a subset of blocks, thus leading to potentially partial results. How to investigate: - Look for any scan error in the querier logs (ie. networking or rate limiting issues) ## CortexStoreGatewayHasNotSyncTheBucket -This alert fixes when a Cortex store-gateway is not successfully scanning blocks in the storage (bucket). A store-gateway is expected to periodically iterate the bucket to find new/deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time it may end up querying only a subset of blocks, thus leading to potentially partial results. +This alert fixes when a Cortex store-gateway is not successfully scanning blocks in the storage (bucket). A store-gateway is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket for a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. How to investigate: - Look for any scan error in the store-gateway logs (ie. networking or rate limiting issues) From 704b11e0cd826beab4ead45d4c5517af39518517 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Thu, 14 May 2020 16:17:28 +0200 Subject: [PATCH 067/364] Update playbooks.md --- jsonnet/mimir-mixin/docs/playbooks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index dd8678048ef..bac54ebe9cd 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -88,14 +88,14 @@ Same as [`CortexIngesterHasNotShippedBlocks`](#CortexIngesterHasNotShippedBlocks ## CortexQuerierHasNotScanTheBucket -This alert fires when a Cortex querier is not successfully scanning blocks in the storage (bucket). A querier is expected to periodically iterate the bucket to find new/deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time it may end up querying only a subset of blocks, thus leading to potentially partial results. +This alert fires when a Cortex querier is not successfully scanning blocks in the storage (bucket). A querier is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. How to investigate: - Look for any scan error in the querier logs (ie. networking or rate limiting issues) ## CortexStoreGatewayHasNotSyncTheBucket -This alert fixes when a Cortex store-gateway is not successfully scanning blocks in the storage (bucket). A store-gateway is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket for a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. +This alert fires when a Cortex store-gateway is not successfully scanning blocks in the storage (bucket). A store-gateway is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket for a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. How to investigate: - Look for any scan error in the store-gateway logs (ie. networking or rate limiting issues) From e67b1f773a9aca102d055764c4e4de30f9fd1c45 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 25 May 2020 11:01:48 +0200 Subject: [PATCH 068/364] Added compactor resources dashboard Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards.libsonnet | 1 + .../dashboards/compactor-resources.libsonnet | 70 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 7d2b89c0ac0..3eacaeb6017 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -9,6 +9,7 @@ (if std.setMember('tsdb', $._config.storage_engine) then (import 'dashboards/compactor.libsonnet') + + (import 'dashboards/compactor-resources.libsonnet') + (import 'dashboards/object-store.libsonnet') else {}) + diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet new file mode 100644 index 00000000000..127363013aa --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet @@ -0,0 +1,70 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + 'cortex-compactor-resources.json': + local filterNodeDiskByCompactor = ||| + ignoring(pod_name) group_right() (label_replace(count by(pod_name, instance, device) (container_fs_writes_bytes_total{%s,container="compactor",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) + ||| % $.namespaceMatcher(); + + $.dashboard('Cortex / Compactor Resources') + .addClusterSelectorTemplates() + .addRow( + $.row('CPU and Memory') + .addPanel( + $.containerCPUUsagePanel('CPU', 'compactor'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'compactor'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'compactor'), + ) + ) + .addRow( + $.row('Network') + .addPanel( + $.panel('Receive Bandwidth') + + $.queryPanel('sum by(pod_name) (rate(container_network_receive_bytes_total{%s,pod_name=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod_name}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Transmit Bandwidth') + + $.queryPanel('sum by(pod_name) (rate(container_network_transmit_bytes_total{%s,pod_name=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod_name}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + ) + .addRow( + $.row('Disk') + .addPanel( + $.panel('Writes') + + $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod_name}} - {{device}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Reads') + + $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod_name}} - {{device}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('Utilization') + + $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="compactor"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + + { yaxes: $.yaxes('percentunit') }, + ) + ) + { + templating+: { + list: [ + // Do not allow to include all clusters/namespaces otherwise this dashboard + // risks to explode because it shows resources per pod. + l + (if (l.name == 'cluster' || l.name == 'namespace') then { includeAll: false } else {}) + for l in super.list + ], + }, + }, +} From b8d72acb65026ceaba608972340aac6a4fd5e27f Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 25 May 2020 11:09:50 +0200 Subject: [PATCH 069/364] Renamed panel Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet index 127363013aa..043a33374d2 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet @@ -53,7 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Utilization') + + $.panel('Disk Space Utilization') + $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="compactor"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + { yaxes: $.yaxes('percentunit') }, ) From e4d4a3e9f46db6247cace13ed53a21d294039b4c Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Tue, 26 May 2020 15:39:14 -0400 Subject: [PATCH 070/364] split the cortex recording rules into smaller groups Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/recording_rules.libsonnet | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index acdd35e4a9d..529467fa2a1 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -4,22 +4,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; prometheusRules+:: { groups+: [ { - name: 'cortex_rules', + name: 'cortex_api', rules: utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job']) + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job', 'route']) + - utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']) + + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']), + }, + { + name: 'cortex_cache', + rules: utils.histogramRules('cortex_memcache_request_duration_seconds', ['cluster', 'job', 'method']) + utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job', 'method']) + + utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job', 'method']) + }, + { + name: 'cortex_chunk_store', + rules: utils.histogramRules('cortex_bigtable_request_duration_seconds', ['cluster', 'job', 'operation']) + utils.histogramRules('cortex_cassandra_request_duration_seconds', ['cluster', 'job', 'operation']) + utils.histogramRules('cortex_dynamo_request_duration_seconds', ['cluster', 'job', 'operation']) + - utils.histogramRules('cortex_query_frontend_retries', ['cluster', 'job']) + - utils.histogramRules('cortex_query_frontend_queue_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('cortex_ingester_queried_series', ['cluster', 'job']) + - utils.histogramRules('cortex_ingester_queried_chunks', ['cluster', 'job']) + - utils.histogramRules('cortex_ingester_queried_samples', ['cluster', 'job']) + utils.histogramRules('cortex_chunk_store_index_lookups_per_query', ['cluster', 'job']) + utils.histogramRules('cortex_chunk_store_series_pre_intersection_per_query', ['cluster', 'job']) + utils.histogramRules('cortex_chunk_store_series_post_intersection_per_query', ['cluster', 'job']) + @@ -28,6 +31,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.histogramRules('cortex_gcs_request_duration_seconds', ['cluster', 'job', 'operation']) + utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job']), }, + { + name: 'cortex_queries', + rules: + utils.histogramRules('cortex_query_frontend_retries', ['cluster', 'job']) + + utils.histogramRules('cortex_query_frontend_queue_duration_seconds', ['cluster', 'job']) + + utils.histogramRules('cortex_ingester_queried_series', ['cluster', 'job']) + + utils.histogramRules('cortex_ingester_queried_chunks', ['cluster', 'job']) + + utils.histogramRules('cortex_ingester_queried_samples', ['cluster', 'job']) + }, { name: 'cortex_received_samples', rules: [ From b641f5f9332fddebbae4815d69123fcbf4ada175 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Tue, 26 May 2020 18:32:54 -0400 Subject: [PATCH 071/364] format jsonnet Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/recording_rules.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 529467fa2a1..34d18eed0fc 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -15,7 +15,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; rules: utils.histogramRules('cortex_memcache_request_duration_seconds', ['cluster', 'job', 'method']) + utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job', 'method']) + utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job', 'method']), }, { name: 'cortex_chunk_store', @@ -33,12 +33,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, { name: 'cortex_queries', - rules: + rules: utils.histogramRules('cortex_query_frontend_retries', ['cluster', 'job']) + utils.histogramRules('cortex_query_frontend_queue_duration_seconds', ['cluster', 'job']) + utils.histogramRules('cortex_ingester_queried_series', ['cluster', 'job']) + utils.histogramRules('cortex_ingester_queried_chunks', ['cluster', 'job']) + - utils.histogramRules('cortex_ingester_queried_samples', ['cluster', 'job']) + utils.histogramRules('cortex_ingester_queried_samples', ['cluster', 'job']), }, { name: 'cortex_received_samples', From 24d64235edb1842e482c305d66758c9f33ada9ae Mon Sep 17 00:00:00 2001 From: Weifeng Wang Date: Wed, 27 May 2020 17:09:56 +0800 Subject: [PATCH 072/364] Fix typo Signed-off-by: Weifeng Wang --- jsonnet/mimir-mixin/dashboards/chunks.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet index e4d349323f8..642ab3427a3 100644 --- a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet @@ -36,7 +36,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Entries') + - $.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{%s}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{%s}[5m])' % [$.jobMatcher('ingester'), $.jobMatcher('ingester')], 'entries'), + $.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{%s}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{%s}[5m]))' % [$.jobMatcher('ingester'), $.jobMatcher('ingester')], 'entries'), ), ) .addRow( From d52b2628b5dffd47b069ba4f8d6e1b5105035045 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Mon, 25 May 2020 13:22:55 -0400 Subject: [PATCH 073/364] make job names configurable Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/config.libsonnet | 10 +++ .../mimir-mixin/dashboards/chunks.libsonnet | 32 +++++----- .../mimir-mixin/dashboards/queries.libsonnet | 64 +++++++++---------- .../mimir-mixin/dashboards/reads.libsonnet | 60 ++++++++--------- .../dashboards/writes-resources.libsonnet | 2 +- .../mimir-mixin/dashboards/writes.libsonnet | 54 ++++++++-------- 6 files changed, 116 insertions(+), 106 deletions(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index d13b2206da9..73f59c54214 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -23,6 +23,16 @@ // modify the job selectors in the dashboard queries. singleBinary: false, + job_names: { + ingester: 'ingester', + distributor: 'distributor', + querier: 'querier', + query_frontend: 'query-frontend', + table_manager: 'table-manager', + store_gateway: 'store-gateway', + gateway: 'cortex-gw', + }, + cortex_p99_latency_threshold_seconds: 2.5, alert_namespace_matcher: '', diff --git a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet index 642ab3427a3..15b6426cf83 100644 --- a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet @@ -8,46 +8,46 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Active Series / Chunks') .addPanel( $.panel('Series') + - $.queryPanel('sum(cortex_ingester_memory_series{%s})' % $.jobMatcher('ingester'), 'series'), + $.queryPanel('sum(cortex_ingester_memory_series{%s})' % $.jobMatcher($._config.job_names.ingester), 'series'), ) .addPanel( $.panel('Chunks per series') + - $.queryPanel('sum(cortex_ingester_memory_chunks{%s}) / sum(cortex_ingester_memory_series{%s})' % [$.jobMatcher('ingester'), $.jobMatcher('ingester')], 'chunks'), + $.queryPanel('sum(cortex_ingester_memory_chunks{%s}) / sum(cortex_ingester_memory_series{%s})' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'chunks'), ) ) .addRow( $.row('Flush Stats') .addPanel( $.panel('Utilization') + - $.latencyPanel('cortex_ingester_chunk_utilization', '{%s}' % $.jobMatcher('ingester'), multiplier='1') + + $.latencyPanel('cortex_ingester_chunk_utilization', '{%s}' % $.jobMatcher($._config.job_names.ingester), multiplier='1') + { yaxes: $.yaxes('percentunit') }, ) .addPanel( $.panel('Age') + - $.latencyPanel('cortex_ingester_chunk_age_seconds', '{%s}' % $.jobMatcher('ingester')), + $.latencyPanel('cortex_ingester_chunk_age_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)), ), ) .addRow( $.row('Flush Stats') .addPanel( $.panel('Size') + - $.latencyPanel('cortex_ingester_chunk_length', '{%s}' % $.jobMatcher('ingester'), multiplier='1') + + $.latencyPanel('cortex_ingester_chunk_length', '{%s}' % $.jobMatcher($._config.job_names.ingester), multiplier='1') + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Entries') + - $.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{%s}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{%s}[5m]))' % [$.jobMatcher('ingester'), $.jobMatcher('ingester')], 'entries'), + $.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{%s}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{%s}[5m]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'entries'), ), ) .addRow( $.row('Flush Stats') .addPanel( $.panel('Queue Length') + - $.queryPanel('cortex_ingester_flush_queue_length{%s}' % $.jobMatcher('ingester'), '{{instance}}'), + $.queryPanel('cortex_ingester_flush_queue_length{%s}' % $.jobMatcher($._config.job_names.ingester), '{{instance}}'), ) .addPanel( $.panel('Flush Rate') + - $.qpsPanel('cortex_ingester_chunk_age_seconds_count{%s}' % $.jobMatcher('ingester')), + $.qpsPanel('cortex_ingester_chunk_age_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)), ), ), @@ -58,7 +58,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Bytes Logged (WAL+Checkpoint) / ingester / second') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval])) + avg(rate(cortex_ingester_checkpoint_logged_bytes_total{%(m)s}[$__interval]))' % { m: $.jobMatcher('ingester') }, 'bytes') + + $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval])) + avg(rate(cortex_ingester_checkpoint_logged_bytes_total{%(m)s}[$__interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + { yaxes: $.yaxes('bytes') }, ) ) @@ -66,16 +66,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('WAL') .addPanel( $.panel('Records logged / ingester / second') + - $.queryPanel('avg(rate(cortex_ingester_wal_records_logged_total{%s}[$__interval]))' % $.jobMatcher('ingester'), 'records'), + $.queryPanel('avg(rate(cortex_ingester_wal_records_logged_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), 'records'), ) .addPanel( $.panel('Bytes per record') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval]) / rate(cortex_ingester_wal_records_logged_total{%(m)s}[$__interval]))' % { m: $.jobMatcher('ingester') }, 'bytes') + + $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval]) / rate(cortex_ingester_wal_records_logged_total{%(m)s}[$__interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + { yaxes: $.yaxes('bytes') }, ) .addPanel( $.panel('Bytes per sample') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval]) / rate(cortex_ingester_ingested_samples_total{%(m)s}[$__interval]))' % { m: $.jobMatcher('ingester') }, 'bytes') + + $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval]) / rate(cortex_ingester_ingested_samples_total{%(m)s}[$__interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + { yaxes: $.yaxes('bytes') }, ) .addPanel( @@ -88,13 +88,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Checkpoint') .addPanel( $.panel('Checkpoint creation/deletion / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_total{%s}[$__interval])' % $.jobMatcher('ingester'), '{{instance}}-creation') + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_total{%s}[$__interval])' % $.jobMatcher('ingester'), '{{instance}}-deletion'), + $.queryPanel('rate(cortex_ingester_checkpoint_creations_total{%s}[$__interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-creation') + + $.queryPanel('rate(cortex_ingester_checkpoint_deletions_total{%s}[$__interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-deletion'), ) .addPanel( $.panel('Checkpoint creation/deletion failed / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_failed_total{%s}[$__interval])' % $.jobMatcher('ingester'), '{{instance}}-creation') + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_failed_total{%s}[$__interval])' % $.jobMatcher('ingester'), '{{instance}}-deletion'), + $.queryPanel('rate(cortex_ingester_checkpoint_creations_failed_total{%s}[$__interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-creation') + + $.queryPanel('rate(cortex_ingester_checkpoint_deletions_failed_total{%s}[$__interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-deletion'), ) ), } diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index 85c68e4429a..91b039f7639 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -9,60 +9,60 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Query Frontend') .addPanel( $.panel('Queue Duration') + - $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{%s}' % $.jobMatcher('query-frontend')), + $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_frontend)), ) .addPanel( $.panel('Retries') + - $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher('query-frontend'), multiplier=1) + + $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Queue Length') + - $.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher('query-frontend'), '{{cluster}} / {{namespace}} / {{instance}}'), + $.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher($._config.job_names.query_frontend), '{{cluster}} / {{namespace}} / {{instance}}'), ) ) .addRow( $.row('Query Frontend - Results Cache') .addPanel( $.panel('Cache Hit %') + - $.queryPanel('sum(rate(cortex_cache_hits{%s}[1m])) / sum(rate(cortex_cache_fetched_keys{%s}[1m]))' % [$.jobMatcher('query-frontend'), $.jobMatcher('query-frontend')], 'Hit Rate') + + $.queryPanel('sum(rate(cortex_cache_hits{%s}[1m])) / sum(rate(cortex_cache_fetched_keys{%s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Hit Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( $.panel('Cache misses') + - $.queryPanel('sum(rate(cortex_cache_fetched_keys{%s}[1m])) - sum(rate(cortex_cache_hits{%s}[1m]))' % [$.jobMatcher('query-frontend'), $.jobMatcher('query-frontend')], 'Miss Rate'), + $.queryPanel('sum(rate(cortex_cache_fetched_keys{%s}[1m])) - sum(rate(cortex_cache_hits{%s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Miss Rate'), ) ) .addRow( $.row('Query Frontend - Sharding/Splitting') .addPanel( $.panel('Intervals per Query') + - $.queryPanel('sum(rate(cortex_frontend_split_queries_total{%s}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{%s, method="split_by_interval"}[1m]))' % [$.jobMatcher('query-frontend'), $.jobMatcher('query-frontend')], 'partition rate'), + $.queryPanel('sum(rate(cortex_frontend_split_queries_total{%s}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{%s, method="split_by_interval"}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'partition rate'), ) .addPanel( $.panel('Sharded Queries %') + - $.queryPanel('sum(rate(cortex_frontend_mapped_asts_total{%s}[1m])) / sum(rate(cortex_frontend_split_queries_total{%s}[1m])) * 100' % [$.jobMatcher('query-frontend'), $.jobMatcher('query-frontend')], 'shard rate'), + $.queryPanel('sum(rate(cortex_frontend_mapped_asts_total{%s}[1m])) / sum(rate(cortex_frontend_split_queries_total{%s}[1m])) * 100' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'shard rate'), ) .addPanel( $.panel('Sharding factor') + - $.queryPanel('sum(rate(cortex_frontend_sharded_queries_total{%s}[1m])) / sum(rate(cortex_frontend_mapped_asts_total{%s}[1m]))' % [$.jobMatcher('query-frontend'), $.jobMatcher('query-frontend')], 'Average'), + $.queryPanel('sum(rate(cortex_frontend_sharded_queries_total{%s}[1m])) / sum(rate(cortex_frontend_mapped_asts_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Average'), ) ) .addRow( $.row('Querier') .addPanel( $.panel('Stages') + - $.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",%s}) * 1e3' % $.jobMatcher('querier'), '{{slice}}') + + $.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",%s}) * 1e3' % $.jobMatcher($._config.job_names.querier), '{{slice}}') + { yaxes: $.yaxes('ms') } + $.stack, ) .addPanel( $.panel('Chunk cache misses') + - $.queryPanel('sum(rate(cortex_cache_fetched_keys{%s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{%s,name="chunksmemcache"}[1m]))' % [$.jobMatcher('querier'), $.jobMatcher('querier')], 'Hit rate'), + $.queryPanel('sum(rate(cortex_cache_fetched_keys{%s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{%s,name="chunksmemcache"}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Hit rate'), ) .addPanel( $.panel('Chunk cache corruptions') + - $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{%s}[1m]))' % $.jobMatcher('querier'), 'Corrupt chunks'), + $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{%s}[1m]))' % $.jobMatcher($._config.job_names.querier), 'Corrupt chunks'), ) ) .addRowIf( @@ -70,33 +70,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Querier - Chunks storage - Index Cache') .addPanel( $.panel('Total entries') + - $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher('querier'), $.jobMatcher('querier')], 'Entries'), + $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Entries'), ) .addPanel( $.panel('Cache Hit %') + - $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher('querier'), $.jobMatcher('querier'), $.jobMatcher('querier')], 'hit rate') + $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'hit rate') { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( $.panel('Churn Rate') + - $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % $.jobMatcher('querier'), 'churn rate'), + $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % $.jobMatcher($._config.job_names.querier), 'churn rate'), ) ) .addRow( $.row('Ingester') .addPanel( $.panel('Series per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector('ingester'), multiplier=1) + + utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector($._config.job_names.ingester), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector('ingester'), multiplier=1) + + utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector($._config.job_names.ingester), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Samples per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector('ingester'), multiplier=1) + + utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector($._config.job_names.ingester), multiplier=1) + { yaxes: $.yaxes('short') }, ) ) @@ -105,22 +105,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Querier - Chunks storage - Store') .addPanel( $.panel('Index Lookups per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector('querier'), multiplier=1) + + utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Series (pre-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', $.jobSelector('querier'), multiplier=1) + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Series (post-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', $.jobSelector('querier'), multiplier=1) + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + { yaxes: $.yaxes('short') }, ) .addPanel( $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector('querier'), multiplier=1) + + utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + { yaxes: $.yaxes('short') }, ) ) @@ -129,18 +129,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Store-gateway - Blocks') .addPanel( $.panel('Blocks queried / sec') + - $.queryPanel('sum(rate(cortex_storegateway_bucket_store_series_blocks_queried_sum{%s}[$__interval]))' % $.jobMatcher('store-gateway'), 'blocks') + + $.queryPanel('sum(rate(cortex_storegateway_bucket_store_series_blocks_queried_sum{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Data fetched / sec') + - $.queryPanel('sum by(data_type) (rate(cortex_storegateway_bucket_store_series_data_fetched_sum{%s}[$__interval]))' % $.jobMatcher('store-gateway'), '{{data_type}}') + + $.queryPanel('sum by(data_type) (rate(cortex_storegateway_bucket_store_series_data_fetched_sum{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + $.stack + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Data touched / sec') + - $.queryPanel('sum by(data_type) (rate(cortex_storegateway_bucket_store_series_data_touched_sum{%s}[$__interval]))' % $.jobMatcher('store-gateway'), '{{data_type}}') + + $.queryPanel('sum by(data_type) (rate(cortex_storegateway_bucket_store_series_data_touched_sum{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + $.stack + { yaxes: $.yaxes('ops') }, ) @@ -150,15 +150,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Series fetch duration (per request)') + - $.latencyPanel('cortex_storegateway_bucket_store_series_get_all_duration_seconds', '{%s}' % $.jobMatcher('store-gateway')), + $.latencyPanel('cortex_storegateway_bucket_store_series_get_all_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( $.panel('Series merge duration (per request)') + - $.latencyPanel('cortex_storegateway_bucket_store_series_merge_duration_seconds', '{%s}' % $.jobMatcher('store-gateway')), + $.latencyPanel('cortex_storegateway_bucket_store_series_merge_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( $.panel('Series returned (per request)') + - $.queryPanel('sum(rate(cortex_storegateway_bucket_store_series_result_series_sum{%s}[$__interval])) / sum(rate(cortex_storegateway_bucket_store_series_result_series_count{%s}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], 'avg series returned'), + $.queryPanel('sum(rate(cortex_storegateway_bucket_store_series_result_series_sum{%s}[$__interval])) / sum(rate(cortex_storegateway_bucket_store_series_result_series_count{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'avg series returned'), ) ) .addRowIf( @@ -166,20 +166,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Blocks currently loaded') + - $.queryPanel('cortex_storegateway_bucket_store_blocks_loaded{%s}' % $.jobMatcher('store-gateway'), '{{instance}}') + $.queryPanel('cortex_storegateway_bucket_store_blocks_loaded{%s}' % $.jobMatcher($._config.job_names.store_gateway), '{{instance}}') ) .addPanel( $.successFailurePanel( 'Blocks loaded / sec', - 'sum(rate(cortex_storegateway_bucket_store_block_loads_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], - 'sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % $.jobMatcher('store-gateway'), + 'sum(rate(cortex_storegateway_bucket_store_block_loads_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], + 'sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), ) ) .addPanel( $.successFailurePanel( 'Blocks dropped / sec', - 'sum(rate(cortex_storegateway_bucket_store_block_drops_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], - 'sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % $.jobMatcher('store-gateway'), + 'sum(rate(cortex_storegateway_bucket_store_block_drops_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], + 'sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), ) ) ), diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 9d410030f20..e39ad6dc5e6 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -8,55 +8,55 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Gateway') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher('cortex-gw')) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('cortex-gw') + [utils.selector.re('route', 'api_prom_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) ) .addRow( $.row('Query Frontend') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher('query-frontend')) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('query-frontend') + [utils.selector.re('route', 'api_prom_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) ) .addRow( $.row('Cache - Query Results') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{%s}' % $.jobMatcher('query-frontend')) + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector('query-frontend')) + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend)) ) ) .addRow( $.row('Querier') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher('querier')) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('querier') + [utils.selector.re('route', 'api_prom_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) ) .addRow( $.row('Ingester') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher('ingester')) + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) ) ) .addRowIf( @@ -64,11 +64,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Store-gateway') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher('store-gateway')) + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('store-gateway') + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) ) ) .addRowIf( @@ -76,11 +76,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached - Chunks storage - Index') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher('querier')) + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) ) ) .addRowIf( @@ -88,11 +88,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached - Chunks storage - Chunks') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher('querier')) + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('method', 'chunksmemcache.fetch')]) + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('method', 'chunksmemcache.fetch')]) ) ) .addRowIf( @@ -100,17 +100,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached - Blocks Storage - Index header') .addPanel( $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds_count{%s}[$__interval]))' % $.jobMatcher('store-gateway'), '{{operation}}') + + $.queryPanel('sum by(operation) (rate(cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds_count{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + $.stack + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Latency (getmulti)') + - $.latencyPanel('cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds', '{%s,operation="getmulti"}' % $.jobMatcher('store-gateway')) + $.latencyPanel('cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds', '{%s,operation="getmulti"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum by(item_type) (rate(cortex_storegateway_blocks_index_cache_hits_total{%s}[$__interval])) / sum by(item_type) (rate(cortex_storegateway_blocks_index_cache_requests_total{%s}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], '{{item_type}}') + + $.queryPanel('sum by(item_type) (rate(cortex_storegateway_blocks_index_cache_hits_total{%s}[$__interval])) / sum by(item_type) (rate(cortex_storegateway_blocks_index_cache_requests_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{item_type}}') + { yaxes: $.yaxes('percentunit') }, ) ) @@ -119,17 +119,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached - Blocks Storage - Chunks') .addPanel( $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(cortex_storegateway_thanos_memcached_operations_total{%s,name="chunks-cache"}[$__interval]))' % $.jobMatcher('store-gateway'), '{{operation}}') + + $.queryPanel('sum by(operation) (rate(cortex_storegateway_thanos_memcached_operations_total{%s,name="chunks-cache"}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + $.stack + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Latency (getmulti)') + - $.latencyPanel('cortex_storegateway_thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",name="chunks-cache"}' % $.jobMatcher('store-gateway')) + $.latencyPanel('cortex_storegateway_thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",name="chunks-cache"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum(rate(cortex_storegateway_thanos_cache_memcached_hits_total{%s,name="chunks-cache"}[$__interval])) / sum(rate(cortex_storegateway_thanos_cache_memcached_requests_total{%s,name="chunks-cache"}[$__interval]))' % [$.jobMatcher('store-gateway'), $.jobMatcher('store-gateway')], 'chunks') + + $.queryPanel('sum(rate(cortex_storegateway_thanos_cache_memcached_hits_total{%s,name="chunks-cache"}[$__interval])) / sum(rate(cortex_storegateway_thanos_cache_memcached_requests_total{%s,name="chunks-cache"}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'chunks') + { yaxes: $.yaxes('percentunit') }, ) ) @@ -139,11 +139,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Cassandra') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher('querier')) + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('operation', 'SELECT')]) + utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'SELECT')]) ) ) .addRowIf( @@ -152,11 +152,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('BigTable') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher('querier')) + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) + utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) ), ) .addRowIf( @@ -165,11 +165,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('DynamoDB') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher('querier')) + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('operation', 'DynamoDB.QueryPages')]) + utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'DynamoDB.QueryPages')]) ), ) .addRowIf( @@ -178,11 +178,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('GCS') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher('querier')) + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector('querier') + [utils.selector.eq('operation', 'GET')]) + utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'GET')]) ) ) // Object store metrics for the store-gateway. diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index 51c313b66e1..ccdc966f9e5 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -32,7 +32,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ingester') .addPanel( $.panel('In-memory series') + - $.queryPanel('sum by(instance) (cortex_ingester_memory_series{%s})' % $.jobMatcher('ingester'), '{{instance}}'), + $.queryPanel('sum by(instance) (cortex_ingester_memory_series{%s})' % $.jobMatcher($._config.job_names.ingester), '{{instance}}'), ) .addPanel( $.containerCPUUsagePanel('CPU', 'ingester'), diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 64a6a6d5a75..4b7f0508d49 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -12,7 +12,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }) .addPanel( $.panel('Samples / s') + - $.statPanel('sum(cluster_namespace_job:cortex_distributor_received_samples:rate5m{%s})' % $.jobMatcher('distributor'), format='reqps') + $.statPanel('sum(cluster_namespace_job:cortex_distributor_received_samples:rate5m{%s})' % $.jobMatcher($._config.job_names.distributor), format='reqps') ) .addPanel( $.panel('Active Series') + @@ -21,68 +21,68 @@ local utils = import 'mixin-utils/utils.libsonnet'; / on(namespace) group_left max by (namespace) (cortex_distributor_replication_factor{%(distributor)s})) ||| % { - ingester: $.jobMatcher('ingester'), - distributor: $.jobMatcher('distributor'), + ingester: $.jobMatcher($._config.job_names.ingester), + distributor: $.jobMatcher($._config.job_names.distributor), }, format='short') ) .addPanel( $.panel('QPS') + - $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route="api_prom_push"}[5m]))' % $.jobMatcher('cortex-gw'), format='reqps') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route="api_prom_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route="api_prom_push"}' % $.jobMatcher('cortex-gw')) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route="api_prom_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('cortex-gw') + [utils.selector.eq('route', 'api_prom_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.eq('route', 'api_prom_push')]) ) ) .addRow( $.row('Distributor') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/httpgrpc.*|api_prom_push"}' % $.jobMatcher('distributor')) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/httpgrpc.*|api_prom_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('distributor') + [utils.selector.re('route', '/httpgrpc.*|api_prom_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/httpgrpc.*|api_prom_push')]) ) ) .addRow( $.row('KV Store (HA Dedupe)') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher('distributor')) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector('distributor')) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) ) ) .addRow( $.row('Ingester') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher('ingester')) + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('route', '/cortex.Ingester/Push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) ) ) .addRow( $.row('KV Store (Ring)') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher('ingester')) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector('ingester')) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) ) ) .addRowIf( @@ -90,11 +90,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher('ingester')) + $.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('method', 'Memcache.Put')]) + utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('method', 'Memcache.Put')]) ) ) .addRowIf( @@ -103,11 +103,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Cassandra') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher('ingester')) + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('operation', 'INSERT')]) + utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'INSERT')]) ) ) .addRowIf( @@ -116,11 +116,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('BigTable') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher('ingester')) + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')]) + utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')]) ) ) .addRowIf( @@ -129,11 +129,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('DynamoDB') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher('ingester')) + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('operation', 'DynamoDB.BatchWriteItem')]) + utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'DynamoDB.BatchWriteItem')]) ) ) .addRowIf( @@ -142,11 +142,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('GCS') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher('ingester')) + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector('ingester') + [utils.selector.eq('operation', 'POST')]) + utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'POST')]) ) ) .addRowIf( @@ -155,13 +155,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.successFailurePanel( 'Uploaded blocks / sec', - 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % [$.jobMatcher('ingester'), $.jobMatcher('ingester')], - 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % $.jobMatcher('ingester'), + 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), ), ) .addPanel( $.panel('Upload latency') + - $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher('ingester')), + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)), ) ), } From 9cb12440a11c19f5210beed6134cdd6975fa158e Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Wed, 27 May 2020 15:03:05 -0400 Subject: [PATCH 074/364] Change `cortex_chunk_store` to `cortex_storage` Co-authored-by: gotjosh --- jsonnet/mimir-mixin/recording_rules.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 34d18eed0fc..73bdbcac8dc 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -18,7 +18,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.histogramRules('cortex_cache_request_duration_seconds', ['cluster', 'job', 'method']), }, { - name: 'cortex_chunk_store', + name: 'cortex_storage', rules: utils.histogramRules('cortex_bigtable_request_duration_seconds', ['cluster', 'job', 'operation']) + utils.histogramRules('cortex_cassandra_request_duration_seconds', ['cluster', 'job', 'operation']) + From 97028bda9e7f4d742d311adb02810cb84358c512 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 4 Jun 2020 11:37:59 +0530 Subject: [PATCH 075/364] Make alert name consistent with others (https://github.com/grafana/cortex-jsonnet/pull/48) Signed-off-by: Ganesh Vernekar --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index d44b8bb7e1f..473e7b8911d 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -240,7 +240,7 @@ }, { // 2 or more failed checkpoint creation in 1h means something is wrong. - alert: 'CortexCheckpointCreationFailing', + alert: 'CortexCheckpointCreationFailed', expr: ||| increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1 |||, From f0340a51af2206863e4228f5e70bd11cdd037fe8 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Thu, 4 Jun 2020 09:58:26 -0400 Subject: [PATCH 076/364] adjust recording rules to use kv_name (https://github.com/grafana/cortex-jsonnet/pull/86) Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 8 ++++---- jsonnet/mimir-mixin/recording_rules.libsonnet | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 4b7f0508d49..22634d1b430 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -56,11 +56,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('KV Store (HA Dedupe)') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s, kv_name="distributor-hatracker"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.eq('kv_name', 'distributor-hatracker')]) ) ) .addRow( @@ -78,11 +78,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('KV Store (Ring)') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s, kv_name="ingester-lifecycler"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('kv_name', 'ingester-lifecycler')]) ) ) .addRowIf( diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 73bdbcac8dc..84cfc01dfa6 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -29,7 +29,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.histogramRules('cortex_chunk_store_chunks_per_query', ['cluster', 'job']) + utils.histogramRules('cortex_database_request_duration_seconds', ['cluster', 'job', 'method']) + utils.histogramRules('cortex_gcs_request_duration_seconds', ['cluster', 'job', 'operation']) + - utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job']), + utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job', 'kv_name']), }, { name: 'cortex_queries', From 1f5f1bb4fa88712f5b91e61823d8830eb27e3717 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 4 Jun 2020 17:11:25 +0200 Subject: [PATCH 077/364] Improved Cortex blocks compactor alerts and dashboard Signed-off-by: Marco Pracucci --- .../mimir-mixin/alerts/compactor.libsonnet | 18 ++++++++-------- .../dashboards/compactor.libsonnet | 21 ++----------------- 2 files changed, 11 insertions(+), 28 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index 2f2d1a17de8..c638e10f8d8 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -4,33 +4,33 @@ name: 'cortex_compactor_alerts', rules: [ { - // Alert if the compactor has not successfully completed a run in the last 24h. - alert: 'CortexCompactorHasNotSuccessfullyRun', + // Alert if the compactor has not successfully cleaned up blocks in the last 24h. + alert: 'CortexCompactorHasNotSuccessfullyCleanedUpBlocks', 'for': '15m', expr: ||| - (time() - cortex_compactor_last_successful_run_timestamp_seconds{%s} > 60 * 60 * 24) + (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds{%s} > 60 * 60 * 24) and - (cortex_compactor_last_successful_run_timestamp_seconds{%s} > 0) + (cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds{%s} > 0) ||| % [$.namespace_matcher(''), $.namespace_matcher('')], labels: { severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully completed a run in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.', }, }, { - // Alert if the compactor has not successfully completed a run since its start. - alert: 'CortexCompactorHasNotSuccessfullyRunSinceStart', + // Alert if the compactor has not successfully cleaned up blocks since its start. + alert: 'CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart', 'for': '24h', expr: ||| - cortex_compactor_last_successful_run_timestamp_seconds{%s} == 0 + cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds{%s} == 0 ||| % $.namespace_matcher(''), labels: { severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully completed a run in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.', }, }, { diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 2327b4d1e34..6ae2f28e1fb 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -9,7 +9,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.textPanel('', ||| - **Per-instance runs**: number of times a compactor instance triggers a compaction across all tenants its shard manage. - - **Per-tenant runs**: number of times a compactor instance triggers the compaction for a single tenant's blocks. + - **Compacted blocks**: number of blocks generated as a result of a compaction operation. + - **Per-block compaction duration**: time taken to generate a single compacted block. |||), ) .addPanel( @@ -22,24 +23,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.bars + { yaxes: $.yaxes('ops') }, ) - .addPanel( - $.successFailurePanel( - 'Per-tenant runs / sec', - 'sum(rate(cortex_compactor_group_compactions_total{%s}[$__interval])) - sum(rate(cortex_compactor_group_compactions_failures_total{%s}[$__interval]))' % [$.jobMatcher('compactor'), $.jobMatcher('compactor')], - 'sum(rate(cortex_compactor_group_compactions_failures_total{%s}[$__interval]))' % $.jobMatcher('compactor'), - ) + - $.bars + - { yaxes: $.yaxes('ops') }, - ) - ) - .addRow( - $.row('') - .addPanel( - $.textPanel('', ||| - - **Compacted blocks**: number of blocks generated as a result of a compaction operation. - - **Per-block compaction duration**: time taken to generate a single compacted block. - |||), - ) .addPanel( $.panel('Compacted blocks / sec') + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__interval]))' % $.jobMatcher('compactor'), 'blocks') + From 475700fe4a4a1f014e148e6c6ec7ec53096ec21c Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Thu, 4 Jun 2020 11:31:28 -0400 Subject: [PATCH 078/364] Revert "adjust recording rules to use kv_name (https://github.com/grafana/cortex-jsonnet/pull/86)" This reverts commit f0340a51af2206863e4228f5e70bd11cdd037fe8. --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 8 ++++---- jsonnet/mimir-mixin/recording_rules.libsonnet | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 22634d1b430..4b7f0508d49 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -56,11 +56,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('KV Store (HA Dedupe)') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s, kv_name="distributor-hatracker"}' % $.jobMatcher($._config.job_names.distributor)) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.eq('kv_name', 'distributor-hatracker')]) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) ) ) .addRow( @@ -78,11 +78,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('KV Store (Ring)') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s, kv_name="ingester-lifecycler"}' % $.jobMatcher($._config.job_names.ingester)) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('kv_name', 'ingester-lifecycler')]) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) ) ) .addRowIf( diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 84cfc01dfa6..73bdbcac8dc 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -29,7 +29,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.histogramRules('cortex_chunk_store_chunks_per_query', ['cluster', 'job']) + utils.histogramRules('cortex_database_request_duration_seconds', ['cluster', 'job', 'method']) + utils.histogramRules('cortex_gcs_request_duration_seconds', ['cluster', 'job', 'operation']) + - utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job', 'kv_name']), + utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job']), }, { name: 'cortex_queries', From 8da08197cad33218ffb97ed8b0112c6f30945ab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Tue, 9 Jun 2020 13:12:36 +0200 Subject: [PATCH 079/364] TSDB metrics update, metadata cache metrics (https://github.com/grafana/cortex-jsonnet/pull/85) * Update TSDB metrics after recent change on Cortex master. * Fix operation name. * Added metadata cache stats (per querier and store-gateway). --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 4 +-- .../dashboards/dashboard-utils.libsonnet | 31 +++++++++++++++-- .../dashboards/object-store.libsonnet | 4 +-- .../mimir-mixin/dashboards/queries.libsonnet | 22 ++++++------- .../mimir-mixin/dashboards/reads.libsonnet | 33 ++++++++----------- 5 files changed, 57 insertions(+), 37 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 822a9bc38af..94300db602f 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -54,9 +54,9 @@ alert: 'CortexStoreGatewayHasNotSyncTheBucket', 'for': '5m', expr: ||| - (time() - cortex_storegateway_blocks_last_successful_sync_timestamp_seconds{%s} > 60 * 30) + (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway",%s} > 60 * 30) and - cortex_storegateway_blocks_last_successful_sync_timestamp_seconds{%s} > 0 + cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway",%s} > 0 ||| % [$.namespace_matcher(''), $.namespace_matcher('')], labels: { severity: 'critical', diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 2a9d7a44095..291ae3f5480 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -175,8 +175,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('percentunit') }, ) .addPanel( - $.panel('Op: ObjectSize') + - $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="objectsize"}' % [$.namespaceMatcher(), component]), + $.panel('Op: Attributes') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="attributes"}' % [$.namespaceMatcher(), component]), ) .addPanel( $.panel('Op: Exists') + @@ -202,4 +202,31 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="delete"}' % [$.namespaceMatcher(), component]), ), + + thanosMemcachedCache(title, jobName, component, cacheName):: + super.row(title) + .addPanel( + $.panel('QPS') + + $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{%s,component="%s",name="%s"}[$__interval]))' % [$.jobMatcher(jobName), component, cacheName], '{{operation}}') + + $.stack + + { yaxes: $.yaxes('ops') }, + ) + .addPanel( + $.panel('Latency (getmulti)') + + $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="%s",name="%s"}' % [$.jobMatcher(jobName), component, cacheName]) + ) + .addPanel( + $.panel('Hit ratio') + + $.queryPanel('sum(rate(thanos_cache_memcached_hits_total{%s,component="%s",name="%s"}[$__interval])) / sum(rate(thanos_cache_memcached_requests_total{%s,component="%s",name="%s"}[$__interval]))' % + [ + $.jobMatcher(jobName), + component, + cacheName, + $.jobMatcher(jobName), + component, + cacheName, + ], 'items') + + { yaxes: $.yaxes('percentunit') }, + ), + } diff --git a/jsonnet/mimir-mixin/dashboards/object-store.libsonnet b/jsonnet/mimir-mixin/dashboards/object-store.libsonnet index c02160236a0..3263446cc73 100644 --- a/jsonnet/mimir-mixin/dashboards/object-store.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/object-store.libsonnet @@ -50,8 +50,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Op: ObjectSize') + - $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="objectsize"}' % $.namespaceMatcher()), + $.panel('Op: Attributes') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="attributes"}' % $.namespaceMatcher()), ) .addPanel( $.panel('Op: Upload') + diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index 91b039f7639..1d44f8a4e22 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -129,18 +129,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Store-gateway - Blocks') .addPanel( $.panel('Blocks queried / sec') + - $.queryPanel('sum(rate(cortex_storegateway_bucket_store_series_blocks_queried_sum{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + + $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Data fetched / sec') + - $.queryPanel('sum by(data_type) (rate(cortex_storegateway_bucket_store_series_data_fetched_sum{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_fetched_sum{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + $.stack + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Data touched / sec') + - $.queryPanel('sum by(data_type) (rate(cortex_storegateway_bucket_store_series_data_touched_sum{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_touched_sum{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + $.stack + { yaxes: $.yaxes('ops') }, ) @@ -150,15 +150,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Series fetch duration (per request)') + - $.latencyPanel('cortex_storegateway_bucket_store_series_get_all_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), + $.latencyPanel('cortex_bucket_store_series_get_all_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( $.panel('Series merge duration (per request)') + - $.latencyPanel('cortex_storegateway_bucket_store_series_merge_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), + $.latencyPanel('cortex_bucket_store_series_merge_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( $.panel('Series returned (per request)') + - $.queryPanel('sum(rate(cortex_storegateway_bucket_store_series_result_series_sum{%s}[$__interval])) / sum(rate(cortex_storegateway_bucket_store_series_result_series_count{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'avg series returned'), + $.queryPanel('sum(rate(cortex_bucket_store_series_result_series_sum{component="store-gateway",%s}[$__interval])) / sum(rate(cortex_bucket_store_series_result_series_count{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'avg series returned'), ) ) .addRowIf( @@ -166,20 +166,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Blocks currently loaded') + - $.queryPanel('cortex_storegateway_bucket_store_blocks_loaded{%s}' % $.jobMatcher($._config.job_names.store_gateway), '{{instance}}') + $.queryPanel('cortex_bucket_store_blocks_loaded{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway), '{{instance}}') ) .addPanel( $.successFailurePanel( 'Blocks loaded / sec', - 'sum(rate(cortex_storegateway_bucket_store_block_loads_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], - 'sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), + 'sum(rate(cortex_bucket_store_block_loads_total{component="store-gateway",%s}[$__interval])) - sum(rate(cortex_bucket_store_block_load_failures_total{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], + 'sum(rate(cortex_bucket_store_block_load_failures_total{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), ) ) .addPanel( $.successFailurePanel( 'Blocks dropped / sec', - 'sum(rate(cortex_storegateway_bucket_store_block_drops_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], - 'sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), + 'sum(rate(cortex_bucket_store_block_drops_total{component="store-gateway",%s}[$__interval])) - sum(rate(cortex_bucket_store_block_drop_failures_total{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], + 'sum(rate(cortex_bucket_store_block_drop_failures_total{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), ) ) ), diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index e39ad6dc5e6..8da8f93480b 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -97,41 +97,34 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.row('Memcached - Blocks Storage - Index header') + $.row('Memcached – Blocks Storage – Index header (Store-gateway)') .addPanel( $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds_count{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + + $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{component="store-gateway",name="index-cache", %s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + $.stack + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Latency (getmulti)') + - $.latencyPanel('cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds', '{%s,operation="getmulti"}' % $.jobMatcher($._config.job_names.store_gateway)) + $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="store-gateway",name="index-cache"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum by(item_type) (rate(cortex_storegateway_blocks_index_cache_hits_total{%s}[$__interval])) / sum by(item_type) (rate(cortex_storegateway_blocks_index_cache_requests_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{item_type}}') + + $.queryPanel('sum by(item_type) (rate(thanos_store_index_cache_hits_total{component="store-gateway",%s}[$__interval])) / sum by(item_type) (rate(thanos_store_index_cache_requests_total{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{item_type}}') + { yaxes: $.yaxes('percentunit') }, ) ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.row('Memcached - Blocks Storage - Chunks') - .addPanel( - $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(cortex_storegateway_thanos_memcached_operations_total{%s,name="chunks-cache"}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + - $.stack + - { yaxes: $.yaxes('ops') }, - ) - .addPanel( - $.panel('Latency (getmulti)') + - $.latencyPanel('cortex_storegateway_thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",name="chunks-cache"}' % $.jobMatcher($._config.job_names.store_gateway)) - ) - .addPanel( - $.panel('Hit ratio') + - $.queryPanel('sum(rate(cortex_storegateway_thanos_cache_memcached_hits_total{%s,name="chunks-cache"}[$__interval])) / sum(rate(cortex_storegateway_thanos_cache_memcached_requests_total{%s,name="chunks-cache"}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'chunks') + - { yaxes: $.yaxes('percentunit') }, - ) + $.thanosMemcachedCache('Memcached – Blocks Storage – Chunks (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'chunks-cache') + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.thanosMemcachedCache('Memcached – Blocks Storage – Metadada (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache') + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.thanosMemcachedCache('Memcached – Blocks Storage – Metadada (Querier)', $._config.job_names.querier, 'querier', 'metadata-cache') ) .addRowIf( std.setMember('chunks', $._config.storage_engine) && From 9e3d76ad6587e73abd315952588733e48ccf49b8 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Wed, 10 Jun 2020 14:34:54 -0400 Subject: [PATCH 080/364] remove job label for alerts to work better with single binary Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 473e7b8911d..0e45e18a330 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -7,7 +7,7 @@ alert: 'CortexIngesterUnhealthy', 'for': '15m', expr: ||| - min(cortex_ring_members{state="Unhealthy", job=~"[a-z]+/distributor" %s}) by (namespace, job) > 0 + min(cortex_ring_members{state="Unhealthy" %s}) by (namespace, job) > 0 ||| % $.namespace_matcher(','), labels: { severity: 'critical', @@ -120,7 +120,7 @@ { alert: 'CortexQuerierCapacityFull', expr: ||| - prometheus_engine_queries_concurrent_max{job=~".+/querier"} - prometheus_engine_queries{job=~".+/querier"} == 0 + prometheus_engine_queries_concurrent_max - prometheus_engine_queries == 0 |||, 'for': '5m', // We don't want to block for longer. labels: { @@ -135,7 +135,7 @@ { alert: 'CortexFrontendQueriesStuck', expr: ||| - sum by (namespace) (cortex_query_frontend_queue_length{job=~".+/query-frontend" %s}) > 1 + sum by (namespace) (cortex_query_frontend_queue_length{%s}) > 1 ||| % $.namespace_matcher(','), 'for': '5m', // We don't want to block for longer. labels: { @@ -295,7 +295,7 @@ expr: ||| ( 4 * - sum by(cluster, namespace) (cortex_ingester_memory_series{job=~".+/ingester"} * cortex_ingester_chunk_size_bytes_sum{job=~".+/ingester"} / cortex_ingester_chunk_size_bytes_count{job=~".+/ingester"}) + sum by(cluster, namespace) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count) / 1e9 ) > @@ -317,9 +317,9 @@ alert: 'CortexProvisioningTooManyActiveSeries', // 1 million active series per ingester max. expr: ||| - avg by (cluster, namespace) (cortex_ingester_memory_series{job=~".+/ingester"}) > 1.1e6 + avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.1e6 and - sum by (cluster, namespace) (rate(cortex_ingester_received_chunks{job=~".+/ingester"}[1h])) == 0 + sum by (cluster, namespace) (rate(cortex_ingester_received_chunks[1h])) == 0 |||, 'for': '1h', labels: { @@ -449,7 +449,7 @@ expr: ||| memberlist_client_cluster_members_count{%s} != on (cluster,namespace) group_left - sum(up{job=~".+/(distributor|ingester|querier)"}) by (cluster,namespace) + sum(up{job=~".+/(distributor|ingester|querier|cortex|ruler)"}) by (cluster,namespace) ||| % $.namespace_matcher(), 'for': '5m', labels: { From f85ce253b2539993920f117d30e008ceea4923c6 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Wed, 10 Jun 2020 16:56:42 -0400 Subject: [PATCH 081/364] improve ruler dashboard Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/dashboards/ruler.libsonnet | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index ae7f7c524c4..74c18920899 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -9,7 +9,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Rule Evaluations') .addPanel( $.panel('EPS') + - $.queryPanel('sum(rate(cortex_prometheus_rule_evaluations_total{%s}[$__interval]))' % $.jobMatcher('ruler'), 'rules processed'), + $.queryPanel( + [ + ||| + sum(rate(cortex_prometheus_rule_evaluations_total{%s}[$__interval])) + - + sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval])) + ||| % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], + 'sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval]))' % $.jobMatcher('ruler'), + ], + ['sucess', 'failed'], + ), ) .addPanel( $.panel('Latency') + From 236d4a6efb4bd2d69498b42b8fcec4ef77012718 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Thu, 11 Jun 2020 09:52:14 -0400 Subject: [PATCH 082/364] named ring for ingester ring alert Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 0e45e18a330..818f3c434bb 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -7,7 +7,7 @@ alert: 'CortexIngesterUnhealthy', 'for': '15m', expr: ||| - min(cortex_ring_members{state="Unhealthy" %s}) by (namespace, job) > 0 + min(cortex_ring_members{state="Unhealthy", name="ingester" %s}) by (namespace, job) > 0 ||| % $.namespace_matcher(','), labels: { severity: 'critical', From a6a1ae9b573518067cce962b2912c3d51ad8000c Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Thu, 11 Jun 2020 11:39:39 -0400 Subject: [PATCH 083/364] use named cache labels in dashboards Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/dashboards/queries.libsonnet | 4 ++-- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index 1d44f8a4e22..78636e6574e 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -25,12 +25,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Query Frontend - Results Cache') .addPanel( $.panel('Cache Hit %') + - $.queryPanel('sum(rate(cortex_cache_hits{%s}[1m])) / sum(rate(cortex_cache_fetched_keys{%s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Hit Rate') + + $.queryPanel('sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m])) / sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Hit Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( $.panel('Cache misses') + - $.queryPanel('sum(rate(cortex_cache_fetched_keys{%s}[1m])) - sum(rate(cortex_cache_hits{%s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Miss Rate'), + $.queryPanel('sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m])) - sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Miss Rate'), ) ) .addRow( diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 8da8f93480b..b6081bd017c 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -30,11 +30,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Cache - Query Results') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_frontend)) + $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend)) + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) ) ) .addRow( From f3e4b6ecefc68e6739d7cfe63ae61e847fc0920c Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Sun, 14 Jun 2020 11:12:47 -0400 Subject: [PATCH 084/364] fix labels Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 818f3c434bb..c1186c90250 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -120,7 +120,7 @@ { alert: 'CortexQuerierCapacityFull', expr: ||| - prometheus_engine_queries_concurrent_max - prometheus_engine_queries == 0 + prometheus_engine_queries_concurrent_max{job=~".+/(cortex|ruler|querier)"}, - prometheus_engine_queries{job=~".+/(cortex|ruler|querier)"}, == 0 |||, 'for': '5m', // We don't want to block for longer. labels: { From 1db2cb2d15e14e225b0dbc7791c5d3bb2c127f5e Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Tue, 16 Jun 2020 11:25:20 -0400 Subject: [PATCH 085/364] fix typo Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index c1186c90250..95f2a01caac 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -120,7 +120,7 @@ { alert: 'CortexQuerierCapacityFull', expr: ||| - prometheus_engine_queries_concurrent_max{job=~".+/(cortex|ruler|querier)"}, - prometheus_engine_queries{job=~".+/(cortex|ruler|querier)"}, == 0 + prometheus_engine_queries_concurrent_max{job=~".+/(cortex|ruler|querier)"} - prometheus_engine_queries{job=~".+/(cortex|ruler|querier)"} == 0 |||, 'for': '5m', // We don't want to block for longer. labels: { From ed122cf115fa8491ea479f2fcc5a28eb7b0baaf8 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 19 Jun 2020 13:12:08 +0200 Subject: [PATCH 086/364] Added blocks storage refetch metrics and alerts Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 24 +++++++++++++++++++ .../mimir-mixin/dashboards/queries.libsonnet | 23 ++++++++++++++++-- .../mimir-mixin/dashboards/ruler.libsonnet | 2 +- jsonnet/mimir-mixin/docs/playbooks.md | 8 +++++++ 4 files changed, 54 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 94300db602f..5d3ed16ae45 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -49,6 +49,30 @@ message: 'Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.', }, }, + { + // Alert if the number of queries for which we had to refetch series from different store-gateways + // (because of missing blocks) is greater than a %. + alert: 'CortexQuerierHighRefetchRate', + 'for': '10m', + expr: ||| + 100 * ( + ( + sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count{%s}[5m])) + - + sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0" %s}[5m])) + ) + / + sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count{%s}[5m])) + ) + > 1 + ||| % [$.namespace_matcher(''), $.namespace_matcher(','), $.namespace_matcher('')], + labels: { + severity: 'warning', + }, + annotations: { + message: 'Cortex Queries in {{ $labels.namespace } are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.', + }, + }, { // Alert if the store-gateway is not successfully synching the bucket. alert: 'CortexStoreGatewayHasNotSyncTheBucket', diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index 78636e6574e..2bdf03c8542 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -25,12 +25,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Query Frontend - Results Cache') .addPanel( $.panel('Cache Hit %') + - $.queryPanel('sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m])) / sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Hit Rate') + + $.queryPanel('sum(rate(cortex_cache_hits{%s}[1m])) / sum(rate(cortex_cache_fetched_keys{%s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Hit Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( $.panel('Cache misses') + - $.queryPanel('sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m])) - sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Miss Rate'), + $.queryPanel('sum(rate(cortex_cache_fetched_keys{%s}[1m])) - sum(rate(cortex_cache_hits{%s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Miss Rate'), ) ) .addRow( @@ -124,6 +124,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('short') }, ) ) + .addRowIf( + std.setMember('chunks', $._config.storage_engine), + $.row('Querier - Blocks storage') + .addPanel( + $.panel('Number of store-gateways hit per Query') + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Refetches of missing blocks per Query') + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Consistency checks failed') + + $.queryPanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[1m])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Failure Rate') + + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), $.row('Store-gateway - Blocks') diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 74c18920899..db60d217047 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -13,7 +13,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; [ ||| sum(rate(cortex_prometheus_rule_evaluations_total{%s}[$__interval])) - - + - sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval])) ||| % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], 'sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval]))' % $.jobMatcher('ruler'), diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index bac54ebe9cd..212a7416cbd 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -93,6 +93,14 @@ This alert fires when a Cortex querier is not successfully scanning blocks in th How to investigate: - Look for any scan error in the querier logs (ie. networking or rate limiting issues) +## CortexQuerierHighRefetchRate + +This alert fires when there's an high number of queries for which series have been refetched from a different store-gateway because of missing blocks. This could happen for a short time whenever a store-gateway ring resharding occurs (e.g. during/after an outage or while rolling out store-gateway) but store-gateways should reconcile in a short time. This alert fires if the issue persist for an unexpected long time and thus it should be investigated. + +How to investigate: +- Ensure there are no errors related to blocks scan or sync in the queriers and store-gateways +- Check store-gateway logs to see if all store-gateway have successfully completed a blocks sync + ## CortexStoreGatewayHasNotSyncTheBucket This alert fires when a Cortex store-gateway is not successfully scanning blocks in the storage (bucket). A store-gateway is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket for a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. From d499421921464fec455b890330b2f4a151e55b89 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 22 Jun 2020 11:08:23 +0200 Subject: [PATCH 087/364] Reverted unintended changes Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/queries.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index 2bdf03c8542..a7a6f491f75 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -25,12 +25,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Query Frontend - Results Cache') .addPanel( $.panel('Cache Hit %') + - $.queryPanel('sum(rate(cortex_cache_hits{%s}[1m])) / sum(rate(cortex_cache_fetched_keys{%s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Hit Rate') + + $.queryPanel('sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m])) / sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Hit Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( $.panel('Cache misses') + - $.queryPanel('sum(rate(cortex_cache_fetched_keys{%s}[1m])) - sum(rate(cortex_cache_hits{%s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Miss Rate'), + $.queryPanel('sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m])) - sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Miss Rate'), ) ) .addRow( From 11e79de948116e47409e6e9e783d2edcba485968 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 22 Jun 2020 11:28:41 +0200 Subject: [PATCH 088/364] Fixed alert syntax Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 5d3ed16ae45..dfdecc5fe2e 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -70,7 +70,7 @@ severity: 'warning', }, annotations: { - message: 'Cortex Queries in {{ $labels.namespace } are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.', + message: 'Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.', }, }, { From 4d6cbdb3dc61041ef20cb9ef491e67c8dc48e492 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 23 Jun 2020 19:18:44 +0100 Subject: [PATCH 089/364] Remove memcached alert. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 95f2a01caac..6e046c8e26b 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -364,26 +364,6 @@ }, ], }, - { - name: 'memcached', - rules: [ - { - alert: 'MemcachedDown', - expr: ||| - memcached_up == 0 - |||, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - Memcached Instance {{ $labels.instance }} is down for more than 15mins. - |||, - }, - }, - ], - }, { name: 'ruler_alerts', rules: [ From 8712ff651289a6cdf95ce82ab2003fb652489007 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 23 Jun 2020 19:44:10 +0100 Subject: [PATCH 090/364] Fix generated alert syntax. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 6e046c8e26b..6bf975f4f99 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -136,7 +136,7 @@ alert: 'CortexFrontendQueriesStuck', expr: ||| sum by (namespace) (cortex_query_frontend_queue_length{%s}) > 1 - ||| % $.namespace_matcher(','), + ||| % $.namespace_matcher(''), 'for': '5m', // We don't want to block for longer. labels: { severity: 'critical', From ff431b6c0ccf31a72f3a3d94da32d011e4c628f1 Mon Sep 17 00:00:00 2001 From: Annanay Date: Tue, 23 Jun 2020 19:51:19 +0530 Subject: [PATCH 091/364] Correct alerts to work with Cortex single binary Signed-off-by: Annanay --- .../mimir-mixin/alerts/alert-utils.libsonnet | 10 ++ jsonnet/mimir-mixin/alerts/alerts.libsonnet | 135 ++++++++---------- 2 files changed, 71 insertions(+), 74 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet b/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet index e72d20c8b3f..daa78782664 100644 --- a/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet @@ -5,4 +5,14 @@ if std.length($._config.alert_namespace_matcher) != 0 then '%s namespace=~"%s"' % [prefix, $._config.alert_namespace_matcher] else '', + + aggregation_labels(replace=''):: + if $._config.singleBinary == true + then 'job' + else replace, + + annotation_labels(replace='$labels.namespace'):: + if $._config.singleBinary == true + then '$labels.job' + else replace, } diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 6bf975f4f99..db3e12222d0 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -7,8 +7,8 @@ alert: 'CortexIngesterUnhealthy', 'for': '15m', expr: ||| - min(cortex_ring_members{state="Unhealthy", name="ingester" %s}) by (namespace, job) > 0 - ||| % $.namespace_matcher(','), + min(cortex_ring_members{state="Unhealthy", name="ingester" %s}) by (%s) > 0 + ||| % [$.namespace_matcher(','), $.aggregation_labels('namespace, job')], labels: { severity: 'critical', }, @@ -16,27 +16,14 @@ message: '{{ $labels.job }} reports more than one unhealthy ingester.', }, }, - { - alert: 'CortexFlushStuck', - expr: ||| - (cortex_ingester_memory_chunks / cortex_ingester_memory_series) > 1.3 - |||, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: '{{ $labels.job }}/{{ $labels.instance }} is stuck flushing chunks.', - }, - }, { alert: 'CortexRequestErrors', expr: ||| - 100 * sum(rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) + 100 * sum(rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) by (%s, route) / - sum(rate(cortex_request_duration_seconds_count[1m])) by (namespace, job, route) + sum(rate(cortex_request_duration_seconds_count[1m])) by (%s, route) > 1 - |||, + ||| % [$.aggregation_labels('namespace, job'), $.aggregation_labels('namespace, job')], 'for': '15m', labels: { severity: 'warning', @@ -88,10 +75,10 @@ { alert: 'CortexQueriesIncorrect', expr: ||| - 100 * sum by (job, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) + 100 * sum by (%s) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) / - sum by (job, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1 - |||, + sum by (%s) (rate(test_exporter_test_case_result_total[5m])) > 1 + ||| % [$.aggregation_labels('namespace, job'), $.aggregation_labels('namespace, job')], 'for': '15m', labels: { severity: 'warning', @@ -135,8 +122,8 @@ { alert: 'CortexFrontendQueriesStuck', expr: ||| - sum by (namespace) (cortex_query_frontend_queue_length{%s}) > 1 - ||| % $.namespace_matcher(''), + sum by (%s) (cortex_query_frontend_queue_length{%s}) > 1 + ||| % [$.aggregation_labels('namespace'), $.namespace_matcher('')], 'for': '5m', // We don't want to block for longer. labels: { severity: 'critical', @@ -150,32 +137,32 @@ { alert: 'CortexCacheRequestErrors', expr: ||| - 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5.." %s}[1m])) by (namespace, job, method) + 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5.." %s}[1m])) by (%s, method) / - sum(rate(cortex_cache_request_duration_seconds_count{%s}[1m])) by (namespace, job, method) + sum(rate(cortex_cache_request_duration_seconds_count{%s}[1m])) by (%s, method) > 1 - ||| % [$.namespace_matcher(','), $.namespace_matcher()], + ||| % [$.namespace_matcher(','), $.aggregation_labels('namespace, job'), $.namespace_matcher(), $.aggregation_labels('namespace, job')], 'for': '15m', labels: { severity: 'warning', }, annotations: { message: ||| - {{ $labels.job }} cache {{ $labels.method }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, + {{ %s }} cache {{ $labels.method }} is experiencing {{ printf "%.2f" $value }}% errors. + ||| % $.annotation_labels(), }, }, { alert: 'CortexIngesterRestarts', expr: ||| - rate(kube_pod_container_status_restarts_total{container="ingester" %s}[30m]) > 0 + rate(process_start_time_seconds{job=~".+(cortex|ingester)" %s}[30m]) > 0 ||| % $.namespace_matcher(','), labels: { severity: 'critical', }, annotations: { - message: '{{ $labels.namespace }}/{{ $labels.pod }} is restarting', - }, + message: '{{ %s }}/{{ %s }} is restarting', + } % [$.annotation_labels(), $.annotation_labels('$labels.pod')], }, { alert: 'CortexTransferFailed', @@ -187,8 +174,8 @@ severity: 'critical', }, annotations: { - message: '{{ $labels.namespace }}/{{ $labels.instance }} transfer failed.', - }, + message: '{{ %s }}/{{ $labels.instance }} transfer failed.', + } % $.annotation_labels(), }, { alert: 'CortexOldChunkInMemory', @@ -204,8 +191,8 @@ }, annotations: { message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} has very old unflushed chunk in memory. - |||, + {{ %s }}/{{ $labels.instance }} has very old unflushed chunk in memory. + ||| % $.annotation_labels(), }, }, { @@ -219,8 +206,8 @@ }, annotations: { message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. - |||, + {{ %s }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. + ||| % $.annotation_labels(), }, }, { @@ -234,8 +221,8 @@ }, annotations: { message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} failed to create checkpoint. - |||, + {{ %s }}/{{ $labels.instance }} failed to create checkpoint. + ||| % $.annotation_labels(), }, }, { @@ -249,8 +236,8 @@ }, annotations: { message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} is failing to create checkpoint. - |||, + {{ %s }}/{{ $labels.instance }} is failing to create checkpoint. + ||| % $.annotation_labels(), }, }, { @@ -264,8 +251,8 @@ }, annotations: { message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} failed to delete checkpoint. - |||, + {{ %s }}/{{ $labels.instance }} failed to delete checkpoint. + ||| % $.annotation_labels(), }, }, { @@ -280,8 +267,8 @@ }, annotations: { message: ||| - {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete checkpoint. - |||, + {{ %s }}/{{ $labels.instance }} is failing to delete checkpoint. + ||| % $.annotation_labels(), }, }, ], @@ -295,71 +282,71 @@ expr: ||| ( 4 * - sum by(cluster, namespace) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count) + sum by(%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count) / 1e9 ) > ( - sum by (cluster, namespace) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 + sum by(%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 ) - |||, + ||| % $.aggregation_labels('cluster, namespace'), 'for': '15m', labels: { severity: 'warning', }, annotations: { message: ||| - Chunk memcached cluster for namespace {{ $labels.namespace }} are too small, should be at least {{ printf "%.2f" $value }}GB. - |||, + Chunk memcached cluster for {{ %s }} are too small, should be at least {{ printf "%.2f" $value }}GB. + ||| % $.annotation_labels(), }, }, { alert: 'CortexProvisioningTooManyActiveSeries', // 1 million active series per ingester max. expr: ||| - avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.1e6 + avg by (%s) (cortex_ingester_memory_series) > 1.1e6 and - sum by (cluster, namespace) (rate(cortex_ingester_received_chunks[1h])) == 0 - |||, + sum by (%s) (rate(cortex_ingester_received_chunks[1h])) == 0 + ||| % $.aggregation_labels('cluster, namespace'), 'for': '1h', labels: { severity: 'warning', }, annotations: { message: ||| - Too many active series for ingesters in namespace {{ $labels.namespace }}, add more ingesters. - |||, + Too many active series for ingesters in {{ %s }}, add more ingesters. + ||| % $.annotation_labels(), }, }, { alert: 'CortexProvisioningTooManyWrites', // 80k writes / s per ingester max. expr: ||| - avg by (cluster,namespace) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3 - |||, + avg by (%s) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3 + ||| % $.aggregation_labels('cluster, namespace'), 'for': '15m', labels: { severity: 'warning', }, annotations: { message: ||| - Too much write QPS for ingesters in namespace {{ $labels.namespace }}, add more ingesters. - |||, + High QPS for ingesters in {{ %s }}, add more ingesters. + ||| % $.annotation_labels(), }, }, { alert: 'CortexProvisioningTooMuchMemory', expr: ||| - avg by (cluster, namespace) (container_memory_working_set_bytes{container_name="ingester" %s} / container_spec_memory_limit_bytes{container_name="ingester" %s}) > 0.7 - ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], + avg by (%s) (container_memory_working_set_bytes{container_name="ingester" %s} / container_spec_memory_limit_bytes{container_name="ingester" %s}) > 0.7 + ||| % [$.aggregation_labels('cluster, namespace'), $.namespace_matcher(','), $.namespace_matcher(',')], 'for': '15m', labels: { severity: 'critical', }, annotations: { message: ||| - Too much memory being used by ingesters in namespace {{ $labels.namespace }}, add more ingesters. - |||, + Too much memory being used by ingesters for {{ %s }}, add more ingesters. + ||| % $.annotation_labels(), }, }, ], @@ -370,11 +357,11 @@ { alert: 'CortexRulerFailedEvaluations', expr: ||| - sum(rate(cortex_prometheus_rule_evaluation_failures_total[1m])) by (namespace, job) + sum(rate(cortex_prometheus_rule_evaluation_failures_total[1m])) by (%s) / - sum(rate(cortex_prometheus_rule_evaluations_total[1m])) by (namespace, job) + sum(rate(cortex_prometheus_rule_evaluations_total[1m])) by (%s) > 0.01 - |||, + ||| % [$.aggregation_labels('namespace, job'), $.aggregation_labels('namespace, job')], 'for': '5m', labels: { severity: 'warning', @@ -388,11 +375,11 @@ { alert: 'CortexRulerMissedEvaluations', expr: ||| - sum(rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) by (namespace, job) + sum(rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) by (%s) / - sum(rate(cortex_prometheus_rule_group_iterations_total[1m])) by (namespace, job) + sum(rate(cortex_prometheus_rule_group_iterations_total[1m])) by (%s) > 0.01 - |||, + ||| % [$.aggregation_labels('namespace, job'), $.aggregation_labels('namespace, job')], 'for': '5m', labels: { severity: 'warning', @@ -406,9 +393,9 @@ { alert: 'CortexRulerFailedRingCheck', expr: ||| - sum(rate(cortex_ruler_ring_check_errors_total[5m])) by (namespace, job) + sum(rate(cortex_ruler_ring_check_errors_total[5m])) by (%s) > 0 - |||, + ||| % $.aggregation_labels('namespace, job'), 'for': '1m', labels: { severity: 'critical', @@ -428,9 +415,9 @@ alert: 'CortexGossipMembersMismatch', expr: ||| memberlist_client_cluster_members_count{%s} - != on (cluster,namespace) group_left - sum(up{job=~".+/(distributor|ingester|querier|cortex|ruler)"}) by (cluster,namespace) - ||| % $.namespace_matcher(), + != on (%s) group_left + sum(up{job=~".+/(distributor|ingester|querier|cortex|ruler)"}) by (%s) + ||| % [$.namespace_matcher(), $.aggregation_labels('namespace, job'), $.aggregation_labels('namespace, job')], 'for': '5m', labels: { severity: 'warning', From 530eed097fd244229235c2693f768e7a50d45393 Mon Sep 17 00:00:00 2001 From: Annanay Agarwal Date: Tue, 23 Jun 2020 20:38:35 +0530 Subject: [PATCH 092/364] Apply suggestions from code review Co-authored-by: Jacob Lisi --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index db3e12222d0..2ae9f08738f 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -289,7 +289,7 @@ ( sum by(%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 ) - ||| % $.aggregation_labels('cluster, namespace'), + ||| % [$.aggregation_labels('cluster, namespace'), $.aggregation_labels('cluster, namespace')], 'for': '15m', labels: { severity: 'warning', @@ -307,7 +307,7 @@ avg by (%s) (cortex_ingester_memory_series) > 1.1e6 and sum by (%s) (rate(cortex_ingester_received_chunks[1h])) == 0 - ||| % $.aggregation_labels('cluster, namespace'), + ||| % [$.aggregation_labels('cluster, namespace'), $.aggregation_labels('cluster, namespace')], 'for': '1h', labels: { severity: 'warning', @@ -417,7 +417,7 @@ memberlist_client_cluster_members_count{%s} != on (%s) group_left sum(up{job=~".+/(distributor|ingester|querier|cortex|ruler)"}) by (%s) - ||| % [$.namespace_matcher(), $.aggregation_labels('namespace, job'), $.aggregation_labels('namespace, job')], + ||| % [$.namespace_matcher(), $.aggregation_labels('cluster, namespace'), $.aggregation_labels('cluster, namespace')], 'for': '5m', labels: { severity: 'warning', From c4ab067dbd60c588158c9522bf17ecd4cc5681ba Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 23 Jun 2020 19:18:04 +0100 Subject: [PATCH 093/364] Simplify the alert rule generation: - Don't both with a function, we just need to know a list of labels to aggregate by that will identify a single Cortex cluster (ie "cluster, namespace" or "job"). - Remove alert_namespace_matcher, its brittle and hard to maintain. - Put WAL alerts in a separate group. Signed-off-by: Tom Wilkie --- .../mimir-mixin/alerts/alert-utils.libsonnet | 10 -- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 142 ++++++++++-------- jsonnet/mimir-mixin/config.libsonnet | 4 +- 3 files changed, 81 insertions(+), 75 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet b/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet index daa78782664..ff732949a08 100644 --- a/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet @@ -1,16 +1,6 @@ { _config:: error 'must provide _config for alerts', - namespace_matcher(prefix=''):: - if std.length($._config.alert_namespace_matcher) != 0 - then '%s namespace=~"%s"' % [prefix, $._config.alert_namespace_matcher] - else '', - - aggregation_labels(replace=''):: - if $._config.singleBinary == true - then 'job' - else replace, - annotation_labels(replace='$labels.namespace'):: if $._config.singleBinary == true then '$labels.job' diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 2ae9f08738f..cef8d0e997b 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -7,30 +7,30 @@ alert: 'CortexIngesterUnhealthy', 'for': '15m', expr: ||| - min(cortex_ring_members{state="Unhealthy", name="ingester" %s}) by (%s) > 0 - ||| % [$.namespace_matcher(','), $.aggregation_labels('namespace, job')], + min by (%s) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0 + ||| % $._config.alert_aggregation_labels, labels: { severity: 'critical', }, annotations: { - message: '{{ $labels.job }} reports more than one unhealthy ingester.', + message: 'There are {{ printf "%f" $value }} unhealthy ingester(s).', }, }, { alert: 'CortexRequestErrors', expr: ||| - 100 * sum(rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) by (%s, route) + 100 * sum by (%s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) / - sum(rate(cortex_request_duration_seconds_count[1m])) by (%s, route) + sum y (%s, route) (rate(cortex_request_duration_seconds_count[1m])) > 1 - ||| % [$.aggregation_labels('namespace, job'), $.aggregation_labels('namespace, job')], + ||| % [$._config.alert_aggregation_labels], 'for': '15m', labels: { severity: 'warning', }, annotations: { message: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. |||, }, }, @@ -78,22 +78,25 @@ 100 * sum by (%s) (rate(test_exporter_test_case_result_total{result="fail"}[5m])) / sum by (%s) (rate(test_exporter_test_case_result_total[5m])) > 1 - ||| % [$.aggregation_labels('namespace, job'), $.aggregation_labels('namespace, job')], + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '15m', labels: { severity: 'warning', }, annotations: { message: ||| - {{ $labels.job }} is reporting incorrect results for {{ printf "%.2f" $value }}% of queries. + Incorrect results for {{ printf "%.2f" $value }}% of queries. |||, }, }, { + // As of https://github.com/cortexproject/cortex/pull/2092, this metric is + // only exposed when it is supposed to be non-zero, so we don't need to do + // any special filtering on the job label. alert: 'CortexBadOverrides', expr: ||| - cortex_overrides_last_reload_successful{job!~".+/table-manager|.+/alertmanager" %s} == 0 - ||| % $.namespace_matcher(','), + cortex_overrides_last_reload_successful == 0 + |||, 'for': '15m', labels: { severity: 'warning', @@ -122,60 +125,60 @@ { alert: 'CortexFrontendQueriesStuck', expr: ||| - sum by (%s) (cortex_query_frontend_queue_length{%s}) > 1 - ||| % [$.aggregation_labels('namespace'), $.namespace_matcher('')], + sum by (%s) (cortex_query_frontend_queue_length) > 1 + ||| % $._config.alert_aggregation_labels, 'for': '5m', // We don't want to block for longer. labels: { severity: 'critical', }, annotations: { message: ||| - {{ $labels.job }} has {{ $value }} queued up queries. + There are {{ $value }} queued up queries. |||, }, }, { alert: 'CortexCacheRequestErrors', expr: ||| - 100 * sum(rate(cortex_cache_request_duration_seconds_count{status_code=~"5.." %s}[1m])) by (%s, method) + 100 * sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count{status_code=~"5.." %s}[1m])) / - sum(rate(cortex_cache_request_duration_seconds_count{%s}[1m])) by (%s, method) + sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count{%s}[1m])) > 1 - ||| % [$.namespace_matcher(','), $.aggregation_labels('namespace, job'), $.namespace_matcher(), $.aggregation_labels('namespace, job')], + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '15m', labels: { severity: 'warning', }, annotations: { message: ||| - {{ %s }} cache {{ $labels.method }} is experiencing {{ printf "%.2f" $value }}% errors. - ||| % $.annotation_labels(), + Cache {{ $labels.method }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, }, }, { alert: 'CortexIngesterRestarts', expr: ||| - rate(process_start_time_seconds{job=~".+(cortex|ingester)" %s}[30m]) > 0 - ||| % $.namespace_matcher(','), + rate(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 0 + |||, labels: { severity: 'critical', }, annotations: { - message: '{{ %s }}/{{ %s }} is restarting', - } % [$.annotation_labels(), $.annotation_labels('$labels.pod')], + message: '{{ $labels.job }}/{{ $labels.instance }} is restarting', + }, }, { alert: 'CortexTransferFailed', expr: ||| - max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success" %s}[15m]) - ||| % $.namespace_matcher(','), + max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m]) + |||, 'for': '5m', labels: { severity: 'critical', }, annotations: { - message: '{{ %s }}/{{ $labels.instance }} transfer failed.', - } % $.annotation_labels(), + message: '{{ $labels.job }}/{{ $labels.instance }} transfer failed.', + }, }, { alert: 'CortexOldChunkInMemory', @@ -183,7 +186,9 @@ // to 10 hours. // Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors). expr: ||| - (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000) and cortex_oldest_unflushed_chunk_timestamp_seconds > 0 + (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000) + and + (cortex_oldest_unflushed_chunk_timestamp_seconds > 0) |||, 'for': '5m', labels: { @@ -191,10 +196,15 @@ }, annotations: { message: ||| - {{ %s }}/{{ $labels.instance }} has very old unflushed chunk in memory. - ||| % $.annotation_labels(), + {{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory. + |||, }, }, + ], + }, + { + name: 'cortex_wal_alerts', + rules: [ { // Alert immediately if WAL is corrupt. alert: 'CortexWALCorruption', @@ -206,12 +216,12 @@ }, annotations: { message: ||| - {{ %s }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. - ||| % $.annotation_labels(), + {{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. + |||, }, }, { - // 1 failed checkpoint creation is a warning. + // One or more failed checkpoint creation is a warning. alert: 'CortexCheckpointCreationFailed', expr: ||| increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0 @@ -221,12 +231,12 @@ }, annotations: { message: ||| - {{ %s }}/{{ $labels.instance }} failed to create checkpoint. - ||| % $.annotation_labels(), + {{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint. + |||, }, }, { - // 2 or more failed checkpoint creation in 1h means something is wrong. + // Two or more failed checkpoint creation in 1h means something is wrong. alert: 'CortexCheckpointCreationFailed', expr: ||| increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1 @@ -236,12 +246,12 @@ }, annotations: { message: ||| - {{ %s }}/{{ $labels.instance }} is failing to create checkpoint. - ||| % $.annotation_labels(), + {{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint. + |||, }, }, { - // 1 failed checkpoint deletion is a warning. + // One or more failed checkpoint deletion is a warning. alert: 'CortexCheckpointDeletionFailed', expr: ||| increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0 @@ -251,12 +261,12 @@ }, annotations: { message: ||| - {{ %s }}/{{ $labels.instance }} failed to delete checkpoint. - ||| % $.annotation_labels(), + {{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint. + |||, }, }, { - // 2 or more failed checkpoint deletion in 2h means something is wrong. + // Two or more failed checkpoint deletion in 2h means something is wrong. // We give this more buffer than creation as this is a less critical operation. alert: 'CortexCheckpointDeletionFailed', expr: ||| @@ -267,8 +277,8 @@ }, annotations: { message: ||| - {{ %s }}/{{ $labels.instance }} is failing to delete checkpoint. - ||| % $.annotation_labels(), + {{ $labels.instance }} is failing to delete checkpoint. + |||, }, }, ], @@ -282,14 +292,14 @@ expr: ||| ( 4 * - sum by(%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count) + sum by (%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count) / 1e9 ) > ( - sum by(%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 + sum by (%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 ) - ||| % [$.aggregation_labels('cluster, namespace'), $.aggregation_labels('cluster, namespace')], + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '15m', labels: { severity: 'warning', @@ -307,7 +317,7 @@ avg by (%s) (cortex_ingester_memory_series) > 1.1e6 and sum by (%s) (rate(cortex_ingester_received_chunks[1h])) == 0 - ||| % [$.aggregation_labels('cluster, namespace'), $.aggregation_labels('cluster, namespace')], + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '1h', labels: { severity: 'warning', @@ -323,7 +333,7 @@ // 80k writes / s per ingester max. expr: ||| avg by (%s) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3 - ||| % $.aggregation_labels('cluster, namespace'), + ||| % $._config.alert_aggregation_labels, 'for': '15m', labels: { severity: 'warning', @@ -337,16 +347,20 @@ { alert: 'CortexProvisioningTooMuchMemory', expr: ||| - avg by (%s) (container_memory_working_set_bytes{container_name="ingester" %s} / container_spec_memory_limit_bytes{container_name="ingester" %s}) > 0.7 - ||| % [$.aggregation_labels('cluster, namespace'), $.namespace_matcher(','), $.namespace_matcher(',')], + avg by (%s) ( + container_memory_working_set_bytes{container_name="ingester"} + / + container_spec_memory_limit_bytes{container_name="ingester"} + ) > 0.7 + ||| % $._config.alert_aggregation_labels, 'for': '15m', labels: { severity: 'critical', }, annotations: { message: ||| - Too much memory being used by ingesters for {{ %s }}, add more ingesters. - ||| % $.annotation_labels(), + Too much memory being used by ingesters - add more ingesters. + |||, }, }, ], @@ -357,11 +371,11 @@ { alert: 'CortexRulerFailedEvaluations', expr: ||| - sum(rate(cortex_prometheus_rule_evaluation_failures_total[1m])) by (%s) + sum by (%s) (rate(cortex_prometheus_rule_evaluation_failures_total[1m])) / - sum(rate(cortex_prometheus_rule_evaluations_total[1m])) by (%s) + sum by (%s) (rate(cortex_prometheus_rule_evaluations_total[1m])) > 0.01 - ||| % [$.aggregation_labels('namespace, job'), $.aggregation_labels('namespace, job')], + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', labels: { severity: 'warning', @@ -375,11 +389,11 @@ { alert: 'CortexRulerMissedEvaluations', expr: ||| - sum(rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) by (%s) + sum by (%s) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) / - sum(rate(cortex_prometheus_rule_group_iterations_total[1m])) by (%s) + sum by (%s) (rate(cortex_prometheus_rule_group_iterations_total[1m])) > 0.01 - ||| % [$.aggregation_labels('namespace, job'), $.aggregation_labels('namespace, job')], + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', labels: { severity: 'warning', @@ -393,9 +407,9 @@ { alert: 'CortexRulerFailedRingCheck', expr: ||| - sum(rate(cortex_ruler_ring_check_errors_total[5m])) by (%s) + sum by (%s) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0 - ||| % $.aggregation_labels('namespace, job'), + ||| % $._config.alert_aggregation_labels, 'for': '1m', labels: { severity: 'critical', @@ -414,10 +428,10 @@ { alert: 'CortexGossipMembersMismatch', expr: ||| - memberlist_client_cluster_members_count{%s} + memberlist_client_cluster_members_count != on (%s) group_left - sum(up{job=~".+/(distributor|ingester|querier|cortex|ruler)"}) by (%s) - ||| % [$.namespace_matcher(), $.aggregation_labels('cluster, namespace'), $.aggregation_labels('cluster, namespace')], + sum by (%s) (up{job=~".+/(distributor|ingester|querier|cortex|ruler)"}) + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', labels: { severity: 'warning', diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 73f59c54214..8e02f3fcc4a 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -33,8 +33,10 @@ gateway: 'cortex-gw', }, + // Labels used to in alert aggregations - should uniquely identify + // a single Cortex cluster. + alert_aggregation_labels: 'cluster, namespace', cortex_p99_latency_threshold_seconds: 2.5, - alert_namespace_matcher: '', // Whether resources dashboards are enabled (based on cAdvisor metrics). resources_dashboards_enabled: false, From a0599d2173440acffc961e471fc9d52c70968970 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 23 Jun 2020 20:47:11 +0100 Subject: [PATCH 094/364] Get it working. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/alerts.jsonnet | 4 ++- .../mimir-mixin/alerts/alert-utils.libsonnet | 8 ----- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 20 ++++++------- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 30 +++++++++---------- .../mimir-mixin/alerts/compactor.libsonnet | 20 ++++++------- 5 files changed, 38 insertions(+), 44 deletions(-) delete mode 100644 jsonnet/mimir-mixin/alerts/alert-utils.libsonnet diff --git a/jsonnet/mimir-mixin/alerts.jsonnet b/jsonnet/mimir-mixin/alerts.jsonnet index 75e7c1b297a..bd44d1d999f 100644 --- a/jsonnet/mimir-mixin/alerts.jsonnet +++ b/jsonnet/mimir-mixin/alerts.jsonnet @@ -1 +1,3 @@ -std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) +local mixin = import 'mixin.libsonnet'; + +std.manifestYamlDoc(mixin.prometheusAlerts) diff --git a/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet b/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet deleted file mode 100644 index ff732949a08..00000000000 --- a/jsonnet/mimir-mixin/alerts/alert-utils.libsonnet +++ /dev/null @@ -1,8 +0,0 @@ -{ - _config:: error 'must provide _config for alerts', - - annotation_labels(replace='$labels.namespace'):: - if $._config.singleBinary == true - then '$labels.job' - else replace, -} diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index cef8d0e997b..c22a7b07e5b 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -1,4 +1,4 @@ -(import 'alert-utils.libsonnet') { +{ groups+: [ { name: 'cortex_alerts', @@ -23,7 +23,7 @@ / sum y (%s, route) (rate(cortex_request_duration_seconds_count[1m])) > 1 - ||| % [$._config.alert_aggregation_labels], + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '15m', labels: { severity: 'warning', @@ -140,9 +140,9 @@ { alert: 'CortexCacheRequestErrors', expr: ||| - 100 * sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count{status_code=~"5.." %s}[1m])) + 100 * sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count{status_code=~"5.."}[1m])) / - sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count{%s}[1m])) + sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count[1m])) > 1 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '15m', @@ -306,8 +306,8 @@ }, annotations: { message: ||| - Chunk memcached cluster for {{ %s }} are too small, should be at least {{ printf "%.2f" $value }}GB. - ||| % $.annotation_labels(), + Chunk memcached cluster is too small, should be at least {{ printf "%.2f" $value }}GB. + |||, }, }, { @@ -324,8 +324,8 @@ }, annotations: { message: ||| - Too many active series for ingesters in {{ %s }}, add more ingesters. - ||| % $.annotation_labels(), + Too many active series for ingesters, add more ingesters. + |||, }, }, { @@ -340,8 +340,8 @@ }, annotations: { message: ||| - High QPS for ingesters in {{ %s }}, add more ingesters. - ||| % $.annotation_labels(), + High QPS for ingesters, add more ingesters. + |||, }, }, { diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index dfdecc5fe2e..65cd6cbe555 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -1,4 +1,4 @@ -(import 'alert-utils.libsonnet') { +{ groups+: [ { name: 'cortex_blocks_alerts', @@ -8,10 +8,10 @@ alert: 'CortexIngesterHasNotShippedBlocks', 'for': '15m', expr: ||| - (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"%s} > 60 * 60 * 4) + (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"} > 60 * 60 * 4) and - (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"%s} > 0) - ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], + (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"} > 0) + |||, labels: { severity: 'critical', }, @@ -24,8 +24,8 @@ alert: 'CortexIngesterHasNotShippedBlocksSinceStart', 'for': '4h', expr: ||| - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"%s} == 0 - ||| % $.namespace_matcher(','), + thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"} == 0 + |||, labels: { severity: 'critical', }, @@ -38,10 +38,10 @@ alert: 'CortexQuerierHasNotScanTheBucket', 'for': '5m', expr: ||| - (time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds{%s} > 60 * 30) + (time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30) and cortex_querier_blocks_last_successful_scan_timestamp_seconds{%s} > 0 - ||| % [$.namespace_matcher(''), $.namespace_matcher('')], + |||, labels: { severity: 'critical', }, @@ -57,15 +57,15 @@ expr: ||| 100 * ( ( - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count{%s}[5m])) + sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) - - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0" %s}[5m])) + sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0"}[5m])) ) / - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count{%s}[5m])) + sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) ) > 1 - ||| % [$.namespace_matcher(''), $.namespace_matcher(','), $.namespace_matcher('')], + |||, labels: { severity: 'warning', }, @@ -78,10 +78,10 @@ alert: 'CortexStoreGatewayHasNotSyncTheBucket', 'for': '5m', expr: ||| - (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway",%s} > 60 * 30) + (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30) and - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway",%s} > 0 - ||| % [$.namespace_matcher(''), $.namespace_matcher('')], + cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0 + |||, labels: { severity: 'critical', }, diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index c638e10f8d8..32eb2842acc 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -1,4 +1,4 @@ -(import 'alert-utils.libsonnet') { +{ groups+: [ { name: 'cortex_compactor_alerts', @@ -8,10 +8,10 @@ alert: 'CortexCompactorHasNotSuccessfullyCleanedUpBlocks', 'for': '15m', expr: ||| - (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds{%s} > 60 * 60 * 24) + (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 24) and - (cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds{%s} > 0) - ||| % [$.namespace_matcher(''), $.namespace_matcher('')], + (cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 0) + |||, labels: { severity: 'critical', }, @@ -24,8 +24,8 @@ alert: 'CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart', 'for': '24h', expr: ||| - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds{%s} == 0 - ||| % $.namespace_matcher(''), + cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds == 0 + |||, labels: { severity: 'critical', }, @@ -38,10 +38,10 @@ alert: 'CortexCompactorHasNotUploadedBlocks', 'for': '15m', expr: ||| - (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"%s} > 60 * 60 * 24) + (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 60 * 60 * 24) and (thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"%s} > 0) - ||| % [$.namespace_matcher(','), $.namespace_matcher(',')], + |||, labels: { severity: 'critical', }, @@ -54,8 +54,8 @@ alert: 'CortexCompactorHasNotUploadedBlocksSinceStart', 'for': '24h', expr: ||| - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"%s} == 0 - ||| % $.namespace_matcher(','), + thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} == 0 + |||, labels: { severity: 'critical', }, From 7df6fcdda2b70eb9a523a08bea6218720d9e82dd Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 23 Jun 2020 20:55:13 +0100 Subject: [PATCH 095/364] Always aggregate request error alert by job. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index c22a7b07e5b..068f7e340f1 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -18,10 +18,12 @@ }, { alert: 'CortexRequestErrors', + // Note is alert_aggregation_labels is "job", this will repeat the label. But + // prometheus seems to tolerate that. expr: ||| - 100 * sum by (%s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) + 100 * sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) / - sum y (%s, route) (rate(cortex_request_duration_seconds_count[1m])) + sum y (%s, job, route) (rate(cortex_request_duration_seconds_count[1m])) > 1 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '15m', @@ -30,7 +32,7 @@ }, annotations: { message: ||| - {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. |||, }, }, From 80a9b42ba91a5b5bb8a82b025235a6b9671be1d5 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 23 Jun 2020 21:07:52 +0100 Subject: [PATCH 096/364] Update cortex-mixin/alerts/alerts.libsonnet Co-authored-by: Jacob Lisi --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 068f7e340f1..dfcf35aac55 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -23,7 +23,7 @@ expr: ||| 100 * sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) / - sum y (%s, job, route) (rate(cortex_request_duration_seconds_count[1m])) + sum by (%s, job, route) (rate(cortex_request_duration_seconds_count[1m])) > 1 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '15m', From 19701c2115e1149a31af0bda95d3f2ea6cdeab32 Mon Sep 17 00:00:00 2001 From: Annanay Date: Wed, 24 Jun 2020 12:51:23 +0530 Subject: [PATCH 097/364] Cleanup leftover namespace matchers Signed-off-by: Annanay --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 2 +- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 65cd6cbe555..e84bc6df90d 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -40,7 +40,7 @@ expr: ||| (time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30) and - cortex_querier_blocks_last_successful_scan_timestamp_seconds{%s} > 0 + cortex_querier_blocks_last_successful_scan_timestamp_seconds > 0 |||, labels: { severity: 'critical', diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index 32eb2842acc..b31ae337473 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -40,7 +40,7 @@ expr: ||| (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 60 * 60 * 24) and - (thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"%s} > 0) + (thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 0) |||, labels: { severity: 'critical', From aa42cb59c4250d652246ecbfe7d53c97fd5a625f Mon Sep 17 00:00:00 2001 From: Annanay Date: Fri, 26 Jun 2020 16:10:52 +0530 Subject: [PATCH 098/364] Purge whitespace Signed-off-by: Annanay --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index dfcf35aac55..1f4d7781ff2 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -21,9 +21,9 @@ // Note is alert_aggregation_labels is "job", this will repeat the label. But // prometheus seems to tolerate that. expr: ||| - 100 * sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) + 100 * sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) / - sum by (%s, job, route) (rate(cortex_request_duration_seconds_count[1m])) + sum by (%s, job, route) (rate(cortex_request_duration_seconds_count[1m])) > 1 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '15m', @@ -142,7 +142,7 @@ { alert: 'CortexCacheRequestErrors', expr: ||| - 100 * sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count{status_code=~"5.."}[1m])) + 100 * sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count{status_code=~"5.."}[1m])) / sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count[1m])) > 1 @@ -350,8 +350,8 @@ alert: 'CortexProvisioningTooMuchMemory', expr: ||| avg by (%s) ( - container_memory_working_set_bytes{container_name="ingester"} - / + container_memory_working_set_bytes{container_name="ingester"} + / container_spec_memory_limit_bytes{container_name="ingester"} ) > 0.7 ||| % $._config.alert_aggregation_labels, @@ -375,7 +375,7 @@ expr: ||| sum by (%s) (rate(cortex_prometheus_rule_evaluation_failures_total[1m])) / - sum by (%s) (rate(cortex_prometheus_rule_evaluations_total[1m])) + sum by (%s) (rate(cortex_prometheus_rule_evaluations_total[1m])) > 0.01 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', @@ -409,7 +409,7 @@ { alert: 'CortexRulerFailedRingCheck', expr: ||| - sum by (%s) (rate(cortex_ruler_ring_check_errors_total[5m])) + sum by (%s) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0 ||| % $._config.alert_aggregation_labels, 'for': '1m', From acf46014f1741b447e3cef41f0637f3fc4f9d6e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 29 Jun 2020 14:09:48 +0200 Subject: [PATCH 099/364] Add ingester to Reads Resources. (https://github.com/grafana/cortex-jsonnet/pull/115) * Add ingester to Reads Resources. * Empty, trigger rerun of GitHub checks --- .../mimir-mixin/dashboards/reads-resources.libsonnet | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index a41d86d1a77..2b93cb808db 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -40,6 +40,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.goHeapInUsePanel('Memory (go heap inuse)', 'querier'), ) ) + .addRow( + $.row('Ingester') + .addPanel( + $.containerCPUUsagePanel('CPU', 'ingester'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ingester'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), + ) + ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), $.row('Store-gateway') From 7b141f0269364833731294b60fe6f16c74a5d992 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Wed, 1 Jul 2020 13:32:26 +0100 Subject: [PATCH 100/364] Update restarts alert, fixes https://github.com/grafana/cortex-jsonnet/pull/117. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 1f4d7781ff2..cf6d66711c0 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -160,13 +160,13 @@ { alert: 'CortexIngesterRestarts', expr: ||| - rate(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 0 + increase(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 1 |||, labels: { severity: 'critical', }, annotations: { - message: '{{ $labels.job }}/{{ $labels.instance }} is restarting', + message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.', }, }, { From 2686325164c6b015c8a04f50738fb6bdf031e792 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 1 Jul 2020 15:26:09 +0200 Subject: [PATCH 101/364] Fixed Cortex compactor playbook Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 212a7416cbd..54c2408df64 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -108,17 +108,17 @@ This alert fires when a Cortex store-gateway is not successfully scanning blocks How to investigate: - Look for any scan error in the store-gateway logs (ie. networking or rate limiting issues) -## CortexCompactorHasNotSuccessfullyRun +## CortexCompactorHasNotSuccessfullyCleanedUpBlocks -This alert fires when a Cortex compactor is not successfully completing a compaction run since a long time. +This alert fires when a Cortex compactor is not successfully deleting blocks marked for deletion since a long time. How to investigate: - Ensure the compactor is not crashing during compaction (ie. `OOMKilled`) -- Look for any error in the compactor logs +- Look for any error in the compactor logs (ie. bucket Delete API errors) -## CortexCompactorHasNotSuccessfullyRunSinceStart +## CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart -Same as [`CortexCompactorHasNotSuccessfullyRun`](#CortexCompactorHasNotSuccessfullyRun). +Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHasNotSuccessfullyCleanedUpBlocks). ## CortexCompactorHasNotUploadedBlocks From d953acd7d3a721a009ee4266a8cf165365023770 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 1 Jul 2020 16:15:28 +0200 Subject: [PATCH 102/364] Fix CortexIngesterRestarts alert --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index cf6d66711c0..3c946d11d4b 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -160,7 +160,7 @@ { alert: 'CortexIngesterRestarts', expr: ||| - increase(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 1 + changes(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 1 |||, labels: { severity: 'critical', From 9ce3f7d5d9e71189c6c8e543f58440a1cb3d645b Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 2 Jul 2020 08:49:23 +0200 Subject: [PATCH 103/364] Update cortex-mixin/docs/playbooks.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marco Pracucci Co-authored-by: Peter Štibraný --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 54c2408df64..76f3630d8d7 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -110,7 +110,7 @@ How to investigate: ## CortexCompactorHasNotSuccessfullyCleanedUpBlocks -This alert fires when a Cortex compactor is not successfully deleting blocks marked for deletion since a long time. +This alert fires when a Cortex compactor is not successfully deleting blocks marked for deletion for a long time. How to investigate: - Ensure the compactor is not crashing during compaction (ie. `OOMKilled`) From 535b8926cd87e170e02b94797bb0d1d104ed6a7a Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 2 Jul 2020 19:57:35 +0100 Subject: [PATCH 104/364] Cortex Ruler alerts improvements Adds a set of improvements on the Cortex Ruler alerts. The addition of: - `CortexRulerNotConnectedToAlertmanagers` to know whenever a ruler is connected to _any_ Alertmanager. - `CortexRulerErrorSendingAlertsToAnyAlertmanager` to have a threshold for sending to at least _an Alertmanager. - `CortexRulerErrorSendingAlertsToSomeAlertmanagers` to have a threshold for sending to each alertmanager. - `CortexRulerNotificationQueueRunningFull` to know whenever the notification queue will be full. On top of that, I have modified the existing rule evaluation alerts to include in the aggregation the instance and rule group. Given that the ruler shards by rule group this makes it easier to identify the offending rules. Signed-off-by: gotjosh --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 76 +++++++++++++++++++-- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 3c946d11d4b..627b82cb2e5 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -370,12 +370,78 @@ { name: 'ruler_alerts', rules: [ + { + alert: 'CortexRulerNotConnectedToAlertmanagers', + expr: 'max_over_time(cortex_prometheus_notifications_alertmanagers_discovered[1m]) < 1', + 'for': '5m', + label: { + severity: 'warning', + }, + message: ||| + {{ $labels.instance }} is not connected to any Alertmanagers. + |||, + }, + { + alert: 'CortexRulerErrorSendingAlertsToAnyAlertmanager', + expr: ||| + min without(alertmanager) ( + rate(cortex_prometheus_notifications_errors_total[1m]) + / + rate(cortex_prometheus_notifications_sent_total[1m]) + ) + * 100 + > 3 + |||, + 'for': '5m', + label: { + severity: 'warning', + }, + message: ||| + {{ printf "%.1f" $value }}% minimum errors while sending alerts from the Cortex Ruler {{$labels.instance}} to any Alertmanager. + |||, + }, + { + alert: 'CortexRulerErrorSendingAlertsToSomeAlertmanagers', + expr: ||| + ( + rate(cortex_prometheus_notifications_errors_total[1m]) + / + rate(cortex_prometheus_notifications_sent_total[1m]) + ) + * 100 + > 1 + |||, + 'for': '5m', + label: { + severity: 'warning', + }, + message: ||| + {{ printf "%.1f" $value }}% minimum errors while sending alerts from the Cortex Ruler {{$labels.instance}} to Alertmanager {{ $labels.alertmanager }}. + |||, + }, + { + alert: 'CortexRulerNotificationQueueRunningFull', + expr: ||| + ( + predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity[5m]) + ) + |||, + 'for': '5m', + label: { + severity: 'warning', + }, + message: ||| + Alert notification queue of Cortex Ruler {{$labels.instance}} is running full. + |||, + }, { alert: 'CortexRulerFailedEvaluations', expr: ||| - sum by (%s) (rate(cortex_prometheus_rule_evaluation_failures_total[1m])) + sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_evaluation_failures_total[1m])) / - sum by (%s) (rate(cortex_prometheus_rule_evaluations_total[1m])) + sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_evaluations_total[1m])) > 0.01 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', @@ -384,16 +450,16 @@ }, annotations: { message: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors. + Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% errors for the rule group {{ $labels.rule_group }} |||, }, }, { alert: 'CortexRulerMissedEvaluations', expr: ||| - sum by (%s) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) / - sum by (%s) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) > 0.01 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', From cb20d6943102e1c05c9245b4e7fa3db7eacf1ca6 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 2 Jul 2020 20:08:13 +0100 Subject: [PATCH 105/364] Include the rule group and instance in the missed evaluation message --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 627b82cb2e5..6421d3da517 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -468,7 +468,7 @@ }, annotations: { message: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% missed iterations. + Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. |||, }, }, From 7b724755274c7a477d8a07dff0e52e3e45a10b27 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 3 Jul 2020 12:01:22 +0100 Subject: [PATCH 106/364] Add a summary to new alets and more wordsmithing --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 6421d3da517..edc192e96f6 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -399,6 +399,9 @@ message: ||| {{ printf "%.1f" $value }}% minimum errors while sending alerts from the Cortex Ruler {{$labels.instance}} to any Alertmanager. |||, + annotations: { + summary: 'Cortex Ruler has encountered more than 3% errors sending alerts to a any Alertmanager.', + }, }, { alert: 'CortexRulerErrorSendingAlertsToSomeAlertmanagers', @@ -418,6 +421,9 @@ message: ||| {{ printf "%.1f" $value }}% minimum errors while sending alerts from the Cortex Ruler {{$labels.instance}} to Alertmanager {{ $labels.alertmanager }}. |||, + annotations: { + summary: 'Cortex Ruler has encountered more than 1% errors sending alerts to a specific Alertmanager.', + }, }, { alert: 'CortexRulerNotificationQueueRunningFull', @@ -433,8 +439,11 @@ severity: 'warning', }, message: ||| - Alert notification queue of Cortex Ruler {{$labels.instance}} is running full. + Alert notification queue of Cortex Ruler {{$labels.instance}} might run full, please investigate. |||, + annotations: { + summary: 'Cortex Ruler instance alert notification queue predicted to run full in 30m.', + }, }, { alert: 'CortexRulerFailedEvaluations', From 1f558204964041506403285939ddefe3610110ae Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 3 Jul 2020 16:18:33 +0100 Subject: [PATCH 107/364] Revert "Cortex Ruler alerts improvements" --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 87 ++------------------- 1 file changed, 6 insertions(+), 81 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index edc192e96f6..3c946d11d4b 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -370,87 +370,12 @@ { name: 'ruler_alerts', rules: [ - { - alert: 'CortexRulerNotConnectedToAlertmanagers', - expr: 'max_over_time(cortex_prometheus_notifications_alertmanagers_discovered[1m]) < 1', - 'for': '5m', - label: { - severity: 'warning', - }, - message: ||| - {{ $labels.instance }} is not connected to any Alertmanagers. - |||, - }, - { - alert: 'CortexRulerErrorSendingAlertsToAnyAlertmanager', - expr: ||| - min without(alertmanager) ( - rate(cortex_prometheus_notifications_errors_total[1m]) - / - rate(cortex_prometheus_notifications_sent_total[1m]) - ) - * 100 - > 3 - |||, - 'for': '5m', - label: { - severity: 'warning', - }, - message: ||| - {{ printf "%.1f" $value }}% minimum errors while sending alerts from the Cortex Ruler {{$labels.instance}} to any Alertmanager. - |||, - annotations: { - summary: 'Cortex Ruler has encountered more than 3% errors sending alerts to a any Alertmanager.', - }, - }, - { - alert: 'CortexRulerErrorSendingAlertsToSomeAlertmanagers', - expr: ||| - ( - rate(cortex_prometheus_notifications_errors_total[1m]) - / - rate(cortex_prometheus_notifications_sent_total[1m]) - ) - * 100 - > 1 - |||, - 'for': '5m', - label: { - severity: 'warning', - }, - message: ||| - {{ printf "%.1f" $value }}% minimum errors while sending alerts from the Cortex Ruler {{$labels.instance}} to Alertmanager {{ $labels.alertmanager }}. - |||, - annotations: { - summary: 'Cortex Ruler has encountered more than 1% errors sending alerts to a specific Alertmanager.', - }, - }, - { - alert: 'CortexRulerNotificationQueueRunningFull', - expr: ||| - ( - predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) - > - min_over_time(prometheus_notifications_queue_capacity[5m]) - ) - |||, - 'for': '5m', - label: { - severity: 'warning', - }, - message: ||| - Alert notification queue of Cortex Ruler {{$labels.instance}} might run full, please investigate. - |||, - annotations: { - summary: 'Cortex Ruler instance alert notification queue predicted to run full in 30m.', - }, - }, { alert: 'CortexRulerFailedEvaluations', expr: ||| - sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_evaluation_failures_total[1m])) + sum by (%s) (rate(cortex_prometheus_rule_evaluation_failures_total[1m])) / - sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_evaluations_total[1m])) + sum by (%s) (rate(cortex_prometheus_rule_evaluations_total[1m])) > 0.01 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', @@ -459,16 +384,16 @@ }, annotations: { message: ||| - Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% errors for the rule group {{ $labels.rule_group }} + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors. |||, }, }, { alert: 'CortexRulerMissedEvaluations', expr: ||| - sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + sum by (%s) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) / - sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + sum by (%s) (rate(cortex_prometheus_rule_group_iterations_total[1m])) > 0.01 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', @@ -477,7 +402,7 @@ }, annotations: { message: ||| - Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% missed iterations. |||, }, }, From 9722accce6442496d18b3922fa7e9d94cae08089 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Mon, 6 Jul 2020 14:44:32 +0100 Subject: [PATCH 108/364] Include instance and rule_group on rule evaluation alerts Modified the existing rule evaluation alerts to include in the aggregation the instance and rule group labels. Given that the ruler shards by rule group this makes it easier to identify the offending rules and its ruler instance. --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 3c946d11d4b..d9a959532cd 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -373,9 +373,9 @@ { alert: 'CortexRulerFailedEvaluations', expr: ||| - sum by (%s) (rate(cortex_prometheus_rule_evaluation_failures_total[1m])) + sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_evaluation_failures_total[1m])) / - sum by (%s) (rate(cortex_prometheus_rule_evaluations_total[1m])) + sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_evaluations_total[1m])) > 0.01 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', @@ -384,16 +384,16 @@ }, annotations: { message: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors. + Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% errors for the rule group {{ $labels.rule_group }}. |||, }, }, { alert: 'CortexRulerMissedEvaluations', expr: ||| - sum by (%s) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) / - sum by (%s) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) > 0.01 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', @@ -402,7 +402,7 @@ }, annotations: { message: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% missed iterations. + Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. |||, }, }, From 8a9743d365c152ac8c22d5e4ef7d2f9060866fe5 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Mon, 6 Jul 2020 23:12:09 -0400 Subject: [PATCH 109/364] use single binary for job labels (https://github.com/grafana/cortex-jsonnet/pull/131) Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/config.libsonnet | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 8e02f3fcc4a..ab02ffaac46 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -23,13 +23,15 @@ // modify the job selectors in the dashboard queries. singleBinary: false, + // These are used by the dashboards and allow for the simultaneous display of + // microservice and single binary cortex clusters. job_names: { - ingester: 'ingester', - distributor: 'distributor', - querier: 'querier', - query_frontend: 'query-frontend', - table_manager: 'table-manager', - store_gateway: 'store-gateway', + ingester: '(ingester|cortex$)', + distributor: '(distributor|cortex$)', + querier: '(querier|cortex$)', + query_frontend: '(query-frontend|cortex$)', + table_manager: '(table-manager|cortex$)', + store_gateway: '(store-gateway|cortex$)', gateway: 'cortex-gw', }, From dabcbc842d6d0fe9c12a0195109272cc4fae10b2 Mon Sep 17 00:00:00 2001 From: Annanay Date: Tue, 7 Jul 2020 17:30:42 +0530 Subject: [PATCH 110/364] Update jsonnet-libs dependency Signed-off-by: Annanay --- jsonnet/mimir-mixin/jsonnetfile.lock.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/jsonnet/mimir-mixin/jsonnetfile.lock.json b/jsonnet/mimir-mixin/jsonnetfile.lock.json index fe50a404cb0..6490fafd9e4 100644 --- a/jsonnet/mimir-mixin/jsonnetfile.lock.json +++ b/jsonnet/mimir-mixin/jsonnetfile.lock.json @@ -1,10 +1,10 @@ { + "version": 1, "dependencies": [ { - "name": "grafana-builder", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "grafana-builder" } }, @@ -12,15 +12,15 @@ "sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE=" }, { - "name": "mixin-utils", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "mixin-utils" } }, - "version": "8f9d72b2e35b5f3cc1b7c2a8af9bbae7658804e2", - "sum": "J1iExBloZLjVEvdzHVjvP9AVTqDOJSfFOtBoeQ7EhKk=" + "version": "21b638f4e4922c0b6fde12120ed45d8ef803edc7", + "sum": "Je2SxBKu+1WrKEEG60zjSKaY/6TPX8uRz5bsaw0a8oA=" } - ] + ], + "legacyImports": false } From ba7bf72f2e558133724844ded3af03f2f39fd17a Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 13 Jul 2020 10:27:46 +0200 Subject: [PATCH 111/364] Improved Cortex blocks storage playbooks Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 48 +++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 76f3630d8d7..ca93a0c5294 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -82,6 +82,15 @@ How to investigate: - Ensure the ingester is receiving write-path traffic (samples to ingest) - Look for any upload error in the ingester logs (ie. networking or authentication issues) +### Ingester hit the disk capacity + +If the ingester hit the disk capacity, any attempt to append samples will fail. You should: + +1. Increase the disk size and restart the ingester. If the ingester is running in Kubernetes with a Persistent Volume, please refers to [Resizing Persistent Volumes using Kubernetes](#resizing-persistent-volumes-using-kubernetes). +2. Investigate why the disk capacity has been hit + - Was the disk just too small? + - Was there an issue compacting TSDB head and the WAL is increasing indefinitely? + ## CortexIngesterHasNotShippedBlocksSinceStart Same as [`CortexIngesterHasNotShippedBlocks`](#CortexIngesterHasNotShippedBlocks). @@ -130,6 +139,45 @@ How to investigate: - Ensure ingesters are successfully shipping blocks to the storage - Look for any error in the compactor logs +### Compactor is failing because of `not healthy index found` + +The compactor may fail to compact blocks due a corrupted block index found in one of the source blocks: + +``` +level=error ts=2020-07-12T17:35:05.516823471Z caller=compactor.go:339 component=compactor msg="failed to compact user blocks" user=REDACTED err="compaction: group 0@6672437747845546250: block with not healthy index found /data/compact/0@6672437747845546250/REDACTED; Compaction level 1; Labels: map[__org_id__:REDACTED]: 1/1183085 series have an average of 1.000 out-of-order chunks: 0.000 of these are exact duplicates (in terms of data and time range)" +``` + +When this happen you should: +1. Rename the block prefixing it with `corrupted-` so that it will be skipped by the compactor and queriers +2. Ensure the compactor has recovered +3. Investigate offline the root cause (eg. download the corrupted block and debug it locally) + +To rename a block stored on GCS you can use the `gsutil` CLI: + +``` +# Replace the placeholders: +# - BUCKET: bucket name +# - TENANT: tenant ID +# - BLOCK: block ID + +gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK +``` + ## CortexCompactorHasNotUploadedBlocksSinceStart Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks). + +## Resizing Persistent Volumes using Kubernetes + +This is the short version of an extensive documentation on [how to resize Kubernetes Persistent Volumes](https://kubernetes.io/blog/2018/07/12/resizing-persistent-volumes-using-kubernetes/). + +**Pre-requisites**: + +- Running Kubernetes v1.11 or above +- The PV storage class has `allowVolumeExpansion: true` +- The PV is backed by a supported block storage volume (eg. GCP-PD, AWS-EBS, ...) + +**How to increase the volume**: + +1. Edit the PVC (persistent volume claim) `spec` for the volume to resize and **increase** `resources` > `requests` > `storage` +2. Restart the pod attached to the PVC for which the storage request has been increased From 7647dc2dd15cb70d79a663c53714262cc639f620 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 13 Jul 2020 11:11:29 +0200 Subject: [PATCH 112/364] Updated playbook Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index ca93a0c5294..c658d646a12 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -148,7 +148,7 @@ level=error ts=2020-07-12T17:35:05.516823471Z caller=compactor.go:339 component= ``` When this happen you should: -1. Rename the block prefixing it with `corrupted-` so that it will be skipped by the compactor and queriers +1. Rename the block prefixing it with `corrupted-` so that it will be skipped by the compactor and queriers. Keep in mind that doing so the block will become invisible to the queriers too, so its series/samples will not be queried. It's safe to do it on a single block with compaction level 1 (because of the samples replication), but not on multiple overlapping level 1 blocks or any block with a compaction level > 1. 2. Ensure the compactor has recovered 3. Investigate offline the root cause (eg. download the corrupted block and debug it locally) From f8c0558bcaa49bcee26d4fe7257d4a4243dceb4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 15 Jul 2020 08:19:55 +0200 Subject: [PATCH 113/364] Added section on creating disk clone and how to access it. (https://github.com/grafana/cortex-jsonnet/pull/138) --- jsonnet/mimir-mixin/docs/playbooks.md | 64 +++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index c658d646a12..d6168827a5f 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -181,3 +181,67 @@ This is the short version of an extensive documentation on [how to resize Kubern 1. Edit the PVC (persistent volume claim) `spec` for the volume to resize and **increase** `resources` > `requests` > `storage` 2. Restart the pod attached to the PVC for which the storage request has been increased + +## How to create clone volume (Google Cloud specific) + +In some scenarios, it may be useful to preserve current volume status for inspection, but keep using the volume. +[Google Persistent Disk supports "Clone"](https://cloud.google.com/compute/docs/disks/add-persistent-disk#source-disk) operation that can be used to do that. +Newly cloned disk is independant from its original, and can be used for further investigation by attaching it to a new Machine / Pod. + +When using Kubernetes, here is YAML file that creates PV (`clone-ingester-7-pv`) pointing to the new disk clone (`clone-pvc-80cc0efa-4996-11ea-ba79-42010a96008c` in this example), +PVC (`clone-ingester-7-pvc`) pointing to PV, and finally Pod (`clone-ingester-7-dataaccess`) using the PVC to access the disk. + +```yaml +apiVersion: v1 +kind: PersistentVolume +metadata: + name: clone-ingester-7-pv +spec: + accessModes: + - ReadWriteOnce + capacity: + storage: 150Gi + gcePersistentDisk: + fsType: ext4 + pdName: clone-pvc-80cc0efa-4996-11ea-ba79-42010a96008c + persistentVolumeReclaimPolicy: Retain + storageClassName: fast + volumeMode: Filesystem +--- +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: clone-ingester-7-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 150Gi + storageClassName: fast + volumeName: clone-ingester-7-pv + volumeMode: Filesystem +--- +apiVersion: v1 +kind: Pod +metadata: + name: clone-ingester-7-dataaccess +spec: + containers: + - name: alpine + image: alpine:latest + command: ['sleep', 'infinity'] + volumeMounts: + - name: mypvc + mountPath: /data + resources: + requests: + cpu: 500m + memory: 1024Mi + volumes: + - name: mypvc + persistentVolumeClaim: + claimName: clone-ingester-7-pvc +``` + +After this preparation, one can use `kubectl exec -t -i clone-ingester-7-dataaccess /bin/sh` to inspect the disk mounted under `/data`. From b19f709e85ad85b24edda16ec8e209d50fe1573d Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Fri, 17 Jul 2020 09:10:59 +0100 Subject: [PATCH 114/364] Support renamed runtime config alert As part of https://github.com/cortexproject/cortex/pull/2874 the metric was renamed, this PR renames the alert accordingly and supports both the old and the new metric name --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index d9a959532cd..a4a1e05b85a 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -95,8 +95,15 @@ // As of https://github.com/cortexproject/cortex/pull/2092, this metric is // only exposed when it is supposed to be non-zero, so we don't need to do // any special filtering on the job label. - alert: 'CortexBadOverrides', + // The metric itself was renamed in + // https://github.com/cortexproject/cortex/pull/2874 + // + // TODO: Remove deprecated metric name of + // cortex_overrides_last_reload_successful in the future + alert: 'CortexBadRuntimeConfig', expr: ||| + cortex_runtime_config_last_reload_successful == 0 + or cortex_overrides_last_reload_successful == 0 |||, 'for': '15m', @@ -105,7 +112,7 @@ }, annotations: { message: ||| - {{ $labels.job }} failed to reload overrides. + {{ $labels.job }} failed to reload runtime config. |||, }, }, From 5ea00dbb046f981cce283b8f01dc45a8bf0316b9 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 21 Jul 2020 10:46:08 +0200 Subject: [PATCH 115/364] Improved blocks storage ingester alerts Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 31 +++++++++++++++++---- jsonnet/mimir-mixin/docs/playbooks.md | 7 +++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index e84bc6df90d..e1da3ac97e8 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -4,13 +4,16 @@ name: 'cortex_blocks_alerts', rules: [ { - // Alert if the ingester has not shipped any block in the last 4h. + // Alert if the ingester has not shipped any block in the last 4h. It also checks cortex_ingester_ingested_samples_total + // to avoid false positives on ingesters not receiving any traffic yet (eg. a newly created cluster). alert: 'CortexIngesterHasNotShippedBlocks', 'for': '15m', expr: ||| - (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"} > 60 * 60 * 4) + (min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 60 * 60 * 4) and - (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"} > 0) + (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 0) + and + (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) |||, labels: { severity: 'critical', @@ -20,11 +23,14 @@ }, }, { - // Alert if the ingester has not shipped any block since start. + // Alert if the ingester has not shipped any block since start. It also checks cortex_ingester_ingested_samples_total + // to avoid false positives on ingesters not receiving any traffic yet (eg. a newly created cluster). alert: 'CortexIngesterHasNotShippedBlocksSinceStart', 'for': '4h', expr: ||| - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"} == 0 + (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) == 0) + and + (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) |||, labels: { severity: 'critical', @@ -33,6 +39,21 @@ message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', }, }, + { + // Alert if the ingester is failing to compact TSDB head into a block, for any opened TSDB. This is a critical + // condition that should never happen. + alert: 'CortexIngesterTSDBHeadCompactionFailed', + 'for': '15m', + expr: ||| + rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.', + }, + }, { // Alert if the querier is not successfully scanning the bucket. alert: 'CortexQuerierHasNotScanTheBucket', diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index d6168827a5f..a6b18e3b3cb 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -95,6 +95,13 @@ If the ingester hit the disk capacity, any attempt to append samples will fail. Same as [`CortexIngesterHasNotShippedBlocks`](#CortexIngesterHasNotShippedBlocks). +## CortexIngesterTSDBHeadCompactionFailed + +This alert fires when a Cortex ingester is failing to compact the TSDB head into a block. A TSDB instance is opened for each tenant writing at least 1 series to the ingester and its head contains the in-memory series not flushed to a block yet. If the TSDB head compaction fails it means it's failing to compact a block from the in-memory series for at least 1 tenant. + +How to investigate: +- Look for details in the ingester logs + ## CortexQuerierHasNotScanTheBucket This alert fires when a Cortex querier is not successfully scanning blocks in the storage (bucket). A querier is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. From 11124fe3e43243576e92855ea5371bd3e97f406d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 21 Jul 2020 12:00:19 +0200 Subject: [PATCH 116/364] Updated alert comment and playbook Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 5 +++-- jsonnet/mimir-mixin/docs/playbooks.md | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index e1da3ac97e8..6d435b68bb3 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -40,8 +40,9 @@ }, }, { - // Alert if the ingester is failing to compact TSDB head into a block, for any opened TSDB. This is a critical - // condition that should never happen. + // Alert if the ingester is failing to compact TSDB head into a block, for any opened TSDB. Once the TSDB head is + // compactable, the ingester will try to compact it every 1 minute. Repeatedly failing it is a critical condition + // that should never happen. alert: 'CortexIngesterTSDBHeadCompactionFailed', 'for': '15m', expr: ||| diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index a6b18e3b3cb..514b99ac1a9 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -97,7 +97,9 @@ Same as [`CortexIngesterHasNotShippedBlocks`](#CortexIngesterHasNotShippedBlocks ## CortexIngesterTSDBHeadCompactionFailed -This alert fires when a Cortex ingester is failing to compact the TSDB head into a block. A TSDB instance is opened for each tenant writing at least 1 series to the ingester and its head contains the in-memory series not flushed to a block yet. If the TSDB head compaction fails it means it's failing to compact a block from the in-memory series for at least 1 tenant. +This alert fires when a Cortex ingester is failing to compact the TSDB head into a block. + +A TSDB instance is opened for each tenant writing at least 1 series to the ingester and its head contains the in-memory series not flushed to a block yet. Once the TSDB head is compactable, the ingester will try to compact it every 1 minute. If the TSDB head compaction repeatedly fails, it means it's failing to compact a block from the in-memory series for at least 1 tenant, and it's a critical condition that should be immediately investigated. How to investigate: - Look for details in the ingester logs From ec3b78d5dcf2205ca9fc0f9e4d0277714f94d924 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 21 Jul 2020 13:56:30 +0200 Subject: [PATCH 117/364] Added TSDB metrics to Cortex / Writes dashboard Signed-off-by: Marco Pracucci --- .../mimir-mixin/dashboards/writes.libsonnet | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 4b7f0508d49..6bc68a0beaf 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -163,5 +163,43 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Upload latency') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)), ) + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.row('Ingester - Blocks storage - TSDB Compactions') + .addPanel( + $.successFailurePanel( + 'Compactions / sec', + 'sum(rate(cortex_ingester_tsdb_compactions_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester)], + 'sum(rate(cortex_ingester_tsdb_compactions_failed_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), + ), + ) + .addPanel( + $.panel('Compactions latency') + + $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)), + ) + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.row('Ingester - Blocks storage - TSDB WAL truncation and checkpoints') + .addPanel( + $.successFailurePanel( + 'WAL truncations / sec', + 'sum(rate(cortex_ingester_tsdb_wal_truncations_total{%s}[$__interval])) - sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), + ), + ) + .addPanel( + $.successFailurePanel( + 'Checkpoints created / sec', + 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total{%s}[$__interval])) - sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), + ), + ) + .addPanel( + $.panel('WAL truncations latency') + + $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + + { yaxes: $.yaxes('s') }, + ) ), } From 7a51e8dca8aa77ae08958a594f980914d7b8889d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 21 Jul 2020 14:48:12 +0200 Subject: [PATCH 118/364] Added WAL corruptions to dashboard Signed-off-by: Marco Pracucci --- .../mimir-mixin/dashboards/writes.libsonnet | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 6bc68a0beaf..e2f9c977e6d 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -166,7 +166,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.row('Ingester - Blocks storage - TSDB Compactions') + $.row('Ingester - Blocks storage - TSDB Head') .addPanel( $.successFailurePanel( 'Compactions / sec', @@ -181,7 +181,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.row('Ingester - Blocks storage - TSDB WAL truncation and checkpoints') + $.row('Ingester - Blocks storage - TSDB WAL') .addPanel( $.successFailurePanel( 'WAL truncations / sec', @@ -189,6 +189,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), ), ) + .addPanel( + $.panel('WAL truncations latency') + + $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + + { yaxes: $.yaxes('s') }, + ) .addPanel( $.successFailurePanel( 'Checkpoints created / sec', @@ -197,9 +202,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('WAL truncations latency') + - $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + - { yaxes: $.yaxes('s') }, + $.panel('Corruptions / sec') + + $.queryPanel([ + 'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), + 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), + ], [ + 'WAL', + 'mmap-ed chunks', + ]) + + $.stack + { + yaxes: $.yaxes('ops'), + aliasColors: { + WAL: '#E24D42', + 'mmap-ed chunks': '#E28A42', + }, + }, ) ), } From 576338887c185f2008b5fd5cb7f9bb1711a107b5 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 21 Jul 2020 15:01:21 +0200 Subject: [PATCH 119/364] Clarified the WAL truncations latency includes checkpointing Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index e2f9c977e6d..fbe537a337a 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -189,11 +189,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), ), ) - .addPanel( - $.panel('WAL truncations latency') + - $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + - { yaxes: $.yaxes('s') }, - ) .addPanel( $.successFailurePanel( 'Checkpoints created / sec', @@ -201,6 +196,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), ), ) + .addPanel( + $.panel('WAL truncations latency (includes checkpointing)') + + $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + + { yaxes: $.yaxes('s') }, + ) .addPanel( $.panel('Corruptions / sec') + $.queryPanel([ From 92df840d3396ba66820630471e33b1a4cce9b452 Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Tue, 28 Jul 2020 14:12:56 +0100 Subject: [PATCH 120/364] Add alert and dashboard using config file hashes This allow to monitor the roll out of new config file versions to the various nodes of a cluster. The metric was added as part of https://github.com/cortexproject/cortex/pull/2874. --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 15 +++++++++++ jsonnet/mimir-mixin/dashboards.libsonnet | 1 + .../mimir-mixin/dashboards/config.libsonnet | 26 +++++++++++++++++++ 3 files changed, 42 insertions(+) create mode 100644 jsonnet/mimir-mixin/dashboards/config.libsonnet diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index a4a1e05b85a..04dc7b01ec3 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -91,6 +91,21 @@ |||, }, }, + { + alert: 'CortexInconsistentConfig', + expr: ||| + count(count by(%s, job, sha256) (cortex_config_hash)) without(sha256) > 1 + ||| % $._config.alert_aggregation_labels, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + An inconsistent config file hash is used across cluster {{ $labels.job }}. + |||, + }, + }, { // As of https://github.com/cortexproject/cortex/pull/2092, this metric is // only exposed when it is supposed to be non-zero, so we don't need to do diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 3eacaeb6017..3016a56f5d5 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -1,5 +1,6 @@ { grafanaDashboards+: + (import 'dashboards/config.libsonnet') + (import 'dashboards/queries.libsonnet') + (import 'dashboards/reads.libsonnet') + (import 'dashboards/ruler.libsonnet') + diff --git a/jsonnet/mimir-mixin/dashboards/config.libsonnet b/jsonnet/mimir-mixin/dashboards/config.libsonnet new file mode 100644 index 00000000000..eedfcb4c393 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/config.libsonnet @@ -0,0 +1,26 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + + 'cortex-config.json': + $.dashboard('Cortex / Config') + .addClusterSelectorTemplates() + .addRow( + $.row('Startup config file') + .addPanel( + $.panel('Startup config file hashes') + + $.queryPanel('count(cortex_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + + $.stack + + { yaxes: $.yaxes('instances') }, + ) + ) + .addRow( + $.row('Runtime config file') + .addPanel( + $.panel('Runtime config file hashes') + + $.queryPanel('count(cortex_runtime_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + + $.stack + + { yaxes: $.yaxes('instances') }, + ) + ), +} From 0317d27c973a4533a7183f15dd9671e8e75c0ff4 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 29 Jul 2020 15:28:21 +0530 Subject: [PATCH 121/364] Bump up limits for ingester alerts (https://github.com/grafana/cortex-jsonnet/pull/150) Signed-off-by: Ganesh Vernekar --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 29 +++++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index a4a1e05b85a..36d774ff6fa 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -321,9 +321,9 @@ }, { alert: 'CortexProvisioningTooManyActiveSeries', - // 1 million active series per ingester max. + // 1.5 million active series per ingester max. expr: ||| - avg by (%s) (cortex_ingester_memory_series) > 1.1e6 + avg by (%s) (cortex_ingester_memory_series) > 1.6e6 and sum by (%s) (rate(cortex_ingester_received_chunks[1h])) == 0 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], @@ -354,14 +354,33 @@ }, }, { - alert: 'CortexProvisioningTooMuchMemory', + alert: 'CortexAllocatingTooMuchMemory', expr: ||| - avg by (%s) ( + ( + container_memory_working_set_bytes{container_name="ingester"} + / + container_spec_memory_limit_bytes{container_name="ingester"} + ) > 0.5 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Too much memory being used by ingesters - add more ingesters. + |||, + }, + }, + { + alert: 'CortexAllocatingTooMuchMemory', + expr: ||| + ( container_memory_working_set_bytes{container_name="ingester"} / container_spec_memory_limit_bytes{container_name="ingester"} ) > 0.7 - ||| % $._config.alert_aggregation_labels, + |||, 'for': '15m', labels: { severity: 'critical', From 8e1258feb3d538b56a819e5077cd12ff6435313e Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Wed, 29 Jul 2020 17:55:17 +0530 Subject: [PATCH 122/364] Update CortexAllocatingTooMuchMemory (https://github.com/grafana/cortex-jsonnet/pull/151) Signed-off-by: Ganesh Vernekar --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 36d774ff6fa..cfaf373385c 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -368,7 +368,7 @@ }, annotations: { message: ||| - Too much memory being used by ingesters - add more ingesters. + Too much memory being used by {{ $labels.instance }} - add more ingesters. |||, }, }, @@ -379,7 +379,7 @@ container_memory_working_set_bytes{container_name="ingester"} / container_spec_memory_limit_bytes{container_name="ingester"} - ) > 0.7 + ) > 0.8 |||, 'for': '15m', labels: { @@ -387,7 +387,7 @@ }, annotations: { message: ||| - Too much memory being used by ingesters - add more ingesters. + Too much memory being used by {{ $labels.instance }} - add more ingesters. |||, }, }, From 058f452007f99ff82007e3add86becd71bb6fd1e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 29 Jul 2020 16:46:11 +0200 Subject: [PATCH 123/364] Basic disaster recovery plan for blocks storage Signed-off-by: Marco Pracucci --- .../docs/blocks-storage-disaster-recovery.md | 90 +++++++++++++++++++ jsonnet/mimir-mixin/docs/playbooks.md | 9 +- 2 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 jsonnet/mimir-mixin/docs/blocks-storage-disaster-recovery.md diff --git a/jsonnet/mimir-mixin/docs/blocks-storage-disaster-recovery.md b/jsonnet/mimir-mixin/docs/blocks-storage-disaster-recovery.md new file mode 100644 index 00000000000..735d46a8a6d --- /dev/null +++ b/jsonnet/mimir-mixin/docs/blocks-storage-disaster-recovery.md @@ -0,0 +1,90 @@ +# Cortex blocks storage - Disaster recovery plan + +This document assumes that you are running a Cortex blocks storage cluster: + +1. Using this mixin config +2. Using GCS as object store (but similar procedures apply to other backends) + +## Recovering from a potential data loss incident + +The ingested series data that could be lost during an incident can be stored in two places: + +1. Ingesters (before blocks are shipped to the bucket) +2. Bucket + +There could be several root causes leading to a potential data loss. In this document we're going to share generic procedures that could be used as a guideline during an incident. + +### Halt the compactor + +The Cortex cluster continues to successfully operate even if the compactor is not running, except that over a long period (12+ hours) this will lead to query performance degrade. The compactor could potentially be the cause of data loss because: + +- It marks blocks for deletion (soft deletion) +- It permanently deletes blocks marked for deletion after `-compactor.deletion-delay` (hard deletion) +- It could generate corrupted compacted blocks (eg. due to a bug or if a source block is corrupted and the automatic checks can't detect it) + +**If you suspect the compactor could be the cause of data loss, halt it**. It can be restarted anytime later. + +When the compactor is **halted**: + +- No new blocks will be compacted +- No blocks will be deleted (soft and hard deletion) + +### Recover source blocks from ingesters + +Ingesters keep, on their persistent disk, the blocks compacted from TSDB head until the `-experimental.tsdb.retention-period` retention expires. The **default retention is 4 days**, in order to give cluster operators enough time to react in case of a data loss incident. + +The blocks retained in the ingesters can be used in case the compactor generates corrupted blocks and the source blocks, shipped from ingesters, have already been hard deleted from the bucket. + +How to manually blocks from ingesters to the bucket: + +1. Ensure [`gsutil`](https://cloud.google.com/storage/docs/gsutil) is installed in the Cortex pod. If not, [install it](#install-gsutil-in-the-cortex-pod) +2. Run `cd /data/tsdb && /path/to/gsutil -m rsync -r -x 'thanos.shipper.json|chunks_head|wal' . gs:///recovered/` + - `-m` enables parallel mode + - `-r` enables recursive rsync + - `-x ` excludes specific patterns from sync + +### Freeze ingesters persistent disk + +The blocks and WAL stored in the ingester persistent disk are the last fence of defence in case of an incident involving blocks not shipped to the bucket or corrupted blocks in the bucket. If the data integrity in the ingester's disk is at risk (eg. close to hit the TSDB retention period or close to reach max disk utilisation), you should freeze it taking a **disk snapshot**. + +To take a **GCP persistent disk snapshot**: + +1. Identify the Kubernetes PVC volume name (`kubectl get pvc -n `) of the volumes to snapshot +2. For each volume, [create a snapshot](https://console.cloud.google.com/compute/snapshotsAdd) from the GCP console ([documentation](https://cloud.google.com/compute/docs/disks/create-snapshots)) + +### Halt the ingesters + +Halting the ingesters should be the **very last resort** because of the side effects. To halt the ingesters, while preserving their disk and without disrupting the cluster write path, you need to: + +1. Create a second pool of ingesters + - Uses the functions `newIngesterStatefulSet()`, `newIngesterPdb()` +2. Wait until the second pool is up and running +3. Halt existing ingesters (scale down to 0 or delete their statefulset) + +However the **queries will return partial data**, due to all the ingested samples which have not been compacted to blocks yet. + + +## Appendix + +### Install `gsutil` in the Cortex pod + +1. Install python + ``` + apk add python3 py3-pip + ln -s /usr/bin/python3 /usr/bin/python + pip install google-compute-engine + ``` +2. Download `gsutil` + ``` + wget https://storage.googleapis.com/pub/gsutil.tar.gz + tar -zxvf gsutil.tar.gz + ./gsutil/gsutil --help + ``` +3. Create `/etc/boto.cfg` with the following content: + ``` + [GoogleCompute] + service_account = default + + [Plugin] + plugin_directory = /usr/lib/python3.8/site-packages/google_compute_engine/boto + ``` diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 514b99ac1a9..8e2026932c4 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -82,6 +82,8 @@ How to investigate: - Ensure the ingester is receiving write-path traffic (samples to ingest) - Look for any upload error in the ingester logs (ie. networking or authentication issues) +_If the alert `CortexIngesterTSDBHeadCompactionFailed` fired as well, then give priority to it because that could be the cause._ + ### Ingester hit the disk capacity If the ingester hit the disk capacity, any attempt to append samples will fail. You should: @@ -101,7 +103,12 @@ This alert fires when a Cortex ingester is failing to compact the TSDB head into A TSDB instance is opened for each tenant writing at least 1 series to the ingester and its head contains the in-memory series not flushed to a block yet. Once the TSDB head is compactable, the ingester will try to compact it every 1 minute. If the TSDB head compaction repeatedly fails, it means it's failing to compact a block from the in-memory series for at least 1 tenant, and it's a critical condition that should be immediately investigated. -How to investigate: +The cause triggering this alert could **lead to**: +- Ingesters run out of memory +- Ingesters run out of disk space +- Queries return partial results after `-querier.query-ingesters-within` time since the beginning of the incident + +How to **investigate**: - Look for details in the ingester logs ## CortexQuerierHasNotScanTheBucket From 6950c5a271be4d0fe6299a3d3e11592f3cfe03f2 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 29 Jul 2020 17:19:38 +0200 Subject: [PATCH 124/364] Merged 'disaster recover' into playbooks without calling it 'disaster recovery' Signed-off-by: Marco Pracucci --- .../docs/blocks-storage-disaster-recovery.md | 90 ----------- jsonnet/mimir-mixin/docs/playbooks.md | 150 ++++++++++++++---- 2 files changed, 120 insertions(+), 120 deletions(-) delete mode 100644 jsonnet/mimir-mixin/docs/blocks-storage-disaster-recovery.md diff --git a/jsonnet/mimir-mixin/docs/blocks-storage-disaster-recovery.md b/jsonnet/mimir-mixin/docs/blocks-storage-disaster-recovery.md deleted file mode 100644 index 735d46a8a6d..00000000000 --- a/jsonnet/mimir-mixin/docs/blocks-storage-disaster-recovery.md +++ /dev/null @@ -1,90 +0,0 @@ -# Cortex blocks storage - Disaster recovery plan - -This document assumes that you are running a Cortex blocks storage cluster: - -1. Using this mixin config -2. Using GCS as object store (but similar procedures apply to other backends) - -## Recovering from a potential data loss incident - -The ingested series data that could be lost during an incident can be stored in two places: - -1. Ingesters (before blocks are shipped to the bucket) -2. Bucket - -There could be several root causes leading to a potential data loss. In this document we're going to share generic procedures that could be used as a guideline during an incident. - -### Halt the compactor - -The Cortex cluster continues to successfully operate even if the compactor is not running, except that over a long period (12+ hours) this will lead to query performance degrade. The compactor could potentially be the cause of data loss because: - -- It marks blocks for deletion (soft deletion) -- It permanently deletes blocks marked for deletion after `-compactor.deletion-delay` (hard deletion) -- It could generate corrupted compacted blocks (eg. due to a bug or if a source block is corrupted and the automatic checks can't detect it) - -**If you suspect the compactor could be the cause of data loss, halt it**. It can be restarted anytime later. - -When the compactor is **halted**: - -- No new blocks will be compacted -- No blocks will be deleted (soft and hard deletion) - -### Recover source blocks from ingesters - -Ingesters keep, on their persistent disk, the blocks compacted from TSDB head until the `-experimental.tsdb.retention-period` retention expires. The **default retention is 4 days**, in order to give cluster operators enough time to react in case of a data loss incident. - -The blocks retained in the ingesters can be used in case the compactor generates corrupted blocks and the source blocks, shipped from ingesters, have already been hard deleted from the bucket. - -How to manually blocks from ingesters to the bucket: - -1. Ensure [`gsutil`](https://cloud.google.com/storage/docs/gsutil) is installed in the Cortex pod. If not, [install it](#install-gsutil-in-the-cortex-pod) -2. Run `cd /data/tsdb && /path/to/gsutil -m rsync -r -x 'thanos.shipper.json|chunks_head|wal' . gs:///recovered/` - - `-m` enables parallel mode - - `-r` enables recursive rsync - - `-x ` excludes specific patterns from sync - -### Freeze ingesters persistent disk - -The blocks and WAL stored in the ingester persistent disk are the last fence of defence in case of an incident involving blocks not shipped to the bucket or corrupted blocks in the bucket. If the data integrity in the ingester's disk is at risk (eg. close to hit the TSDB retention period or close to reach max disk utilisation), you should freeze it taking a **disk snapshot**. - -To take a **GCP persistent disk snapshot**: - -1. Identify the Kubernetes PVC volume name (`kubectl get pvc -n `) of the volumes to snapshot -2. For each volume, [create a snapshot](https://console.cloud.google.com/compute/snapshotsAdd) from the GCP console ([documentation](https://cloud.google.com/compute/docs/disks/create-snapshots)) - -### Halt the ingesters - -Halting the ingesters should be the **very last resort** because of the side effects. To halt the ingesters, while preserving their disk and without disrupting the cluster write path, you need to: - -1. Create a second pool of ingesters - - Uses the functions `newIngesterStatefulSet()`, `newIngesterPdb()` -2. Wait until the second pool is up and running -3. Halt existing ingesters (scale down to 0 or delete their statefulset) - -However the **queries will return partial data**, due to all the ingested samples which have not been compacted to blocks yet. - - -## Appendix - -### Install `gsutil` in the Cortex pod - -1. Install python - ``` - apk add python3 py3-pip - ln -s /usr/bin/python3 /usr/bin/python - pip install google-compute-engine - ``` -2. Download `gsutil` - ``` - wget https://storage.googleapis.com/pub/gsutil.tar.gz - tar -zxvf gsutil.tar.gz - ./gsutil/gsutil --help - ``` -3. Create `/etc/boto.cfg` with the following content: - ``` - [GoogleCompute] - service_account = default - - [Plugin] - plugin_directory = /usr/lib/python3.8/site-packages/google_compute_engine/boto - ``` diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 8e2026932c4..5090a5a5c8f 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -1,9 +1,13 @@ # Playbooks -This document contains playbooks, or at least a checklist of what to look for, for alerts in the cortex-mixin. -# Alerts +This document contains playbooks, or at least a checklist of what to look for, for alerts in the cortex-mixin. This document assumes that you are running a Cortex cluster: -## CortexIngesterRestarts +1. Using this mixin config +2. Using GCS as object store (but similar procedures apply to other backends) + +## Alerts + +### CortexIngesterRestarts First, check if the alert is for a single ingester or multiple. Even if the alert is only for one ingester, it's best to follow up by checking `kubectl get pods --namespace=` every few minutes, or looking at the query `rate(kube_pod_container_status_restarts_total{container="ingester"}[30m]) > 0` just until you're sure there isn't a larger issue causing multiple restarts. Next, check `kubectl get events`, with and without the addition of the `--namespace` flag, to look for node restarts or other related issues. Grep or something similar to filter the output can be useful here. The most common cause of this alert is a single cloud providers node restarting and causing the ingester on that node to be rescheduled somewhere else. @@ -20,17 +24,17 @@ If nothing obvious from the above, check for increased load: - If there is an increase in the number of active series and the memory provisioned is not enough, scale up the ingesters horizontally to have the same number of series as before per ingester. - If we had an outage and once Cortex is back up, the incoming traffic increases. (or) The clients have their Prometheus remote-write lagging and starts to send samples at a higher rate (again, an increase in traffic but in terms of number of samples). Scale up the ingester horizontally in this case too. -## CortexRequest Latency +### CortexRequest Latency First establish if the alert is for read or write latency. The alert should say. -### Write Latency +#### Write Latency Using the Cortex write dashboard, find the cluster which reported the high write latency and deduce where in the stack the latency is being introduced: distributor: It is quite normal for the distributor P99 latency to be 50-100ms, and for the ingesters to be ~5ms. If the distributor latency is higher than this, you may need to scale up the distributors. If there is a high error rate being introduced at the distributors (400s or 500s) this has been know to induce latency. ingesters: It is very unusual for ingester latency to be high, as they just write to memory. They probably needs scaling up, but it is worth investigating what is going on first. -### Read Latency +#### Read Latency Query performance is an known problem. When you get this alert, you need to work out if: (a) this is a operation issue / configuration (b) this is because of algorithms and inherently limited (c) this is a bug Using the Cortex read dashboard, find the cluster which reported the high read latency and deduce where in the stack the latency is being introduced. @@ -43,38 +47,38 @@ If you think its provisioning / scaling is the problem, consult the scaling dash Right now most of the execution time will be spent in PromQL's innerEval. NB that the prepare (index and chunk fetch) are now interleaved with Eval, so you need to expand both to confirm if its flow execution of slow fetching. -## CortexTransferFailed +### CortexTransferFailed This alert goes off when an ingester fails to find another node to transfer its data to when it was shutting down. If there is both a pod stuck terminating and one stuck joining, look at the kubernetes events. This may be due to scheduling problems caused by some combination of anti affinity rules/resource utilization. Adding a new node can help in these circumstances. You can see recent events associated with a resource via kubectl describe, ex: `kubectl -n describe pod ` -## CortexIngesterUnhealthy +### CortexIngesterUnhealthy This alert goes off when an ingester is marked as unhealthy. Check the ring web page to see which is marked as unhealthy. You could then check the logs to see if there are any related to that ingester ex: `kubectl logs -f ingester-01 --namespace=prod`. A simple way to resolve this may be to click the "Forgot" button on the ring page, especially if the pod doesn't exist anymore. It might not exist anymore because it was on a node that got shut down, so you could check to see if there are any logs related to the node that pod is/was on, ex: `kubectl get events --namespace=prod | grep cloud-provider-node`. -## CortexFlushStuck +### CortexFlushStuck @todo -## CortexLoadBalancerErrors +### CortexLoadBalancerErrors @todo -## CortexTableSyncFailure +### CortexTableSyncFailure @todo -## CortexQuerierCapacityFull +### CortexQuerierCapacityFull @todo -## CortexFrontendQueriesStuck +### CortexFrontendQueriesStuck @todo -## CortexProvisioningTooMuchMemory +### CortexProvisioningTooMuchMemory @todo -## MemcachedDown +### MemcachedDown @todo -## CortexRulerFailedRingCheck +### CortexRulerFailedRingCheck This alert occurs when a ruler is unable to validate whether or not it should claim ownership over the evaluation of a rule group. The most likely cause is that one of the rule ring entries is unhealthy. If this is the case proceed to the ring admin http page and forget the unhealth ruler. The other possible cause would be an error returned the ring client. If this is the case look into debugging the ring based on the in-use backend implementation. -## CortexIngesterHasNotShippedBlocks +### CortexIngesterHasNotShippedBlocks This alert fires when a Cortex ingester is not uploading any block to the long-term storage. An ingester is expected to upload a block to the storage every block range period (defaults to 2h) and if a longer time elapse since the last successful upload it means something is not working correctly. @@ -84,7 +88,7 @@ How to investigate: _If the alert `CortexIngesterTSDBHeadCompactionFailed` fired as well, then give priority to it because that could be the cause._ -### Ingester hit the disk capacity +#### Ingester hit the disk capacity If the ingester hit the disk capacity, any attempt to append samples will fail. You should: @@ -93,11 +97,11 @@ If the ingester hit the disk capacity, any attempt to append samples will fail. - Was the disk just too small? - Was there an issue compacting TSDB head and the WAL is increasing indefinitely? -## CortexIngesterHasNotShippedBlocksSinceStart +### CortexIngesterHasNotShippedBlocksSinceStart Same as [`CortexIngesterHasNotShippedBlocks`](#CortexIngesterHasNotShippedBlocks). -## CortexIngesterTSDBHeadCompactionFailed +### CortexIngesterTSDBHeadCompactionFailed This alert fires when a Cortex ingester is failing to compact the TSDB head into a block. @@ -111,14 +115,14 @@ The cause triggering this alert could **lead to**: How to **investigate**: - Look for details in the ingester logs -## CortexQuerierHasNotScanTheBucket +### CortexQuerierHasNotScanTheBucket This alert fires when a Cortex querier is not successfully scanning blocks in the storage (bucket). A querier is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. How to investigate: - Look for any scan error in the querier logs (ie. networking or rate limiting issues) -## CortexQuerierHighRefetchRate +### CortexQuerierHighRefetchRate This alert fires when there's an high number of queries for which series have been refetched from a different store-gateway because of missing blocks. This could happen for a short time whenever a store-gateway ring resharding occurs (e.g. during/after an outage or while rolling out store-gateway) but store-gateways should reconcile in a short time. This alert fires if the issue persist for an unexpected long time and thus it should be investigated. @@ -126,14 +130,14 @@ How to investigate: - Ensure there are no errors related to blocks scan or sync in the queriers and store-gateways - Check store-gateway logs to see if all store-gateway have successfully completed a blocks sync -## CortexStoreGatewayHasNotSyncTheBucket +### CortexStoreGatewayHasNotSyncTheBucket This alert fires when a Cortex store-gateway is not successfully scanning blocks in the storage (bucket). A store-gateway is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket for a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. How to investigate: - Look for any scan error in the store-gateway logs (ie. networking or rate limiting issues) -## CortexCompactorHasNotSuccessfullyCleanedUpBlocks +### CortexCompactorHasNotSuccessfullyCleanedUpBlocks This alert fires when a Cortex compactor is not successfully deleting blocks marked for deletion for a long time. @@ -141,11 +145,11 @@ How to investigate: - Ensure the compactor is not crashing during compaction (ie. `OOMKilled`) - Look for any error in the compactor logs (ie. bucket Delete API errors) -## CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart +### CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHasNotSuccessfullyCleanedUpBlocks). -## CortexCompactorHasNotUploadedBlocks +### CortexCompactorHasNotUploadedBlocks This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time. @@ -155,7 +159,7 @@ How to investigate: - Ensure ingesters are successfully shipping blocks to the storage - Look for any error in the compactor logs -### Compactor is failing because of `not healthy index found` +#### Compactor is failing because of `not healthy index found` The compactor may fail to compact blocks due a corrupted block index found in one of the source blocks: @@ -179,11 +183,74 @@ To rename a block stored on GCS you can use the `gsutil` CLI: gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK ``` -## CortexCompactorHasNotUploadedBlocksSinceStart +### CortexCompactorHasNotUploadedBlocksSinceStart Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks). -## Resizing Persistent Volumes using Kubernetes + +## Cortex blocks storage - What to do when things to wrong + +## Recovering from a potential data loss incident + +The ingested series data that could be lost during an incident can be stored in two places: + +1. Ingesters (before blocks are shipped to the bucket) +2. Bucket + +There could be several root causes leading to a potential data loss. In this document we're going to share generic procedures that could be used as a guideline during an incident. + +### Halt the compactor + +The Cortex cluster continues to successfully operate even if the compactor is not running, except that over a long period (12+ hours) this will lead to query performance degrade. The compactor could potentially be the cause of data loss because: + +- It marks blocks for deletion (soft deletion) +- It permanently deletes blocks marked for deletion after `-compactor.deletion-delay` (hard deletion) +- It could generate corrupted compacted blocks (eg. due to a bug or if a source block is corrupted and the automatic checks can't detect it) + +**If you suspect the compactor could be the cause of data loss, halt it**. It can be restarted anytime later. + +When the compactor is **halted**: + +- No new blocks will be compacted +- No blocks will be deleted (soft and hard deletion) + +### Recover source blocks from ingesters + +Ingesters keep, on their persistent disk, the blocks compacted from TSDB head until the `-experimental.tsdb.retention-period` retention expires. The **default retention is 4 days**, in order to give cluster operators enough time to react in case of a data loss incident. + +The blocks retained in the ingesters can be used in case the compactor generates corrupted blocks and the source blocks, shipped from ingesters, have already been hard deleted from the bucket. + +How to manually blocks from ingesters to the bucket: + +1. Ensure [`gsutil`](https://cloud.google.com/storage/docs/gsutil) is installed in the Cortex pod. If not, [install it](#install-gsutil-in-the-cortex-pod) +2. Run `cd /data/tsdb && /path/to/gsutil -m rsync -r -x 'thanos.shipper.json|chunks_head|wal' . gs:///recovered/` + - `-m` enables parallel mode + - `-r` enables recursive rsync + - `-x ` excludes specific patterns from sync + +### Freeze ingesters persistent disk + +The blocks and WAL stored in the ingester persistent disk are the last fence of defence in case of an incident involving blocks not shipped to the bucket or corrupted blocks in the bucket. If the data integrity in the ingester's disk is at risk (eg. close to hit the TSDB retention period or close to reach max disk utilisation), you should freeze it taking a **disk snapshot**. + +To take a **GCP persistent disk snapshot**: + +1. Identify the Kubernetes PVC volume name (`kubectl get pvc -n `) of the volumes to snapshot +2. For each volume, [create a snapshot](https://console.cloud.google.com/compute/snapshotsAdd) from the GCP console ([documentation](https://cloud.google.com/compute/docs/disks/create-snapshots)) + +### Halt the ingesters + +Halting the ingesters should be the **very last resort** because of the side effects. To halt the ingesters, while preserving their disk and without disrupting the cluster write path, you need to: + +1. Create a second pool of ingesters + - Uses the functions `newIngesterStatefulSet()`, `newIngesterPdb()` +2. Wait until the second pool is up and running +3. Halt existing ingesters (scale down to 0 or delete their statefulset) + +However the **queries will return partial data**, due to all the ingested samples which have not been compacted to blocks yet. + +## Manual procedures + +### Resizing Persistent Volumes using Kubernetes This is the short version of an extensive documentation on [how to resize Kubernetes Persistent Volumes](https://kubernetes.io/blog/2018/07/12/resizing-persistent-volumes-using-kubernetes/). @@ -198,7 +265,7 @@ This is the short version of an extensive documentation on [how to resize Kubern 1. Edit the PVC (persistent volume claim) `spec` for the volume to resize and **increase** `resources` > `requests` > `storage` 2. Restart the pod attached to the PVC for which the storage request has been increased -## How to create clone volume (Google Cloud specific) +### How to create clone volume (Google Cloud specific) In some scenarios, it may be useful to preserve current volume status for inspection, but keep using the volume. [Google Persistent Disk supports "Clone"](https://cloud.google.com/compute/docs/disks/add-persistent-disk#source-disk) operation that can be used to do that. @@ -261,3 +328,26 @@ spec: ``` After this preparation, one can use `kubectl exec -t -i clone-ingester-7-dataaccess /bin/sh` to inspect the disk mounted under `/data`. + +### Install `gsutil` in the Cortex pod + +1. Install python + ``` + apk add python3 py3-pip + ln -s /usr/bin/python3 /usr/bin/python + pip install google-compute-engine + ``` +2. Download `gsutil` + ``` + wget https://storage.googleapis.com/pub/gsutil.tar.gz + tar -zxvf gsutil.tar.gz + ./gsutil/gsutil --help + ``` +3. Create `/etc/boto.cfg` with the following content: + ``` + [GoogleCompute] + service_account = default + + [Plugin] + plugin_directory = /usr/lib/python3.8/site-packages/google_compute_engine/boto + ``` From f1d3944d982d5802c55b441f4feb254fdab41b0f Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 3 Aug 2020 11:26:57 +0200 Subject: [PATCH 125/364] Improved doc based on feedback received Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 5090a5a5c8f..6ed58d69382 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -201,13 +201,13 @@ There could be several root causes leading to a potential data loss. In this doc ### Halt the compactor -The Cortex cluster continues to successfully operate even if the compactor is not running, except that over a long period (12+ hours) this will lead to query performance degrade. The compactor could potentially be the cause of data loss because: +The Cortex cluster continues to successfully operate even if the compactor is not running, except that over a long period (12+ hours) this will lead to query performance degradation. The compactor could potentially be the cause of data loss because: -- It marks blocks for deletion (soft deletion) +- It marks blocks for deletion (soft deletion). _This doesn't lead to any immediate deletion, but blocks marked for deletion will be hard deleted once a delay expires._ - It permanently deletes blocks marked for deletion after `-compactor.deletion-delay` (hard deletion) - It could generate corrupted compacted blocks (eg. due to a bug or if a source block is corrupted and the automatic checks can't detect it) -**If you suspect the compactor could be the cause of data loss, halt it**. It can be restarted anytime later. +**If you suspect the compactor could be the cause of data loss, halt it** (delete the statefulset or scale down the replicas to 0). It can be restarted anytime later. When the compactor is **halted**: @@ -223,10 +223,12 @@ The blocks retained in the ingesters can be used in case the compactor generates How to manually blocks from ingesters to the bucket: 1. Ensure [`gsutil`](https://cloud.google.com/storage/docs/gsutil) is installed in the Cortex pod. If not, [install it](#install-gsutil-in-the-cortex-pod) -2. Run `cd /data/tsdb && /path/to/gsutil -m rsync -r -x 'thanos.shipper.json|chunks_head|wal' . gs:///recovered/` +2. Run `cd /data/tsdb && /path/to/gsutil -m rsync -n -r -x 'thanos.shipper.json|chunks_head|wal' . gs:///recovered/` + - `-n` enabled the **dry run** (remove it once you've verified the output matches your expectations) - `-m` enables parallel mode - `-r` enables recursive rsync - - `-x ` excludes specific patterns from sync + - `-x ` excludes specific patterns from sync (no WAL or shipper metadata file should be uploaded to the bucket) + - Don't use `-d` (dangerous) because it will delete from the bucket any block which is not in the local filesystem ### Freeze ingesters persistent disk From afbce48c09fe2c28ac9451269f7b5d5dbd480091 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 3 Aug 2020 12:34:11 +0200 Subject: [PATCH 126/364] Documented what happens with a StatefulSet is deleted Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 6ed58d69382..fb408b9d82b 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -353,3 +353,12 @@ After this preparation, one can use `kubectl exec -t -i clone-ingester-7-dataacc [Plugin] plugin_directory = /usr/lib/python3.8/site-packages/google_compute_engine/boto ``` + +### Deleting a StatefulSet with persistent volumes + +When you delete a Kubernetes StatefulSet whose pods have persistent volume claims (PVC), the PVCs are not automatically deleted. This means that if the StatefulSet is recreated, the pods for which there was already a PVC will get the volume mounted previously. + +A PVC can be manually deleted by an operator. When a PVC claim is deleted, what happens to the volume depends on its [Reclaim Policy](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming): + +- `Retain`: the volume will not be deleted until the PV resource will be manually deleted from Kubernetes +- `Delete`: the volume will be automatically deleted From 11aa1213b5abe0f911c96af519113d7118a2280e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 7 Aug 2020 10:23:04 +0200 Subject: [PATCH 127/364] Cleaned up blocks storage config Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts.libsonnet | 2 +- jsonnet/mimir-mixin/config.libsonnet | 6 +++--- jsonnet/mimir-mixin/dashboards.libsonnet | 4 ++-- .../mimir-mixin/dashboards/queries.libsonnet | 6 +++--- .../dashboards/reads-resources.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 18 +++++++++--------- .../mimir-mixin/dashboards/writes.libsonnet | 6 +++--- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index c6e52da8973..8ac22e59ee6 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -2,7 +2,7 @@ prometheusAlerts+:: (import 'alerts/alerts.libsonnet') + - (if std.setMember('tsdb', $._config.storage_engine) + (if std.setMember('blocks', $._config.storage_engine) then (import 'alerts/blocks.libsonnet') + (import 'alerts/compactor.libsonnet') diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index ab02ffaac46..92850327a45 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -4,9 +4,9 @@ _config+:: { // Switch for overall storage engine. - // May contain 'chunks', 'tsdb' or both. - // Enables chunks- or tsdb- specific panels and dashboards. - storage_engine: ['chunks', 'tsdb'], + // May contain 'chunks', 'blocks' or both. + // Enables chunks- or blocks- specific panels and dashboards. + storage_engine: ['chunks', 'blocks'], // For chunks backend, switch for chunk index type. // May contain 'bigtable', 'dynamodb' or 'cassandra'. diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 3016a56f5d5..bf4dca8276d 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -7,7 +7,7 @@ (import 'dashboards/scaling.libsonnet') + (import 'dashboards/writes.libsonnet') + - (if std.setMember('tsdb', $._config.storage_engine) + (if std.setMember('blocks', $._config.storage_engine) then (import 'dashboards/compactor.libsonnet') + (import 'dashboards/compactor-resources.libsonnet') + @@ -18,7 +18,7 @@ then import 'dashboards/chunks.libsonnet' else {}) + - (if std.setMember('tsdb', $._config.storage_engine) + (if std.setMember('blocks', $._config.storage_engine) && std.setMember('chunks', $._config.storage_engine) then import 'dashboards/comparison.libsonnet' else {}) + diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index a7a6f491f75..9cdeec00725 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -144,7 +144,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.row('Store-gateway - Blocks') .addPanel( $.panel('Blocks queried / sec') + @@ -165,7 +165,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.row('') .addPanel( $.panel('Series fetch duration (per request)') + @@ -181,7 +181,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.row('') .addPanel( $.panel('Blocks currently loaded') + diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index 2b93cb808db..de87aeaece4 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -53,7 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.row('Store-gateway') .addPanel( $.containerCPUUsagePanel('CPU', 'store-gateway'), diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index b6081bd017c..d554baad2df 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -60,7 +60,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.row('Store-gateway') .addPanel( $.panel('QPS') + @@ -96,7 +96,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.row('Memcached – Blocks Storage – Index header (Store-gateway)') .addPanel( $.panel('QPS') + @@ -115,15 +115,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.thanosMemcachedCache('Memcached – Blocks Storage – Chunks (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'chunks-cache') ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.thanosMemcachedCache('Memcached – Blocks Storage – Metadada (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache') ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.thanosMemcachedCache('Memcached – Blocks Storage – Metadada (Querier)', $._config.job_names.querier, 'querier', 'metadata-cache') ) .addRowIf( @@ -180,20 +180,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) // Object store metrics for the store-gateway. .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.objectStorePanels1('Store-gateway - Blocks Object Store', 'store-gateway'), ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.objectStorePanels2('', 'store-gateway'), ) // Object store metrics for the querier. .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.objectStorePanels1('Querier - Blocks Object Store', 'querier'), ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.objectStorePanels2('', 'querier'), ), } diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index fbe537a337a..98aff618439 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -150,7 +150,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.row('Ingester - Blocks storage - Shipper') .addPanel( $.successFailurePanel( @@ -165,7 +165,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.row('Ingester - Blocks storage - TSDB Head') .addPanel( $.successFailurePanel( @@ -180,7 +180,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('tsdb', $._config.storage_engine), + std.setMember('blocks', $._config.storage_engine), $.row('Ingester - Blocks storage - TSDB WAL') .addPanel( $.successFailurePanel( From 6a203cad440305db38c0ae1123777107bda02772 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 7 Aug 2020 11:11:56 +0200 Subject: [PATCH 128/364] Fixed std.setMember() usage Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards.libsonnet | 8 ++-- .../mimir-mixin/dashboards/queries.libsonnet | 12 +++--- .../dashboards/reads-resources.libsonnet | 2 +- .../mimir-mixin/dashboards/reads.libsonnet | 38 +++++++++---------- .../mimir-mixin/dashboards/writes.libsonnet | 24 ++++++------ 6 files changed, 43 insertions(+), 43 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index 8ac22e59ee6..9369a7da57a 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -2,7 +2,7 @@ prometheusAlerts+:: (import 'alerts/alerts.libsonnet') + - (if std.setMember('blocks', $._config.storage_engine) + (if std.member($._config.storage_engine, 'blocks') then (import 'alerts/blocks.libsonnet') + (import 'alerts/compactor.libsonnet') diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index bf4dca8276d..06e739b776d 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -7,19 +7,19 @@ (import 'dashboards/scaling.libsonnet') + (import 'dashboards/writes.libsonnet') + - (if std.setMember('blocks', $._config.storage_engine) + (if std.member($._config.storage_engine, 'blocks') then (import 'dashboards/compactor.libsonnet') + (import 'dashboards/compactor-resources.libsonnet') + (import 'dashboards/object-store.libsonnet') else {}) + - (if std.setMember('chunks', $._config.storage_engine) + (if std.member($._config.storage_engine, 'chunks') then import 'dashboards/chunks.libsonnet' else {}) + - (if std.setMember('blocks', $._config.storage_engine) - && std.setMember('chunks', $._config.storage_engine) + (if std.member($._config.storage_engine, 'blocks') + && std.member($._config.storage_engine, 'chunks') then import 'dashboards/comparison.libsonnet' else {}) + diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index 9cdeec00725..e8fb81c734c 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -66,7 +66,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('chunks', $._config.storage_engine), + std.member($._config.storage_engine, 'chunks'), $.row('Querier - Chunks storage - Index Cache') .addPanel( $.panel('Total entries') + @@ -101,7 +101,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('chunks', $._config.storage_engine), + std.member($._config.storage_engine, 'chunks'), $.row('Querier - Chunks storage - Store') .addPanel( $.panel('Index Lookups per Query') + @@ -125,7 +125,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('chunks', $._config.storage_engine), + std.member($._config.storage_engine, 'chunks'), $.row('Querier - Blocks storage') .addPanel( $.panel('Number of store-gateways hit per Query') + @@ -144,7 +144,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway - Blocks') .addPanel( $.panel('Blocks queried / sec') + @@ -165,7 +165,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( $.panel('Series fetch duration (per request)') + @@ -181,7 +181,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( $.panel('Blocks currently loaded') + diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index de87aeaece4..6dd8d0cef9d 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -53,7 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') .addPanel( $.containerCPUUsagePanel('CPU', 'store-gateway'), diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index d554baad2df..7af070e4478 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -60,7 +60,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') .addPanel( $.panel('QPS') + @@ -72,7 +72,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('chunks', $._config.storage_engine), + std.member($._config.storage_engine, 'chunks'), $.row('Memcached - Chunks storage - Index') .addPanel( $.panel('QPS') + @@ -84,7 +84,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('chunks', $._config.storage_engine), + std.member($._config.storage_engine, 'chunks'), $.row('Memcached - Chunks storage - Chunks') .addPanel( $.panel('QPS') + @@ -96,7 +96,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.row('Memcached – Blocks Storage – Index header (Store-gateway)') .addPanel( $.panel('QPS') + @@ -115,20 +115,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache('Memcached – Blocks Storage – Chunks (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'chunks-cache') ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache('Memcached – Blocks Storage – Metadada (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache') ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache('Memcached – Blocks Storage – Metadada (Querier)', $._config.job_names.querier, 'querier', 'metadata-cache') ) .addRowIf( - std.setMember('chunks', $._config.storage_engine) && - std.setMember('cassandra', $._config.chunk_index_backend + $._config.chunk_store_backend), + std.member($._config.storage_engine, 'chunks') && + std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), $.row('Cassandra') .addPanel( $.panel('QPS') + @@ -140,8 +140,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('chunks', $._config.storage_engine) && - std.setMember('bigtable', $._config.chunk_index_backend + $._config.chunk_store_backend), + std.member($._config.storage_engine, 'chunks') && + std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), $.row('BigTable') .addPanel( $.panel('QPS') + @@ -153,8 +153,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addRowIf( - std.setMember('chunks', $._config.storage_engine) && - std.setMember('dynamodb', $._config.chunk_index_backend + $._config.chunk_store_backend), + std.member($._config.storage_engine, 'chunks') && + std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), $.row('DynamoDB') .addPanel( $.panel('QPS') + @@ -166,8 +166,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addRowIf( - std.setMember('chunks', $._config.storage_engine) && - std.setMember('gcs', $._config.chunk_store_backend), + std.member($._config.storage_engine, 'chunks') && + std.member($._config.chunk_store_backend, 'gcs'), $.row('GCS') .addPanel( $.panel('QPS') + @@ -180,20 +180,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) // Object store metrics for the store-gateway. .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.objectStorePanels1('Store-gateway - Blocks Object Store', 'store-gateway'), ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.objectStorePanels2('', 'store-gateway'), ) // Object store metrics for the querier. .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.objectStorePanels1('Querier - Blocks Object Store', 'querier'), ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.objectStorePanels2('', 'querier'), ), } diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 98aff618439..88978e2308c 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -86,7 +86,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('chunks', $._config.storage_engine), + std.member($._config.storage_engine, 'chunks'), $.row('Memcached') .addPanel( $.panel('QPS') + @@ -98,8 +98,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('chunks', $._config.storage_engine) && - std.setMember('cassandra', $._config.chunk_index_backend + $._config.chunk_store_backend), + std.member($._config.storage_engine, 'chunks') && + std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), $.row('Cassandra') .addPanel( $.panel('QPS') + @@ -111,8 +111,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('chunks', $._config.storage_engine) && - std.setMember('bigtable', $._config.chunk_index_backend + $._config.chunk_store_backend), + std.member($._config.storage_engine, 'chunks') && + std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), $.row('BigTable') .addPanel( $.panel('QPS') + @@ -124,8 +124,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('chunks', $._config.storage_engine) && - std.setMember('dynamodb', $._config.chunk_index_backend + $._config.chunk_store_backend), + std.member($._config.storage_engine, 'chunks') && + std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), $.row('DynamoDB') .addPanel( $.panel('QPS') + @@ -137,8 +137,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('chunks', $._config.storage_engine) && - std.setMember('gcs', $._config.chunk_store_backend), + std.member($._config.storage_engine, 'chunks') && + std.member($._config.chunk_store_backend, 'gcs'), $.row('GCS') .addPanel( $.panel('QPS') + @@ -150,7 +150,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.row('Ingester - Blocks storage - Shipper') .addPanel( $.successFailurePanel( @@ -165,7 +165,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.row('Ingester - Blocks storage - TSDB Head') .addPanel( $.successFailurePanel( @@ -180,7 +180,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.setMember('blocks', $._config.storage_engine), + std.member($._config.storage_engine, 'blocks'), $.row('Ingester - Blocks storage - TSDB WAL') .addPanel( $.successFailurePanel( From 4160d4d4c320f180ed65c2f82deda9570c0bd63d Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Fri, 7 Aug 2020 11:15:36 +0100 Subject: [PATCH 129/364] Add playbook entry for sample with repeated timestamp --- jsonnet/mimir-mixin/docs/playbooks.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index fb408b9d82b..46c5a6b04b1 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -1,6 +1,6 @@ # Playbooks -This document contains playbooks, or at least a checklist of what to look for, for alerts in the cortex-mixin. This document assumes that you are running a Cortex cluster: +This document contains playbooks, or at least a checklist of what to look for, for alerts in the cortex-mixin and logs from Cortex. This document assumes that you are running a Cortex cluster: 1. Using this mixin config 2. Using GCS as object store (but similar procedures apply to other backends) @@ -362,3 +362,16 @@ A PVC can be manually deleted by an operator. When a PVC claim is deleted, what - `Retain`: the volume will not be deleted until the PV resource will be manually deleted from Kubernetes - `Delete`: the volume will be automatically deleted + + +## Log lines + +### Log line containing 'sample with repeated timestamp but different value' + +This means a sample with the same timestamp as an existing one was received with a different value. The number of occurrences is recorded in the `prometheus_target_scrapes_sample_out_of_order_total` metric. + +Possible reasons for this are: +- Multiple agents are scraping the same app without deduplication in place. Check the IP addresses mentioned in the log line for the agent that returned the deplicate sample. Change the labels of each sample generated per agent so they are unique. +- Incorrect relabelling rules can cause a label to be dropped from a sample so that multiple samples have the same labels. If these samples were collected at the same time they will cause this error. +- The exporter being scraped sets the same timestamp on every scrape. Note that exporters should generally not set timestamps. +- Prometheus scrapes at the millisecond level. If the scrapes are done very quickly the same sample could be returned. This is very unlikely. From d6cafc183fc97baeb43546297b1684514beda091 Mon Sep 17 00:00:00 2001 From: MichelHollands <42814411+MichelHollands@users.noreply.github.com> Date: Fri, 7 Aug 2020 11:59:59 +0100 Subject: [PATCH 130/364] Update cortex-mixin/docs/playbooks.md Co-authored-by: Jack Baldry --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 46c5a6b04b1..087a3b5e42c 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -371,7 +371,7 @@ A PVC can be manually deleted by an operator. When a PVC claim is deleted, what This means a sample with the same timestamp as an existing one was received with a different value. The number of occurrences is recorded in the `prometheus_target_scrapes_sample_out_of_order_total` metric. Possible reasons for this are: -- Multiple agents are scraping the same app without deduplication in place. Check the IP addresses mentioned in the log line for the agent that returned the deplicate sample. Change the labels of each sample generated per agent so they are unique. +- Multiple agents are scraping the same app without deduplication in place. Check the IP addresses mentioned in the log line for the agent that returned the duplicate sample. Change the labels of each sample generated per agent so they are unique. - Incorrect relabelling rules can cause a label to be dropped from a sample so that multiple samples have the same labels. If these samples were collected at the same time they will cause this error. - The exporter being scraped sets the same timestamp on every scrape. Note that exporters should generally not set timestamps. - Prometheus scrapes at the millisecond level. If the scrapes are done very quickly the same sample could be returned. This is very unlikely. From dfbd87006a675c76371bb5b1c4610f2fb16f061a Mon Sep 17 00:00:00 2001 From: MichelHollands <42814411+MichelHollands@users.noreply.github.com> Date: Mon, 10 Aug 2020 09:02:03 +0100 Subject: [PATCH 131/364] Update cortex-mixin/docs/playbooks.md Co-authored-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 087a3b5e42c..dfa0f00a53e 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -368,7 +368,7 @@ A PVC can be manually deleted by an operator. When a PVC claim is deleted, what ### Log line containing 'sample with repeated timestamp but different value' -This means a sample with the same timestamp as an existing one was received with a different value. The number of occurrences is recorded in the `prometheus_target_scrapes_sample_out_of_order_total` metric. +This means a sample with the same timestamp as the latest one was received with a different value. The number of occurrences is recorded in the `cortex_discarded_samples_total` metric with the label `reason="new-value-for-timestamp"`. Possible reasons for this are: - Multiple agents are scraping the same app without deduplication in place. Check the IP addresses mentioned in the log line for the agent that returned the duplicate sample. Change the labels of each sample generated per agent so they are unique. From f53aaee14fa25016dd311311765183c5e0edb868 Mon Sep 17 00:00:00 2001 From: MichelHollands <42814411+MichelHollands@users.noreply.github.com> Date: Mon, 10 Aug 2020 09:18:33 +0100 Subject: [PATCH 132/364] Update cortex-mixin/docs/playbooks.md Co-authored-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index dfa0f00a53e..773aa16e11c 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -371,7 +371,7 @@ A PVC can be manually deleted by an operator. When a PVC claim is deleted, what This means a sample with the same timestamp as the latest one was received with a different value. The number of occurrences is recorded in the `cortex_discarded_samples_total` metric with the label `reason="new-value-for-timestamp"`. Possible reasons for this are: -- Multiple agents are scraping the same app without deduplication in place. Check the IP addresses mentioned in the log line for the agent that returned the duplicate sample. Change the labels of each sample generated per agent so they are unique. +- Multiple Prometheus servers / Grafana agents are scraping the same target without deduplication in place. Check the IP addresses mentioned in the log line for the agent that returned the duplicate sample. Change the labels of each sample generated per agent so they are unique. - Incorrect relabelling rules can cause a label to be dropped from a sample so that multiple samples have the same labels. If these samples were collected at the same time they will cause this error. - The exporter being scraped sets the same timestamp on every scrape. Note that exporters should generally not set timestamps. - Prometheus scrapes at the millisecond level. If the scrapes are done very quickly the same sample could be returned. This is very unlikely. From 6591520bbffdb8f3613a4c3a92c9de77a84c8342 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Mon, 10 Aug 2020 09:26:06 +0100 Subject: [PATCH 133/364] Address review comments --- jsonnet/mimir-mixin/docs/playbooks.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 773aa16e11c..570c9e8df49 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -372,6 +372,5 @@ This means a sample with the same timestamp as the latest one was received with Possible reasons for this are: - Multiple Prometheus servers / Grafana agents are scraping the same target without deduplication in place. Check the IP addresses mentioned in the log line for the agent that returned the duplicate sample. Change the labels of each sample generated per agent so they are unique. -- Incorrect relabelling rules can cause a label to be dropped from a sample so that multiple samples have the same labels. If these samples were collected at the same time they will cause this error. +- Incorrect relabelling rules can cause a label to be dropped from a sample so that multiple samples have the same labels. If these samples were collected from the same target they will have the same timestamp. An example is dropping the `cpu` label when there are multiple cpus. - The exporter being scraped sets the same timestamp on every scrape. Note that exporters should generally not set timestamps. -- Prometheus scrapes at the millisecond level. If the scrapes are done very quickly the same sample could be returned. This is very unlikely. From 0f6c578834a4b9411342135263aeb4b66611a352 Mon Sep 17 00:00:00 2001 From: MichelHollands <42814411+MichelHollands@users.noreply.github.com> Date: Mon, 10 Aug 2020 15:11:00 +0100 Subject: [PATCH 134/364] Update cortex-mixin/docs/playbooks.md Co-authored-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 570c9e8df49..fa0f397965a 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -372,5 +372,5 @@ This means a sample with the same timestamp as the latest one was received with Possible reasons for this are: - Multiple Prometheus servers / Grafana agents are scraping the same target without deduplication in place. Check the IP addresses mentioned in the log line for the agent that returned the duplicate sample. Change the labels of each sample generated per agent so they are unique. -- Incorrect relabelling rules can cause a label to be dropped from a sample so that multiple samples have the same labels. If these samples were collected from the same target they will have the same timestamp. An example is dropping the `cpu` label when there are multiple cpus. +- Incorrect relabelling rules can cause a label to be dropped from a series so that multiple series have the same labels. If these series were collected from the same target they will have the same timestamp. - The exporter being scraped sets the same timestamp on every scrape. Note that exporters should generally not set timestamps. From 9b8fda85b2b29227b6545dbe07e67eab81bee2fc Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Mon, 10 Aug 2020 15:39:04 +0100 Subject: [PATCH 135/364] Remove unlikely reason for duplicate sample --- jsonnet/mimir-mixin/docs/playbooks.md | 1 - 1 file changed, 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index fa0f397965a..e5358605fe4 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -371,6 +371,5 @@ A PVC can be manually deleted by an operator. When a PVC claim is deleted, what This means a sample with the same timestamp as the latest one was received with a different value. The number of occurrences is recorded in the `cortex_discarded_samples_total` metric with the label `reason="new-value-for-timestamp"`. Possible reasons for this are: -- Multiple Prometheus servers / Grafana agents are scraping the same target without deduplication in place. Check the IP addresses mentioned in the log line for the agent that returned the duplicate sample. Change the labels of each sample generated per agent so they are unique. - Incorrect relabelling rules can cause a label to be dropped from a series so that multiple series have the same labels. If these series were collected from the same target they will have the same timestamp. - The exporter being scraped sets the same timestamp on every scrape. Note that exporters should generally not set timestamps. From 83e33cd3608b678c0cd7e3b361d5eae3aa6101b9 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Mon, 10 Aug 2020 14:32:44 -0400 Subject: [PATCH 136/364] Added per instance gateway writes panel Signed-off-by: Joe Elliott --- jsonnet/mimir-mixin/config.libsonnet | 3 +++ .../dashboards/dashboard-utils.libsonnet | 15 ++++++++++++++- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 8 ++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 92850327a45..285ee21f207 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -42,5 +42,8 @@ // Whether resources dashboards are enabled (based on cAdvisor metrics). resources_dashboards_enabled: false, + + // If supplied adds additional panels that are broken down per instance. i.e. 'pod' in a kubernetes install + per_instance_label: '', }, } diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 291ae3f5480..52c6c567b4e 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -38,13 +38,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addMultiTemplate('namespace', 'cortex_build_info', 'namespace'), }, - // The ,ixin allow specialism of the job selector depending on if its a single binary + row(title):: + super.row(title) + { + addPanelIf(condition, panel):: + if condition + then self.addPanel(panel) + else self, + }, + + // The mixin allow specialism of the job selector depending on if its a single binary // deployment or a namespaced one. jobMatcher(job):: if $._config.singleBinary then 'job=~"$job"' else 'cluster=~"$cluster", job=~"($namespace)/%s"' % job, + jobMatcherEquality(job):: + if $._config.singleBinary + then 'job=~"$job"' + else 'cluster="$cluster", namespace="$namespace", job=~"($namespace)/%s"' % job, + namespaceMatcher():: if $._config.singleBinary then 'job=~"$job"' diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 88978e2308c..bfe920a21df 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -40,6 +40,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.eq('route', 'api_prom_push')]) ) + .addPanelIf( + $._config.per_instance_label != '', + $.panel('Per %s Latency' % $._config.per_instance_label) + + $.queryPanel( + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="api_prom_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' + ) + + { yaxes: $.yaxes('s'), legend: { show: false }, fill: 0, tooltip: { sort: 2 } } + ) ) .addRow( $.row('Distributor') From 3f4db3668bcc639587a515da950b9cea4d259fa6 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Mon, 10 Aug 2020 14:38:52 -0400 Subject: [PATCH 137/364] Consolidated panel logic Signed-off-by: Joe Elliott --- .../mimir-mixin/dashboards/dashboard-utils.libsonnet | 10 ++++++++++ jsonnet/mimir-mixin/dashboards/writes.libsonnet | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 52c6c567b4e..c67e6553051 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -78,6 +78,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], }, + // hiddenLegendQueryPanel is a standard query panel designed to handle a large number of series. it hides the legend, doesn't fill the series and + // sorts the tooltip descending + hiddenLegendQueryPanel(queries, legends, legendLink=null):: + $.queryPanel(queries, legends, legendLink) + + { + legend: { show: false }, + fill: 0, + tooltip: { sort: 2 }, + }, + qpsPanel(selector):: super.qpsPanel(selector) + { targets: [ diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index bfe920a21df..0116efe5c4f 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -42,11 +42,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanelIf( $._config.per_instance_label != '', - $.panel('Per %s Latency' % $._config.per_instance_label) + - $.queryPanel( + $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="api_prom_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' ) + - { yaxes: $.yaxes('s'), legend: { show: false }, fill: 0, tooltip: { sort: 2 } } + { yaxes: $.yaxes('s') } ) ) .addRow( From 8ddac81ec52477e54731618a7522c5d3734b7d69 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Mon, 10 Aug 2020 14:54:24 -0400 Subject: [PATCH 138/364] Added per pod writes to distributors and ingesters Signed-off-by: Joe Elliott --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 0116efe5c4f..1021da4ec24 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -59,6 +59,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/httpgrpc.*|api_prom_push')]) ) + .addPanelIf( + $._config.per_instance_label != '', + $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.hiddenLegendQueryPanel( + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/httpgrpc.*|api_prom_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.distributor)], '' + ) + + { yaxes: $.yaxes('s') } + ) ) .addRow( $.row('KV Store (HA Dedupe)') @@ -81,6 +89,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) ) + .addPanelIf( + $._config.per_instance_label != '', + $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.hiddenLegendQueryPanel( + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.ingester)], '' + ) + + { yaxes: $.yaxes('s') } + ) ) .addRow( $.row('KV Store (Ring)') From f75166b60e1c1fbcd563815bd9a8672950abd8c4 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Mon, 10 Aug 2020 15:07:33 -0400 Subject: [PATCH 139/364] Added per pod metrics to read dashboard Signed-off-by: Joe Elliott --- jsonnet/mimir-mixin/config.libsonnet | 2 +- .../mimir-mixin/dashboards/reads.libsonnet | 40 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 285ee21f207..78851e103b5 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -44,6 +44,6 @@ resources_dashboards_enabled: false, // If supplied adds additional panels that are broken down per instance. i.e. 'pod' in a kubernetes install - per_instance_label: '', + per_instance_label: 'pod', }, } diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 7af070e4478..444e8aa31be 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -14,6 +14,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) + .addPanelIf( + $._config.per_instance_label != '', + $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.hiddenLegendQueryPanel( + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_prom_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' + ) + + { yaxes: $.yaxes('s') } + ) ) .addRow( $.row('Query Frontend') @@ -25,6 +33,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) + .addPanelIf( + $._config.per_instance_label != '', + $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.hiddenLegendQueryPanel( + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_prom_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.query_frontend)], '' + ) + + { yaxes: $.yaxes('s') } + ) ) .addRow( $.row('Cache - Query Results') @@ -47,6 +63,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) + .addPanelIf( + $._config.per_instance_label != '', + $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.hiddenLegendQueryPanel( + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_prom_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.querier)], '' + ) + + { yaxes: $.yaxes('s') } + ) ) .addRow( $.row('Ingester') @@ -58,6 +82,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) ) + .addPanelIf( + $._config.per_instance_label != '', + $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.hiddenLegendQueryPanel( + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.ingester)], '' + ) + + { yaxes: $.yaxes('s') } + ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), @@ -70,6 +102,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) ) + .addPanelIf( + $._config.per_instance_label != '', + $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.hiddenLegendQueryPanel( + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.store_gateway)], '' + ) + + { yaxes: $.yaxes('s') } + ) ) .addRowIf( std.member($._config.storage_engine, 'chunks'), From c49dc2f5f4b8c75d7f1f017aa3e160ca7fcabec6 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Mon, 10 Aug 2020 17:23:12 -0400 Subject: [PATCH 140/364] lint Signed-off-by: Joe Elliott --- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index c67e6553051..0137175dfcd 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -78,10 +78,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], }, - // hiddenLegendQueryPanel is a standard query panel designed to handle a large number of series. it hides the legend, doesn't fill the series and + // hiddenLegendQueryPanel is a standard query panel designed to handle a large number of series. it hides the legend, doesn't fill the series and // sorts the tooltip descending hiddenLegendQueryPanel(queries, legends, legendLink=null):: - $.queryPanel(queries, legends, legendLink) + + $.queryPanel(queries, legends, legendLink) + { legend: { show: false }, fill: 0, From 9061500e0bea34b29d486c737582a066d9eafdda Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Tue, 11 Aug 2020 10:52:38 -0400 Subject: [PATCH 141/364] Set default per_instance_label to empty Signed-off-by: Joe Elliott --- jsonnet/mimir-mixin/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 78851e103b5..285ee21f207 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -44,6 +44,6 @@ resources_dashboards_enabled: false, // If supplied adds additional panels that are broken down per instance. i.e. 'pod' in a kubernetes install - per_instance_label: 'pod', + per_instance_label: '', }, } From ae533430d5394a727862820bbae7a82b5eac0c3a Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Tue, 11 Aug 2020 11:43:50 -0400 Subject: [PATCH 142/364] Removed configurability Signed-off-by: Joe Elliott --- jsonnet/mimir-mixin/config.libsonnet | 4 ++-- .../dashboards/dashboard-utils.libsonnet | 11 +++-------- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 15 +++++---------- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 9 +++------ 4 files changed, 13 insertions(+), 26 deletions(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 285ee21f207..08336b3c70a 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -43,7 +43,7 @@ // Whether resources dashboards are enabled (based on cAdvisor metrics). resources_dashboards_enabled: false, - // If supplied adds additional panels that are broken down per instance. i.e. 'pod' in a kubernetes install - per_instance_label: '', + // Used on panels that show metrics per instance. i.e. 'pod' in a kubernetes install + per_instance_label: 'pod', }, } diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 0137175dfcd..aa1e68ceb66 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -38,14 +38,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addMultiTemplate('namespace', 'cortex_build_info', 'namespace'), }, - row(title):: - super.row(title) + { - addPanelIf(condition, panel):: - if condition - then self.addPanel(panel) - else self, - }, - // The mixin allow specialism of the job selector depending on if its a single binary // deployment or a namespaced one. jobMatcher(job):: @@ -53,6 +45,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; then 'job=~"$job"' else 'cluster=~"$cluster", job=~"($namespace)/%s"' % job, + // jobMatcherEquality performs exact matches on cluster and namespace. Should be used on + // panels that are expected to return too many series to be useful when multiplier + // namespaces or clusters are selected. jobMatcherEquality(job):: if $._config.singleBinary then 'job=~"$job"' diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 444e8aa31be..18d95d6a7a4 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -14,8 +14,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) - .addPanelIf( - $._config.per_instance_label != '', + .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_prom_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' @@ -33,8 +32,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) - .addPanelIf( - $._config.per_instance_label != '', + .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_prom_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.query_frontend)], '' @@ -63,8 +61,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) ) - .addPanelIf( - $._config.per_instance_label != '', + .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_prom_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.querier)], '' @@ -82,8 +79,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) ) - .addPanelIf( - $._config.per_instance_label != '', + .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.ingester)], '' @@ -102,8 +98,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) ) - .addPanelIf( - $._config.per_instance_label != '', + .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.store_gateway)], '' diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 1021da4ec24..1f61778aff3 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -40,8 +40,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.eq('route', 'api_prom_push')]) ) - .addPanelIf( - $._config.per_instance_label != '', + .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="api_prom_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' @@ -59,8 +58,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/httpgrpc.*|api_prom_push')]) ) - .addPanelIf( - $._config.per_instance_label != '', + .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/httpgrpc.*|api_prom_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.distributor)], '' @@ -89,8 +87,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) ) - .addPanelIf( - $._config.per_instance_label != '', + .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.ingester)], '' From eb6305da98ef83735567d558773e344d7fd6c7ac Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Tue, 25 Aug 2020 07:25:06 +0000 Subject: [PATCH 143/364] Playbook entry for CortexWALCorruption (https://github.com/grafana/cortex-jsonnet/pull/163) Signed-off-by: Ganesh Vernekar --- jsonnet/mimir-mixin/docs/playbooks.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index e5358605fe4..fdf472c0426 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -187,6 +187,24 @@ gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks). +### CortexWALCorruption + +This alert is only related to the chunks storage. This can happen because of 2 reasons: (1) Non graceful shutdown of ingesters. (2) Faulty storage or NFS. + +WAL corruptions are only detected at startups, so at this point the WAL/Checkpoint would have been repaired automatically. So we can only check what happened and if there was any data loss and take actions to avoid this happening in future. + +1. Check if there was any node restarts that force killed pods. If there is, then the corruption is from the non graceful shutdown of ingesters, which is generally fine. You can: + * Describe the pod to see the last state. + * Use `kube_pod_info` to check the node for the pod. `node_boot_time_seconds` to see if node just booted (which also indicates restart). + * You can use `eventrouter` logs to double check. + * Check ingester logs to check if the shutdown logs are missing at that time. +2. To confirm this, in the logs, check the WAL segment on which the corruption happened (let's say `X`) and the last checkpoint attempt number (let's say `Y`, this is the last WAL segment that was present when checkpointing started). +3. If `X > Y`, then it's most likely an abrupt restart of ingester and the corruption would be on the last few records of the last segment. To verify this, check the file timestamps of WAL segment `X` and `X - 1` if they were recent. +4. If `X < Y`, then the corruption was in some WAL segment which was not the last one. This indicates faulty disk and some data loss on that ingester. +5. In case of faulty disk corruption, if the number or ingesters that had corruption within the chunk flush age: + 1. Less than the quorum number for your replication factor: No data loss, because there is a guarantee that the data is replicated. For example, if replication factor is 3, then it's fine if corruption was on 1 ingester. + 2. Equal or more than the quorum number but less than replication factor: There is a good chance that there is no data loss if it was replicated to desired number of ingesters. But it's good to check once for data loss. + 3. Equal or more than the replication factor: Then there is definitely some data loss. ## Cortex blocks storage - What to do when things to wrong From 723f0aab414085e2fee2d9d3549f66017887ba56 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar Date: Tue, 25 Aug 2020 14:22:24 +0530 Subject: [PATCH 144/364] Fix annotation for CortexAllocatingTooMuchMemory Signed-off-by: Ganesh Vernekar --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 03dcb8afcba..e155f017365 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -383,7 +383,7 @@ }, annotations: { message: ||| - Too much memory being used by {{ $labels.instance }} - add more ingesters. + Too much memory being used by {{ $labels.pod }} - add more ingesters. |||, }, }, @@ -402,7 +402,7 @@ }, annotations: { message: ||| - Too much memory being used by {{ $labels.instance }} - add more ingesters. + Too much memory being used by {{ $labels.pod }} - add more ingesters. |||, }, }, From ff404870fe4c5dfea4041816972dcea7c744d012 Mon Sep 17 00:00:00 2001 From: Sandeep Sukhani Date: Wed, 26 Aug 2020 17:20:29 +0530 Subject: [PATCH 145/364] changes suggested from PR review --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index e155f017365..acc6fa4b317 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -383,7 +383,7 @@ }, annotations: { message: ||| - Too much memory being used by {{ $labels.pod }} - add more ingesters. + Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - add more ingesters. |||, }, }, @@ -402,7 +402,7 @@ }, annotations: { message: ||| - Too much memory being used by {{ $labels.pod }} - add more ingesters. + Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - add more ingesters. |||, }, }, From e81cc1816495bc9335324676f43a68c09262b293 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Wed, 26 Aug 2020 11:26:53 -0400 Subject: [PATCH 146/364] exposes ruler queries for reuse Signed-off-by: Owen Diehl --- .../mimir-mixin/dashboards/ruler.libsonnet | 49 ++++++++++++------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index db60d217047..06cc2712012 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -2,6 +2,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { + rulerQueries+:: { + ruleEvaluations: { + success: + ||| + sum(rate(cortex_prometheus_rule_evaluations_total{%s}[$__interval])) + - + sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval])) + |||, + failure: 'sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval]))', + latency: + ||| + sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{%s}[$__interval])) + / + sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{%s}[$__interval])) + |||, + }, + groupEvaluations: { + missedIterations: 'sum(rate(cortex_prometheus_rule_group_iterations_missed_total{%s}[$__interval]))', + latency: + ||| + sum (rate(cortex_prometheus_rule_group_duration_seconds_sum{%s}[$__interval])) + / + sum (rate(cortex_prometheus_rule_group_duration_seconds_count{%s}[$__interval])) + |||, + }, + }, + 'ruler.json': $.dashboard('Cortex / Ruler') .addClusterSelectorTemplates() @@ -11,12 +38,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('EPS') + $.queryPanel( [ - ||| - sum(rate(cortex_prometheus_rule_evaluations_total{%s}[$__interval])) - - - sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval])) - ||| % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], - 'sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval]))' % $.jobMatcher('ruler'), + $.rulerQueries.ruleEvaluations.success % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], + $.rulerQueries.ruleEvaluations.failure % $.jobMatcher('ruler'), ], ['sucess', 'failed'], ), @@ -24,11 +47,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Latency') + $.queryPanel( - ||| - sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{%s}[$__interval])) - / - sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{%s}[$__interval])) - ||| % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], + $.rulerQueries.ruleEvaluations.latency % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], 'average' ), ) @@ -37,16 +56,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Group Evaluations') .addPanel( $.panel('Missed Iterations') + - $.queryPanel('sum(rate(cortex_prometheus_rule_group_iterations_missed_total{%s}[$__interval]))' % $.jobMatcher('ruler'), 'iterations missed'), + $.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher('ruler'), 'iterations missed'), ) .addPanel( $.panel('Latency') + $.queryPanel( - ||| - sum (rate(cortex_prometheus_rule_group_duration_seconds_sum{%s}[$__interval])) - / - sum (rate(cortex_prometheus_rule_group_duration_seconds_count{%s}[$__interval])) - ||| % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], + $.rulerQueries.groupEvaluations.latency % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], 'average' ), ) From 796dfe6486ae55f3da7ffa31b33de209e4eb1b23 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 22 Sep 2020 09:08:31 +0200 Subject: [PATCH 147/364] Renamed container_name and pod_name label names to container and pod respectively. Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 8 +++---- .../dashboards/compactor-resources.libsonnet | 10 ++++---- .../dashboards/comparison.libsonnet | 24 +++++++++---------- .../dashboards/dashboard-utils.libsonnet | 12 +++++----- .../mimir-mixin/dashboards/scaling.libsonnet | 4 ++-- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index acc6fa4b317..ea7020174c4 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -372,9 +372,9 @@ alert: 'CortexAllocatingTooMuchMemory', expr: ||| ( - container_memory_working_set_bytes{container_name="ingester"} + container_memory_working_set_bytes{container="ingester"} / - container_spec_memory_limit_bytes{container_name="ingester"} + container_spec_memory_limit_bytes{container="ingester"} ) > 0.5 |||, 'for': '15m', @@ -391,9 +391,9 @@ alert: 'CortexAllocatingTooMuchMemory', expr: ||| ( - container_memory_working_set_bytes{container_name="ingester"} + container_memory_working_set_bytes{container="ingester"} / - container_spec_memory_limit_bytes{container_name="ingester"} + container_spec_memory_limit_bytes{container="ingester"} ) > 0.8 |||, 'for': '15m', diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet index 043a33374d2..02d9edd0b34 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-compactor-resources.json': local filterNodeDiskByCompactor = ||| - ignoring(pod_name) group_right() (label_replace(count by(pod_name, instance, device) (container_fs_writes_bytes_total{%s,container="compactor",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) + ignoring(pod) group_right() (label_replace(count by(pod, instance, device) (container_fs_writes_bytes_total{%s,container="compactor",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) ||| % $.namespaceMatcher(); $.dashboard('Cortex / Compactor Resources') @@ -24,13 +24,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Network') .addPanel( $.panel('Receive Bandwidth') + - $.queryPanel('sum by(pod_name) (rate(container_network_receive_bytes_total{%s,pod_name=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod_name}}') + + $.queryPanel('sum by(pod) (rate(container_network_receive_bytes_total{%s,pod=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Transmit Bandwidth') + - $.queryPanel('sum by(pod_name) (rate(container_network_transmit_bytes_total{%s,pod_name=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod_name}}') + + $.queryPanel('sum by(pod) (rate(container_network_transmit_bytes_total{%s,pod=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) @@ -39,13 +39,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Disk') .addPanel( $.panel('Writes') + - $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod_name}} - {{device}}') + + $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Reads') + - $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod_name}} - {{device}}') + + $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) diff --git a/jsonnet/mimir-mixin/dashboards/comparison.libsonnet b/jsonnet/mimir-mixin/dashboards/comparison.libsonnet index 2646fffb981..29355df9d4a 100644 --- a/jsonnet/mimir-mixin/dashboards/comparison.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/comparison.libsonnet @@ -30,13 +30,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('CPU per sample') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__interval]))', 'chunks') ) .addPanel( $.panel('Memory per active series') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - working set') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + { yaxes: $.yaxes('bytes') } @@ -46,13 +46,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__interval]))', 'chunks') ) .addPanel( $.panel('Memory') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="ingester"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="ingester"})', 'chunks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"})', 'blocks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"})', 'chunks - working set') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + { yaxes: $.yaxes('bytes') } @@ -90,13 +90,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container_name="querier"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container_name="querier"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"}[$__interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"}[$__interval]))', 'chunks') ) .addPanel( $.panel('Memory') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container_name="querier"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container_name="querier"})', 'chunks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"})', 'blocks - working set') + + $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"})', 'chunks - working set') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/querier"})', 'blocks - heap inuse') + $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/querier"})', 'chunks - heap inuse') + { yaxes: $.yaxes('bytes') } diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index aa1e68ceb66..00210fa5efa 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -128,9 +128,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerCPUUsagePanel(title, containerName):: $.panel(title) + $.queryPanel([ - 'sum by(pod_name) (rate(container_cpu_usage_seconds_total{%s,container_name="%s"}[$__interval]))' % [$.namespaceMatcher(), containerName], - 'min(container_spec_cpu_quota{%s,container_name="%s"} / container_spec_cpu_period{%s,container_name="%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], - ], ['{{pod_name}}', 'limit']) + + 'sum by(pod) (rate(container_cpu_usage_seconds_total{%s,container="%s"}[$__interval]))' % [$.namespaceMatcher(), containerName], + 'min(container_spec_cpu_quota{%s,container="%s"} / container_spec_cpu_period{%s,container="%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], + ], ['{{pod}}', 'limit']) + { seriesOverrides: [ { @@ -144,9 +144,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerMemoryWorkingSetPanel(title, containerName):: $.panel(title) + $.queryPanel([ - 'sum by(pod_name) (container_memory_working_set_bytes{%s,container_name="%s"})' % [$.namespaceMatcher(), containerName], - 'min(container_spec_memory_limit_bytes{%s,container_name="%s"} > 0)' % [$.namespaceMatcher(), containerName], - ], ['{{pod_name}}', 'limit']) + + 'sum by(pod) (container_memory_working_set_bytes{%s,container="%s"})' % [$.namespaceMatcher(), containerName], + 'min(container_spec_memory_limit_bytes{%s,container="%s"} > 0)' % [$.namespaceMatcher(), containerName], + ], ['{{pod}}', 'limit']) + { seriesOverrides: [ { diff --git a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet index 4f37132e154..97e9b124d35 100644 --- a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet @@ -80,7 +80,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) * - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod_name", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:]) + quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:]) / sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_cpu_cores{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) |||, @@ -94,7 +94,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) * - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod_name", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m]) + quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m]) / sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_memory_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) |||, From 6ce621b4eb70ea37bb00fedcbe6c098366eff655 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Sun, 27 Sep 2020 13:45:43 -0400 Subject: [PATCH 148/364] feat: add new api endpoint prefixes to dashboards Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 18 +++++++++--------- .../mimir-mixin/dashboards/writes.libsonnet | 14 +++++++------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 18d95d6a7a4..f4daf24b1bd 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -8,16 +8,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Gateway') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_prom_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' ) + { yaxes: $.yaxes('s') } ) @@ -26,16 +26,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Query Frontend') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_prom_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.query_frontend)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.query_frontend)], '' ) + { yaxes: $.yaxes('s') } ) @@ -55,16 +55,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Querier') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', 'api_prom_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_prom_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.querier)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.querier)], '' ) + { yaxes: $.yaxes('s') } ) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 1f61778aff3..81deae2baed 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -27,23 +27,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('QPS') + - $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route="api_prom_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route="(api_v1|api_prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route="api_prom_push"}' % $.jobMatcher($._config.job_names.gateway)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route="(api_v1|api_prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.eq('route', 'api_prom_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.eq('route', '(api_v1|api_prom)_push')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="api_prom_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="(api_v1|api_prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' ) + { yaxes: $.yaxes('s') } ) @@ -52,16 +52,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Distributor') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/httpgrpc.*|api_prom_push"}' % $.jobMatcher($._config.job_names.distributor)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/httpgrpc.*|(api_v1|api_prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/httpgrpc.*|api_prom_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/httpgrpc.*|(api_v1|api_prom)_push')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/httpgrpc.*|api_prom_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.distributor)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/httpgrpc.*|(api_v1|api_prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.distributor)], '' ) + { yaxes: $.yaxes('s') } ) From 36c0eaa7f7932ccef03a21a66f091d302d50fe21 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Sun, 27 Sep 2020 13:48:20 -0400 Subject: [PATCH 149/364] simpler regex Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 81deae2baed..fd2557d7f31 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -27,23 +27,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('QPS') + - $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route="(api_v1|api_prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route="api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route="(api_v1|api_prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route="api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.eq('route', '(api_v1|api_prom)_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.eq('route', 'api_(v1|prom)_push')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="(api_v1|api_prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="api_(v1|prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' ) + { yaxes: $.yaxes('s') } ) @@ -52,16 +52,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Distributor') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/httpgrpc.*|(api_v1|api_prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/httpgrpc.*|(api_v1|api_prom)_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/httpgrpc.*|api_(v1|prom)_push')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/httpgrpc.*|(api_v1|api_prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.distributor)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/httpgrpc.*|api_(v1|prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.distributor)], '' ) + { yaxes: $.yaxes('s') } ) From 5d33ce4e3946c718ab5b162c6ebb0c028d71ecb4 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 1 Oct 2020 12:50:22 +0530 Subject: [PATCH 150/364] Fix writes dashboard (https://github.com/grafana/cortex-jsonnet/pull/193) Signed-off-by: Ganesh Vernekar --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index fd2557d7f31..374f0610668 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -27,14 +27,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('QPS') + - $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route="api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route="api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( $.panel('Latency') + @@ -43,7 +43,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="api_(v1|prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' ) + { yaxes: $.yaxes('s') } ) From d113cfe87d49b5af4262814863ea4cf3b2ba7838 Mon Sep 17 00:00:00 2001 From: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Date: Thu, 1 Oct 2020 13:05:25 +0530 Subject: [PATCH 151/364] Fix write dashboard (https://github.com/grafana/cortex-jsonnet/pull/194) Signed-off-by: Ganesh Vernekar --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 374f0610668..db8a8415182 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -38,7 +38,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.eq('route', 'api_(v1|prom)_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + From f991c23a6bfc924d6d9b82ab32afd0f37f81ecf6 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Sun, 11 Oct 2020 15:10:13 -0400 Subject: [PATCH 152/364] feat: use querier specific API metrics Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index f4daf24b1bd..56738fd8793 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -55,16 +55,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Querier') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.querier)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.querier)], '' ) + { yaxes: $.yaxes('s') } ) From c879631c53ea96e06aabd04a5a69882e914feaa7 Mon Sep 17 00:00:00 2001 From: Jacob Lisi Date: Sun, 11 Oct 2020 15:14:33 -0400 Subject: [PATCH 153/364] add recording rule group Signed-off-by: Jacob Lisi --- jsonnet/mimir-mixin/recording_rules.libsonnet | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 73bdbcac8dc..54544d0a283 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -10,6 +10,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job', 'route']) + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']), }, + { + name: 'cortex_querier_api', + rules: + utils.histogramRules('cortex_querier_request_duration_seconds', ['cluster', 'job']) + + utils.histogramRules('cortex_querier_request_duration_seconds', ['cluster', 'job', 'route']) + + utils.histogramRules('cortex_querier_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']), + }, { name: 'cortex_cache', rules: From 49229cb35a19a9152230274f63f66b5376d1add1 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Sat, 26 Sep 2020 15:19:50 +0100 Subject: [PATCH 154/364] Remove jobMatcherEquality; empty panels look worse than slow ones. Eventually we'll use Ganesh's topk work in these panels. Signed-off-by: Tom Wilkie --- .../mimir-mixin/dashboards/dashboard-utils.libsonnet | 8 -------- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 10 +++++----- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 6 +++--- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 00210fa5efa..4429e906438 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -45,14 +45,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; then 'job=~"$job"' else 'cluster=~"$cluster", job=~"($namespace)/%s"' % job, - // jobMatcherEquality performs exact matches on cluster and namespace. Should be used on - // panels that are expected to return too many series to be useful when multiplier - // namespaces or clusters are selected. - jobMatcherEquality(job):: - if $._config.singleBinary - then 'job=~"$job"' - else 'cluster="$cluster", namespace="$namespace", job=~"($namespace)/%s"' % job, - namespaceMatcher():: if $._config.singleBinary then 'job=~"$job"' diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index f4daf24b1bd..bd7e9777d03 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -17,7 +17,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' ) + { yaxes: $.yaxes('s') } ) @@ -35,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.query_frontend)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' ) + { yaxes: $.yaxes('s') } ) @@ -64,7 +64,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.querier)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], '' ) + { yaxes: $.yaxes('s') } ) @@ -82,7 +82,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.ingester)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' ) + { yaxes: $.yaxes('s') } ) @@ -101,7 +101,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.store_gateway)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], '' ) + { yaxes: $.yaxes('s') } ) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index db8a8415182..d367bd0ce05 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -43,7 +43,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.gateway)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' ) + { yaxes: $.yaxes('s') } ) @@ -61,7 +61,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/httpgrpc.*|api_(v1|prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.distributor)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/httpgrpc.*|api_(v1|prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' ) + { yaxes: $.yaxes('s') } ) @@ -90,7 +90,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcherEquality($._config.job_names.ingester)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' ) + { yaxes: $.yaxes('s') } ) From 93196b2b6b35c5023693e59c533202ad08d201b9 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 13 Oct 2020 11:47:00 +0100 Subject: [PATCH 155/364] Use mixtool to build and lint the mixin. Signed-off-by: Tom Wilkie --- jsonnet/mimir-mixin/alerts.jsonnet | 3 --- jsonnet/mimir-mixin/dashboards.jsonnet | 6 ------ jsonnet/mimir-mixin/dashboards/chunks.libsonnet | 4 ++-- .../dashboards/compactor-resources.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/compactor.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/comparison.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/config.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/object-store.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/queries.libsonnet | 2 +- .../mimir-mixin/dashboards/reads-resources.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/ruler.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/scaling.libsonnet | 2 +- .../mimir-mixin/dashboards/writes-resources.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 2 +- jsonnet/mimir-mixin/jsonnetfile.json | 10 +++++----- jsonnet/mimir-mixin/recording_rules.jsonnet | 1 - 17 files changed, 19 insertions(+), 29 deletions(-) delete mode 100644 jsonnet/mimir-mixin/alerts.jsonnet delete mode 100644 jsonnet/mimir-mixin/dashboards.jsonnet delete mode 100644 jsonnet/mimir-mixin/recording_rules.jsonnet diff --git a/jsonnet/mimir-mixin/alerts.jsonnet b/jsonnet/mimir-mixin/alerts.jsonnet deleted file mode 100644 index bd44d1d999f..00000000000 --- a/jsonnet/mimir-mixin/alerts.jsonnet +++ /dev/null @@ -1,3 +0,0 @@ -local mixin = import 'mixin.libsonnet'; - -std.manifestYamlDoc(mixin.prometheusAlerts) diff --git a/jsonnet/mimir-mixin/dashboards.jsonnet b/jsonnet/mimir-mixin/dashboards.jsonnet deleted file mode 100644 index c3ec625a279..00000000000 --- a/jsonnet/mimir-mixin/dashboards.jsonnet +++ /dev/null @@ -1,6 +0,0 @@ -local mixin = import 'mixin.libsonnet'; - -{ - [name]: std.manifestJsonEx(mixin.grafanaDashboards[name], ' ') - for name in std.objectFields(mixin.grafanaDashboards) -} diff --git a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet index 15b6426cf83..979ee80380d 100644 --- a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet @@ -2,7 +2,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-chunks.json': - $.dashboard('Cortex / Chunks') + ($.dashboard('Cortex / Chunks') + { uid: 'a56a3fa6284064eb392a115f3acbf744' }) .addClusterSelectorTemplates() .addRow( $.row('Active Series / Chunks') @@ -52,7 +52,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), 'cortex-wal.json': - $.dashboard('Cortex / WAL') + ($.dashboard('Cortex / WAL') + { uid: 'd4fb924cdc1581cd8e870e3eb0110bda' }) .addClusterSelectorTemplates() .addRow( $.row('') diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet index 02d9edd0b34..92b8fd76a38 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet @@ -6,7 +6,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ignoring(pod) group_right() (label_replace(count by(pod, instance, device) (container_fs_writes_bytes_total{%s,container="compactor",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) ||| % $.namespaceMatcher(); - $.dashboard('Cortex / Compactor Resources') + ($.dashboard('Cortex / Compactor Resources') + { uid: 'df9added6f1f4332f95848cca48ebd99' }) .addClusterSelectorTemplates() .addRow( $.row('CPU and Memory') diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 6ae2f28e1fb..347e0139049 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -2,7 +2,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-compactor.json': - $.dashboard('Cortex / Compactor') + ($.dashboard('Cortex / Compactor') + { uid: '9c408e1d55681ecb8a22c9fab46875cc' }) .addClusterSelectorTemplates() .addRow( $.row('Compactions') diff --git a/jsonnet/mimir-mixin/dashboards/comparison.libsonnet b/jsonnet/mimir-mixin/dashboards/comparison.libsonnet index 29355df9d4a..836e9b46001 100644 --- a/jsonnet/mimir-mixin/dashboards/comparison.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/comparison.libsonnet @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-blocks-vs-chunks.json': - $.dashboard('Cortex / Blocks vs Chunks') + ($.dashboard('Cortex / Blocks vs Chunks') + { uid: '0e2b4dd23df9921972e3fb554c0fc483' }) .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') .addTemplate('blocks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') .addTemplate('chunks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') diff --git a/jsonnet/mimir-mixin/dashboards/config.libsonnet b/jsonnet/mimir-mixin/dashboards/config.libsonnet index eedfcb4c393..9240ef89dc7 100644 --- a/jsonnet/mimir-mixin/dashboards/config.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/config.libsonnet @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-config.json': - $.dashboard('Cortex / Config') + ($.dashboard('Cortex / Config') + { uid: '61bb048ced9817b2d3e07677fb1c6290' }) .addClusterSelectorTemplates() .addRow( $.row('Startup config file') diff --git a/jsonnet/mimir-mixin/dashboards/object-store.libsonnet b/jsonnet/mimir-mixin/dashboards/object-store.libsonnet index 3263446cc73..69d0492e703 100644 --- a/jsonnet/mimir-mixin/dashboards/object-store.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/object-store.libsonnet @@ -2,7 +2,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-object-store.json': - $.dashboard('Cortex / Object Store') + ($.dashboard('Cortex / Object Store') + { uid: 'd5a3a4489d57c733b5677fb55370a723' }) .addClusterSelectorTemplates() .addRow( $.row('Components') diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index e8fb81c734c..35888c4187e 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-queries.json': - $.dashboard('Cortex / Queries') + ($.dashboard('Cortex / Queries') + { uid: 'd9931b1054053c8b972d320774bb8f1d' }) .addClusterSelectorTemplates() .addRow( $.row('Query Frontend') diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index 6dd8d0cef9d..ea2115c05c0 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -2,7 +2,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-reads-resources.json': - $.dashboard('Cortex / Reads Resources') + ($.dashboard('Cortex / Reads Resources') + { uid: '2fd2cda9eea8d8af9fbc0a5960425120' }) .addClusterSelectorTemplates() .addRow( $.row('Gateway') diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index bd7e9777d03..2332126ab16 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -2,7 +2,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-reads.json': - $.dashboard('Cortex / Reads') + ($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' }) .addClusterSelectorTemplates() .addRow( $.row('Gateway') diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 06cc2712012..4ca6edd0643 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -30,7 +30,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, 'ruler.json': - $.dashboard('Cortex / Ruler') + ($.dashboard('Cortex / Ruler') + { uid: '44d12bcb1f95661c6ab6bc946dfc3473' }) .addClusterSelectorTemplates() .addRow( $.row('Rule Evaluations') diff --git a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet index 97e9b124d35..d1ff7bd31e6 100644 --- a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-scaling.json': - $.dashboard('Cortex / Scaling') + ($.dashboard('Cortex / Scaling') + { uid: '88c041017b96856c9176e07cf557bdcf' }) .addClusterSelectorTemplates() .addRow( $.row('Workload-based scaling') diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index ccdc966f9e5..6b38ddbed75 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -2,7 +2,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-writes-resources.json': - $.dashboard('Cortex / Writes Resources') + ($.dashboard('Cortex / Writes Resources') + { uid: 'c0464f0d8bd026f776c9006b0591bb0b' }) .addClusterSelectorTemplates() .addRow( $.row('Gateway') diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index d367bd0ce05..3886441434e 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -2,7 +2,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-writes.json': - $.dashboard('Cortex / Writes') + ($.dashboard('Cortex / Writes') + { uid: '0156f6d15aa234d452a33a4f13c838e3' }) .addClusterSelectorTemplates() .addRow( ($.row('Headlines') + diff --git a/jsonnet/mimir-mixin/jsonnetfile.json b/jsonnet/mimir-mixin/jsonnetfile.json index 87e724d5ed1..3f1547aaebd 100644 --- a/jsonnet/mimir-mixin/jsonnetfile.json +++ b/jsonnet/mimir-mixin/jsonnetfile.json @@ -1,24 +1,24 @@ { + "version": 1, "dependencies": [ { - "name": "grafana-builder", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "grafana-builder" } }, "version": "master" }, { - "name": "mixin-utils", "source": { "git": { - "remote": "https://github.com/grafana/jsonnet-libs", + "remote": "https://github.com/grafana/jsonnet-libs.git", "subdir": "mixin-utils" } }, "version": "master" } - ] + ], + "legacyImports": true } diff --git a/jsonnet/mimir-mixin/recording_rules.jsonnet b/jsonnet/mimir-mixin/recording_rules.jsonnet deleted file mode 100644 index dbe13f417b4..00000000000 --- a/jsonnet/mimir-mixin/recording_rules.jsonnet +++ /dev/null @@ -1 +0,0 @@ -std.manifestYamlDoc((import 'mixin.libsonnet').prometheusRules) From 4e497b198768c309e6124f5dd8cd16d5496c64cf Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 14 Oct 2020 15:31:29 +0200 Subject: [PATCH 156/364] Add ingester-blocks to ingester's job label matcher Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 08336b3c70a..f8df1be11df 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -26,7 +26,7 @@ // These are used by the dashboards and allow for the simultaneous display of // microservice and single binary cortex clusters. job_names: { - ingester: '(ingester|cortex$)', + ingester: '(ingester|ingester-blocks|cortex$)', // Match also ingester-blocks, which is used during the migration from chunks to blocks. distributor: '(distributor|cortex$)', querier: '(querier|cortex$)', query_frontend: '(query-frontend|cortex$)', From 170ef439a95da02ac338154fe348f17e54157f5e Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 9 Oct 2020 13:54:13 +0100 Subject: [PATCH 157/364] Improved Ruler Dashboard Includes information about per rule group per ruler evaluation and notification delivery. --- .../mimir-mixin/dashboards/ruler.libsonnet | 72 +++++++++++++++++-- 1 file changed, 66 insertions(+), 6 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 06cc2712012..e60deb23e17 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -18,13 +18,42 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{%s}[$__interval])) |||, }, + perUserPerGroupEvaluations: { + failure: 'sum by(rule_group) (rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval])) > 0', + latency: + ||| + sum by(user) (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{%s}[$__interval])) + / + sum by(user) (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{%s}[$__interval])) + |||, + }, groupEvaluations: { - missedIterations: 'sum(rate(cortex_prometheus_rule_group_iterations_missed_total{%s}[$__interval]))', + missedIterations: 'sum by(user) (rate(cortex_prometheus_rule_group_iterations_missed_total{%s}[$__interval])) > 0', latency: ||| - sum (rate(cortex_prometheus_rule_group_duration_seconds_sum{%s}[$__interval])) + rate(cortex_prometheus_rule_group_duration_seconds_sum{%s}[$__interval]) + / + rate(cortex_prometheus_rule_group_duration_seconds_count{%s}[$__interval]) + |||, + }, + notifications: { + failure: + ||| + sum by(user) (rate(cortex_prometheus_notifications_errors_total{%s}[$__interval])) + / + sum by(user) (rate(cortex_prometheus_notifications_sent_total{%s}[$__interval])) + > 0 + |||, + queue: + ||| + sum by(user) (rate(cortex_prometheus_notifications_queue_length{%s}[$__interval])) / - sum (rate(cortex_prometheus_rule_group_duration_seconds_count{%s}[$__interval])) + sum by(user) (rate(cortex_prometheus_notifications_queue_capacity{%s}[$__interval])) + > 0 + |||, + dropped: + ||| + sum by (user) (increase(cortex_prometheus_notifications_dropped_total{%s}[$__interval])) > 0 |||, }, }, @@ -33,7 +62,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.dashboard('Cortex / Ruler') .addClusterSelectorTemplates() .addRow( - $.row('Rule Evaluations') + $.row('Rule Evaluations Global') .addPanel( $.panel('EPS') + $.queryPanel( @@ -56,14 +85,45 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Group Evaluations') .addPanel( $.panel('Missed Iterations') + - $.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher('ruler'), 'iterations missed'), + $.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher('ruler'), '{{ user }}'), ) .addPanel( $.panel('Latency') + $.queryPanel( $.rulerQueries.groupEvaluations.latency % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], - 'average' + '{{ user }}' ), ) + .addPanel( + $.panel('Failures') + + $.queryPanel( + $.rulerQueries.perUserPerGroupEvaluations.failure % [$.jobMatcher('ruler')], '{{ rule_group }}' + ) + ) + ) + .addRow( + $.row('Rule Evaluation per User') + .addPanel( + $.panel('Latency') + + $.queryPanel( + $.rulerQueries.perUserPerGroupEvaluations.latency % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], + '{{ user }}' + ) + ) + ) + .addRow( + $.row('Notifications') + .addPanel( + $.panel('Delivery Errors') + + $.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}') + ) + .addPanel( + $.panel('Queue Length') + + $.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}') + ) + .addPanel( + $.panel('Dropped') + + $.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher('ruler'), '{{ user }}') + ) ), } From 6aa50da4869408f35466da7bd618355fb289fc4c Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 15 Oct 2020 17:03:42 +0100 Subject: [PATCH 158/364] Add ruler QPS and latency panels --- jsonnet/mimir-mixin/dashboards/ruler.libsonnet | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index e60deb23e17..007457a90f3 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -81,6 +81,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) ) + .addRow( + $.row('Gateway Latency') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_rules.*|api_prom_api_v1_(rules|alerts)"}' % $.jobMatcher($._config.job_names.gateway)) + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_prom_rules.*|api_prom_api_v1_(rules|alerts)')]) + ) + ) .addRow( $.row('Group Evaluations') .addPanel( From a53dd6424fc55cec9a7f6a3f92b102094e3a485c Mon Sep 17 00:00:00 2001 From: Jack Baldry Date: Fri, 16 Oct 2020 10:22:33 +0100 Subject: [PATCH 159/364] s/Metadada/Metadata/ --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 4bece33aa6d..2d3107b5114 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -155,11 +155,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.thanosMemcachedCache('Memcached – Blocks Storage – Metadada (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache') + $.thanosMemcachedCache('Memcached – Blocks Storage – Metadata (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache') ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.thanosMemcachedCache('Memcached – Blocks Storage – Metadada (Querier)', $._config.job_names.querier, 'querier', 'metadata-cache') + $.thanosMemcachedCache('Memcached – Blocks Storage – Metadata (Querier)', $._config.job_names.querier, 'querier', 'metadata-cache') ) .addRowIf( std.member($._config.storage_engine, 'chunks') && From c91aabcdeefa0ea08ac4997b7e14299a3a731721 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Sat, 17 Oct 2020 19:57:57 +0100 Subject: [PATCH 160/364] Add the ruler to the write resources dashboard --- jsonnet/mimir-mixin/config.libsonnet | 1 + .../dashboards/writes-resources.libsonnet | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 08336b3c70a..1bfa3a69e8f 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -29,6 +29,7 @@ ingester: '(ingester|cortex$)', distributor: '(distributor|cortex$)', querier: '(querier|cortex$)', + ruler: '(ruler|cortex$)', query_frontend: '(query-frontend|cortex$)', table_manager: '(table-manager|cortex$)', store_gateway: '(store-gateway|cortex$)', diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index ccdc966f9e5..a51f81b6d23 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -46,6 +46,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), ) + ) + .addRow( + $.row('Ruler') + .addPanel( + $.panel('Rules') + + $.queryPanel('sum by(instance) (cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher($._config.job_names.ruler), '{{instance}}'), + ) + .addPanel( + $.containerCPUUsagePanel('CPU', 'ruler'), + ) + ) + .addRow( + $.row('') + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ruler'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'ruler'), + ) ) + { templating+: { list: [ From a77a96aa311fd4c18f9c6886ba7f282d03b760c7 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Mon, 19 Oct 2020 15:36:28 +0100 Subject: [PATCH 161/364] Add Headlines and Write/Reads QPS and latency to the ruler dahsboard --- .../mimir-mixin/dashboards/ruler.libsonnet | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 007457a90f3..21e9a26754c 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -61,6 +61,28 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'ruler.json': $.dashboard('Cortex / Ruler') .addClusterSelectorTemplates() + .addRow( + ($.row('Headlines') + { + height: '100px', + showTitle: false, + }) + .addPanel( + $.panel('Active Configurations') + + $.statPanel('sum(cortex_ruler_managers_total{%s})' % $.jobMatcher('ruler'), format='short') + ) + .addPanel( + $.panel('Total Rules') + + $.statPanel('sum(cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher('ruler'), format='short') + ) + .addPanel( + $.panel('Read QPS') + + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}[5m]))' % $.jobMatcher('ruler'), format='reqps') + ) + .addPanel( + $.panel('Write QPS') + + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}[5m]))' % $.jobMatcher('ruler'), format='reqps') + ) + ) .addRow( $.row('Rule Evaluations Global') .addPanel( @@ -82,7 +104,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Gateway Latency') + $.row('Configuration API (gateway)') .addPanel( $.panel('QPS') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_rules.*|api_prom_api_v1_(rules|alerts)"}' % $.jobMatcher($._config.job_names.gateway)) @@ -92,6 +114,28 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_prom_rules.*|api_prom_api_v1_(rules|alerts)')]) ) ) + .addRow( + $.row('Writes') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher('ruler')) + ) + .addPanel( + $.panel('Latency') + + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobSelector('ruler')) + ) + ) + .addRow( + $.row('Reads') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher('ruler')) + ) + .addPanel( + $.panel('Latency') + + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobSelector('ruler')) + ) + ) .addRow( $.row('Group Evaluations') .addPanel( From 94f6a6c0eb6aa94934e5c1f3b5f24674c5d69b25 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Mon, 19 Oct 2020 15:57:35 +0100 Subject: [PATCH 162/364] Be specific about read/writes --- jsonnet/mimir-mixin/dashboards/ruler.libsonnet | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 21e9a26754c..b16e753a13d 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -75,11 +75,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.statPanel('sum(cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher('ruler'), format='short') ) .addPanel( - $.panel('Read QPS') + + $.panel('Read from Ingesters - QPS') + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}[5m]))' % $.jobMatcher('ruler'), format='reqps') ) .addPanel( - $.panel('Write QPS') + + $.panel('Write to Ingesters - QPS') + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}[5m]))' % $.jobMatcher('ruler'), format='reqps') ) ) @@ -115,25 +115,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Writes') + $.row('Writes (Ingesters)') .addPanel( $.panel('QPS') + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher('ruler')) ) .addPanel( $.panel('Latency') + - $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobSelector('ruler')) + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher('ruler')) ) ) .addRow( - $.row('Reads') + $.row('Reads (Ingesters)') .addPanel( $.panel('QPS') + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher('ruler')) ) .addPanel( $.panel('Latency') + - $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobSelector('ruler')) + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher('ruler')) ) ) .addRow( From e422c50f8edbfa3363114c7f2575d1baddba545c Mon Sep 17 00:00:00 2001 From: gotjosh Date: Mon, 19 Oct 2020 16:06:04 +0100 Subject: [PATCH 163/364] Move to read resources --- .../dashboards/reads-resources.libsonnet | 19 ++++++++++++++++++ .../dashboards/writes-resources.libsonnet | 20 +------------------ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index 6dd8d0cef9d..9291d7bccfa 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -52,6 +52,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), ) ) + .addRow( + $.row('Ruler') + .addPanel( + $.panel('Rules') + + $.queryPanel('sum by(instance) (cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher($._config.job_names.ruler), '{{instance}}'), + ) + .addPanel( + $.containerCPUUsagePanel('CPU', 'ruler'), + ) + ) + .addRow( + $.row('') + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ruler'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'ruler'), + ) + ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index a51f81b6d23..f9a31175880 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -47,25 +47,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), ) ) - .addRow( - $.row('Ruler') - .addPanel( - $.panel('Rules') + - $.queryPanel('sum by(instance) (cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher($._config.job_names.ruler), '{{instance}}'), - ) - .addPanel( - $.containerCPUUsagePanel('CPU', 'ruler'), - ) - ) - .addRow( - $.row('') - .addPanel( - $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ruler'), - ) - .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'ruler'), - ) - ) + { + + { templating+: { list: [ // Do not allow to include all clusters/namespaces otherwise this dashboard From aa23d67a9cebc70929575a39f855f7a6be030117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 27 Oct 2020 12:53:40 +0100 Subject: [PATCH 164/364] Added alert for failed compator run. --- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index b31ae337473..f11ee00cd93 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -63,6 +63,21 @@ message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.', }, }, + { + // Alert if compactor fails. + alert: 'CortexCompactorRunFailed', + expr: ||| + increase(cortex_compactor_runs_failed_total[10m]) > 0 + |||, + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + {{ $labels.job }}/{{ $labels.instance }} failed to run compaction. + |||, + }, + }, ], }, ], From f8a743495414ad12375058f0d52deae926c15966 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 27 Oct 2020 14:34:57 +0100 Subject: [PATCH 165/364] Update cortex-mixin/config.libsonnet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marco Pracucci Co-authored-by: Peter Štibraný --- jsonnet/mimir-mixin/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index f8df1be11df..44238e9c262 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -26,7 +26,7 @@ // These are used by the dashboards and allow for the simultaneous display of // microservice and single binary cortex clusters. job_names: { - ingester: '(ingester|ingester-blocks|cortex$)', // Match also ingester-blocks, which is used during the migration from chunks to blocks. + ingester: '(ingester.*|cortex$)', // Match also ingester-blocks, which is used during the migration from chunks to blocks. distributor: '(distributor|cortex$)', querier: '(querier|cortex$)', query_frontend: '(query-frontend|cortex$)', From 72905cfe7c6d695c6aa56d743a377edb8608258b Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 27 Oct 2020 18:00:38 +0000 Subject: [PATCH 166/364] Add a dashboard for the Alertmanager Includes: - Notification - Alerts received - Configuration API / UI Signed-off-by: gotjosh --- jsonnet/mimir-mixin/dashboards.libsonnet | 1 + .../dashboards/alertmanager.libsonnet | 88 +++++++++++++++++++ .../mimir-mixin/dashboards/ruler.libsonnet | 2 +- 3 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 06e739b776d..6f5e44f4870 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -4,6 +4,7 @@ (import 'dashboards/queries.libsonnet') + (import 'dashboards/reads.libsonnet') + (import 'dashboards/ruler.libsonnet') + + (import 'dashboards/alertmanager.libsonnet') + (import 'dashboards/scaling.libsonnet') + (import 'dashboards/writes.libsonnet') + diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet new file mode 100644 index 00000000000..0b5c77a4c44 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -0,0 +1,88 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + + 'alertmanager.json': + $.dashboard('Cortex / Alertmanager') + .addClusterSelectorTemplates() + .addRow( + ($.row('Headlines') + { + height: '100px', + showTitle: false, + }) + .addPanel( + $.panel('Total Alerts') + + $.statPanel('sum(cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), format='short') + ) + .addPanel( + $.panel('Total Silences') + + $.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short') + ) + ) + .addRow( + $.row('Alerts Received') + .addPanel( + $.panel('APS') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_alerts_received_total{%s}[$__interval])) + - + sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + ) + .addRow( + $.row('Alert Notifications') + .addPanel( + $.panel('NPS') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_notifications_total{%s}[$__interval])) + - + sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('NPS by integration') + + $.queryPanel( + [ + ||| + ( + sum(rate(cortex_alertmanager_notifications_total{%s}[$__interval])) by(integration) + - + sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval])) by(integration) + ) > 0 + or on () vector(0) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval])) by(integration)' % $.jobMatcher('alertmanager'), + ], + ['success - {{ integration }}', 'failed - {{ integration }}'] + ) + ) + .addPanel( + $.panel('Latency') + + $.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher('alertmanager')) + ) + ) + .addRow( + $.row('Configuration API (gateway) + Alertmanager UI') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_v1_alerts|alertmanager"}' % $.jobMatcher($._config.job_names.gateway)) + ) + .addPanel( + $.panel('Latency') + + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_v1_alerts|alertmanager')]) + ) + ), +} diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 8fc83cfbcd4..9c81e8221f4 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -92,7 +92,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.rulerQueries.ruleEvaluations.success % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], $.rulerQueries.ruleEvaluations.failure % $.jobMatcher('ruler'), ], - ['sucess', 'failed'], + ['success', 'failed'], ), ) .addPanel( From 012fa096262c0e220058699b48ad021173635345 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 27 Oct 2020 18:10:58 +0000 Subject: [PATCH 167/364] md5 dashboard name --- jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index 0b5c77a4c44..6f8d8a4c8dc 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'alertmanager.json': - $.dashboard('Cortex / Alertmanager') + $.dashboard('Cortex / Alertmanager') + { uid: 'a76bee5913c97c918d9e56a3cc88cc28' }) .addClusterSelectorTemplates() .addRow( ($.row('Headlines') + { From 590792702d97a9e8e1238fd93681c07e7c4dca40 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Mon, 2 Nov 2020 11:03:50 +0000 Subject: [PATCH 168/364] Fix wrapping --- jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index 6f8d8a4c8dc..1f3bdbc47d2 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'alertmanager.json': - $.dashboard('Cortex / Alertmanager') + { uid: 'a76bee5913c97c918d9e56a3cc88cc28' }) + ($.dashboard('Cortex / Alertmanager') + { uid: 'a76bee5913c97c918d9e56a3cc88cc28' }) .addClusterSelectorTemplates() .addRow( ($.row('Headlines') + { From 4a171c1d0e5a6c0bb529d2d98f7903bb3781b334 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 10 Nov 2020 15:23:32 +0100 Subject: [PATCH 169/364] Added more critical alerts on Cortex ingester TSDB (https://github.com/grafana/cortex-jsonnet/pull/208) * Added more critical alerts on Cortex ingester TSDB Signed-off-by: Marco Pracucci * Added CHANGELOG entry Signed-off-by: Marco Pracucci * Addressed review comments Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 73 +++++++++++++++++++++ jsonnet/mimir-mixin/docs/playbooks.md | 60 +++++++++++++++-- 2 files changed, 127 insertions(+), 6 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 6d435b68bb3..d99b9a0fcfd 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -55,6 +55,79 @@ message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.', }, }, + { + alert: 'CortexIngesterTSDBHeadTruncationFailed', + expr: ||| + rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB head.', + }, + }, + { + alert: 'CortexIngesterTSDBCheckpointCreationFailed', + expr: ||| + rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to create TSDB checkpoint.', + }, + }, + { + alert: 'CortexIngesterTSDBCheckpointDeletionFailed', + expr: ||| + rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete TSDB checkpoint.', + }, + }, + { + alert: 'CortexIngesterTSDBWALTruncationFailed', + expr: ||| + rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 + |||, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB WAL.', + }, + }, + { + alert: 'CortexIngesterTSDBWALCorrupted', + expr: ||| + rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} got a corrupted TSDB WAL.', + }, + }, + { + alert: 'CortexIngesterTSDBWALWritesFailed', + 'for': '3m', + expr: ||| + rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to write to TSDB WAL.', + }, + }, { // Alert if the querier is not successfully scanning the bucket. alert: 'CortexQuerierHasNotScanTheBucket', diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index fdf472c0426..1bc69bded08 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -82,7 +82,7 @@ This alert occurs when a ruler is unable to validate whether or not it should cl This alert fires when a Cortex ingester is not uploading any block to the long-term storage. An ingester is expected to upload a block to the storage every block range period (defaults to 2h) and if a longer time elapse since the last successful upload it means something is not working correctly. -How to investigate: +How to **investigate**: - Ensure the ingester is receiving write-path traffic (samples to ingest) - Look for any upload error in the ingester logs (ie. networking or authentication issues) @@ -115,18 +115,66 @@ The cause triggering this alert could **lead to**: How to **investigate**: - Look for details in the ingester logs +### CortexIngesterTSDBHeadTruncationFailed + +This alert fires when a Cortex ingester fails to truncate the TSDB head. + +The TSDB head is the in-memory store used to keep series and samples not compacted into a block yet. If head truncation fails for a long time, the ingester disk might get full as it won't continue to the WAL truncation stage and the subsequent ingester restart may take a long time or even go into an OOMKilled crash loop because of the huge WAL to replay. For this reason, it's important to investigate and address the issue as soon as it happen. + +How to **investigate**: +- Look for details in the ingester logs + +### CortexIngesterTSDBCheckpointCreationFailed + +This alert fires when a Cortex ingester fails to create a TSDB checkpoint. + +How to **investigate**: +- Look for details in the ingester logs +- If the checkpoint fails because of a `corruption in segment`, you can restart the ingester because at next startup TSDB will try to "repair" it. After restart, if the issue is repaired and the ingester is running, you should also get paged by `CortexIngesterTSDBWALCorrupted` to signal you the WAL was corrupted and manual investigation is required. + +### CortexIngesterTSDBCheckpointDeletionFailed + +This alert fires when a Cortex ingester fails to delete a TSDB checkpoint. + +Generally, this is not an urgent issue, but manual investigation is required to find the root cause of the issue and fix it. + +How to **investigate**: +- Look for details in the ingester logs + +### CortexIngesterTSDBWALTruncationFailed + +This alert fires when a Cortex ingester fails to truncate the TSDB WAL. + +How to **investigate**: +- Look for details in the ingester logs + +### CortexIngesterTSDBWALCorrupted + +This alert fires when a Cortex ingester finds a corrupted TSDB WAL (stored on disk) while replaying it at ingester startup or when creation of a checkpoint comes across a WAL corruption. + +If this alert fires during an **ingester startup**, the WAL should have been auto-repaired, but manual investigation is required. The WAL repair mechanism cause data loss because all WAL records after the corrupted segment are discarded and so their samples lost while replaying the WAL. If this issue happen only on 1 ingester then Cortex doesn't suffer any data loss because of the replication factor, while if it happens on multiple ingesters then some data loss is possible. + +If this alert fires during a **checkpoint creation**, you should have also been paged with `CortexIngesterTSDBCheckpointCreationFailed`, and you can follow the steps under that alert. + +### CortexIngesterTSDBWALWritesFailed + +This alert fires when a Cortex ingester is failing to log records to the TSDB WAL on disk. + +How to **investigate**: +- Look for details in the ingester logs + ### CortexQuerierHasNotScanTheBucket This alert fires when a Cortex querier is not successfully scanning blocks in the storage (bucket). A querier is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. -How to investigate: +How to **investigate**: - Look for any scan error in the querier logs (ie. networking or rate limiting issues) ### CortexQuerierHighRefetchRate This alert fires when there's an high number of queries for which series have been refetched from a different store-gateway because of missing blocks. This could happen for a short time whenever a store-gateway ring resharding occurs (e.g. during/after an outage or while rolling out store-gateway) but store-gateways should reconcile in a short time. This alert fires if the issue persist for an unexpected long time and thus it should be investigated. -How to investigate: +How to **investigate**: - Ensure there are no errors related to blocks scan or sync in the queriers and store-gateways - Check store-gateway logs to see if all store-gateway have successfully completed a blocks sync @@ -134,14 +182,14 @@ How to investigate: This alert fires when a Cortex store-gateway is not successfully scanning blocks in the storage (bucket). A store-gateway is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket for a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. -How to investigate: +How to **investigate**: - Look for any scan error in the store-gateway logs (ie. networking or rate limiting issues) ### CortexCompactorHasNotSuccessfullyCleanedUpBlocks This alert fires when a Cortex compactor is not successfully deleting blocks marked for deletion for a long time. -How to investigate: +How to **investigate**: - Ensure the compactor is not crashing during compaction (ie. `OOMKilled`) - Look for any error in the compactor logs (ie. bucket Delete API errors) @@ -153,7 +201,7 @@ Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHas This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time. -How to investigate: +How to **investigate**: - If the alert `CortexCompactorHasNotSuccessfullyRun` or `CortexCompactorHasNotSuccessfullyRunSinceStart` have fired as well, then investigate that issue first - If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first - Ensure ingesters are successfully shipping blocks to the storage From ff9607e14d643ade4cca8b01017b4e058adc726e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 10 Nov 2020 15:41:35 +0100 Subject: [PATCH 170/364] Update alert expression. --- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index f11ee00cd93..5a3282690a4 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -67,7 +67,7 @@ // Alert if compactor fails. alert: 'CortexCompactorRunFailed', expr: ||| - increase(cortex_compactor_runs_failed_total[10m]) > 0 + increase(cortex_compactor_runs_failed_total[3h]) > 1 |||, labels: { severity: 'warning', From 447aefd5faa77a9375181656b7674c680a6ffe06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 10 Nov 2020 15:45:33 +0100 Subject: [PATCH 171/364] Fixed range. --- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index 5a3282690a4..1ed365b9fe5 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -67,7 +67,7 @@ // Alert if compactor fails. alert: 'CortexCompactorRunFailed', expr: ||| - increase(cortex_compactor_runs_failed_total[3h]) > 1 + increase(cortex_compactor_runs_failed_total[2h]) > 1 |||, labels: { severity: 'warning', From d45f08a4db77c48d98388ea3b321418c5e79666e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 10 Nov 2020 16:08:19 +0100 Subject: [PATCH 172/364] Fix severity for CortexCompactorRunFailed --- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index 1ed365b9fe5..c31b2bb4cd5 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -70,7 +70,7 @@ increase(cortex_compactor_runs_failed_total[2h]) > 1 |||, labels: { - severity: 'warning', + severity: 'critical', }, annotations: { message: ||| From eb74ca29d4789f01c381ada7b62378673c2dfc75 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 13 Nov 2020 16:19:17 +0000 Subject: [PATCH 173/364] Add Alertmanager to dashboard for writes resources --- .../dashboards/writes-resources.libsonnet | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index a7f21308b14..63ec4965a4e 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -47,6 +47,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), ) ) + .addRow( + $.row('Alertmanager') + .addPanel( + $.containerCPUUsagePanel('CPU', 'alertmanager'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'alertmanager'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'alertmanager'), + ) + ) + { templating+: { list: [ From 182f229ccc9cf26534551f77144e311a81925565 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 13 Nov 2020 17:30:32 +0000 Subject: [PATCH 174/364] Add a new dashboard --- jsonnet/mimir-mixin/dashboards.libsonnet | 3 +- .../alertmanager-resources.libsonnet | 93 +++++++++++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 6f5e44f4870..0147148dd0c 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -26,7 +26,8 @@ (if !$._config.resources_dashboards_enabled then {} else (import 'dashboards/reads-resources.libsonnet') + - (import 'dashboards/writes-resources.libsonnet')) + + (import 'dashboards/writes-resources.libsonnet') + + (import 'dashboards/alertmanager-resources.libsonnet')) + { _config:: $._config }, } diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet new file mode 100644 index 00000000000..a09cb9b122d --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -0,0 +1,93 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + local filterNodeDiskByCompactor = ||| + ignoring(pod) group_right() (label_replace(count by(pod, instance, device) (container_fs_writes_bytes_total{%s,container="alertmanager",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) + ||| % $.namespaceMatcher(); + 'alertmanager-resources.json': + ($.dashboard('Alertmanager') + { uid: 'df9added6f1f4332f95848cca48ebd99' }) + .addClusterSelectorTemplates() + .addRow( + $.row('Gateway') + .addPanel( + $.containerCPUUsagePanel('CPU', 'cortex-gw'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'), + ) + ) + .addRow( + $.row('Alertmanager') + .addPanel( + $.containerCPUUsagePanel('CPU', 'alertmanager'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'alertmanager'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'alertmanager'), + ) + ) + .addRow( + $.row('Instance Mapper') + .addPanel( + $.containerCPUUsagePanel('CPU', 'alertmanager-im'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'alertmanager-im'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'alertmanager-im'), + ) + ) + .addRow( + $.row('Network') + .addPanel( + $.panel('Receive Bandwidth') + + $.queryPanel('sum by(pod) (rate(container_network_receive_bytes_total{%s,pod=~"alertmanager.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Transmit Bandwidth') + + $.queryPanel('sum by(pod) (rate(container_network_transmit_bytes_total{%s,pod=~"alertmanager.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + ) + .addRow( + $.row('Disk') + .addPanel( + $.panel('Writes') + + $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Reads') + + $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('Disk Space Utilization') + + $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="alertmanager"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + + { yaxes: $.yaxes('percentunit') }, + ) + ) + { + templating+: { + list: [ + // Do not allow to include all clusters/namespaces otherwise this dashboard + // risks to explode because it shows resources per pod. + l + (if (l.name == 'cluster' || l.name == 'namespace') then { includeAll: false } else {}) + for l in super.list + ], + }, + }, +} From 7ef5530ff6614d7e3adaa0a1dbfe1aeb5510a27e Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 13 Nov 2020 17:40:54 +0000 Subject: [PATCH 175/364] Fix the linter --- jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index a09cb9b122d..e5416e68c06 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -1,10 +1,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { + 'alertmanager-resources.json': local filterNodeDiskByCompactor = ||| ignoring(pod) group_right() (label_replace(count by(pod, instance, device) (container_fs_writes_bytes_total{%s,container="alertmanager",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) ||| % $.namespaceMatcher(); - 'alertmanager-resources.json': ($.dashboard('Alertmanager') + { uid: 'df9added6f1f4332f95848cca48ebd99' }) .addClusterSelectorTemplates() .addRow( From a1a7aef32c5755e511bccac19aadf561f75517ea Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 13 Nov 2020 17:52:57 +0000 Subject: [PATCH 176/364] Use the right hash and dashboard name --- jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index e5416e68c06..cadef56a383 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -5,7 +5,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; local filterNodeDiskByCompactor = ||| ignoring(pod) group_right() (label_replace(count by(pod, instance, device) (container_fs_writes_bytes_total{%s,container="alertmanager",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) ||| % $.namespaceMatcher(); - ($.dashboard('Alertmanager') + { uid: 'df9added6f1f4332f95848cca48ebd99' }) + ($.dashboard('Cortex / Alertmanager Resources') + { uid: '68b66aed90ccab448009089544a8d6c6' }) .addClusterSelectorTemplates() .addRow( $.row('Gateway') diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index 1f3bdbc47d2..9de47dbc2d9 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -1,7 +1,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { - 'alertmanager.json': ($.dashboard('Cortex / Alertmanager') + { uid: 'a76bee5913c97c918d9e56a3cc88cc28' }) .addClusterSelectorTemplates() From 8c2899d942a53f0ffc2ac683227ccfe3f3d6dc4d Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 13 Nov 2020 17:56:34 +0000 Subject: [PATCH 177/364] Remove Alertmanager from write resources --- .../dashboards/writes-resources.libsonnet | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index 63ec4965a4e..a7f21308b14 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -47,18 +47,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), ) ) - .addRow( - $.row('Alertmanager') - .addPanel( - $.containerCPUUsagePanel('CPU', 'alertmanager'), - ) - .addPanel( - $.containerMemoryWorkingSetPanel('Memory (workingset)', 'alertmanager'), - ) - .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'alertmanager'), - ) - ) + { templating+: { list: [ From 0ec170582f2af35de679251ee2997405b0a22ff6 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 13 Nov 2020 17:57:35 +0000 Subject: [PATCH 178/364] No risk of dashboard exploding with AM --- .../dashboards/alertmanager-resources.libsonnet | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index cadef56a383..5e00badffb3 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -80,14 +80,5 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="alertmanager"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + { yaxes: $.yaxes('percentunit') }, ) - ) + { - templating+: { - list: [ - // Do not allow to include all clusters/namespaces otherwise this dashboard - // risks to explode because it shows resources per pod. - l + (if (l.name == 'cluster' || l.name == 'namespace') then { includeAll: false } else {}) - for l in super.list - ], - }, - }, + ), } From cd0df1bd6f536c63d1d6cb4fe056b51389992fdd Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 13 Nov 2020 17:58:35 +0000 Subject: [PATCH 179/364] Dont reference the compactor --- .../dashboards/alertmanager-resources.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index 5e00badffb3..0f36bfb479c 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -2,7 +2,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'alertmanager-resources.json': - local filterNodeDiskByCompactor = ||| + local filterNodeDiskByAlertmanager = ||| ignoring(pod) group_right() (label_replace(count by(pod, instance, device) (container_fs_writes_bytes_total{%s,container="alertmanager",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) ||| % $.namespaceMatcher(); ($.dashboard('Cortex / Alertmanager Resources') + { uid: '68b66aed90ccab448009089544a8d6c6' }) @@ -62,13 +62,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Disk') .addPanel( $.panel('Writes') + - $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + + $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__interval])) + %s' % filterNodeDiskByAlertmanager, '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Reads') + - $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + + $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__interval])) + %s' % filterNodeDiskByAlertmanager, '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) @@ -80,5 +80,5 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="alertmanager"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + { yaxes: $.yaxes('percentunit') }, ) - ), + ) + { } From ec5a733d48e19927e8746c90517523eef34db411 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Fri, 13 Nov 2020 18:25:36 +0000 Subject: [PATCH 180/364] [hotfix] missing comma --- jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index 0f36bfb479c..9f0be99ac37 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -80,5 +80,5 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="alertmanager"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + { yaxes: $.yaxes('percentunit') }, ) - ) + { + ), } From 7de663f61247f3ee7c4e0660df5c313180059ecc Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Fri, 4 Dec 2020 14:04:01 -0500 Subject: [PATCH 181/364] uses $__rate_interval in ruler dashboard queries Signed-off-by: Owen Diehl --- .../mimir-mixin/dashboards/ruler.libsonnet | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 9c81e8221f4..6f19d61d8b3 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -6,54 +6,53 @@ local utils = import 'mixin-utils/utils.libsonnet'; ruleEvaluations: { success: ||| - sum(rate(cortex_prometheus_rule_evaluations_total{%s}[$__interval])) + sum(rate(cortex_prometheus_rule_evaluations_total{%s}[$__rate_interval])) - - sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval])) + sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__rate_interval])) |||, - failure: 'sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval]))', + failure: 'sum(rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__rate_interval]))', latency: ||| - sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{%s}[$__interval])) + sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{%s}[$__rate_interval])) / - sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{%s}[$__interval])) + sum (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{%s}[$__rate_interval])) |||, }, perUserPerGroupEvaluations: { - failure: 'sum by(rule_group) (rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__interval])) > 0', + failure: 'sum by(rule_group) (rate(cortex_prometheus_rule_evaluation_failures_total{%s}[$__rate_interval])) > 0', latency: ||| - sum by(user) (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{%s}[$__interval])) + sum by(user) (rate(cortex_prometheus_rule_evaluation_duration_seconds_sum{%s}[$__rate_interval])) / - sum by(user) (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{%s}[$__interval])) + sum by(user) (rate(cortex_prometheus_rule_evaluation_duration_seconds_count{%s}[$__rate_interval])) |||, }, groupEvaluations: { - missedIterations: 'sum by(user) (rate(cortex_prometheus_rule_group_iterations_missed_total{%s}[$__interval])) > 0', + missedIterations: 'sum by(user) (rate(cortex_prometheus_rule_group_iterations_missed_total{%s}[$__rate_interval])) > 0', latency: ||| - rate(cortex_prometheus_rule_group_duration_seconds_sum{%s}[$__interval]) + rate(cortex_prometheus_rule_group_duration_seconds_sum{%s}[$__rate_interval]) / - rate(cortex_prometheus_rule_group_duration_seconds_count{%s}[$__interval]) + rate(cortex_prometheus_rule_group_duration_seconds_count{%s}[$__rate_interval]) |||, }, notifications: { failure: ||| - sum by(user) (rate(cortex_prometheus_notifications_errors_total{%s}[$__interval])) + sum by(user) (rate(cortex_prometheus_notifications_errors_total{%s}[$__rate_interval])) / - sum by(user) (rate(cortex_prometheus_notifications_sent_total{%s}[$__interval])) + sum by(user) (rate(cortex_prometheus_notifications_sent_total{%s}[$__rate_interval])) > 0 |||, queue: ||| - sum by(user) (rate(cortex_prometheus_notifications_queue_length{%s}[$__interval])) + sum by(user) (rate(cortex_prometheus_notifications_queue_length{%s}[$__rate_interval])) / - sum by(user) (rate(cortex_prometheus_notifications_queue_capacity{%s}[$__interval])) - > 0 + sum by(user) (rate(cortex_prometheus_notifications_queue_capacity{%s}[$__rate_interval])) > 0 |||, dropped: ||| - sum by (user) (increase(cortex_prometheus_notifications_dropped_total{%s}[$__interval])) > 0 + sum by (user) (increase(cortex_prometheus_notifications_dropped_total{%s}[$__rate_interval])) > 0 |||, }, }, From efbd9fc4f9ddb082df336e3ffc679dbb378d9dd9 Mon Sep 17 00:00:00 2001 From: Owen Diehl Date: Fri, 4 Dec 2020 14:20:22 -0500 Subject: [PATCH 182/364] s/interva/rate_interval/g Signed-off-by: Owen Diehl --- .../alertmanager-resources.libsonnet | 8 ++--- .../dashboards/alertmanager.libsonnet | 18 +++++------ .../mimir-mixin/dashboards/chunks.libsonnet | 16 +++++----- .../dashboards/compactor-resources.libsonnet | 8 ++--- .../dashboards/compactor.libsonnet | 18 +++++------ .../dashboards/comparison.libsonnet | 32 +++++++++---------- .../dashboards/dashboard-utils.libsonnet | 10 +++--- .../dashboards/object-store.libsonnet | 8 ++--- .../mimir-mixin/dashboards/queries.libsonnet | 16 +++++----- .../mimir-mixin/dashboards/reads.libsonnet | 14 ++++---- .../mimir-mixin/dashboards/writes.libsonnet | 28 ++++++++-------- 11 files changed, 88 insertions(+), 88 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index 9f0be99ac37..9bff5fcad74 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -47,13 +47,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Network') .addPanel( $.panel('Receive Bandwidth') + - $.queryPanel('sum by(pod) (rate(container_network_receive_bytes_total{%s,pod=~"alertmanager.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod}}') + + $.queryPanel('sum by(pod) (rate(container_network_receive_bytes_total{%s,pod=~"alertmanager.*"}[$__rate_interval]))' % $.namespaceMatcher(), '{{pod}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Transmit Bandwidth') + - $.queryPanel('sum by(pod) (rate(container_network_transmit_bytes_total{%s,pod=~"alertmanager.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod}}') + + $.queryPanel('sum by(pod) (rate(container_network_transmit_bytes_total{%s,pod=~"alertmanager.*"}[$__rate_interval]))' % $.namespaceMatcher(), '{{pod}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) @@ -62,13 +62,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Disk') .addPanel( $.panel('Writes') + - $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__interval])) + %s' % filterNodeDiskByAlertmanager, '{{pod}} - {{device}}') + + $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % filterNodeDiskByAlertmanager, '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Reads') + - $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__interval])) + %s' % filterNodeDiskByAlertmanager, '{{pod}} - {{device}}') + + $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % filterNodeDiskByAlertmanager, '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index 9de47dbc2d9..fa5c6abf6cd 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -25,11 +25,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_alerts_received_total{%s}[$__interval])) + sum(rate(cortex_alertmanager_alerts_received_total{%s}[$__rate_interval])) - - sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__interval])) + sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval])) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__interval]))' % $.jobMatcher('alertmanager'), + 'sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -42,11 +42,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_notifications_total{%s}[$__interval])) + sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval])) - - sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval])) + sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval]))' % $.jobMatcher('alertmanager'), + 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -57,13 +57,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; [ ||| ( - sum(rate(cortex_alertmanager_notifications_total{%s}[$__interval])) by(integration) + sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval])) by(integration) - - sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval])) by(integration) + sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration) ) > 0 or on () vector(0) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval])) by(integration)' % $.jobMatcher('alertmanager'), + 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration)' % $.jobMatcher('alertmanager'), ], ['success - {{ integration }}', 'failed - {{ integration }}'] ) diff --git a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet index 979ee80380d..0481569ab84 100644 --- a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet @@ -58,7 +58,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Bytes Logged (WAL+Checkpoint) / ingester / second') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval])) + avg(rate(cortex_ingester_checkpoint_logged_bytes_total{%(m)s}[$__interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + + $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval])) + avg(rate(cortex_ingester_checkpoint_logged_bytes_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + { yaxes: $.yaxes('bytes') }, ) ) @@ -66,16 +66,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('WAL') .addPanel( $.panel('Records logged / ingester / second') + - $.queryPanel('avg(rate(cortex_ingester_wal_records_logged_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), 'records'), + $.queryPanel('avg(rate(cortex_ingester_wal_records_logged_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), 'records'), ) .addPanel( $.panel('Bytes per record') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval]) / rate(cortex_ingester_wal_records_logged_total{%(m)s}[$__interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + + $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval]) / rate(cortex_ingester_wal_records_logged_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + { yaxes: $.yaxes('bytes') }, ) .addPanel( $.panel('Bytes per sample') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__interval]) / rate(cortex_ingester_ingested_samples_total{%(m)s}[$__interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + + $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval]) / rate(cortex_ingester_ingested_samples_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + { yaxes: $.yaxes('bytes') }, ) .addPanel( @@ -88,13 +88,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Checkpoint') .addPanel( $.panel('Checkpoint creation/deletion / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_total{%s}[$__interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-creation') + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_total{%s}[$__interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-deletion'), + $.queryPanel('rate(cortex_ingester_checkpoint_creations_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-creation') + + $.queryPanel('rate(cortex_ingester_checkpoint_deletions_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-deletion'), ) .addPanel( $.panel('Checkpoint creation/deletion failed / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_failed_total{%s}[$__interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-creation') + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_failed_total{%s}[$__interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-deletion'), + $.queryPanel('rate(cortex_ingester_checkpoint_creations_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-creation') + + $.queryPanel('rate(cortex_ingester_checkpoint_deletions_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-deletion'), ) ), } diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet index 92b8fd76a38..72e3e280976 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet @@ -24,13 +24,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Network') .addPanel( $.panel('Receive Bandwidth') + - $.queryPanel('sum by(pod) (rate(container_network_receive_bytes_total{%s,pod=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod}}') + + $.queryPanel('sum by(pod) (rate(container_network_receive_bytes_total{%s,pod=~"compactor.*"}[$__rate_interval]))' % $.namespaceMatcher(), '{{pod}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Transmit Bandwidth') + - $.queryPanel('sum by(pod) (rate(container_network_transmit_bytes_total{%s,pod=~"compactor.*"}[$__interval]))' % $.namespaceMatcher(), '{{pod}}') + + $.queryPanel('sum by(pod) (rate(container_network_transmit_bytes_total{%s,pod=~"compactor.*"}[$__rate_interval]))' % $.namespaceMatcher(), '{{pod}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) @@ -39,13 +39,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Disk') .addPanel( $.panel('Writes') + - $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + + $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Reads') + - $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + + $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 347e0139049..dcd29ea12b8 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -16,16 +16,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.startedCompletedFailedPanel( 'Per-instance runs / sec', - 'sum(rate(cortex_compactor_runs_started_total{%s}[$__interval]))' % $.jobMatcher('compactor'), - 'sum(rate(cortex_compactor_runs_completed_total{%s}[$__interval]))' % $.jobMatcher('compactor'), - 'sum(rate(cortex_compactor_runs_failed_total{%s}[$__interval]))' % $.jobMatcher('compactor') + 'sum(rate(cortex_compactor_runs_started_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), + 'sum(rate(cortex_compactor_runs_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), + 'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor') ) + $.bars + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Compacted blocks / sec') + - $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__interval]))' % $.jobMatcher('compactor'), 'blocks') + + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), 'blocks') + { yaxes: $.yaxes('ops') }, ) .addPanel( @@ -37,7 +37,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Garbage Collector') .addPanel( $.panel('Blocks marked for deletion / sec') + - $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__interval]))' % $.jobMatcher('compactor'), 'blocks') + + $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), 'blocks') + { yaxes: $.yaxes('ops') }, ) .addPanel( @@ -45,8 +45,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Blocks deletions / sec', // The cortex_compactor_blocks_cleaned_total tracks the number of successfully // deleted blocks. - 'sum(rate(cortex_compactor_blocks_cleaned_total{%s}[$__interval]))' % $.jobMatcher('compactor'), - 'sum(rate(cortex_compactor_block_cleanup_failures_total{%s}[$__interval]))' % $.jobMatcher('compactor'), + 'sum(rate(cortex_compactor_blocks_cleaned_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), + 'sum(rate(cortex_compactor_block_cleanup_failures_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), ) + { yaxes: $.yaxes('ops') } ) ) @@ -57,8 +57,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Metadata Syncs / sec', // The cortex_compactor_meta_syncs_total metric is incremented each time a per-tenant // metadata sync is triggered. - 'sum(rate(cortex_compactor_meta_syncs_total{%s}[$__interval])) - sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__interval]))' % [$.jobMatcher('compactor'), $.jobMatcher('compactor')], - 'sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__interval]))' % $.jobMatcher('compactor'), + 'sum(rate(cortex_compactor_meta_syncs_total{%s}[$__rate_interval])) - sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher('compactor'), $.jobMatcher('compactor')], + 'sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), ) + { yaxes: $.yaxes('ops') } ) .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/comparison.libsonnet b/jsonnet/mimir-mixin/dashboards/comparison.libsonnet index 836e9b46001..1716f7d4c51 100644 --- a/jsonnet/mimir-mixin/dashboards/comparison.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/comparison.libsonnet @@ -11,8 +11,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ingesters') .addPanel( $.panel('Samples / sec') + - $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($blocks_namespace)/ingester"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($chunks_namespace)/ingester"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($blocks_namespace)/ingester"}[$__rate_interval]))', 'blocks') + + $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($chunks_namespace)/ingester"}[$__rate_interval]))', 'chunks') ) ) .addRow( @@ -30,8 +30,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('CPU per sample') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__rate_interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__rate_interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__rate_interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__rate_interval]))', 'chunks') ) .addPanel( $.panel('Memory per active series') + @@ -46,8 +46,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__rate_interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__rate_interval]))', 'chunks') ) .addPanel( $.panel('Memory') + @@ -62,27 +62,27 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Queriers') .addPanel( $.panel('Queries / sec (query-frontend)') + - $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$blocks_namespace/query-frontend",route!="metrics"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$chunks_namespace/query-frontend",route!="metrics"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$blocks_namespace/query-frontend",route!="metrics"}[$__rate_interval]))', 'blocks') + + $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$chunks_namespace/query-frontend",route!="metrics"}[$__rate_interval]))', 'chunks') ) .addPanel( $.panel('Queries / sec (query-tee)') + - $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval]))', 'blocks') + + $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval]))', 'chunks') ) ) .addRow( $.row('') .addPanel( $.panel('Latency 99th') + - $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval])))', 'blocks') + - $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval])))', 'chunks') + + $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval])))', 'blocks') + + $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval])))', 'chunks') + { yaxes: $.yaxes('s') } ) .addPanel( $.panel('Latency average') + - $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__interval]))', 'blocks') + - $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__interval]))', 'chunks') + + $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval]))', 'blocks') + + $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval]))', 'chunks') + { yaxes: $.yaxes('s') } ) ) @@ -90,8 +90,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"}[$__interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"}[$__interval]))', 'chunks') + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"}[$__rate_interval]))', 'blocks') + + $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"}[$__rate_interval]))', 'chunks') ) .addPanel( $.panel('Memory') + diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 4429e906438..4b5c2b62f0b 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -120,7 +120,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerCPUUsagePanel(title, containerName):: $.panel(title) + $.queryPanel([ - 'sum by(pod) (rate(container_cpu_usage_seconds_total{%s,container="%s"}[$__interval]))' % [$.namespaceMatcher(), containerName], + 'sum by(pod) (rate(container_cpu_usage_seconds_total{%s,container="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), containerName], 'min(container_spec_cpu_quota{%s,container="%s"} / container_spec_cpu_period{%s,container="%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], ], ['{{pod}}', 'limit']) + { @@ -175,13 +175,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; super.row(title) .addPanel( $.panel('Operations / sec') + - $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__interval]))' % [$.namespaceMatcher(), component], '{{operation}}') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') + $.stack + { yaxes: $.yaxes('rps') }, ) .addPanel( $.panel('Error rate') + - $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s,component="%s"}[$__interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__interval]))' % [$.namespaceMatcher(), component, $.namespaceMatcher(), component], '{{operation}}') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s,component="%s"}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component, $.namespaceMatcher(), component], '{{operation}}') + { yaxes: $.yaxes('percentunit') }, ) .addPanel( @@ -217,7 +217,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; super.row(title) .addPanel( $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{%s,component="%s",name="%s"}[$__interval]))' % [$.jobMatcher(jobName), component, cacheName], '{{operation}}') + + $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{%s,component="%s",name="%s"}[$__rate_interval]))' % [$.jobMatcher(jobName), component, cacheName], '{{operation}}') + $.stack + { yaxes: $.yaxes('ops') }, ) @@ -227,7 +227,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum(rate(thanos_cache_memcached_hits_total{%s,component="%s",name="%s"}[$__interval])) / sum(rate(thanos_cache_memcached_requests_total{%s,component="%s",name="%s"}[$__interval]))' % + $.queryPanel('sum(rate(thanos_cache_memcached_hits_total{%s,component="%s",name="%s"}[$__rate_interval])) / sum(rate(thanos_cache_memcached_requests_total{%s,component="%s",name="%s"}[$__rate_interval]))' % [ $.jobMatcher(jobName), component, diff --git a/jsonnet/mimir-mixin/dashboards/object-store.libsonnet b/jsonnet/mimir-mixin/dashboards/object-store.libsonnet index 69d0492e703..69e257b60dd 100644 --- a/jsonnet/mimir-mixin/dashboards/object-store.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/object-store.libsonnet @@ -8,13 +8,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Components') .addPanel( $.panel('RPS / component') + - $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__interval]))' % $.namespaceMatcher(), '{{component}}') + + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{component}}') + $.stack + { yaxes: $.yaxes('rps') }, ) .addPanel( $.panel('Error rate / component') + - $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') + + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') + { yaxes: $.yaxes('percentunit') }, ) ) @@ -22,13 +22,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Operations') .addPanel( $.panel('RPS / operation') + - $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__interval]))' % $.namespaceMatcher(), '{{operation}}') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{operation}}') + $.stack + { yaxes: $.yaxes('rps') }, ) .addPanel( $.panel('Error rate / operation') + - $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') + { yaxes: $.yaxes('percentunit') }, ) ) diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index 35888c4187e..449edcabefa 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -148,18 +148,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Store-gateway - Blocks') .addPanel( $.panel('Blocks queried / sec') + - $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + + $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Data fetched / sec') + - $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_fetched_sum{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_fetched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + $.stack + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Data touched / sec') + - $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_touched_sum{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_touched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + $.stack + { yaxes: $.yaxes('ops') }, ) @@ -177,7 +177,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Series returned (per request)') + - $.queryPanel('sum(rate(cortex_bucket_store_series_result_series_sum{component="store-gateway",%s}[$__interval])) / sum(rate(cortex_bucket_store_series_result_series_count{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'avg series returned'), + $.queryPanel('sum(rate(cortex_bucket_store_series_result_series_sum{component="store-gateway",%s}[$__rate_interval])) / sum(rate(cortex_bucket_store_series_result_series_count{component="store-gateway",%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'avg series returned'), ) ) .addRowIf( @@ -190,15 +190,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.successFailurePanel( 'Blocks loaded / sec', - 'sum(rate(cortex_bucket_store_block_loads_total{component="store-gateway",%s}[$__interval])) - sum(rate(cortex_bucket_store_block_load_failures_total{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], - 'sum(rate(cortex_bucket_store_block_load_failures_total{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), + 'sum(rate(cortex_bucket_store_block_loads_total{component="store-gateway",%s}[$__rate_interval])) - sum(rate(cortex_bucket_store_block_load_failures_total{component="store-gateway",%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], + 'sum(rate(cortex_bucket_store_block_load_failures_total{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), ) ) .addPanel( $.successFailurePanel( 'Blocks dropped / sec', - 'sum(rate(cortex_bucket_store_block_drops_total{component="store-gateway",%s}[$__interval])) - sum(rate(cortex_bucket_store_block_drop_failures_total{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], - 'sum(rate(cortex_bucket_store_block_drop_failures_total{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), + 'sum(rate(cortex_bucket_store_block_drops_total{component="store-gateway",%s}[$__rate_interval])) - sum(rate(cortex_bucket_store_block_drop_failures_total{component="store-gateway",%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], + 'sum(rate(cortex_bucket_store_block_drop_failures_total{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), ) ) ), diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 3715d78312a..492889b2535 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -17,7 +17,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' ) + { yaxes: $.yaxes('s') } ) @@ -35,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' ) + { yaxes: $.yaxes('s') } ) @@ -64,7 +64,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], '' ) + { yaxes: $.yaxes('s') } ) @@ -82,7 +82,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' ) + { yaxes: $.yaxes('s') } ) @@ -101,7 +101,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], '' ) + { yaxes: $.yaxes('s') } ) @@ -135,7 +135,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached – Blocks Storage – Index header (Store-gateway)') .addPanel( $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{component="store-gateway",name="index-cache", %s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + + $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{component="store-gateway",name="index-cache", %s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + $.stack + { yaxes: $.yaxes('ops') }, ) @@ -145,7 +145,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum by(item_type) (rate(thanos_store_index_cache_hits_total{component="store-gateway",%s}[$__interval])) / sum by(item_type) (rate(thanos_store_index_cache_requests_total{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{item_type}}') + + $.queryPanel('sum by(item_type) (rate(thanos_store_index_cache_hits_total{component="store-gateway",%s}[$__rate_interval])) / sum by(item_type) (rate(thanos_store_index_cache_requests_total{component="store-gateway",%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{item_type}}') + { yaxes: $.yaxes('percentunit') }, ) ) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 3886441434e..030fff7e852 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -43,7 +43,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' ) + { yaxes: $.yaxes('s') } ) @@ -61,7 +61,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/httpgrpc.*|api_(v1|prom)_push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' ) + { yaxes: $.yaxes('s') } ) @@ -90,7 +90,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' ) + { yaxes: $.yaxes('s') } ) @@ -176,8 +176,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.successFailurePanel( 'Uploaded blocks / sec', - 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], - 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), + 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ), ) .addPanel( @@ -191,8 +191,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.successFailurePanel( 'Compactions / sec', - 'sum(rate(cortex_ingester_tsdb_compactions_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester)], - 'sum(rate(cortex_ingester_tsdb_compactions_failed_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), + 'sum(rate(cortex_ingester_tsdb_compactions_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], + 'sum(rate(cortex_ingester_tsdb_compactions_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ), ) .addPanel( @@ -206,27 +206,27 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.successFailurePanel( 'WAL truncations / sec', - 'sum(rate(cortex_ingester_tsdb_wal_truncations_total{%s}[$__interval])) - sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], - 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), + 'sum(rate(cortex_ingester_tsdb_wal_truncations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ), ) .addPanel( $.successFailurePanel( 'Checkpoints created / sec', - 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total{%s}[$__interval])) - sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], - 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), + 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], + 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ), ) .addPanel( $.panel('WAL truncations latency (includes checkpointing)') + - $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + + $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__rate_interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + { yaxes: $.yaxes('s') }, ) .addPanel( $.panel('Corruptions / sec') + $.queryPanel([ - 'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), - 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.ingester), + 'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), + 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ], [ 'WAL', 'mmap-ed chunks', From 4e53eb22bce2875b7c9a9c19c94ddcd1715f750f Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 10 Dec 2020 16:38:19 +0000 Subject: [PATCH 183/364] [Ruler] Dashboard changes Adds several improvements to the visibility and experience of the Ruler dashboard: - [chunks] Cache information - [chunks] Index and Chunk information based on queries - [blocks] Store-gateway information based on queries - [both] Makes the group and rule evaluation panels collapsed by default --- .../mimir-mixin/dashboards/ruler.libsonnet | 94 +++++++++++++++---- 1 file changed, 77 insertions(+), 17 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 6f19d61d8b3..5383b52b713 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -135,8 +135,83 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher('ruler')) ) ) + .addRowIf( + std.member($._config.storage_engine, 'chunks'), + $.row('Ruler - Chunks storage - Index Cache') + .addPanel( + $.panel('Total entries') + + $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'Entries'), + ) + .addPanel( + $.panel('Cache Hit %') + + $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'hit rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + $.panel('Churn Rate') + + $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % $.jobMatcher($._config.job_names.ruler), 'churn rate'), + ) + ) + .addRowIf( + std.member($._config.storage_engine, 'chunks'), + $.row('Ruler - Chunks storage - Store') + .addPanel( + $.panel('Index Lookups per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Series (pre-intersection) per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Series (post-intersection) per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Chunks per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + ) + .addRowIf( + std.member($._config.storage_engine, 'blocks'), + $.row('Ruler - Blocks storage') + .addPanel( + $.panel('Number of store-gateways hit per Query') + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Refetches of missing blocks per Query') + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Consistency checks failed') + + $.queryPanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[1m])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'Failure Rate') + + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) .addRow( - $.row('Group Evaluations') + $.row('Notifications') + .addPanel( + $.panel('Delivery Errors') + + $.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}') + ) + .addPanel( + $.panel('Queue Length') + + $.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}') + ) + .addPanel( + $.panel('Dropped') + + $.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher('ruler'), '{{ user }}') + ) + ) + .addRow( + ($.row('Group Evaluations') + { collapse: true }) .addPanel( $.panel('Missed Iterations') + $.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher('ruler'), '{{ user }}'), @@ -156,7 +231,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Rule Evaluation per User') + ($.row('Rule Evaluation per User') + { collapse: true }) .addPanel( $.panel('Latency') + $.queryPanel( @@ -164,20 +239,5 @@ local utils = import 'mixin-utils/utils.libsonnet'; '{{ user }}' ) ) - ) - .addRow( - $.row('Notifications') - .addPanel( - $.panel('Delivery Errors') + - $.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}') - ) - .addPanel( - $.panel('Queue Length') + - $.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}') - ) - .addPanel( - $.panel('Dropped') + - $.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher('ruler'), '{{ user }}') - ) ), } From f90ce5784dc560e6d931c184e5834ad4a6f79467 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 10 Dec 2020 18:44:17 +0000 Subject: [PATCH 184/364] missing a place for using the querier job reference --- jsonnet/mimir-mixin/dashboards/ruler.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 5383b52b713..b9347f7fd8d 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -144,7 +144,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Cache Hit %') + - $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'hit rate') + $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'hit rate') { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( From 1485efcc2866efedc27470549e523b28fdb12a6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Mon, 14 Dec 2020 09:52:49 +0100 Subject: [PATCH 185/364] Add query-scheduler to dashboards and alerts. (https://github.com/grafana/cortex-jsonnet/pull/228) --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 17 ++++++++++++++++- jsonnet/mimir-mixin/config.libsonnet | 1 + .../mimir-mixin/dashboards/queries.libsonnet | 11 +++++++++++ .../dashboards/reads-resources.libsonnet | 12 ++++++++++++ jsonnet/mimir-mixin/dashboards/reads.libsonnet | 11 +++++++++++ 5 files changed, 51 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index ea7020174c4..42c1de68358 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -157,7 +157,22 @@ }, annotations: { message: ||| - There are {{ $value }} queued up queries. + There are {{ $value }} queued up queries in query-frontend. + |||, + }, + }, + { + alert: 'CortexSchedulerQueriesStuck', + expr: ||| + sum by (%s) (cortex_query_scheduler_queue_length) > 1 + ||| % $._config.alert_aggregation_labels, + 'for': '5m', // We don't want to block for longer. + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + There are {{ $value }} queued up queries in query-scheduler. |||, }, }, diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 3ee409002a6..ad89da47940 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -31,6 +31,7 @@ querier: '(querier|cortex$)', ruler: '(ruler|cortex$)', query_frontend: '(query-frontend|cortex$)', + query_scheduler: 'query-scheduler', // Not part of single-binary. table_manager: '(table-manager|cortex$)', store_gateway: '(store-gateway|cortex$)', gateway: 'cortex-gw', diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index 449edcabefa..fa92d56590e 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -21,6 +21,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher($._config.job_names.query_frontend), '{{cluster}} / {{namespace}} / {{instance}}'), ) ) + .addRow( + $.row('Query Scheduler') + .addPanel( + $.panel('Queue Duration') + + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)), + ) + .addPanel( + $.panel('Queue Length') + + $.queryPanel('cortex_query_scheduler_queue_length{%s}' % $.jobMatcher($._config.job_names.query_scheduler), '{{cluster}} / {{namespace}} / {{instance}}'), + ) + ) .addRow( $.row('Query Frontend - Results Cache') .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index 455e8aa0d99..6e39a384d3c 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -28,6 +28,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.goHeapInUsePanel('Memory (go heap inuse)', 'query-frontend'), ) ) + .addRow( + $.row('Query Scheduler') + .addPanel( + $.containerCPUUsagePanel('CPU', 'query-scheduler'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'query-scheduler'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'query-scheduler'), + ) + ) .addRow( $.row('Querier') .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 492889b2535..1fbdb6e9fc6 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -40,6 +40,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('s') } ) ) + .addRow( + $.row('Query Scheduler') + .addPanel( + $.panel('QPS') + + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) + ) + .addPanel( + $.panel('Latency (Time in Queue)') + + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) + ) + ) .addRow( $.row('Cache - Query Results') .addPanel( From 95129695bb2c1b33e711fe68e8abf95d81da8551 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 11 Dec 2020 17:45:25 +0100 Subject: [PATCH 186/364] Improved blocks storage observability Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 13 ++++++++++ .../dashboards/compactor-resources.libsonnet | 15 +++-------- .../dashboards/compactor.libsonnet | 23 +++++++++++++++-- .../dashboards/dashboard-utils.libsonnet | 5 ++++ .../mimir-mixin/dashboards/queries.libsonnet | 12 +++++++++ .../dashboards/reads-resources.libsonnet | 21 ++++++++++++++++ .../dashboards/writes-resources.libsonnet | 20 +++++++++++++++ .../mimir-mixin/dashboards/writes.libsonnet | 4 +++ jsonnet/mimir-mixin/docs/playbooks.md | 25 ++++++------------- 9 files changed, 108 insertions(+), 30 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 42c1de68358..c2d266e8ca1 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -239,6 +239,19 @@ |||, }, }, + { + alert: 'CortexMemoryMapAreasTooHigh', + expr: ||| + process_memory_map_areas{job=~".+(cortex|ingester)"} / process_memory_map_areas_limit{job=~".+(cortex|ingester)"} > 0.8 + |||, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.job }}/{{ $labels.instance }} has a number of mmap-ed areas close to the limit.', + }, + }, ], }, { diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet index 72e3e280976..6ce5f35ce5e 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet @@ -2,10 +2,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-compactor-resources.json': - local filterNodeDiskByCompactor = ||| - ignoring(pod) group_right() (label_replace(count by(pod, instance, device) (container_fs_writes_bytes_total{%s,container="compactor",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) - ||| % $.namespaceMatcher(); - ($.dashboard('Cortex / Compactor Resources') + { uid: 'df9added6f1f4332f95848cca48ebd99' }) .addClusterSelectorTemplates() .addRow( @@ -38,20 +34,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Disk') .addPanel( - $.panel('Writes') + - $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + + $.panel('Disk Writes') + + $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('compactor'), '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( - $.panel('Reads') + - $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % filterNodeDiskByCompactor, '{{pod}} - {{device}}') + + $.panel('Disk Reads') + + $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('compactor'), '{{pod}} - {{device}}') + $.stack + { yaxes: $.yaxes('Bps') }, ) - ) - .addRow( - $.row('') .addPanel( $.panel('Disk Space Utilization') + $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="compactor"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index dcd29ea12b8..5d656c3f3ee 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -9,8 +9,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.textPanel('', ||| - **Per-instance runs**: number of times a compactor instance triggers a compaction across all tenants its shard manage. - - **Compacted blocks**: number of blocks generated as a result of a compaction operation. - - **Per-block compaction duration**: time taken to generate a single compacted block. + - **Tenants compaction progress**: in a multi-tenant cluster it shows the progress of tenants compacted while compaction is running. Reset to 0 once the compaction run is completed for all tenants in the shard. |||), ) .addPanel( @@ -23,6 +22,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.bars + { yaxes: $.yaxes('ops') }, ) + .addPanel( + $.panel('Tenants compaction progress') + + $.queryPanel(||| + ( + cortex_compactor_tenants_processing_succeeded{%s} + + cortex_compactor_tenants_processing_failed{%s} + + cortex_compactor_tenants_skipped{%s} + ) / cortex_compactor_tenants_discovered{%s} + ||| % [$.jobMatcher('compactor'), $.jobMatcher('compactor'), $.jobMatcher('compactor'), $.jobMatcher('compactor')], '{{instance}}') + + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) + .addRow( + $.row('') + .addPanel( + $.textPanel('', ||| + - **Compacted blocks**: number of blocks generated as a result of a compaction operation. + - **Per-block compaction duration**: time taken to generate a single compacted block. + |||), + ) .addPanel( $.panel('Compacted blocks / sec') + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), 'blocks') + diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 4b5c2b62f0b..e9824dead09 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -239,4 +239,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('percentunit') }, ), + + filterNodeDiskContainer(containerName):: + ||| + ignoring(pod) group_right() (label_replace(count by(pod, instance, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) + ||| % [$.namespaceMatcher(), containerName], } diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index fa92d56590e..cad974357f9 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -212,5 +212,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_bucket_store_block_drop_failures_total{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), ) ) + ) + .addRowIf( + std.member($._config.storage_engine, 'blocks'), + $.row('') + .addPanel( + $.panel('Lazy loaded index-headers') + + $.queryPanel('cortex_bucket_store_indexheader_lazy_load_total{%s} - cortex_bucket_store_indexheader_lazy_unload_total{%s}' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{instance}}') + ) + .addPanel( + $.panel('Index-header lazy load duration') + + $.latencyPanel('cortex_bucket_store_indexheader_lazy_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), + ) ), } diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index 6e39a384d3c..7c3cc30719d 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -95,6 +95,27 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.goHeapInUsePanel('Memory (go heap inuse)', 'store-gateway'), ) + ) + .addRowIf( + std.member($._config.storage_engine, 'blocks'), + $.row('') + .addPanel( + $.panel('Disk Writes') + + $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('store-gateway'), '{{pod}} - {{device}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Disk Reads') + + $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('store-gateway'), '{{pod}} - {{device}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Disk Space Utilization') + + $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="store-gateway"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + + { yaxes: $.yaxes('percentunit') }, + ) ) + { templating+: { list: [ diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index a7f21308b14..5e7391ba7f2 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -47,6 +47,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), ) ) + .addRow( + $.row('') + .addPanel( + $.panel('Disk Writes') + + $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('ingester'), '{{pod}} - {{device}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Disk Reads') + + $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('ingester'), '{{pod}} - {{device}}') + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Disk Space Utilization') + + $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="ingester"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + + { yaxes: $.yaxes('percentunit') }, + ) + ) + { templating+: { list: [ diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 030fff7e852..9ab73d12e70 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -25,6 +25,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; distributor: $.jobMatcher($._config.job_names.distributor), }, format='short') ) + .addPanel( + $.panel('Tenants') + + $.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short') + ) .addPanel( $.panel('QPS') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 1bc69bded08..d184e7449e9 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -53,26 +53,17 @@ This alert goes off when an ingester fails to find another node to transfer its ### CortexIngesterUnhealthy This alert goes off when an ingester is marked as unhealthy. Check the ring web page to see which is marked as unhealthy. You could then check the logs to see if there are any related to that ingester ex: `kubectl logs -f ingester-01 --namespace=prod`. A simple way to resolve this may be to click the "Forgot" button on the ring page, especially if the pod doesn't exist anymore. It might not exist anymore because it was on a node that got shut down, so you could check to see if there are any logs related to the node that pod is/was on, ex: `kubectl get events --namespace=prod | grep cloud-provider-node`. -### CortexFlushStuck -@todo +### CortexMemoryMapAreasTooHigh -### CortexLoadBalancerErrors -@todo +This alert fires when a Cortex process has a number of memory map areas close to the limit. The limit is a per-process limit imposed by the kernel and this issue is typically caused by a large number of mmap-ed failes. -### CortexTableSyncFailure -@todo +How to **fix**: +- Increase the limit on your system: `sysctl -w vm.max_map_count=` +- If it's caused by a store-gateway, consider enabling `-blocks-storage.bucket-store.index-header-lazy-loading-enabled=true` to lazy mmap index-headers at query time -### CortexQuerierCapacityFull -@todo - -### CortexFrontendQueriesStuck -@todo - -### CortexProvisioningTooMuchMemory -@todo - -### MemcachedDown -@todo +More information: +- [Kernel doc](https://www.kernel.org/doc/Documentation/sysctl/vm.txt) +- [Side effects when increasing `vm.max_map_count`](https://www.suse.com/support/kb/doc/?id=000016692) ### CortexRulerFailedRingCheck From 3e507358a4a17c6def98baeeebec1f3f8f05ae60 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 14 Dec 2020 09:54:15 +0100 Subject: [PATCH 187/364] Addressed feedback Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index c2d266e8ca1..b6628ab39f0 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -242,7 +242,7 @@ { alert: 'CortexMemoryMapAreasTooHigh', expr: ||| - process_memory_map_areas{job=~".+(cortex|ingester)"} / process_memory_map_areas_limit{job=~".+(cortex|ingester)"} > 0.8 + process_memory_map_areas{job=~".+(cortex|ingester|store-gateway)"} / process_memory_map_areas_limit{job=~".+(cortex|ingester|store-gateway)"} > 0.8 |||, 'for': '5m', labels: { From ba803b91b5e74186ff7097cc91ffb4de8da335be Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 14 Dec 2020 10:02:40 +0100 Subject: [PATCH 188/364] Fixed workingset memory panel while rolling out a StatefulSet Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index e9824dead09..5644af3488a 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -136,7 +136,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerMemoryWorkingSetPanel(title, containerName):: $.panel(title) + $.queryPanel([ - 'sum by(pod) (container_memory_working_set_bytes{%s,container="%s"})' % [$.namespaceMatcher(), containerName], + // We use "max" instead of "sum" otherwise during a rolling update of a statefulset we will end up + // summing the memory of the old pod (whose metric will be stale for 5m) to the new pod. + 'max by(pod) (container_memory_working_set_bytes{%s,container="%s"})' % [$.namespaceMatcher(), containerName], 'min(container_spec_memory_limit_bytes{%s,container="%s"} > 0)' % [$.namespaceMatcher(), containerName], ], ['{{pod}}', 'limit']) + { From 2d97250f15d5c98cad75ea34fcc40fd29d7ae5d4 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 14 Dec 2020 10:24:46 +0100 Subject: [PATCH 189/364] Added avg and max blocks / tenant Signed-off-by: Marco Pracucci --- .../dashboards/compactor.libsonnet | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 5d656c3f3ee..36bff1d38a7 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -5,7 +5,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ($.dashboard('Cortex / Compactor') + { uid: '9c408e1d55681ecb8a22c9fab46875cc' }) .addClusterSelectorTemplates() .addRow( - $.row('Compactions') + $.row('Summary') .addPanel( $.textPanel('', ||| - **Per-instance runs**: number of times a compactor instance triggers a compaction across all tenants its shard manage. @@ -52,6 +52,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher('compactor')) ) ) + .addRow( + $.row('') + .addPanel( + $.textPanel('', ||| + - **Average blocks / tenant**: the average number of blocks per tenant. + - **Tenants with largest number of blocks**: the 10 tenants with the largest number of blocks. + |||), + ) + .addPanel( + $.panel('Average blocks / tenant') + + $.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher('compactor'), 'avg'), + ) + .addPanel( + $.panel('Tenants with largest number of blocks') + + $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher('compactor'), '{{user}}'), + ) + ) .addRow( $.row('Garbage Collector') .addPanel( From 83ea7c8b54cf2fa8fb9ce29357bc2f7f05b2b66c Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 14 Dec 2020 11:53:04 +0100 Subject: [PATCH 190/364] Fixed CortexRequestErrors alert to not include ready route Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index b6628ab39f0..0d51d4ed66b 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -21,9 +21,9 @@ // Note is alert_aggregation_labels is "job", this will repeat the label. But // prometheus seems to tolerate that. expr: ||| - 100 * sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5.."}[1m])) + 100 * sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready"}[1m])) / - sum by (%s, job, route) (rate(cortex_request_duration_seconds_count[1m])) + sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready"}[1m])) > 1 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '15m', @@ -39,7 +39,7 @@ { alert: 'CortexRequestLatency', expr: ||| - cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process"} + cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready"} > %(cortex_p99_latency_threshold_seconds)s ||| % $._config, From 3aecb5d59cd2155a1b6ef8073e1f79c4519e9648 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 Jan 2021 11:52:03 +0100 Subject: [PATCH 191/364] Added bucket index observability Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 27 ++++++++++++++++++ .../mimir-mixin/dashboards/queries.libsonnet | 28 +++++++++++++++++-- jsonnet/mimir-mixin/docs/playbooks.md | 22 +++++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index d99b9a0fcfd..8207746ee8b 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -184,6 +184,33 @@ message: 'Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.', }, }, + { + // Alert if the bucket index has not been updated for a given user. + alert: 'CortexBucketIndexNotUpdated', + expr: ||| + min by(namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex bucket index for tenant {{ $labels.user }} in {{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}.', + }, + }, + { + // Alert if a we consistently find partial blocks for a given tenant over a relatively large time range. + alert: 'CortexTenantHasPartialBlocks', + 'for': '6h', + expr: ||| + max by(namespace, user) (cortex_bucket_blocks_partials_count) > 0 + |||, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Cortex tenant {{ $labels.user }} in {{ $labels.namespace }} has {{ $value }} partial blocks.', + }, + }, ], }, ], diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index cad974357f9..5f51425b6ed 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -136,7 +136,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRowIf( - std.member($._config.storage_engine, 'chunks'), + std.member($._config.storage_engine, 'blocks'), $.row('Querier - Blocks storage') .addPanel( $.panel('Number of store-gateways hit per Query') + @@ -156,7 +156,31 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Store-gateway - Blocks') + $.row('') + .addPanel( + $.panel('Bucket indexes loaded (per querier)') + + $.queryPanel([ + 'max(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), + 'min(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), + 'avg(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), + ], ['Max', 'Min', 'Average']) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.successFailurePanel( + 'Bucket indexes load / sec', + 'sum(rate(cortex_bucket_index_loads_total{%s}[$__rate_interval])) - sum(rate(cortex_bucket_index_load_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], + 'sum(rate(cortex_bucket_index_load_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.querier), + ) + ) + .addPanel( + $.panel('Bucket indexes load latency') + + $.latencyPanel('cortex_bucket_index_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.querier)), + ) + ) + .addRowIf( + std.member($._config.storage_engine, 'blocks'), + $.row('Store-gateway - Blocks storage') .addPanel( $.panel('Blocks queried / sec') + $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index d184e7449e9..b8046409992 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -226,6 +226,28 @@ gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks). +### CortexBucketIndexNotUpdated + +This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. + +How to **investigate**: +- Ensure the compactor is successfully running +- Look for any error in the compactor logs + +### CortexTenantHasPartialBlocks + +This alert fires when Cortex finds partial blocks for a given tenant. A partial block is a block missing the `meta.json` and this may usually happen in two circumstances: + +1. A block upload has been interrupted and not cleaned up or retried +2. A block deletion has been interrupted and `deletion-mark.json` has been deleted before `meta.json` + +How to **investigate**: +- Look for the block ID in the logs +- Find out which Cortex component operated on the block at last (eg. uploaded by ingester/compactor, or deleted by compactor) +- Investigate if was a partial upload or partial delete +- Safely manually delete the block from the bucket if was a partial delete or an upload failed by a compactor +- Further investigate if was an upload failed by an ingester but not later retried (ingesters are expected to retry uploads until succeed) + ### CortexWALCorruption This alert is only related to the chunks storage. This can happen because of 2 reasons: (1) Non graceful shutdown of ingesters. (2) Faulty storage or NFS. From 476b1e9171228f3c78b19fe528f89980cfe0ff06 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 5 Jan 2021 12:42:14 +0100 Subject: [PATCH 192/364] Honor per_instance_label in all panels Signed-off-by: Marco Pracucci --- .../dashboards/alertmanager-resources.libsonnet | 13 ++++++++----- jsonnet/mimir-mixin/dashboards/chunks.libsonnet | 10 +++++----- .../dashboards/compactor-resources.libsonnet | 10 ++++++++-- .../mimir-mixin/dashboards/compactor.libsonnet | 2 +- .../dashboards/dashboard-utils.libsonnet | 9 ++++++--- jsonnet/mimir-mixin/dashboards/queries.libsonnet | 8 ++++---- .../dashboards/reads-resources.libsonnet | 15 ++++++++++++--- .../dashboards/writes-resources.libsonnet | 15 ++++++++++++--- 8 files changed, 56 insertions(+), 26 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index 9bff5fcad74..5fdd92a283a 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -2,9 +2,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'alertmanager-resources.json': - local filterNodeDiskByAlertmanager = ||| - ignoring(pod) group_right() (label_replace(count by(pod, instance, device) (container_fs_writes_bytes_total{%s,container="alertmanager",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) - ||| % $.namespaceMatcher(); ($.dashboard('Cortex / Alertmanager Resources') + { uid: '68b66aed90ccab448009089544a8d6c6' }) .addClusterSelectorTemplates() .addRow( @@ -62,13 +59,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Disk') .addPanel( $.panel('Writes') + - $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % filterNodeDiskByAlertmanager, '{{pod}} - {{device}}') + + $.queryPanel( + 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('alertmanager')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Reads') + - $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % filterNodeDiskByAlertmanager, '{{pod}} - {{device}}') + + $.queryPanel( + 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('alertmanager')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + $.stack + { yaxes: $.yaxes('Bps') }, ) diff --git a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet index 0481569ab84..b82c68800db 100644 --- a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/chunks.libsonnet @@ -43,7 +43,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Flush Stats') .addPanel( $.panel('Queue Length') + - $.queryPanel('cortex_ingester_flush_queue_length{%s}' % $.jobMatcher($._config.job_names.ingester), '{{instance}}'), + $.queryPanel('cortex_ingester_flush_queue_length{%s}' % $.jobMatcher($._config.job_names.ingester), '{{%s}}' % $._config.per_instance_label), ) .addPanel( $.panel('Flush Rate') + @@ -88,13 +88,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Checkpoint') .addPanel( $.panel('Checkpoint creation/deletion / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-creation') + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-deletion'), + $.queryPanel('rate(cortex_ingester_checkpoint_creations_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-creation' % $._config.per_instance_label) + + $.queryPanel('rate(cortex_ingester_checkpoint_deletions_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-deletion' % $._config.per_instance_label), ) .addPanel( $.panel('Checkpoint creation/deletion failed / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-creation') + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{instance}}-deletion'), + $.queryPanel('rate(cortex_ingester_checkpoint_creations_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-creation' % $._config.per_instance_label) + + $.queryPanel('rate(cortex_ingester_checkpoint_deletions_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-deletion' % $._config.per_instance_label), ) ), } diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet index 6ce5f35ce5e..c24f600b05a 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet @@ -35,13 +35,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Disk') .addPanel( $.panel('Disk Writes') + - $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('compactor'), '{{pod}} - {{device}}') + + $.queryPanel( + 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('compactor')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Disk Reads') + - $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('compactor'), '{{pod}} - {{device}}') + + $.queryPanel( + 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('compactor')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + $.stack + { yaxes: $.yaxes('Bps') }, ) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 36bff1d38a7..d767e28d7d2 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -30,7 +30,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; cortex_compactor_tenants_processing_failed{%s} + cortex_compactor_tenants_skipped{%s} ) / cortex_compactor_tenants_discovered{%s} - ||| % [$.jobMatcher('compactor'), $.jobMatcher('compactor'), $.jobMatcher('compactor'), $.jobMatcher('compactor')], '{{instance}}') + + ||| % [$.jobMatcher('compactor'), $.jobMatcher('compactor'), $.jobMatcher('compactor'), $.jobMatcher('compactor')], '{{%s}}' % $._config.per_instance_label) + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) ) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 5644af3488a..f4f1ec6aeaa 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -154,7 +154,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; goHeapInUsePanel(title, jobName):: $.panel(title) + - $.queryPanel('sum by(instance) (go_memstats_heap_inuse_bytes{%s})' % $.jobMatcher(jobName), '{{instance}}') + + $.queryPanel( + 'sum by(%s) (go_memstats_heap_inuse_bytes{%s})' % [$._config.per_instance_label, $.jobMatcher(jobName)], + '{{%s}}' % $._config.per_instance_label + ) + { yaxes: $.yaxes('bytes') }, // Switches a panel from lines (default) to bars. @@ -244,6 +247,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; filterNodeDiskContainer(containerName):: ||| - ignoring(pod) group_right() (label_replace(count by(pod, instance, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) - ||| % [$.namespaceMatcher(), containerName], + ignoring(%s) group_right() (label_replace(count by(%s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) + ||| % [$._config.per_instance_label, $._config.per_instance_label, $.namespaceMatcher(), containerName], } diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index cad974357f9..2fcbf633024 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -18,7 +18,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Queue Length') + - $.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher($._config.job_names.query_frontend), '{{cluster}} / {{namespace}} / {{instance}}'), + $.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher($._config.job_names.query_frontend), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label), ) ) .addRow( @@ -29,7 +29,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Queue Length') + - $.queryPanel('cortex_query_scheduler_queue_length{%s}' % $.jobMatcher($._config.job_names.query_scheduler), '{{cluster}} / {{namespace}} / {{instance}}'), + $.queryPanel('cortex_query_scheduler_queue_length{%s}' % $.jobMatcher($._config.job_names.query_scheduler), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label), ) ) .addRow( @@ -196,7 +196,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Blocks currently loaded') + - $.queryPanel('cortex_bucket_store_blocks_loaded{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway), '{{instance}}') + $.queryPanel('cortex_bucket_store_blocks_loaded{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway), '{{%s}}' % $._config.per_instance_label) ) .addPanel( $.successFailurePanel( @@ -218,7 +218,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Lazy loaded index-headers') + - $.queryPanel('cortex_bucket_store_indexheader_lazy_load_total{%s} - cortex_bucket_store_indexheader_lazy_unload_total{%s}' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{instance}}') + $.queryPanel('cortex_bucket_store_indexheader_lazy_load_total{%s} - cortex_bucket_store_indexheader_lazy_unload_total{%s}' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{%s}}' % $._config.per_instance_label) ) .addPanel( $.panel('Index-header lazy load duration') + diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index 7c3cc30719d..715673b68da 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -68,7 +68,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ruler') .addPanel( $.panel('Rules') + - $.queryPanel('sum by(instance) (cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher($._config.job_names.ruler), '{{instance}}'), + $.queryPanel( + 'sum by(%s) (cortex_prometheus_rule_group_rules{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ruler)], + '{{%s}}' % $._config.per_instance_label + ), ) .addPanel( $.containerCPUUsagePanel('CPU', 'ruler'), @@ -101,13 +104,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Disk Writes') + - $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('store-gateway'), '{{pod}} - {{device}}') + + $.queryPanel( + 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('store-gateway')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Disk Reads') + - $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('store-gateway'), '{{pod}} - {{device}}') + + $.queryPanel( + 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('store-gateway')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + $.stack + { yaxes: $.yaxes('Bps') }, ) diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index 5e7391ba7f2..9d7e3e80309 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -32,7 +32,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ingester') .addPanel( $.panel('In-memory series') + - $.queryPanel('sum by(instance) (cortex_ingester_memory_series{%s})' % $.jobMatcher($._config.job_names.ingester), '{{instance}}'), + $.queryPanel( + 'sum by(%s) (cortex_ingester_memory_series{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], + '{{%s}}' % $._config.per_instance_label + ), ) .addPanel( $.containerCPUUsagePanel('CPU', 'ingester'), @@ -51,13 +54,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Disk Writes') + - $.queryPanel('sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('ingester'), '{{pod}} - {{device}}') + + $.queryPanel( + 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('ingester')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + $.stack + { yaxes: $.yaxes('Bps') }, ) .addPanel( $.panel('Disk Reads') + - $.queryPanel('sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % $.filterNodeDiskContainer('ingester'), '{{pod}} - {{device}}') + + $.queryPanel( + 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('ingester')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + $.stack + { yaxes: $.yaxes('Bps') }, ) From a5c8c8ad726394b1549a112c3f3b6c2470719b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 6 Jan 2021 12:17:49 +0100 Subject: [PATCH 193/364] Ignore long-running requests on query-scheduler when checking for high latency. (https://github.com/grafana/cortex-jsonnet/pull/242) --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 0d51d4ed66b..9fe049287a5 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -39,7 +39,7 @@ { alert: 'CortexRequestLatency', expr: ||| - cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready"} + cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > %(cortex_p99_latency_threshold_seconds)s ||| % $._config, From 3de40e6be7bcc87250c896adff2979d4147bd922 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 15 Jan 2021 09:41:26 +0100 Subject: [PATCH 194/364] Honor configured job_names in the 'Memory (go heap inuse)' panel Signed-off-by: Marco Pracucci --- .../dashboards/alertmanager-resources.libsonnet | 2 +- .../dashboards/reads-resources.libsonnet | 14 +++++++------- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 2 +- .../dashboards/writes-resources.libsonnet | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index 5fdd92a283a..21ed7db9f0a 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -13,7 +13,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.gateway), ) ) .addRow( diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index 715673b68da..c37948c652f 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -13,7 +13,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.gateway), ) ) .addRow( @@ -25,7 +25,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'query-frontend'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'query-frontend'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.query_frontend), ) ) .addRow( @@ -37,7 +37,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'query-scheduler'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'query-scheduler'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.query_scheduler), ) ) .addRow( @@ -49,7 +49,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'querier'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'querier'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.querier), ) ) .addRow( @@ -61,7 +61,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ingester'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.ingester), ) ) .addRow( @@ -83,7 +83,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ruler'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'ruler'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.ruler), ) ) .addRowIf( @@ -96,7 +96,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'store-gateway'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'store-gateway'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.store_gateway), ) ) .addRowIf( diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 1fbdb6e9fc6..9f98308c9d8 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -143,7 +143,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Memcached – Blocks Storage – Index header (Store-gateway)') + $.row('Memcached – Blocks Storage – Block Index (Store-gateway)') .addPanel( $.panel('QPS') + $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{component="store-gateway",name="index-cache", %s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index 9d7e3e80309..ab5b92a61c3 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -13,7 +13,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.gateway), ) ) .addRow( @@ -25,7 +25,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'distributor'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'distributor'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.distributor), ) ) .addRow( @@ -47,7 +47,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ingester'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.ingester), ) ) .addRow( From 7d7897da6d0457bea76a5a3ced04879b8ed81669 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 18 Jan 2021 10:29:24 +0100 Subject: [PATCH 195/364] Fixed ingester "Disk Space Utilization" to include any ingester.* PV Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index ab5b92a61c3..62a750069e1 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -72,7 +72,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Disk Space Utilization') + - $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="ingester"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + + $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name=~"ingester.*"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + { yaxes: $.yaxes('percentunit') }, ) ) From e5c978bae69b8222851588e40a13a01eac23b529 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Mon, 18 Jan 2021 17:10:34 +0100 Subject: [PATCH 196/364] Alert quicker for broken runtime config Signed-off-by: Goutham Veeramachaneni --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 9fe049287a5..6cf9f2d59e6 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -121,7 +121,8 @@ or cortex_overrides_last_reload_successful == 0 |||, - 'for': '15m', + // Alert quicker for human errors. + 'for': '5m', labels: { severity: 'warning', }, From 6a0998dec8acfe2d56ef38bfd4ee23628d1a1d00 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 19 Jan 2021 09:57:36 +0100 Subject: [PATCH 197/364] Fixed ingester alerts to include any ingester.* job Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 6 +++--- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 6cf9f2d59e6..b2cff6b8b3d 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -198,7 +198,7 @@ { alert: 'CortexIngesterRestarts', expr: ||| - changes(process_start_time_seconds{job=~".+(cortex|ingester)"}[30m]) > 1 + changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) > 1 |||, labels: { severity: 'critical', @@ -243,7 +243,7 @@ { alert: 'CortexMemoryMapAreasTooHigh', expr: ||| - process_memory_map_areas{job=~".+(cortex|ingester|store-gateway)"} / process_memory_map_areas_limit{job=~".+(cortex|ingester|store-gateway)"} > 0.8 + process_memory_map_areas{job=~".+(cortex|ingester.*|store-gateway)"} / process_memory_map_areas_limit{job=~".+(cortex|ingester.*|store-gateway)"} > 0.8 |||, 'for': '5m', labels: { @@ -502,7 +502,7 @@ expr: ||| memberlist_client_cluster_members_count != on (%s) group_left - sum by (%s) (up{job=~".+/(distributor|ingester|querier|cortex|ruler)"}) + sum by (%s) (up{job=~".+/(distributor|ingester.*|querier|cortex|ruler)"}) ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', labels: { diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 8207746ee8b..12e9160d881 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -9,9 +9,9 @@ alert: 'CortexIngesterHasNotShippedBlocks', 'for': '15m', expr: ||| - (min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 60 * 60 * 4) + (min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4) and - (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) > 0) + (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) and (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) |||, @@ -28,7 +28,7 @@ alert: 'CortexIngesterHasNotShippedBlocksSinceStart', 'for': '4h', expr: ||| - (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester"}) == 0) + (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0) and (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) |||, From 2ed56b1badee6e090c21c1b099f15aa208e5ae3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 20 Jan 2021 12:22:47 +0100 Subject: [PATCH 198/364] Disable multi-selection for variables in resources dashboards. (https://github.com/grafana/cortex-jsonnet/pull/251) * Disable multi-selection for variables in resources dashboards. * CHANGELOG.md --- .../alertmanager-resources.libsonnet | 2 +- .../dashboards/dashboard-utils.libsonnet | 20 +++++++++++++------ .../dashboards/reads-resources.libsonnet | 2 +- .../dashboards/writes-resources.libsonnet | 2 +- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index 21ed7db9f0a..7ca50c5db2b 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'alertmanager-resources.json': ($.dashboard('Cortex / Alertmanager Resources') + { uid: '68b66aed90ccab448009089544a8d6c6' }) - .addClusterSelectorTemplates() + .addClusterSelectorTemplates(false) .addRow( $.row('Gateway') .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index f4f1ec6aeaa..2e48fd7a050 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -14,7 +14,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; then self.addRow(row) else self, - addClusterSelectorTemplates():: + addClusterSelectorTemplates(multi=true):: local d = self { tags: $._config.tags, links: [ @@ -31,11 +31,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], }; - if $._config.singleBinary - then d.addMultiTemplate('job', 'cortex_build_info', 'job') - else d - .addMultiTemplate('cluster', 'cortex_build_info', 'cluster') - .addMultiTemplate('namespace', 'cortex_build_info', 'namespace'), + if multi then + if $._config.singleBinary + then d.addMultiTemplate('job', 'cortex_build_info', 'job') + else d + .addMultiTemplate('cluster', 'cortex_build_info', 'cluster') + .addMultiTemplate('namespace', 'cortex_build_info', 'namespace') + else + if $._config.singleBinary + then d.addTemplate('job', 'cortex_build_info', 'job') + else d + .addTemplate('cluster', 'cortex_build_info', 'cluster') + .addTemplate('namespace', 'cortex_build_info', 'namespace'), + }, // The mixin allow specialism of the job selector depending on if its a single binary diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index c37948c652f..d1fd0466e15 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-reads-resources.json': ($.dashboard('Cortex / Reads Resources') + { uid: '2fd2cda9eea8d8af9fbc0a5960425120' }) - .addClusterSelectorTemplates() + .addClusterSelectorTemplates(false) .addRow( $.row('Gateway') .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index 62a750069e1..68570111256 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { 'cortex-writes-resources.json': ($.dashboard('Cortex / Writes Resources') + { uid: 'c0464f0d8bd026f776c9006b0591bb0b' }) - .addClusterSelectorTemplates() + .addClusterSelectorTemplates(false) .addRow( $.row('Gateway') .addPanel( From 23a762ff10f3b5ab3b0f3edc20ae6f14ef2d86f1 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 25 Jan 2021 14:15:00 +0100 Subject: [PATCH 199/364] Added alert CortexIngesterHasUnshippedBlocks Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 18 ++++++++++++++++++ jsonnet/mimir-mixin/docs/playbooks.md | 7 +++++++ 2 files changed, 25 insertions(+) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index 12e9160d881..beeb62408a5 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -39,6 +39,24 @@ message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', }, }, + { + // Alert if the ingester has compacted some blocks that haven't been successfully uploaded to the storage yet since + // more than 1 hour. The metric tracks the time of the oldest unshipped block, measured as the time when the + // TSDB head has been compacted to a block. The metric is 0 if all blocks have been shipped. + alert: 'CortexIngesterHasUnshippedBlocks', + 'for': '15m', + expr: ||| + (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600) + and + (cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0) + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: "Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet.", + }, + }, { // Alert if the ingester is failing to compact TSDB head into a block, for any opened TSDB. Once the TSDB head is // compactable, the ingester will try to compact it every 1 minute. Repeatedly failing it is a critical condition diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index b8046409992..6ce4e7a3dcb 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -92,6 +92,13 @@ If the ingester hit the disk capacity, any attempt to append samples will fail. Same as [`CortexIngesterHasNotShippedBlocks`](#CortexIngesterHasNotShippedBlocks). +### CortexIngesterHasUnshippedBlocks + +This alert fires when a Cortex ingester has compacted some blocks but such blocks haven't been successfully uploaded to the storage yet. + +How to **investigate**: +- Look for details in the ingester logs + ### CortexIngesterTSDBHeadCompactionFailed This alert fires when a Cortex ingester is failing to compact the TSDB head into a block. From fbab4f2ea9bbf2a80d5bda33443fca267ffcac3f Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 27 Jan 2021 13:05:37 +0100 Subject: [PATCH 200/364] Increased CortexAllocatingTooMuchMemory alert threshold Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index b2cff6b8b3d..14b4e04b791 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -404,7 +404,7 @@ container_memory_working_set_bytes{container="ingester"} / container_spec_memory_limit_bytes{container="ingester"} - ) > 0.5 + ) > 0.65 |||, 'for': '15m', labels: { From bbc5c3483db43ee454019aa9f9f86303e7662aeb Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Thu, 25 Feb 2021 10:54:53 +0100 Subject: [PATCH 201/364] Add alert for etcd memory limits close Signed-off-by: Goutham Veeramachaneni --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 43 +++++++++++++++++++++ jsonnet/mimir-mixin/docs/playbooks.md | 18 +++++++++ 2 files changed, 61 insertions(+) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 14b4e04b791..7a57c9eab7a 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -514,5 +514,48 @@ }, ], }, + { + name: 'etcd_alerts', + rules: [ + { + alert: 'EtcdAllocatingTooMuchMemory', + expr: ||| + ( + container_memory_working_set_bytes{container="etcd"} + / + container_spec_memory_limit_bytes{container="etcd"} + ) > 0.65 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. + |||, + }, + }, + { + alert: 'EtcdAllocatingTooMuchMemory', + expr: ||| + ( + container_memory_working_set_bytes{container="etcd"} + / + container_spec_memory_limit_bytes{container="etcd"} + ) > 0.8 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. + |||, + }, + }, + ], + }, ], } diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 6ce4e7a3dcb..cc656eb2139 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -274,6 +274,24 @@ WAL corruptions are only detected at startups, so at this point the WAL/Checkpoi 2. Equal or more than the quorum number but less than replication factor: There is a good chance that there is no data loss if it was replicated to desired number of ingesters. But it's good to check once for data loss. 3. Equal or more than the replication factor: Then there is definitely some data loss. +### EtcdAllocatingTooMuchMemory + +This can be triggered if there are too many HA dedupe keys in etcd. We saw this when one of our clusters hit 20K tenants that were using HA dedupe config. Raise the etcd limits via: + +``` + etcd+: { + spec+: { + pod+: { + resources+: { + limits: { + memory: '2Gi', + }, + }, + }, + }, + }, +``` + ## Cortex blocks storage - What to do when things to wrong ## Recovering from a potential data loss incident From d5ad43aa43f2650394f095b20a02f731436751ae Mon Sep 17 00:00:00 2001 From: Mauro Stettler Date: Fri, 5 Mar 2021 04:21:23 -0300 Subject: [PATCH 202/364] the distributor now supports push via GRPC (https://github.com/grafana/cortex-jsonnet/pull/266) Signed-off-by: Mauro Stettler --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 9ab73d12e70..eb3fcbc3a74 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -56,16 +56,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Distributor') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/httpgrpc.*|api_(v1|prom)_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' ) + { yaxes: $.yaxes('s') } ) From adb82b6d3a6846306fb9da0488e363f12297bb1d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 9 Mar 2021 18:09:02 +0100 Subject: [PATCH 203/364] Fixed CortexQuerierHighRefetchRate alert Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index beeb62408a5..f416f30a231 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -172,7 +172,7 @@ ( sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) - - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0"}[5m])) + sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0(\\.0)?"}[5m])) ) / sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) From 667ca36ee87888be29596fda5d527d2250d83bdf Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 10 Mar 2021 08:59:47 +0100 Subject: [PATCH 204/364] Fixed label matcher Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index f416f30a231..ab1f15fb8e2 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -172,7 +172,7 @@ ( sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) - - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0(\\.0)?"}[5m])) + sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m])) ) / sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) From 5c75158411ec43ab260a840b0a145e4b2dcc7ac8 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 12 Mar 2021 09:55:54 +0100 Subject: [PATCH 205/364] Sort legend descending in the CPU/memory panels Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 7 ++++++- jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 2e48fd7a050..47ff4919a7d 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -139,6 +139,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; fill: 0, }, ], + tooltip: { sort: 2 }, // Sort descending. }, containerMemoryWorkingSetPanel(title, containerName):: @@ -158,6 +159,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, ], yaxes: $.yaxes('bytes'), + tooltip: { sort: 2 }, // Sort descending. }, goHeapInUsePanel(title, jobName):: @@ -166,7 +168,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum by(%s) (go_memstats_heap_inuse_bytes{%s})' % [$._config.per_instance_label, $.jobMatcher(jobName)], '{{%s}}' % $._config.per_instance_label ) + - { yaxes: $.yaxes('bytes') }, + { + yaxes: $.yaxes('bytes'), + tooltip: { sort: 2 }, // Sort descending. + }, // Switches a panel from lines (default) to bars. bars:: { diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index 68570111256..95f7a056e8f 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -35,7 +35,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( 'sum by(%s) (cortex_ingester_memory_series{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '{{%s}}' % $._config.per_instance_label - ), + ) + + { + tooltip: { sort: 2 }, // Sort descending. + }, ) .addPanel( $.containerCPUUsagePanel('CPU', 'ingester'), From a7bdd2e89c8220a067a96ecf45823a77f963ea2e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 16 Mar 2021 12:23:01 +0100 Subject: [PATCH 206/364] Add slow queries dashboard Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards.libsonnet | 1 + .../dashboards/slow-queries.libsonnet | 184 ++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index 0147148dd0c..c064ed18858 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -7,6 +7,7 @@ (import 'dashboards/alertmanager.libsonnet') + (import 'dashboards/scaling.libsonnet') + (import 'dashboards/writes.libsonnet') + + (import 'dashboards/slow-queries.libsonnet') + (if std.member($._config.storage_engine, 'blocks') then diff --git a/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet b/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet new file mode 100644 index 00000000000..96dfb1fa247 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet @@ -0,0 +1,184 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + 'cortex-slow-queries.json': + ($.dashboard('Cortex / Slow Queries') + { uid: 'e6f3091e29d2636e3b8393447e925668' }) + .addClusterSelectorTemplates(false) + .addRow( + $.row('') + .addPanel( + { + title: 'Slow queries', + type: 'table', + datasource: '${lokidatasource}', + + // Query logs from Loki. + targets: [ + { + // Filter out the remote read endpoint. + expr: '{cluster=~"$cluster",namespace=~"$namespace",name="query-frontend"} |= "query stats" != "/api/v1/read" | logfmt | org_id=~"${tenant_id}" | response_time > ${min_duration}', + instant: false, + legendFormat: '', + range: true, + refId: 'A', + }, + ], + + // Use Grafana transformations to display fields in a table. + transformations: [ + { + // Convert labels to fields. + id: 'labelsToFields', + options: {}, + }, + { + // Compute the query time range. + id: 'calculateField', + options: { + alias: 'Time range', + mode: 'binary', + binary: { + left: 'param_end', + operator: '-', + reducer: 'sum', + right: 'param_start', + }, + reduce: { reducer: 'sum' }, + replaceFields: false, + }, + }, + { + id: 'organize', + options: { + // Hide fields we don't care. + local hiddenFields = ['caller', 'cluster', 'container', 'host', 'id', 'job', 'level', 'line', 'method', 'msg', 'name', 'namespace', 'org_id', 'param_end', 'param_start', 'param_time', 'path', 'pod', 'pod_template_hash', 'query_wall_time_seconds', 'stream', 'traceID', 'tsNs'], + + excludeByName: { + [field]: true + for field in hiddenFields + }, + + // Order fields. + local orderedFields = ['ts', 'param_query', 'Time range', 'param_step', 'response_time'], + + indexByName: { + [orderedFields[i]]: i + for i in std.range(0, std.length(orderedFields) - 1) + }, + + // Rename fields. + renameByName: { + param_query: 'Query', + param_step: 'Step', + response_time: 'Duration', + }, + }, + }, + ], + + fieldConfig: { + // Configure overrides to nicely format field values. + overrides: [ + { + matcher: { id: 'byName', options: 'Time range' }, + properties: [ + { + id: 'mappings', + value: [ + { + from: '', + id: 1, + text: 'Instant query', + to: '', + type: 1, + value: '0', + }, + ], + }, + { id: 'unit', value: 's' }, + ], + }, + { + matcher: { id: 'byName', options: 'Step' }, + properties: [{ id: 'unit', value: 's' }], + }, + ], + }, + }, + ) + ) + + { + templating+: { + list+: [ + // Add the Loki datasource. + { + type: 'datasource', + name: 'lokidatasource', + label: 'Logs datasource', + query: 'loki', + hide: 0, + includeAll: false, + multi: false, + }, + // Add a variable to configure the min duration. + { + local defaultValue = '5s', + + type: 'textbox', + name: 'min_duration', + label: 'Min duration', + hide: 0, + options: [ + { + selected: true, + text: defaultValue, + value: defaultValue, + }, + ], + current: { + // Default value. + selected: true, + text: defaultValue, + value: defaultValue, + }, + query: defaultValue, + }, + // Add a variable to configure the tenant to filter on. + { + local defaultValue = '.*', + + type: 'textbox', + name: 'tenant_id', + label: 'Tenant ID', + hide: 0, + options: [ + { + selected: true, + text: defaultValue, + value: defaultValue, + }, + ], + current: { + // Default value. + selected: true, + text: defaultValue, + value: defaultValue, + }, + query: defaultValue, + }, + ], + }, + } + { + templating+: { + list: [ + // Do not allow to include all clusters/namespaces otherwise this dashboard + // risks to explode because it shows resources per pod. + l + (if (l.name == 'cluster' || l.name == 'namespace') then { includeAll: false } else {}) + for l in super.list + ], + }, + } + { + // No auto-refresh by default. + refresh: '', + }, +} From 419eaba7211eb60e5679da5ce30124b9f568d6a8 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 16 Mar 2021 12:26:53 +0100 Subject: [PATCH 207/364] Added tenant ID field to the table Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet b/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet index 96dfb1fa247..a732388a067 100644 --- a/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet @@ -51,7 +51,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; id: 'organize', options: { // Hide fields we don't care. - local hiddenFields = ['caller', 'cluster', 'container', 'host', 'id', 'job', 'level', 'line', 'method', 'msg', 'name', 'namespace', 'org_id', 'param_end', 'param_start', 'param_time', 'path', 'pod', 'pod_template_hash', 'query_wall_time_seconds', 'stream', 'traceID', 'tsNs'], + local hiddenFields = ['caller', 'cluster', 'container', 'host', 'id', 'job', 'level', 'line', 'method', 'msg', 'name', 'namespace', 'param_end', 'param_start', 'param_time', 'path', 'pod', 'pod_template_hash', 'query_wall_time_seconds', 'stream', 'traceID', 'tsNs'], excludeByName: { [field]: true @@ -59,7 +59,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, // Order fields. - local orderedFields = ['ts', 'param_query', 'Time range', 'param_step', 'response_time'], + local orderedFields = ['ts', 'org_id', 'param_query', 'Time range', 'param_step', 'response_time'], indexByName: { [orderedFields[i]]: i @@ -68,6 +68,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Rename fields. renameByName: { + org_id: 'Tenant ID', param_query: 'Query', param_step: 'Step', response_time: 'Duration', From 077487024b55330384b9b5575f1c4b2886255ddd Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Thu, 18 Mar 2021 12:58:12 +0000 Subject: [PATCH 208/364] Add recording rules to calculate Cortex scaling - Update dashboard so it only shows under provisioned services and why - Add sizing rules based on limits. - Add some docs to the dashboard. Signed-off-by: Tom Wilkie --- .../mimir-mixin/dashboards/scaling.libsonnet | 125 ++++------- jsonnet/mimir-mixin/recording_rules.libsonnet | 201 ++++++++++++++++++ 2 files changed, 238 insertions(+), 88 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet index d1ff7bd31e6..11e1f7950ce 100644 --- a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet @@ -6,105 +6,54 @@ local utils = import 'mixin-utils/utils.libsonnet'; ($.dashboard('Cortex / Scaling') + { uid: '88c041017b96856c9176e07cf557bdcf' }) .addClusterSelectorTemplates() .addRow( - $.row('Workload-based scaling') - .addPanel( - $.panel('Workload-based scaling') + { sort: { col: 1, desc: false } } + - $.tablePanel([ - ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"}, - "deployment", "$1", "statefulset", "(.*)" - ) - ) - |||, - ||| - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(cortex_distributor_received_samples_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "ingester", "cluster", ".*"))[1h:]) - * 3 / 80e3 - |||, - ||| - label_replace( - sum by(cluster, namespace) ( - cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace"} - ) / 1e+6, - "deployment", "ingester", "cluster", ".*" - ) - or - label_replace( - sum by (cluster, namespace) ( - 4 * cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} - * - cortex_ingester_chunk_size_bytes_sum{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} - / - cortex_ingester_chunk_size_bytes_count{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} - ) - / - avg by (cluster, namespace) (memcached_limit_bytes{cluster=~"$cluster", namespace=~"$namespace", job=~".+/memcached"}), - "deployment", "memcached", "namespace", ".*" - ) + ($.row('Cortex Service Scaling') + { height: '200px' }) + .addPanel({ + type: 'text', + title: '', + options: { + content: ||| + This dashboards shows any services which are not scaled correctly. + The table below gives the required number of replicas and the reason why. + We only show services without enough replicas. + + Reasons: + - **sample_rate**: There are not enough replicas to handle the + sample rate. Applies to distributor and ingesters. + - **active_series**: There are not enough replicas + to handle the number of active series. Applies to ingesters. + - **cpu_usage**: There are not enough replicas + based on the CPU usage of the jobs vs the resource requests. + Applies to all jobs. + - **memory_usage**: There are not enough replicas based on the memory + usage vs the resource requests. Applies to all jobs. + - **active_series_limits**: There are not enough replicas to hold 60% of the + sum of all the per tenant series limits. + - **sample_rate_limits**: There are not enough replicas to handle 60% of the + sum of all the per tenant rate limits. |||, - ], { - cluster: { alias: 'Cluster' }, - namespace: { alias: 'Namespace' }, - deployment: { alias: 'Deployment' }, - 'Value #A': { alias: 'Current Replicas', decimals: 0 }, - 'Value #B': { alias: 'Required Replicas, by ingestion rate', decimals: 0 }, - 'Value #C': { alias: 'Required Replicas, by active series', decimals: 0 }, - }) - ) + mode: 'markdown', + }, + }) ) .addRow( - ($.row('Resource-based scaling') + { height: '500px' }) + ($.row('Scaling') + { height: '400px' }) .addPanel( - $.panel('Resource-based scaling') + { sort: { col: 1, desc: false } } + + $.panel('Workload-based scaling') + { sort: { col: 0, desc: false } } + $.tablePanel([ ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, - "deployment", "$1", "statefulset", "(.*)" - ) - ) - |||, - ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, - "deployment", "$1", "statefulset", "(.*)" - ) - ) - * - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:]) - / - sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_cpu_cores{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) - |||, - ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, - "deployment", "$1", "statefulset", "(.*)" - ) + sort_desc( + cluster_namespace_deployment_reason:required_replicas:count{cluster=~"$cluster", namespace=~"$namespace"} + > ignoring(reason) group_left + cluster_namespace_deployment:actual_replicas:count{cluster=~"$cluster", namespace=~"$namespace"} ) - * - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m]) - / - sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_memory_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) |||, ], { + '__name__': { alias: 'Cluster', type: 'hidden' }, cluster: { alias: 'Cluster' }, namespace: { alias: 'Namespace' }, - deployment: { alias: 'Deployment' }, - 'Value #A': { alias: 'Current Replicas', decimals: 0 }, - 'Value #B': { alias: 'Required Replicas, by CPU usage', decimals: 0 }, - 'Value #C': { alias: 'Required Replicas, by Memory usage', decimals: 0 }, + deployment: { alias: 'Service' }, + reason: { alias: 'Reason' }, + 'Value': { alias: 'Required Replicas', decimals: 0 }, }) ) ), diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 54544d0a283..85ad6d39bb1 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -58,6 +58,207 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, ], }, + { + local _config = { + max_series_per_ingester: 1.5e6, + max_samples_per_sec_per_ingester: 80e3, + max_samples_per_sec_per_distributor: 240e3, + limit_utilisation_target: 0.6, + }, + name: 'cortex_scaling_rules', + rules: [ + { + // Convenience rule to get the number of replicas for both a deployment and a statefulset. + record: 'cluster_namespace_deployment:actual_replicas:count', + expr: ||| + sum by (cluster, namespace, deployment) (kube_deployment_spec_replicas) + or + sum by (cluster, namespace, deployment) ( + label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)") + ) + |||, + }, + { + // Distributors should be able to deal with 240k samples/s. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'distributor', + reason: 'sample_rate', + }, + expr: ||| + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + / %(max_samples_per_sec_per_distributor)s + ) + ||| % _config, + }, + { + // We should be about to cover 80% of our limits, + // and ingester can have 80k samples/s. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'distributor', + reason: 'sample_rate_limits', + }, + expr: ||| + ceil( + sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"}) + * %(limit_utilisation_target)s / %(max_samples_per_sec_per_distributor)s + ) + ||| % _config, + }, + { + // We want ingesters each ingester to deal with 80k samples/s. + // NB we measure this at the distributors and multiple by RF (3). + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'ingester', + reason: 'sample_rate', + }, + expr: ||| + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + * 3 / %(max_samples_per_sec_per_ingester)s + ) + ||| % _config, + }, + { + // Ingester should have 1.5M series in memory + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'ingester', + reason: 'active_series', + }, + expr: ||| + ceil( + quantile_over_time(0.99, + sum by(cluster, namespace) ( + cortex_ingester_memory_series + )[24h:] + ) + / %(max_series_per_ingester)s + ) + ||| % _config, + }, + { + // We should be about to cover 60% of our limits, + // and ingester can have 1.5M series in memory + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'ingester', + reason: 'active_series_limits', + }, + expr: ||| + ceil( + sum by (cluster, namespace) (cortex_overrides{limit_name="max_global_series_per_user"}) + * 3 * %(limit_utilisation_target)s / %(max_series_per_ingester)s + ) + ||| % _config, + }, + { + // We should be about to cover 60% of our limits, + // and ingester can have 80k samples/s. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'ingester', + reason: 'sample_rate_limits', + }, + expr: ||| + ceil( + sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"}) + * %(limit_utilisation_target)s / %(max_samples_per_sec_per_ingester)s + ) + ||| % _config, + }, + { + // Ingesters store 96h of data on disk - we want memcached to store 1/4 of that. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'memcached', + reason: 'active_series', + }, + expr: ||| + ceil( + (sum by (cluster, namespace) ( + cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester"} + ) / 4) + / + avg by (cluster, namespace) ( + memcached_limit_bytes{job=~".+/memcached"} + ) + ) + |||, + }, + { + // Jobs should be sized to their CPU usage. + // We do this by comparing 99th percentile usage over the last 24hrs to + // their current provisioned #replicas and resource requests. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + reason: 'cpu_usage', + }, + expr: ||| + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, + sum by (cluster, namespace, deployment) ( + label_replace( + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + )[24h:5m] + ) + / + sum by (cluster, namespace, deployment) ( + label_replace( + kube_pod_container_resource_requests_cpu_cores, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + ) + ) + |||, + }, + { + // Jobs should be sized to their Memory usage. + // We do this by comparing 99th percentile usage over the last 24hrs to + // their current provisioned #replicas and resource requests. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + reason: 'memory_usage', + }, + expr: ||| + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, + sum by (cluster, namespace, deployment) ( + label_replace( + container_memory_usage_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + )[24h:5m] + ) + / + sum by (cluster, namespace, deployment) ( + label_replace( + kube_pod_container_resource_requests_memory_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + ) + ) + |||, + }, + ], + }, ], }, } From 84867770842916450e29886afe25b00d91ac04f0 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 23 Mar 2021 11:47:39 +0100 Subject: [PATCH 209/364] Increased CortexRequestErrors alert severity Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 7a57c9eab7a..cad56a1b979 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -18,7 +18,7 @@ }, { alert: 'CortexRequestErrors', - // Note is alert_aggregation_labels is "job", this will repeat the label. But + // Note if alert_aggregation_labels is "job", this will repeat the label. But // prometheus seems to tolerate that. expr: ||| 100 * sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready"}[1m])) @@ -28,7 +28,7 @@ ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '15m', labels: { - severity: 'warning', + severity: 'critical', }, annotations: { message: ||| From 1c4dec677adf5f9adb3eda538a2298b9f0793337 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 23 Mar 2021 18:29:57 +0100 Subject: [PATCH 210/364] Fixed "Disk Writes" and "Disk Reads" panels Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/config.libsonnet | 5 ++++- .../mimir-mixin/dashboards/alertmanager-resources.libsonnet | 4 ++-- jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet | 4 ++-- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 5 ++--- jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet | 4 ++-- jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet | 4 ++-- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index ad89da47940..fbac5ef15d0 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -45,7 +45,10 @@ // Whether resources dashboards are enabled (based on cAdvisor metrics). resources_dashboards_enabled: false, - // Used on panels that show metrics per instance. i.e. 'pod' in a kubernetes install + // The label used to differentiate between different application instances (i.e. 'pod' in a kubernetes install). per_instance_label: 'pod', + + // The label used to differentiate between different nodes (i.e. servers). + per_node_label: 'instance', }, } diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index 7ca50c5db2b..7a9e721afed 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -60,7 +60,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Writes') + $.queryPanel( - 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('alertmanager')], + 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('alertmanager')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.stack + @@ -69,7 +69,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Reads') + $.queryPanel( - 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('alertmanager')], + 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('alertmanager')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.stack + diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet index c24f600b05a..275e6bd0b81 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet @@ -36,7 +36,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Disk Writes') + $.queryPanel( - 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('compactor')], + 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('compactor')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.stack + @@ -45,7 +45,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Disk Reads') + $.queryPanel( - 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('compactor')], + 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('compactor')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.stack + diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 47ff4919a7d..7326f22f1e1 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -257,9 +257,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('percentunit') }, ), - filterNodeDiskContainer(containerName):: ||| - ignoring(%s) group_right() (label_replace(count by(%s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) - ||| % [$._config.per_instance_label, $._config.per_instance_label, $.namespaceMatcher(), containerName], + ignoring(%s) group_right() (label_replace(count by(%s, %s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) + ||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName], } diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index d1fd0466e15..697a7fd49da 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -105,7 +105,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Disk Writes') + $.queryPanel( - 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('store-gateway')], + 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('store-gateway')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.stack + @@ -114,7 +114,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Disk Reads') + $.queryPanel( - 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('store-gateway')], + 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('store-gateway')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.stack + diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index 95f7a056e8f..f833e406629 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -58,7 +58,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Disk Writes') + $.queryPanel( - 'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('ingester')], + 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('ingester')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.stack + @@ -67,7 +67,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Disk Reads') + $.queryPanel( - 'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_instance_label, $.filterNodeDiskContainer('ingester')], + 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('ingester')], '{{%s}} - {{device}}' % $._config.per_instance_label ) + $.stack + From 4ff33a34f3ee48e2370c84db4e445160510f8157 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 30 Mar 2021 08:27:46 +0200 Subject: [PATCH 211/364] Pre-compute aggregations to optimize scaling recording rules Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/recording_rules.libsonnet | 78 ++++++++++++------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 85ad6d39bb1..5ef55769e96 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -197,6 +197,30 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) |||, }, + { + // Convenience rule to get the CPU utilization for both a deployment and a statefulset. + record: 'cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate', + expr: ||| + sum by (cluster, namespace, deployment) ( + label_replace( + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + ) + |||, + }, + { + // Convenience rule to get the CPU request for both a deployment and a statefulset. + record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum', + expr: ||| + sum by (cluster, namespace, deployment) ( + label_replace( + kube_pod_container_resource_requests_cpu_cores, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + ) + |||, + }, { // Jobs should be sized to their CPU usage. // We do this by comparing 99th percentile usage over the last 24hrs to @@ -209,20 +233,32 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( cluster_namespace_deployment:actual_replicas:count * - quantile_over_time(0.99, - sum by (cluster, namespace, deployment) ( - label_replace( - node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ) - )[24h:5m] - ) + quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h:5m]) / - sum by (cluster, namespace, deployment) ( - label_replace( - kube_pod_container_resource_requests_cpu_cores, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ) + cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + ) + |||, + }, + { + // Convenience rule to get the Memory utilization for both a deployment and a statefulset. + record: 'cluster_namespace_deployment:container_memory_usage_bytes:sum', + expr: ||| + sum by (cluster, namespace, deployment) ( + label_replace( + container_memory_usage_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + ) + |||, + }, + { + // Convenience rule to get the Memory request for both a deployment and a statefulset. + record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum', + expr: ||| + sum by (cluster, namespace, deployment) ( + label_replace( + kube_pod_container_resource_requests_memory_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ) ) |||, @@ -239,21 +275,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( cluster_namespace_deployment:actual_replicas:count * - quantile_over_time(0.99, - sum by (cluster, namespace, deployment) ( - label_replace( - container_memory_usage_bytes, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ) - )[24h:5m] - ) + quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h:5m]) / - sum by (cluster, namespace, deployment) ( - label_replace( - kube_pod_container_resource_requests_memory_bytes, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ) - ) + cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum ) |||, }, From 7ba8424f3cf8db7cb17fcb4060a8046c87fd3549 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 31 Mar 2021 10:59:31 +0200 Subject: [PATCH 212/364] Removed 5m step from subquery Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/recording_rules.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 5ef55769e96..f980474453b 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -233,7 +233,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( cluster_namespace_deployment:actual_replicas:count * - quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h:5m]) + quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) / cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum ) @@ -275,7 +275,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( cluster_namespace_deployment:actual_replicas:count * - quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h:5m]) + quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) / cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum ) From 5c36a63ad074bc1449c070ad83c1f9b6b886354d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Apr 2021 09:48:30 +0200 Subject: [PATCH 213/364] Add function to customize compactor statefulset Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/config.libsonnet | 1 + .../dashboards/compactor-resources.libsonnet | 2 +- .../dashboards/compactor.libsonnet | 28 +++++++++---------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index fbac5ef15d0..94c9845cb51 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -35,6 +35,7 @@ table_manager: '(table-manager|cortex$)', store_gateway: '(store-gateway|cortex$)', gateway: 'cortex-gw', + compactor: 'compactor.*', // Match also custom compactor deployments. }, // Labels used to in alert aggregations - should uniquely identify diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet index 275e6bd0b81..b0ddf170322 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet @@ -13,7 +13,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerMemoryWorkingSetPanel('Memory (workingset)', 'compactor'), ) .addPanel( - $.goHeapInUsePanel('Memory (go heap inuse)', 'compactor'), + $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.compactor), ) ) .addRow( diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index d767e28d7d2..657cfce7a32 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -15,9 +15,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.startedCompletedFailedPanel( 'Per-instance runs / sec', - 'sum(rate(cortex_compactor_runs_started_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), - 'sum(rate(cortex_compactor_runs_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), - 'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor') + 'sum(rate(cortex_compactor_runs_started_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), + 'sum(rate(cortex_compactor_runs_completed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), + 'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor) ) + $.bars + { yaxes: $.yaxes('ops') }, @@ -30,7 +30,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; cortex_compactor_tenants_processing_failed{%s} + cortex_compactor_tenants_skipped{%s} ) / cortex_compactor_tenants_discovered{%s} - ||| % [$.jobMatcher('compactor'), $.jobMatcher('compactor'), $.jobMatcher('compactor'), $.jobMatcher('compactor')], '{{%s}}' % $._config.per_instance_label) + + ||| % [$.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor)], '{{%s}}' % $._config.per_instance_label) + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) ) @@ -44,12 +44,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Compacted blocks / sec') + - $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), 'blocks') + + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Per-block compaction duration') + - $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher('compactor')) + $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) ) ) .addRow( @@ -62,18 +62,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Average blocks / tenant') + - $.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher('compactor'), 'avg'), + $.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), 'avg'), ) .addPanel( $.panel('Tenants with largest number of blocks') + - $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher('compactor'), '{{user}}'), + $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}'), ) ) .addRow( $.row('Garbage Collector') .addPanel( $.panel('Blocks marked for deletion / sec') + - $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), 'blocks') + + $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + { yaxes: $.yaxes('ops') }, ) .addPanel( @@ -81,8 +81,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Blocks deletions / sec', // The cortex_compactor_blocks_cleaned_total tracks the number of successfully // deleted blocks. - 'sum(rate(cortex_compactor_blocks_cleaned_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), - 'sum(rate(cortex_compactor_block_cleanup_failures_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), + 'sum(rate(cortex_compactor_blocks_cleaned_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), + 'sum(rate(cortex_compactor_block_cleanup_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), ) + { yaxes: $.yaxes('ops') } ) ) @@ -93,14 +93,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Metadata Syncs / sec', // The cortex_compactor_meta_syncs_total metric is incremented each time a per-tenant // metadata sync is triggered. - 'sum(rate(cortex_compactor_meta_syncs_total{%s}[$__rate_interval])) - sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher('compactor'), $.jobMatcher('compactor')], - 'sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor'), + 'sum(rate(cortex_compactor_meta_syncs_total{%s}[$__rate_interval])) - sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor)], + 'sum(rate(cortex_compactor_meta_sync_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), ) + { yaxes: $.yaxes('ops') } ) .addPanel( $.panel('Metadata Sync Duration') + // This metric tracks the duration of a per-tenant metadata sync. - $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher('compactor')), + $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)), ) ) .addRow($.objectStorePanels1('Object Store', 'compactor')) From d7fbc234109663998b858ef7dd2d8f52c1ca7168 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Apr 2021 09:58:54 +0200 Subject: [PATCH 214/364] Use the job name in compactor alerts too Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index c31b2bb4cd5..a2615e04cf3 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -38,10 +38,10 @@ alert: 'CortexCompactorHasNotUploadedBlocks', 'for': '15m', expr: ||| - (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 60 * 60 * 24) + (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} > 60 * 60 * 24) and - (thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} > 0) - |||, + (thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} > 0) + ||| % $._config.job_names, labels: { severity: 'critical', }, @@ -54,8 +54,8 @@ alert: 'CortexCompactorHasNotUploadedBlocksSinceStart', 'for': '24h', expr: ||| - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor"} == 0 - |||, + thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} == 0 + ||| % $._config.job_names, labels: { severity: 'critical', }, From 73c6770e01c9874c4ebeda541aac6ea49b1addea Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Apr 2021 10:54:50 +0200 Subject: [PATCH 215/364] Fixed CortexCompactorRunFailed threshold Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index a2615e04cf3..c8ac434a27c 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -67,7 +67,7 @@ // Alert if compactor fails. alert: 'CortexCompactorRunFailed', expr: ||| - increase(cortex_compactor_runs_failed_total[2h]) > 1 + increase(cortex_compactor_runs_failed_total[2h]) >= 2 |||, labels: { severity: 'critical', From a103a95e0f0ef6462600e0b21284a455a0eb7b34 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Apr 2021 20:24:13 +0200 Subject: [PATCH 216/364] Added Cortex Rollout progress dashboard Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards.libsonnet | 1 + .../dashboards/dashboard-utils.libsonnet | 66 ++++ .../dashboards/rollout-progress.libsonnet | 284 ++++++++++++++++++ 3 files changed, 351 insertions(+) create mode 100644 jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index c064ed18858..baf800b3128 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -8,6 +8,7 @@ (import 'dashboards/scaling.libsonnet') + (import 'dashboards/writes.libsonnet') + (import 'dashboards/slow-queries.libsonnet') + + (import 'dashboards/rollout-progress.libsonnet') + (if std.member($._config.storage_engine, 'blocks') then diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 7326f22f1e1..25a9c43e788 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -173,6 +173,72 @@ local utils = import 'mixin-utils/utils.libsonnet'; tooltip: { sort: 2 }, // Sort descending. }, + newStatPanel(queries, legends='', unit='percentunit', thresholds=[], instant=false, novalue=''):: + super.queryPanel(queries, legends) + { + type: 'stat', + targets: [ + target { + instant: instant, + interval: '', + + // Reset defaults from queryPanel(). + format: null, + intervalFactor: null, + step: null, + } + for target in super.targets + ], + fieldConfig: { + defaults: { + color: { mode: 'thresholds' }, + decimals: 1, + thresholds: { + mode: 'absolute', + steps: thresholds, + }, + noValue: novalue, + unit: unit, + }, + overrides: [], + }, + }, + + barGauge(queries, legends='', thresholds=[], unit='short', min=null, max=null):: + super.queryPanel(queries, legends) + { + type: 'bargauge', + targets: [ + target { + // Reset defaults from queryPanel(). + format: null, + intervalFactor: null, + step: null, + } + for target in super.targets + ], + fieldConfig: { + defaults: { + color: { mode: 'thresholds' }, + mappings: [], + max: max, + min: min, + thresholds: { + mode: 'absolute', + steps: thresholds, + }, + unit: unit, + }, + }, + options: { + displayMode: 'basic', + orientation: 'horizontal', + reduceOptions: { + calcs: ['lastNotNull'], + fields: '', + values: false, + }, + }, + }, + // Switches a panel from lines (default) to bars. bars:: { bars: true, diff --git a/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet b/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet new file mode 100644 index 00000000000..45ee51bba39 --- /dev/null +++ b/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet @@ -0,0 +1,284 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + local config = { + namespace_matcher: $.namespaceMatcher(), + gateway_job_matcher: $.jobMatcher($._config.job_names.gateway), + gateway_write_routes_regex: 'api_(v1|prom)_push', + gateway_read_routes_regex: '(prometheus|api_prom)_api_v1_.+', + all_services_regex: std.join('|', ['cortex-gw', 'distributor', 'ingester', 'query-frontend', 'querier', 'compactor', 'store-gateway', 'ruler', 'alertmanager']), + }, + + 'cortex-rollout-progress.json': + ($.dashboard('Cortex / Rollout progress') + { uid: '7544a3a62b1be6ffd919fc990ab8ba8f' }) + .addClusterSelectorTemplates(false) + { + // This dashboard uses the new grid system in order to place panels (using gridPos). + // Because of this we can't use the mixin's addRow() and addPanel(). + schemaVersion: 27, + rows: null, + panels: [ + // + // Rollout progress + // + $.panel('Rollout progress') + + $.barGauge([ + ||| + ( + kube_statefulset_status_replicas_updated{%(namespace_matcher)s,statefulset=~"%(all_services_regex)s"} + / + kube_statefulset_replicas{%(namespace_matcher)s} + ) and ( + kube_statefulset_replicas{%(namespace_matcher)s} + > 0 + ) + ||| % config, + ||| + ( + kube_deployment_status_replicas_updated{%(namespace_matcher)s,deployment=~"%(all_services_regex)s"} + / + kube_deployment_spec_replicas{%(namespace_matcher)s} + ) and ( + kube_deployment_spec_replicas{%(namespace_matcher)s} + > 0 + ) + ||| % config, + ], legends=[ + '{{statefulset}}', + '{{deployment}}', + ], thresholds=[ + { color: 'yellow', value: null }, + { color: 'yellow', value: 0.999 }, + { color: 'green', value: 1 }, + ], unit='percentunit', min=0, max=1) + { + id: 1, + gridPos: { h: 8, w: 10, x: 0, y: 0 }, + }, + + // + // Writes + // + $.panel('Writes - 2xx') + + $.newStatPanel(||| + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) + ||| % config, thresholds=[ + { color: 'green', value: null }, + ]) + { + id: 2, + gridPos: { h: 4, w: 2, x: 10, y: 0 }, + }, + + $.panel('Writes - 4xx') + + $.newStatPanel(||| + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) + ||| % config, thresholds=[ + { color: 'green', value: null }, + { color: 'orange', value: 0.2 }, + { color: 'red', value: 0.5 }, + ]) + { + id: 3, + gridPos: { h: 4, w: 2, x: 12, y: 0 }, + }, + + $.panel('Writes - 5xx') + + $.newStatPanel(||| + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) + ||| % config, thresholds=[ + { color: 'green', value: null }, + { color: 'red', value: 0.01 }, + ]) + { + id: 4, + gridPos: { h: 4, w: 2, x: 14, y: 0 }, + }, + + $.panel('Writes 99th Latency') + + $.newStatPanel(||| + histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"})) + ||| % config, unit='s', thresholds=[ + { color: 'green', value: null }, + { color: 'orange', value: 0.2 }, + { color: 'red', value: 0.5 }, + ]) + { + id: 5, + gridPos: { h: 4, w: 8, x: 16, y: 0 }, + }, + + // + // Reads + // + $.panel('Reads - 2xx') + + $.newStatPanel(||| + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) + ||| % config, thresholds=[ + { color: 'green', value: null }, + ]) + { + id: 6, + gridPos: { h: 4, w: 2, x: 10, y: 4 }, + }, + + $.panel('Reads - 4xx') + + $.newStatPanel(||| + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) + ||| % config, thresholds=[ + { color: 'green', value: null }, + { color: 'orange', value: 0.01 }, + { color: 'red', value: 0.05 }, + ]) + { + id: 7, + gridPos: { h: 4, w: 2, x: 12, y: 4 }, + }, + + $.panel('Reads - 5xx') + + $.newStatPanel(||| + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) + ||| % config, thresholds=[ + { color: 'green', value: null }, + { color: 'red', value: 0.01 }, + ]) + { + id: 8, + gridPos: { h: 4, w: 2, x: 14, y: 4 }, + }, + + $.panel('Reads 99th Latency') + + $.newStatPanel(||| + histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"})) + ||| % config, unit='s', thresholds=[ + { color: 'green', value: null }, + { color: 'orange', value: 1 }, + { color: 'red', value: 2.5 }, + ]) + { + id: 9, + gridPos: { h: 4, w: 8, x: 16, y: 4 }, + }, + + // + // Unhealthy pods + // + $.panel('Unhealthy pods') + + $.newStatPanel([ + ||| + kube_deployment_status_replicas_unavailable{%(namespace_matcher)s, deployment=~"%(all_services_regex)s"} + > 0 + ||| % config, + ||| + kube_statefulset_status_replicas_current{%(namespace_matcher)s, statefulset=~"%(all_services_regex)s"} - + kube_statefulset_status_replicas_ready {%(namespace_matcher)s, statefulset=~"%(all_services_regex)s"} + > 0 + ||| % config, + ], legends=[ + '{{deployment}}', + '{{statefulset}}', + ], thresholds=[ + { color: 'green', value: null }, + { color: 'orange', value: 1 }, + { color: 'red', value: 2 }, + ], instant=true, novalue='All healthy') + { + options: { + text: { + // Small font size since we may have many entries during a rollout. + titleSize: 14, + valueSize: 14, + }, + }, + id: 10, + gridPos: { h: 8, w: 10, x: 0, y: 8 }, + }, + + // + // Versions + // + { + title: 'Pods count per Version', + type: 'table', + datasource: '$datasource', + + targets: [ + { + expr: ||| + count by(container, version) ( + label_replace( + kube_pod_container_info{%(namespace_matcher)s,container=~"%(all_services_regex)s"}, + "version", "$1", "image", ".*:(.+)-.*" + ) + ) + ||| % config, + instant: true, + legendFormat: '', + refId: 'A', + }, + ], + + fieldConfig: { + overrides: [ + { + // Center align the version. + matcher: { id: 'byRegexp', options: 'r.*' }, + properties: [{ id: 'custom.align', value: 'center' }], + }, + ], + }, + + transformations: [ + { + // Transform the version label to a field. + id: 'labelsToFields', + options: { valueLabel: 'version' }, + }, + { + // Hide time. + id: 'organize', + options: { excludeByName: { Time: true } }, + }, + { + // Sort by container. + id: 'sortBy', + options: { fields: {}, sort: [{ field: 'container' }] }, + }, + ], + + id: 11, + gridPos: { h: 8, w: 6, x: 10, y: 8 }, + }, + + // + // Performance comparison with 24h ago + // + $.panel('Latency vs 24h ago') + + $.queryPanel([||| + 1 - ( + avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"} offset 24h))[1h:]) + / + avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}))[1h:]) + ) + ||| % config, ||| + 1 - ( + avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"} offset 24h))[1h:]) + / + avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}))[1h:]) + ) + ||| % config], ['writes', 'reads']) + { + yaxes: $.yaxes({ + format: 'percentunit', + min: null, // Can be negative. + }), + + id: 12, + gridPos: { h: 8, w: 8, x: 16, y: 8 }, + }, + ], + + templating+: { + list: [ + // Do not allow to include all clusters/namespaces cause this dashboard is designed to show + // 1 cluster at a time. + l + (if (l.name == 'cluster' || l.name == 'namespace') then { includeAll: false } else {}) + for l in super.list + ], + }, + }, +} From efa84d49a38d555c52a8ac5840cc025007a53b73 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 7 Apr 2021 09:45:27 +0200 Subject: [PATCH 217/364] Fix 'Unhealthy pods' in Cortex Rollout dashboard Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 4 ++-- jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 25a9c43e788..f79c27af029 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -173,7 +173,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; tooltip: { sort: 2 }, // Sort descending. }, - newStatPanel(queries, legends='', unit='percentunit', thresholds=[], instant=false, novalue=''):: + newStatPanel(queries, legends='', unit='percentunit', decimals=1, thresholds=[], instant=false, novalue=''):: super.queryPanel(queries, legends) + { type: 'stat', targets: [ @@ -191,7 +191,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; fieldConfig: { defaults: { color: { mode: 'thresholds' }, - decimals: 1, + decimals: decimals, thresholds: { mode: 'absolute', steps: thresholds, diff --git a/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet b/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet index 45ee51bba39..83a5abb7a2f 100644 --- a/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet @@ -177,7 +177,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { color: 'green', value: null }, { color: 'orange', value: 1 }, { color: 'red', value: 2 }, - ], instant=true, novalue='All healthy') + { + ], instant=true, novalue='All healthy', unit='short', decimals=0) + { options: { text: { // Small font size since we may have many entries during a rollout. From 6f9fbc097f9115714729f7460fd33530c2e81cbe Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Tue, 20 Apr 2021 17:02:49 +0200 Subject: [PATCH 218/364] Simplify compactor alerts We should simply alert on things not having run since X. Signed-off-by: Goutham Veeramachaneni --- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index c8ac434a27c..13ab593a7f9 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -6,11 +6,9 @@ { // Alert if the compactor has not successfully cleaned up blocks in the last 24h. alert: 'CortexCompactorHasNotSuccessfullyCleanedUpBlocks', - 'for': '15m', + 'for': '1h', expr: ||| (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 24) - and - (cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 0) |||, labels: { severity: 'critical', @@ -20,17 +18,17 @@ }, }, { - // Alert if the compactor has not successfully cleaned up blocks since its start. - alert: 'CortexCompactorHasNotSuccessfullyCleanedUpBlocksSinceStart', - 'for': '24h', + // Alert if the compactor has not successfully run compaction in the last 6h. + alert: 'CortexCompactorHasNotSuccessfullyRunCompaction', + 'for': '1h', expr: ||| - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds == 0 + time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6 |||, labels: { severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 6 hours.', }, }, { From 7b9dc6b34b5465ebd1472458d9f438f2d007ab69 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Tue, 20 Apr 2021 17:15:24 +0200 Subject: [PATCH 219/364] Use the right metric Signed-off-by: Goutham Veeramachaneni --- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index 13ab593a7f9..8490b205142 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -22,7 +22,7 @@ alert: 'CortexCompactorHasNotSuccessfullyRunCompaction', 'for': '1h', expr: ||| - time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6 + time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 6 |||, labels: { severity: 'critical', From c11d9e679a53f53a15bceee447dab32d5c49b12b Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Tue, 20 Apr 2021 18:13:15 +0200 Subject: [PATCH 220/364] Apply suggestions from code review Co-authored-by: Marco Pracucci Signed-off-by: Goutham Veeramachaneni --- jsonnet/mimir-mixin/alerts/compactor.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index 8490b205142..7a858c56bb8 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -4,17 +4,17 @@ name: 'cortex_compactor_alerts', rules: [ { - // Alert if the compactor has not successfully cleaned up blocks in the last 24h. + // Alert if the compactor has not successfully cleaned up blocks in the last 6h. alert: 'CortexCompactorHasNotSuccessfullyCleanedUpBlocks', 'for': '1h', expr: ||| - (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 24) + (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6) |||, labels: { severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 6 hours.', }, }, { From 7bbb1722a1f75dc6875b3c0a1cfc31c79853722b Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 21 Apr 2021 09:40:54 +0200 Subject: [PATCH 221/364] Fix CortexCompactorHasNotSuccessfullyRunCompaction to avoid false positives Signed-off-by: Marco Pracucci --- .../mimir-mixin/alerts/compactor.libsonnet | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index 7a858c56bb8..be3de8c0c2c 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -18,17 +18,33 @@ }, }, { - // Alert if the compactor has not successfully run compaction in the last 6h. + // Alert if the compactor has not successfully run compaction in the last 24h. alert: 'CortexCompactorHasNotSuccessfullyRunCompaction', 'for': '1h', expr: ||| - time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 6 + (time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24) + and + (cortex_compactor_last_successful_run_timestamp_seconds > 0) + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.', + }, + }, + { + // Alert if the compactor has not successfully run compaction in the last 24h since startup. + alert: 'CortexCompactorHasNotSuccessfullyRunCompaction', + 'for': '24h', + expr: ||| + cortex_compactor_last_successful_run_timestamp_seconds == 0 |||, labels: { severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 6 hours.', + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.', }, }, { From 8c30820d7d4591667740ae7206670ca033d6be56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Thu, 22 Apr 2021 11:18:06 +0200 Subject: [PATCH 222/364] Introduce ingester instance limits to configuration, and add alerts. (https://github.com/grafana/cortex-jsonnet/pull/296) * Introduce ingester instance limits to configuration, and add alerts. * CHANGELOG.md * Address (internal) review feedback. --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 81 +++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index cad56a1b979..c5211ca65c5 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -255,6 +255,87 @@ }, ], }, + { + name: 'cortex_ingester_instance_alerts', + rules: [ + { + alert: 'CortexIngesterReachingSeriesLimit', + expr: ||| + ( + (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_series"} > 0) + ) > 0.7 + |||, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. + |||, + }, + }, + { + alert: 'CortexIngesterReachingSeriesLimit', + expr: ||| + ( + (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_series"} > 0) + ) > 0.8 + |||, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. + |||, + }, + }, + { + alert: 'CortexIngesterReachingTenantsLimit', + expr: ||| + ( + (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_tenants"} > 0) + ) > 0.7 + |||, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit. + |||, + }, + }, + { + alert: 'CortexIngesterReachingTenantsLimit', + expr: ||| + ( + (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_tenants"} > 0) + ) > 0.8 + |||, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit. + |||, + }, + }, + ], + }, { name: 'cortex_wal_alerts', rules: [ From 67ee413a567d9af86de887edfc772a5e018e8f2d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 4 May 2021 12:35:59 +0200 Subject: [PATCH 223/364] Improve CortexRulerFailedRingCheck alert Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index c5211ca65c5..7568b4fd177 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -560,16 +560,16 @@ { alert: 'CortexRulerFailedRingCheck', expr: ||| - sum by (%s) (rate(cortex_ruler_ring_check_errors_total[5m])) + sum by (%s, job) (rate(cortex_ruler_ring_check_errors_total[1m])) > 0 ||| % $._config.alert_aggregation_labels, - 'for': '1m', + 'for': '5m', labels: { severity: 'critical', }, annotations: { message: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors when checking the ring for rule group ownership. + Cortex Rulers {{ $labels.job }} are experiencing errors when checking the ring for rule group ownership. |||, }, }, From f745408b658f6f3af6032e64a639cbe87e8a4c13 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 4 May 2021 12:52:31 +0200 Subject: [PATCH 224/364] Added example Loki query to CortexTenantHasPartialBlocks playbook Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 6ce4e7a3dcb..f581493933b 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -249,7 +249,10 @@ This alert fires when Cortex finds partial blocks for a given tenant. A partial 2. A block deletion has been interrupted and `deletion-mark.json` has been deleted before `meta.json` How to **investigate**: -- Look for the block ID in the logs +- Look for the block ID in the logs. Example Loki query: + ``` + {cluster="",namespace="",container="compactor"} |= "skipped partial block" + ``` - Find out which Cortex component operated on the block at last (eg. uploaded by ingester/compactor, or deleted by compactor) - Investigate if was a partial upload or partial delete - Safely manually delete the block from the bucket if was a partial delete or an upload failed by a compactor From e2113d71cb1b54437880fdeb30cd9ba8a8d618b3 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 11 May 2021 19:09:56 +0200 Subject: [PATCH 225/364] Default dashboards to Cortex blocks storage only Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 94c9845cb51..b30f7dc4126 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -6,7 +6,7 @@ // Switch for overall storage engine. // May contain 'chunks', 'blocks' or both. // Enables chunks- or blocks- specific panels and dashboards. - storage_engine: ['chunks', 'blocks'], + storage_engine: ['blocks'], // For chunks backend, switch for chunk index type. // May contain 'bigtable', 'dynamodb' or 'cassandra'. From 3e0ea6b3742bdb200399a55ba86906f4ff024f90 Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Thu, 13 May 2021 09:42:53 +0100 Subject: [PATCH 226/364] Add missing memberlist components to alerts This adds the admin-api, compactor and store-gateway components to the memberlist alert. Signed-off-by: Christian Simon --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 7568b4fd177..38c9d00ee26 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -583,7 +583,7 @@ expr: ||| memberlist_client_cluster_members_count != on (%s) group_left - sum by (%s) (up{job=~".+/(distributor|ingester.*|querier|cortex|ruler)"}) + sum by (%s) (up{job=~".+/(admin-api|compactor|store-gateway|distributor|ingester.*|querier|cortex|ruler)"}) ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', labels: { From 2c33b1fef6c4786ba22dd9c12c1d9ca3b12ddc69 Mon Sep 17 00:00:00 2001 From: Alex Martin Date: Wed, 19 May 2021 00:15:15 -0500 Subject: [PATCH 227/364] mixin: Add gateway to valid job names (for GEM) --- jsonnet/mimir-mixin/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index b30f7dc4126..364e7a6ee84 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -34,7 +34,7 @@ query_scheduler: 'query-scheduler', // Not part of single-binary. table_manager: '(table-manager|cortex$)', store_gateway: '(store-gateway|cortex$)', - gateway: 'cortex-gw', + gateway: '(gateway|cortex-gw)', compactor: 'compactor.*', // Match also custom compactor deployments. }, From d9e5b316b72f5483fa0e4901eaedf1a5d3c1fb1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20=C5=A0tibran=C3=BD?= Date: Wed, 19 May 2021 14:19:03 +0200 Subject: [PATCH 228/364] Only show namespaces from selected cluster. "All" works thanks to using regex matcher. (https://github.com/grafana/cortex-jsonnet/pull/311) * Only show namespaces from selected cluster. "All" works thanks to using regex matcher. * CHANGELOG.md --- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index f79c27af029..1ba99603a61 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -36,13 +36,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; then d.addMultiTemplate('job', 'cortex_build_info', 'job') else d .addMultiTemplate('cluster', 'cortex_build_info', 'cluster') - .addMultiTemplate('namespace', 'cortex_build_info', 'namespace') + .addMultiTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace') else if $._config.singleBinary then d.addTemplate('job', 'cortex_build_info', 'job') else d .addTemplate('cluster', 'cortex_build_info', 'cluster') - .addTemplate('namespace', 'cortex_build_info', 'namespace'), + .addTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace'), }, From d29b27ede26e7a773621d3018e11f9aa200b86b3 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 24 May 2021 18:47:01 +0200 Subject: [PATCH 229/364] Fixed CortexIngesterHasNotShippedBlocks alert false positive Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index ab1f15fb8e2..d1157f38438 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -13,7 +13,14 @@ and (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) and + # Only if the ingester has ingested samples over the last 4h. (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) + and + # Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance + # had ingested samples in the past, then no traffic was received for a long period and then it starts + # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving + # samples, while the a block shipping is expected within the next 4h. + (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0) |||, labels: { severity: 'critical', From 37b2881463fd392800de7069d39516f59535b4ec Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 26 May 2021 14:51:05 +0200 Subject: [PATCH 230/364] Fixed mixin linter Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/scaling.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet index 11e1f7950ce..a01a7db304e 100644 --- a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/scaling.libsonnet @@ -48,12 +48,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) |||, ], { - '__name__': { alias: 'Cluster', type: 'hidden' }, + __name__: { alias: 'Cluster', type: 'hidden' }, cluster: { alias: 'Cluster' }, namespace: { alias: 'Namespace' }, deployment: { alias: 'Service' }, reason: { alias: 'Reason' }, - 'Value': { alias: 'Required Replicas', decimals: 0 }, + Value: { alias: 'Required Replicas', decimals: 0 }, }) ) ), From b96046931d931accffdbe94bdcfc61e400fa1637 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 26 May 2021 14:57:08 +0200 Subject: [PATCH 231/364] Add placeholders to make the linter pass Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 98 ++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 3dc7d152c90..16511382e96 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -24,7 +24,15 @@ If nothing obvious from the above, check for increased load: - If there is an increase in the number of active series and the memory provisioned is not enough, scale up the ingesters horizontally to have the same number of series as before per ingester. - If we had an outage and once Cortex is back up, the incoming traffic increases. (or) The clients have their Prometheus remote-write lagging and starts to send samples at a higher rate (again, an increase in traffic but in terms of number of samples). Scale up the ingester horizontally in this case too. -### CortexRequest Latency +### CortexIngesterReachingSeriesLimit + +_TODO: this playbook has not been written yet._ + +### CortexIngesterReachingTenantsLimit + +_TODO: this playbook has not been written yet._ + +### CortexRequestLatency First establish if the alert is for read or write latency. The alert should say. #### Write Latency @@ -47,6 +55,10 @@ If you think its provisioning / scaling is the problem, consult the scaling dash Right now most of the execution time will be spent in PromQL's innerEval. NB that the prepare (index and chunk fetch) are now interleaved with Eval, so you need to expand both to confirm if its flow execution of slow fetching. +### CortexRequestErrors + +_TODO: this playbook has not been written yet._ + ### CortexTransferFailed This alert goes off when an ingester fails to find another node to transfer its data to when it was shutting down. If there is both a pod stuck terminating and one stuck joining, look at the kubernetes events. This may be due to scheduling problems caused by some combination of anti affinity rules/resource utilization. Adding a new node can help in these circumstances. You can see recent events associated with a resource via kubectl describe, ex: `kubectl -n describe pod ` @@ -69,6 +81,14 @@ More information: This alert occurs when a ruler is unable to validate whether or not it should claim ownership over the evaluation of a rule group. The most likely cause is that one of the rule ring entries is unhealthy. If this is the case proceed to the ring admin http page and forget the unhealth ruler. The other possible cause would be an error returned the ring client. If this is the case look into debugging the ring based on the in-use backend implementation. +### CortexRulerFailedEvaluations + +_TODO: this playbook has not been written yet._ + +### CortexRulerMissedEvaluations + +_TODO: this playbook has not been written yet._ + ### CortexIngesterHasNotShippedBlocks This alert fires when a Cortex ingester is not uploading any block to the long-term storage. An ingester is expected to upload a block to the storage every block range period (defaults to 2h) and if a longer time elapse since the last successful upload it means something is not working correctly. @@ -233,6 +253,14 @@ gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks). +### CortexCompactorHasNotSuccessfullyRunCompaction + +_TODO: this playbook has not been written yet._ + +### CortexCompactorRunFailed + +_TODO: this playbook has not been written yet._ + ### CortexBucketIndexNotUpdated This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. @@ -277,6 +305,74 @@ WAL corruptions are only detected at startups, so at this point the WAL/Checkpoi 2. Equal or more than the quorum number but less than replication factor: There is a good chance that there is no data loss if it was replicated to desired number of ingesters. But it's good to check once for data loss. 3. Equal or more than the replication factor: Then there is definitely some data loss. +### CortexRequestErrors + +_TODO: this playbook has not been written yet._ + +### CortexTableSyncFailure + +_TODO: this playbook has not been written yet._ + +### CortexQueriesIncorrect + +_TODO: this playbook has not been written yet._ + +### CortexInconsistentConfig + +_TODO: this playbook has not been written yet._ + +### CortexBadRuntimeConfig + +_TODO: this playbook has not been written yet._ + +### CortexQuerierCapacityFull + +_TODO: this playbook has not been written yet._ + +### CortexFrontendQueriesStuck + +_TODO: this playbook has not been written yet._ + +### CortexSchedulerQueriesStuck + +_TODO: this playbook has not been written yet._ + +### CortexCacheRequestErrors + +_TODO: this playbook has not been written yet._ + +### CortexOldChunkInMemory + +_TODO: this playbook has not been written yet._ + +### CortexCheckpointCreationFailed + +_TODO: this playbook has not been written yet._ + +### CortexCheckpointDeletionFailed + +_TODO: this playbook has not been written yet._ + +### CortexProvisioningMemcachedTooSmall + +_TODO: this playbook has not been written yet._ + +### CortexProvisioningTooManyActiveSeries + +_TODO: this playbook has not been written yet._ + +### CortexProvisioningTooManyWrites + +_TODO: this playbook has not been written yet._ + +### CortexAllocatingTooMuchMemory + +_TODO: this playbook has not been written yet._ + +### CortexGossipMembersMismatch + +_TODO: this playbook has not been written yet._ + ### EtcdAllocatingTooMuchMemory This can be triggered if there are too many HA dedupe keys in etcd. We saw this when one of our clusters hit 20K tenants that were using HA dedupe config. Raise the etcd limits via: From 193597feb1bffa5918f40d716831cb2c76200857 Mon Sep 17 00:00:00 2001 From: Javier Palomo Date: Mon, 31 May 2021 14:11:14 +0200 Subject: [PATCH 232/364] cortex-mixin: Use kube_pod_container_resource_{requests,limits} metrics This updates the recording rules to make them compatible with kube-state-metrics v2.0.0 which introduces some breaking changes in some metric names. With kube-state-metrics v2.0.0: - `kube_pod_container_resource_requests_cpu_cores` becomes `kube_pod_container_resource_requests{resource="cpu"}` - `kube_pod_container_resource_requests_memory_bytes` becomes `kube_pod_container_resource_requests{resource="memory"}` --- jsonnet/mimir-mixin/recording_rules.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index f980474453b..7abec55d218 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -215,7 +215,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; expr: ||| sum by (cluster, namespace, deployment) ( label_replace( - kube_pod_container_resource_requests_cpu_cores, + kube_pod_container_resource_requests{resource="cpu"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ) ) @@ -257,7 +257,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; expr: ||| sum by (cluster, namespace, deployment) ( label_replace( - kube_pod_container_resource_requests_memory_bytes, + kube_pod_container_resource_requests{resource="memory"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ) ) From 4a5f52a9a36b5471620e698dfda7170ebbd2c1a6 Mon Sep 17 00:00:00 2001 From: Javier Palomo Date: Mon, 31 May 2021 17:12:07 +0200 Subject: [PATCH 233/364] cortex-mixin: Make the recording rules backwards compatible --- jsonnet/mimir-mixin/recording_rules.libsonnet | 56 ++++++++++++++++--- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 7abec55d218..4a50eae2e32 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -213,10 +213,30 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Convenience rule to get the CPU request for both a deployment and a statefulset. record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum', expr: ||| - sum by (cluster, namespace, deployment) ( - label_replace( - kube_pod_container_resource_requests{resource="cpu"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 + # that remove resource metrics, ref: + # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 + # - https://github.com/kubernetes/kube-state-metrics/pull/1004 + # + # This is the old expression, compatible with kube-state-metrics < v2.0.0, + # where kube_pod_container_resource_requests_cpu_cores was removed: + ( + sum by (cluster, namespace, deployment) ( + label_replace( + kube_pod_container_resource_requests_cpu_cores, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + ) + ) + or + # This expression is compatible with kube-state-metrics >= v1.4.0, + # where kube_pod_container_resource_requests was introduced. + ( + sum by (cluster, namespace, deployment) ( + label_replace( + kube_pod_container_resource_requests{resource="cpu"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) ) ) |||, @@ -255,10 +275,30 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Convenience rule to get the Memory request for both a deployment and a statefulset. record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum', expr: ||| - sum by (cluster, namespace, deployment) ( - label_replace( - kube_pod_container_resource_requests{resource="memory"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 + # that remove resource metrics, ref: + # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 + # - https://github.com/kubernetes/kube-state-metrics/pull/1004 + # + # This is the old expression, compatible with kube-state-metrics < v2.0.0, + # where kube_pod_container_resource_requests_memory_bytes was removed: + ( + sum by (cluster, namespace, deployment) ( + label_replace( + kube_pod_container_resource_requests_memory_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + ) + ) + or + # This expression is compatible with kube-state-metrics >= v1.4.0, + # where kube_pod_container_resource_requests was introduced. + ( + sum by (cluster, namespace, deployment) ( + label_replace( + kube_pod_container_resource_requests{resource="memory"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) ) ) |||, From f9f8cc2b2c8636152ad655c35ad3d78ea8a4953f Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Mon, 31 May 2021 22:37:41 -0400 Subject: [PATCH 234/364] refactor: functions to reduce code duplication - improve overrideability - making more use of `per_instance_label` from _config - added containerNetworkPanel functions for dashboards to use --- jsonnet/mimir-mixin/config.libsonnet | 6 ++++ .../alertmanager-resources.libsonnet | 10 ++----- .../dashboards/compactor-resources.libsonnet | 10 ++----- .../dashboards/dashboard-utils.libsonnet | 28 +++++++++++++++---- 4 files changed, 33 insertions(+), 21 deletions(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 364e7a6ee84..3d0fd02ea70 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -49,6 +49,12 @@ // The label used to differentiate between different application instances (i.e. 'pod' in a kubernetes install). per_instance_label: 'pod', + // Name selectors for different application instances, using the "per_instance_label". + instance_names: { + compactor: 'compactor.*', + alertmanager: 'alertmanager.*', + }, + // The label used to differentiate between different nodes (i.e. servers). per_node_label: 'instance', }, diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index 7a9e721afed..4c67c1615a0 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -43,16 +43,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Network') .addPanel( - $.panel('Receive Bandwidth') + - $.queryPanel('sum by(pod) (rate(container_network_receive_bytes_total{%s,pod=~"alertmanager.*"}[$__rate_interval]))' % $.namespaceMatcher(), '{{pod}}') + - $.stack + - { yaxes: $.yaxes('Bps') }, + $.containerNetworkReceiveBytesPanel($._config.instance_names.alertmanager), ) .addPanel( - $.panel('Transmit Bandwidth') + - $.queryPanel('sum by(pod) (rate(container_network_transmit_bytes_total{%s,pod=~"alertmanager.*"}[$__rate_interval]))' % $.namespaceMatcher(), '{{pod}}') + - $.stack + - { yaxes: $.yaxes('Bps') }, + $.containerNetworkTransmitBytesPanel($._config.instance_names.alertmanager), ) ) .addRow( diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet index b0ddf170322..79a7ac03fa8 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet @@ -19,16 +19,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Network') .addPanel( - $.panel('Receive Bandwidth') + - $.queryPanel('sum by(pod) (rate(container_network_receive_bytes_total{%s,pod=~"compactor.*"}[$__rate_interval]))' % $.namespaceMatcher(), '{{pod}}') + - $.stack + - { yaxes: $.yaxes('Bps') }, + $.containerNetworkReceiveBytesPanel($._config.instance_names.compactor), ) .addPanel( - $.panel('Transmit Bandwidth') + - $.queryPanel('sum by(pod) (rate(container_network_transmit_bytes_total{%s,pod=~"compactor.*"}[$__rate_interval]))' % $.namespaceMatcher(), '{{pod}}') + - $.stack + - { yaxes: $.yaxes('Bps') }, + $.containerNetworkTransmitBytesPanel($._config.instance_names.compactor), ) ) .addRow( diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 1ba99603a61..511826f7b7b 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -128,9 +128,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerCPUUsagePanel(title, containerName):: $.panel(title) + $.queryPanel([ - 'sum by(pod) (rate(container_cpu_usage_seconds_total{%s,container="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), containerName], + 'sum by(%s) (rate(container_cpu_usage_seconds_total{%s,container="%s"}[$__rate_interval]))' % [$._config.per_instance_label, $.namespaceMatcher(), containerName], 'min(container_spec_cpu_quota{%s,container="%s"} / container_spec_cpu_period{%s,container="%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], - ], ['{{pod}}', 'limit']) + + ], ['{{%s}}' % $._config.per_instance_label, 'limit']) + { seriesOverrides: [ { @@ -146,10 +146,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel(title) + $.queryPanel([ // We use "max" instead of "sum" otherwise during a rolling update of a statefulset we will end up - // summing the memory of the old pod (whose metric will be stale for 5m) to the new pod. - 'max by(pod) (container_memory_working_set_bytes{%s,container="%s"})' % [$.namespaceMatcher(), containerName], + // summing the memory of the old instance/pod (whose metric will be stale for 5m) to the new instance/pod. + 'max by(%s) (container_memory_working_set_bytes{%s,container="%s"})' % [$._config.per_instance_label, $.namespaceMatcher(), containerName], 'min(container_spec_memory_limit_bytes{%s,container="%s"} > 0)' % [$.namespaceMatcher(), containerName], - ], ['{{pod}}', 'limit']) + + ], ['{{%s}}' % $._config.per_instance_label, 'limit']) + { seriesOverrides: [ { @@ -162,6 +162,24 @@ local utils = import 'mixin-utils/utils.libsonnet'; tooltip: { sort: 2 }, // Sort descending. }, + containerNetworkPanel(title, metric, instanceName):: + $.panel(title) + + $.queryPanel( + 'sum by(%(instance)s) (rate(%(metric)s{%(namespace)s,%(instance)s=~"%(instanceName)s"}[$__rate_interval]))' % { + namespace: $.namespaceMatcher(), + metric: metric, + instance: $._config.per_instance_label, + instanceName: instanceName, + }, '{{%s}}' % $._config.per_instance_label) + + $.stack + + { yaxes: $.yaxes('Bps') }, + + containerNetworkReceiveBytesPanel(instanceName):: + $.containerNetworkPanel('Receive Bandwidth', 'container_network_receive_bytes_total', instanceName), + + containerNetworkTransmitBytesPanel(instanceName):: + $.containerNetworkPanel('Transmit Bandwidth', 'container_network_transmit_bytes_total', instanceName), + goHeapInUsePanel(title, jobName):: $.panel(title) + $.queryPanel( From c99479d18b7087fb03aa286197727f79b77b32d9 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Mon, 31 May 2021 23:03:08 -0400 Subject: [PATCH 235/364] fix: lint --- .../mimir-mixin/dashboards/dashboard-utils.libsonnet | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 511826f7b7b..ded63ddc7e4 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -162,15 +162,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; tooltip: { sort: 2 }, // Sort descending. }, - containerNetworkPanel(title, metric, instanceName):: + containerNetworkPanel(title, metric, instanceName):: $.panel(title) + $.queryPanel( 'sum by(%(instance)s) (rate(%(metric)s{%(namespace)s,%(instance)s=~"%(instanceName)s"}[$__rate_interval]))' % { - namespace: $.namespaceMatcher(), + namespace: $.namespaceMatcher(), metric: metric, - instance: $._config.per_instance_label, - instanceName: instanceName, - }, '{{%s}}' % $._config.per_instance_label) + + instance: $._config.per_instance_label, + instanceName: instanceName, + }, '{{%s}}' % $._config.per_instance_label + ) + $.stack + { yaxes: $.yaxes('Bps') }, From 9d9c73939eab73ec85be7a8d9c966373817819b7 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Wed, 2 Jun 2021 14:48:11 -0400 Subject: [PATCH 236/364] refactor: config for job aggregation strings - to make it easier to override, define "cluster_namespace_job" in $._config as `job_aggregation_prefix`. - added some `job_aggregation_labels_*` as well The resulting output does not change (unless config is overridden). --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- jsonnet/mimir-mixin/config.libsonnet | 5 +++++ jsonnet/mimir-mixin/dashboards/writes.libsonnet | 13 ++++++++++--- jsonnet/mimir-mixin/recording_rules.libsonnet | 11 ++++++----- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 7568b4fd177..d34777b212b 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -39,7 +39,7 @@ { alert: 'CortexRequestLatency', expr: ||| - cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"} + %(job_aggregation_prefix)s_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > %(cortex_p99_latency_threshold_seconds)s ||| % $._config, diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 364e7a6ee84..7172ec181e1 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -38,6 +38,11 @@ compactor: 'compactor.*', // Match also custom compactor deployments. }, + // Aggregation strings related to "jobs" + job_aggregation_prefix: 'cluster_namespace_job', + job_aggregation_labels_recording_rules: 'cluster, namespace, job', + job_aggregation_labels_active_series: 'namespace', + // Labels used to in alert aggregations - should uniquely identify // a single Cortex cluster. alert_aggregation_labels: 'cluster, namespace', diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index eb3fcbc3a74..dc79edf7585 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -12,17 +12,24 @@ local utils = import 'mixin-utils/utils.libsonnet'; }) .addPanel( $.panel('Samples / s') + - $.statPanel('sum(cluster_namespace_job:cortex_distributor_received_samples:rate5m{%s})' % $.jobMatcher($._config.job_names.distributor), format='reqps') + $.statPanel( + 'sum(%(jobAggregationPrefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % { + job: $.jobMatcher($._config.job_names.distributor), + jobAggregationPrefix: $._config.job_aggregation_prefix + }, + format='reqps' + ) ) .addPanel( $.panel('Active Series') + $.statPanel(||| sum(cortex_ingester_memory_series{%(ingester)s} - / on(namespace) group_left - max by (namespace) (cortex_distributor_replication_factor{%(distributor)s})) + / on(%(labels)s) group_left + max by (%(labels)s) (cortex_distributor_replication_factor{%(distributor)s})) ||| % { ingester: $.jobMatcher($._config.job_names.ingester), distributor: $.jobMatcher($._config.job_names.distributor), + labels: $._config.job_aggregation_labels_active_series }, format='short') ) .addPanel( diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index f980474453b..3cb21191829 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -51,10 +51,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; name: 'cortex_received_samples', rules: [ { - record: 'cluster_namespace_job:cortex_distributor_received_samples:rate5m', + record: '%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m' % $._config, expr: ||| - sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) - |||, + sum by (%(job_aggregation_labels_recording_rules)s) (rate(cortex_distributor_received_samples_total[5m])) + ||| % $._config, }, ], }, @@ -64,6 +64,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; max_samples_per_sec_per_ingester: 80e3, max_samples_per_sec_per_distributor: 240e3, limit_utilisation_target: 0.6, + job_aggregation_prefix: $._config.job_aggregation_prefix, }, name: 'cortex_scaling_rules', rules: [ @@ -89,7 +90,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( - cluster_namespace_job:cortex_distributor_received_samples:rate5m + %(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m )[24h:] ) / %(max_samples_per_sec_per_distributor)s @@ -123,7 +124,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( - cluster_namespace_job:cortex_distributor_received_samples:rate5m + %(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m )[24h:] ) * 3 / %(max_samples_per_sec_per_ingester)s From ec05ad6228f485355953c9630e49a4080ae8e805 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Wed, 2 Jun 2021 14:54:55 -0400 Subject: [PATCH 237/364] lint --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index dc79edf7585..6debc021205 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -13,10 +13,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Samples / s') + $.statPanel( - 'sum(%(jobAggregationPrefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % { + 'sum(%(jobAggregationPrefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % { job: $.jobMatcher($._config.job_names.distributor), - jobAggregationPrefix: $._config.job_aggregation_prefix - }, + jobAggregationPrefix: $._config.job_aggregation_prefix, + }, format='reqps' ) ) @@ -29,7 +29,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % { ingester: $.jobMatcher($._config.job_names.ingester), distributor: $.jobMatcher($._config.job_names.distributor), - labels: $._config.job_aggregation_labels_active_series + labels: $._config.job_aggregation_labels_active_series, }, format='short') ) .addPanel( From 46b2c1128e6ad6d17d0300fc3044aea61796cfa2 Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Thu, 3 Jun 2021 10:24:22 -0400 Subject: [PATCH 238/364] Update cortex-mixin/dashboards/writes.libsonnet simplify mapping by extending $._config Co-authored-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 6debc021205..e490db608c6 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -13,9 +13,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Samples / s') + $.statPanel( - 'sum(%(jobAggregationPrefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % { + 'sum(%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % $._config + { job: $.jobMatcher($._config.job_names.distributor), - jobAggregationPrefix: $._config.job_aggregation_prefix, }, format='reqps' ) From 1671e384c0570cfd2fd6e4bb407726416fef8fb4 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Thu, 3 Jun 2021 18:15:29 -0400 Subject: [PATCH 239/364] fix: syntax --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index e490db608c6..dd524e5d8c7 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -13,9 +13,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Samples / s') + $.statPanel( - 'sum(%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % $._config + { - job: $.jobMatcher($._config.job_names.distributor), - }, + 'sum(%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( + $._config { + job: $.jobMatcher($._config.job_names.distributor), + } + ), format='reqps' ) ) From 3299e55d2c432c5dd0d5b8993f25902032e3b2f2 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Fri, 4 Jun 2021 16:52:42 -0400 Subject: [PATCH 240/364] refactor: added a group_config defines group-related strings based off of array-based parameters in _config. deprecated _config.alert_aggregation_labels with a std.trace warning, while maintaining (temporary?) backward compatibility. --- jsonnet/mimir-mixin/alerts.libsonnet | 2 +- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- jsonnet/mimir-mixin/config.libsonnet | 12 ++--- jsonnet/mimir-mixin/dashboards.libsonnet | 2 +- .../mimir-mixin/dashboards/writes.libsonnet | 9 ++-- jsonnet/mimir-mixin/groups.libsonnet | 45 +++++++++++++++++++ jsonnet/mimir-mixin/mixin.libsonnet | 1 + jsonnet/mimir-mixin/recording_rules.libsonnet | 23 +++++----- 8 files changed, 68 insertions(+), 28 deletions(-) create mode 100644 jsonnet/mimir-mixin/groups.libsonnet diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index 9369a7da57a..771c62c89d5 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -8,5 +8,5 @@ (import 'alerts/compactor.libsonnet') else {}) + - { _config:: $._config }, + { _config:: $._config + $._group_config }, } diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index d34777b212b..3605702dc58 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -39,7 +39,7 @@ { alert: 'CortexRequestLatency', expr: ||| - %(job_aggregation_prefix)s_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"} + %(group_prefix_jobs)s_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > %(cortex_p99_latency_threshold_seconds)s ||| % $._config, diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 7172ec181e1..6322945c540 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -38,14 +38,10 @@ compactor: 'compactor.*', // Match also custom compactor deployments. }, - // Aggregation strings related to "jobs" - job_aggregation_prefix: 'cluster_namespace_job', - job_aggregation_labels_recording_rules: 'cluster, namespace, job', - job_aggregation_labels_active_series: 'namespace', - - // Labels used to in alert aggregations - should uniquely identify - // a single Cortex cluster. - alert_aggregation_labels: 'cluster, namespace', + // Grouping labels, to uniquely identify and group by {jobs, clusters} + job_labels: ['cluster', 'namespace', 'job'], + cluster_labels: ['cluster', 'namespace'], + cortex_p99_latency_threshold_seconds: 2.5, // Whether resources dashboards are enabled (based on cAdvisor metrics). diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/jsonnet/mimir-mixin/dashboards.libsonnet index baf800b3128..9e7f71c28d7 100644 --- a/jsonnet/mimir-mixin/dashboards.libsonnet +++ b/jsonnet/mimir-mixin/dashboards.libsonnet @@ -31,5 +31,5 @@ (import 'dashboards/writes-resources.libsonnet') + (import 'dashboards/alertmanager-resources.libsonnet')) + - { _config:: $._config }, + { _config:: $._config + $._group_config }, } diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index dd524e5d8c7..ea2ce3c3bd6 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -13,7 +13,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Samples / s') + $.statPanel( - 'sum(%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( + 'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( $._config { job: $.jobMatcher($._config.job_names.distributor), } @@ -25,12 +25,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Active Series') + $.statPanel(||| sum(cortex_ingester_memory_series{%(ingester)s} - / on(%(labels)s) group_left - max by (%(labels)s) (cortex_distributor_replication_factor{%(distributor)s})) - ||| % { + / on(%(group_by_cluster)s) group_left + max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s})) + ||| % ($._config) { ingester: $.jobMatcher($._config.job_names.ingester), distributor: $.jobMatcher($._config.job_names.distributor), - labels: $._config.job_aggregation_labels_active_series, }, format='short') ) .addPanel( diff --git a/jsonnet/mimir-mixin/groups.libsonnet b/jsonnet/mimir-mixin/groups.libsonnet new file mode 100644 index 00000000000..6781364510b --- /dev/null +++ b/jsonnet/mimir-mixin/groups.libsonnet @@ -0,0 +1,45 @@ +{ + local makePrefix(groups) = std.join('_', groups), + local makeGroupBy(groups) = std.join(', ', groups), + + local group_by_cluster = makeGroupBy($._config.cluster_labels), + + _group_config+:: { + // Each group prefix is composed of `_`-separated labels + group_prefix_jobs: makePrefix($._config.job_labels), + group_prefix_clusters: makePrefix($._config.cluster_labels), + + // Each group-by label list is `, `-separated and unique identifies + group_by_job: makeGroupBy($._config.job_labels), + group_by_cluster: group_by_cluster, + }, + + // The following works around the deprecation of `$._config.alert_aggregation_labels` + // - If an override of that value is detected, a warning will be printed + // - If no override was detected, it will be set to the `group_by_cluster` value, + // which will replace it altogether in the future. + local alert_aggregation_labels_override = ( + { + alert_aggregation_labels: null, + } + super._config + ).alert_aggregation_labels, + + _config+:: { + alert_aggregation_labels: + if alert_aggregation_labels_override != null + then std.trace( + ||| + Deprecated: _config.alert_aggregation_labels + This field has been explicitly overridden to "%s". + Instead, express the override in terms of _config.cluster_labels. + E.g., cluster_labels: %s will automatically convert to "%s". + ||| % [ + alert_aggregation_labels_override, + $._config.cluster_labels, + group_by_cluster, + ], + alert_aggregation_labels_override + ) + else std.trace('All good with group by cluster', group_by_cluster), + }, +} diff --git a/jsonnet/mimir-mixin/mixin.libsonnet b/jsonnet/mimir-mixin/mixin.libsonnet index ed281b5b664..bc04944c8da 100644 --- a/jsonnet/mimir-mixin/mixin.libsonnet +++ b/jsonnet/mimir-mixin/mixin.libsonnet @@ -1,4 +1,5 @@ (import 'config.libsonnet') + +(import 'groups.libsonnet') + (import 'dashboards.libsonnet') + (import 'alerts.libsonnet') + (import 'recording_rules.libsonnet') diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 3cb21191829..6461ff47478 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -1,6 +1,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; { + local _config = { + max_series_per_ingester: 1.5e6, + max_samples_per_sec_per_ingester: 80e3, + max_samples_per_sec_per_distributor: 240e3, + limit_utilisation_target: 0.6, + } + $._config + $._group_config, prometheusRules+:: { groups+: [ { @@ -51,21 +57,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; name: 'cortex_received_samples', rules: [ { - record: '%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m' % $._config, + record: '%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m' % _config, expr: ||| - sum by (%(job_aggregation_labels_recording_rules)s) (rate(cortex_distributor_received_samples_total[5m])) - ||| % $._config, + sum by (%(group_by_job)s) (rate(cortex_distributor_received_samples_total[5m])) + ||| % _config, }, ], }, { - local _config = { - max_series_per_ingester: 1.5e6, - max_samples_per_sec_per_ingester: 80e3, - max_samples_per_sec_per_distributor: 240e3, - limit_utilisation_target: 0.6, - job_aggregation_prefix: $._config.job_aggregation_prefix, - }, name: 'cortex_scaling_rules', rules: [ { @@ -90,7 +89,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( - %(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m + %(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m )[24h:] ) / %(max_samples_per_sec_per_distributor)s @@ -124,7 +123,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( - %(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m + %(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m )[24h:] ) * 3 / %(max_samples_per_sec_per_ingester)s From 4a5b372003fb415bd1ef0fec8fe09f42f6ca9fd2 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Fri, 4 Jun 2021 16:53:44 -0400 Subject: [PATCH 241/364] refactor: added a group_config defines group-related strings based off of array-based parameters in _config. deprecated _config.alert_aggregation_labels with a std.trace warning, while maintaining (temporary?) backward compatibility. --- jsonnet/mimir-mixin/groups.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/groups.libsonnet b/jsonnet/mimir-mixin/groups.libsonnet index 6781364510b..736c0962b98 100644 --- a/jsonnet/mimir-mixin/groups.libsonnet +++ b/jsonnet/mimir-mixin/groups.libsonnet @@ -40,6 +40,6 @@ ], alert_aggregation_labels_override ) - else std.trace('All good with group by cluster', group_by_cluster), + group_by_cluster, }, } From 3b6693d7d14b50a2894388164dea5014be3ebf65 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Fri, 4 Jun 2021 16:56:48 -0400 Subject: [PATCH 242/364] refactor: added a group_config defines group-related strings based off of array-based parameters in _config. deprecated _config.alert_aggregation_labels with a std.trace warning, while maintaining (temporary?) backward compatibility. --- jsonnet/mimir-mixin/groups.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/groups.libsonnet b/jsonnet/mimir-mixin/groups.libsonnet index 736c0962b98..630766722f4 100644 --- a/jsonnet/mimir-mixin/groups.libsonnet +++ b/jsonnet/mimir-mixin/groups.libsonnet @@ -40,6 +40,6 @@ ], alert_aggregation_labels_override ) - group_by_cluster, + else group_by_cluster, }, } From df6a7608fba4fad7b0717421133102eec718f4a8 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 7 Jun 2021 08:53:23 +0200 Subject: [PATCH 243/364] Lower CortexIngesterRestarts severity Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 7568b4fd177..276e8842709 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -198,10 +198,13 @@ { alert: 'CortexIngesterRestarts', expr: ||| - changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) > 1 + changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) >= 2 |||, labels: { - severity: 'critical', + // This alert is on a cause not symptom. A couple of ingesters restarts may be suspicious but + // not necessarily an issue (eg. may happen because of the K8S node autoscaler), so we're + // keeping the alert as warning as a signal in case of an outage. + severity: 'warning', }, annotations: { message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.', From 2df257a995843108ebe9dc05dc01651cc6fe694e Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Tue, 8 Jun 2021 15:57:47 -0400 Subject: [PATCH 244/364] feature: add some text boxes and descriptions Focussing on the reads and writes dashboards, added some info panels and hover-over descriptions for some of the panels. Some common code used by the compactor also received additional text content. New functions: - addRows - addRowsIf ...to add a list of rows to a dashboard. The `thanosMemcachedCache` function has had some of its query text sprawled out for easier reading and comparison with similar dashboard queries. --- .../dashboards/compactor.libsonnet | 3 +- .../dashboards/dashboard-utils.libsonnet | 184 +++++++-- .../mimir-mixin/dashboards/reads.libsonnet | 357 ++++++++++++++++-- .../mimir-mixin/dashboards/writes.libsonnet | 203 +++++++++- 4 files changed, 658 insertions(+), 89 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 657cfce7a32..a1d5ea90433 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -103,6 +103,5 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)), ) ) - .addRow($.objectStorePanels1('Object Store', 'compactor')) - .addRow($.objectStorePanels2('', 'compactor')), + .addRows($.getObjectStoreRows('Object Store', 'compactor')), } diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index ded63ddc7e4..57ae82b3a3c 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -14,6 +14,24 @@ local utils = import 'mixin-utils/utils.libsonnet'; then self.addRow(row) else self, + addRowsIf(condition, rows):: + if condition + then + local reduceRows(dashboard, remainingRows) = + if (std.length(remainingRows) == 0) + then dashboard + else + reduceRows( + dashboard.addRow(remainingRows[0]), + std.slice(remainingRows, 1, std.length(remainingRows), 1) + ) + ; + reduceRows(self, rows) + else self, + + addRows(rows):: + addRowsIf(true, rows), + addClusterSelectorTemplates(multi=true):: local d = self { tags: $._config.tags, @@ -43,7 +61,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; else d .addTemplate('cluster', 'cortex_build_info', 'cluster') .addTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace'), - + editable: true, }, // The mixin allow specialism of the job selector depending on if its a single binary @@ -274,8 +292,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; type: 'text', } + options, - objectStorePanels1(title, component):: - super.row(title) + + getObjectStoreRows(title, component):: [ + ($.row(title) { height: '25px' }) + .addPanel( + $.textPanel( + '', + ||| + - The panels below summarize the rate of requests issued by %s + to object storage, separated by operation type. + - It also includes the average, median, and 99th percentile latency + of each operation and the error rate of each operation. + ||| % component + ) + ), + $.row('') .addPanel( $.panel('Operations / sec') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') + @@ -288,62 +319,163 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('percentunit') }, ) .addPanel( - $.panel('Op: Attributes') + + $.panel('Latency of Op: Attributes') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="attributes"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Op: Exists') + + $.panel('Latency of Op: Exists') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="exists"}' % [$.namespaceMatcher(), component]), ), - - // Second row of Object Store stats - objectStorePanels2(title, component):: - super.row(title) + $.row('') .addPanel( - $.panel('Op: Get') + + $.panel('Latency of Op: Get') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Op: GetRange') + + $.panel('Latency of Op: GetRange') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get_range"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Op: Upload') + + $.panel('Latency of Op: Upload') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="upload"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Op: Delete') + + $.panel('Latency of Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="delete"}' % [$.namespaceMatcher(), component]), ), + ], thanosMemcachedCache(title, jobName, component, cacheName):: + local config = { + jobMatcher: $.jobMatcher(jobName), + component: component, + cacheName: cacheName, + cacheNameReadable: std.strReplace(cacheName, '-', ' '), + }; + local panelText = { + 'metadata-cache': + ||| + The metadata cache + is an optional component that the + store-gateway and querier + will check before going to object storage. + This set of panels focuses on the + %s’s use of the metadata cache. + ||| % component, + 'chunks-cache': + ||| + The chunks cache + is an optional component that the + store-gateway + will check before going to object storage. + This helps reduce calls to the object store. + |||, + }[cacheName]; + super.row(title) + .addPanel( + $.textPanel( + '', panelText + ) + ) .addPanel( $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{%s,component="%s",name="%s"}[$__rate_interval]))' % [$.jobMatcher(jobName), component, cacheName], '{{operation}}') + + $.queryPanel( + ||| + sum by(operation) ( + rate( + thanos_memcached_operations_total{ + %(jobMatcher)s, + component="%(component)s", + name="%(cacheName)s" + }[$__rate_interval] + ) + ) + ||| % config, + '{{operation}}' + ) + $.stack + - { yaxes: $.yaxes('ops') }, + { yaxes: $.yaxes('ops') } + + $.panelDescription( + 'Requests Per Second', + ||| + Requests per second made to + the %(cacheNameReadable)s + from the %(component)s, + separated into request type. + ||| % config + ), ) .addPanel( $.panel('Latency (getmulti)') + - $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="%s",name="%s"}' % [$.jobMatcher(jobName), component, cacheName]) + $.latencyPanel( + 'thanos_memcached_operation_duration_seconds', + ||| + { + %(jobMatcher)s, + operation="getmulti", + component="%(component)s", + name="%(cacheName)s" + } + ||| % config + ) + + $.panelDescription( + 'Latency (getmulti)', + ||| + The average, median (50th percentile) and 99th percentile + time to satisfy a “getmulti” request + made by the %(component)s, + which retrieves multiple items from the cache. + ||| % config + ) ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum(rate(thanos_cache_memcached_hits_total{%s,component="%s",name="%s"}[$__rate_interval])) / sum(rate(thanos_cache_memcached_requests_total{%s,component="%s",name="%s"}[$__rate_interval]))' % - [ - $.jobMatcher(jobName), - component, - cacheName, - $.jobMatcher(jobName), - component, - cacheName, - ], 'items') + - { yaxes: $.yaxes('percentunit') }, + $.queryPanel( + ||| + sum( + rate( + thanos_cache_memcached_hits_total{ + %(jobMatcher)s, + component="%(component)s", + name="%(cacheName)s" + }[$__rate_interval] + ) + ) + / + sum( + rate( + thanos_cache_memcached_requests_total{ + %(jobMatcher)s, + component="%(component)s", + name="%(cacheName)s" + }[$__rate_interval] + ) + ) + ||| % config, + 'items' + ) + + { yaxes: $.yaxes('percentunit') } + + $.panelDescription( + 'Hit Ratio', + ||| + The fraction of %(component)s requests to the + %(cacheNameReadable)s that successfully return data. + Requests that miss the cache must go to + object storage for the underlying data. + ||| % config + ), ), filterNodeDiskContainer(containerName):: ||| ignoring(%s) group_right() (label_replace(count by(%s, %s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) ||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName], + + panelDescription(title, description):: { + description: ||| + ### %s + %s + ||| % [title, description], + }, } diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 9f98308c9d8..f741b18833d 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -4,117 +4,270 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'cortex-reads.json': ($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' }) .addClusterSelectorTemplates() + .addRow( + ($.row('Reads Summary') { height: '175px', showTitle: false }) + .addPanel( + $.textPanel('', ||| +

+ This dashboard shows various health metrics for the Cortex read path. + It is broken into sections for each service on the read path, and organized by the order in which the read request flows. +
+ Incoming queries travel from the gateway → query frontend → query scheduler → querier → ingester and/or store-gateway (depending on the age of the query). +

+

+ The dashboard shows metrics for the 4 optional caches that can be deployed with Cortex: + the query results cache, the metadata cache, the chunks cache, and the index cache. +
+ These panels will show “no data” if the caches are not deployed. +

+

+ Lastly, it also includes metrics for how the ingester and store-gateway interact with object storage. +

+ |||), + ) + ) + .addRow( + ($.row('Headlines') + + { + height: '100px', + showTitle: false, + }) + .addPanel( + $.panel('Instant Queries / s') + + $.statPanel(||| + sum( + rate( + cortex_request_duration_seconds_count{ + %(queryFrontend)s, + route=~"(prometheus|api_prom)_api_v1_query" + }[1h] + ) + ) + + sum( + rate( + cortex_prometheus_rule_evaluations_total{ + %(ruler)s + }[1h] + ) + ) + ||| % { + queryFrontend: $.jobMatcher($._config.job_names.query_frontend), + ruler: $.jobMatcher($._config.job_names.ruler), + }, format='reqps') + + $.panelDescription( + 'Instant Queries Per Second', + ||| + Rate of instant queries per second being made to the system. + Includes both queries made to the /prometheus API as + well as queries from the ruler. + ||| + ), + ) + .addPanel( + $.panel('Range Queries / s') + + $.statPanel(||| + sum( + rate( + cortex_request_duration_seconds_count{ + %(queryFrontend)s, + route=~"(prometheus|api_prom)_api_v1_query_range" + }[1h] + ) + ) + ||| % { + queryFrontend: $.jobMatcher($._config.job_names.query_frontend), + }, format='reqps') + + $.panelDescription( + 'Range Queries Per Second', + ||| + Rate of range queries per second being made to + Cortex via the /prometheus API. + (The ruler does not issue range queries). + ||| + ), + ) + ) .addRow( $.row('Gateway') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) + + $.panelDescriptionRps('gateway') ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + + $.panelDescriptionLatency('gateway') ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' ) + - { yaxes: $.yaxes('s') } + { yaxes: $.yaxes('s') } + + $.panelDescriptionP99Latency('gateway') ) ) .addRow( $.row('Query Frontend') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) + + $.panelDescriptionRps('query frontend') ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + + $.panelDescriptionLatency('query frontend') ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' ) + - { yaxes: $.yaxes('s') } + { yaxes: $.yaxes('s') } + + $.panelDescriptionP99Latency('query frontend') ) ) .addRow( $.row('Query Scheduler') + .addPanel( + $.textPanel( + '', + ||| +

+ The query scheduler is an optional service that moves + the internal queue from the query frontend into a + separate component. + If this service is not deployed, + these panels will show "No Data." +

+ ||| + ) + ) .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) + + $.panelDescriptionRps('query scheduler') ) .addPanel( $.panel('Latency (Time in Queue)') + - $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) + + $.panelDescriptionLatency('query scheduler') ) ) .addRow( $.row('Cache - Query Results') + .addPanel( + $.textPanel('', ||| +

+ The query results is an optional service is one of 4 + optional caches that can be deployed as part of a Cortex + cluster to improve query performance. + It is used by the query-frontend to cache entire results + of queries. +

+ |||) + ) .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) + $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) + + $.panelDescriptionRps('query results') ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) + + $.panelDescriptionLatency('query results') ) ) .addRow( $.row('Querier') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) + + $.panelDescriptionRps( + 'querier' + ) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + + $.panelDescriptionLatency('querier') ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], '' ) + - { yaxes: $.yaxes('s') } + { yaxes: $.yaxes('s') } + + $.panelDescriptionP99Latency('querier') ) ) .addRow( $.row('Ingester') + .addPanel( + $.textPanel( + '', + ||| +

+ For short term queries, queriers go + to the ingester to fetch the data. +

+ ||| + ) + ) .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) + + $.panelDescriptionRps('ingester') ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) + + $.panelDescriptionLatency('ingester') ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' ) + - { yaxes: $.yaxes('s') } + { yaxes: $.yaxes('s') } + + $.panelDescriptionP99Latency('ingester') ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') + .addPanel( + $.textPanel( + '', + ||| +

+ For longer term queries, queriers go to the store-gateways to + fetch the data. + Store-gateways are responsible for fetching the data from object + storage. +

+ ||| + ) + ) .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) + + $.panelDescriptionRps('store gateway') ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) + + $.panelDescriptionLatency('store gateway') ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], '' ) + - { yaxes: $.yaxes('s') } + { yaxes: $.yaxes('s') } + + $.panelDescriptionP99Latency('store gateway') ) ) .addRowIf( @@ -143,34 +296,134 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Memcached – Blocks Storage – Block Index (Store-gateway)') + $.row('Memcached – Blocks Storage – Block Index (Store-gateway)') // Resembles thanosMemcachedCache + .addPanel( + $.textPanel( + '', + ||| +

+ The block index cache is an optional component that the + store-gateway will check before going to object storage. + This helps reduce calls to the object store. +

+ ||| + ) + ) .addPanel( $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{component="store-gateway",name="index-cache", %s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + + $.queryPanel( + ||| + sum by(operation) ( + rate( + thanos_memcached_operations_total{ + component="store-gateway", + name="index-cache", + %s + }[$__rate_interval] + ) + ) + ||| % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}' + ) + $.stack + - { yaxes: $.yaxes('ops') }, + { yaxes: $.yaxes('ops') } + + $.panelDescription( + 'Requests Per Second', + ||| + Requests per second made to + the block index cache + from the store-gateway, + separated into request type. + ||| + ), ) .addPanel( $.panel('Latency (getmulti)') + - $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="store-gateway",name="index-cache"}' % $.jobMatcher($._config.job_names.store_gateway)) + $.latencyPanel( + 'thanos_memcached_operation_duration_seconds', + ||| + { + %s, + operation="getmulti", + component="store-gateway", + name="index-cache" + } + ||| % $.jobMatcher($._config.job_names.store_gateway) + ) + + $.panelDescription( + 'Latency (getmulti)', + ||| + The average, median (50th percentile) and 99th percentile + time to satisfy a “getmulti” request + from the store-gateway, + which retrieves multiple items from the cache. + ||| + ) ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum by(item_type) (rate(thanos_store_index_cache_hits_total{component="store-gateway",%s}[$__rate_interval])) / sum by(item_type) (rate(thanos_store_index_cache_requests_total{component="store-gateway",%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{item_type}}') + - { yaxes: $.yaxes('percentunit') }, + $.queryPanel( + ||| + sum by(item_type) ( + rate( + thanos_store_index_cache_hits_total{ + component="store-gateway", + %s + }[$__rate_interval] + ) + ) + / + sum by(item_type) ( + rate( + thanos_store_index_cache_requests_total{ + component="store-gateway", + %s + }[$__rate_interval] + ) + ) + ||| % [ + $.jobMatcher($._config.job_names.store_gateway), + $.jobMatcher($._config.job_names.store_gateway), + ], + '{{item_type}}' + ) + + { yaxes: $.yaxes('percentunit') } + + $.panelDescription( + 'Hit Ratio', + ||| + The fraction of requests to the + block index cache that successfully return data. + Requests that miss the cache must go to + object storage for the underlying data. + ||| + ), ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.thanosMemcachedCache('Memcached – Blocks Storage – Chunks (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'chunks-cache') + $.thanosMemcachedCache( + 'Memcached – Blocks Storage – Chunks (Store-gateway)', + $._config.job_names.store_gateway, + 'store-gateway', + 'chunks-cache' + ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.thanosMemcachedCache('Memcached – Blocks Storage – Metadata (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache') + $.thanosMemcachedCache( + 'Memcached – Blocks Storage – Metadata (Store-gateway)', + $._config.job_names.store_gateway, + 'store-gateway', + 'metadata-cache' + ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.thanosMemcachedCache('Memcached – Blocks Storage – Metadata (Querier)', $._config.job_names.querier, 'querier', 'metadata-cache') + $.thanosMemcachedCache( + 'Memcached – Blocks Storage – Metadata (Querier)', + $._config.job_names.querier, + 'querier', + 'metadata-cache' + ) ) .addRowIf( std.member($._config.storage_engine, 'chunks') && @@ -225,21 +478,43 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) // Object store metrics for the store-gateway. - .addRowIf( + .addRowsIf( std.member($._config.storage_engine, 'blocks'), - $.objectStorePanels1('Store-gateway - Blocks Object Store', 'store-gateway'), - ) - .addRowIf( - std.member($._config.storage_engine, 'blocks'), - $.objectStorePanels2('', 'store-gateway'), + $.getObjectStoreRows('Store-gateway - Blocks Object Store', 'store-gateway') ) // Object store metrics for the querier. - .addRowIf( + .addRowsIf( std.member($._config.storage_engine, 'blocks'), - $.objectStorePanels1('Querier - Blocks Object Store', 'querier'), - ) - .addRowIf( - std.member($._config.storage_engine, 'blocks'), - $.objectStorePanels2('', 'querier'), + $.getObjectStoreRows('Querier - Blocks Object Store', 'querier') ), -} +} + +( + { + panelDescriptionRps(service):: + $.panelDescription( + 'Requests Per Second', + ||| + Read requests per second made to the %s(s). + ||| % service + ), + + panelDescriptionLatency(service):: + $.panelDescription( + 'Latency', + ||| + Across all %s instances, the average, median + (50th percentile), and 99th percentile time to respond + to a request. + ||| % service + ), + + panelDescriptionP99Latency(service):: + $.panelDescription( + 'Per Instance P99 Latency', + ||| + The 99th percentile latency for each individual + instance of the %s service. + ||| % service + ), + } +) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index ea2ce3c3bd6..cdf442acd1c 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -5,6 +5,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; ($.dashboard('Cortex / Writes') + { uid: '0156f6d15aa234d452a33a4f13c838e3' }) .addClusterSelectorTemplates() .addRow( + ($.row('Writes Summary') { height: '125px', showTitle: false }) + .addPanel( + $.textPanel('', ||| +

+ This dashboard shows various health metrics for the Cortex write path. + It is broken into sections for each service on the write path, + and organized by the order in which the write request flows. +
+ Incoming metrics data travels from the gateway → distributor → ingester. +

+

+ It also includes metrics for the key-value (KV) stores used to manage + the High Availability Tracker and the Ingesters. +

+ |||), + ) + ).addRow( ($.row('Headlines') + { height: '100px', @@ -18,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; job: $.jobMatcher($._config.job_names.distributor), } ), - format='reqps' + format='short' ) ) .addPanel( @@ -37,7 +54,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short') ) .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) @@ -45,76 +62,89 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Gateway') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) + + $.panelDescriptionRps('gateway') ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) + + $.panelDescriptionLatency('gateway') ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' ) + - { yaxes: $.yaxes('s') } + { yaxes: $.yaxes('s') } + + $.panelDescriptionP99Latency('gateway') ) ) .addRow( $.row('Distributor') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) + + $.panelDescriptionRps('distributor') ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) + + $.panelDescriptionLatency('distributor') ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' ) + - { yaxes: $.yaxes('s') } + { yaxes: $.yaxes('s') } + + $.panelDescriptionP99Latency('distributor') ) ) .addRow( $.row('KV Store (HA Dedupe)') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) + + $.panelDescriptionRpsKvStoreDedupe() ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) + + $.panelDescriptionLatencyKvStore() ) ) .addRow( $.row('Ingester') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) + + $.panelDescriptionRps('ingester') ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) + + $.panelDescriptionLatency('ingester') ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' ) + - { yaxes: $.yaxes('s') } + { yaxes: $.yaxes('s') } + + $.panelDescriptionP99Latency('ingester') ) ) .addRow( $.row('KV Store (Ring)') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) + + $.panelDescriptionRpsKvStoreRing() ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) + + $.panelDescriptionLatencyKvStore() ) ) .addRowIf( @@ -189,36 +219,91 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Uploaded blocks / sec', 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), + ) + + $.panelDescription( + 'Uploaded blocks / sec', + ||| + The rate of blocks being uploaded from the ingesters + to the long term storage/object store. + ||| ), ) .addPanel( $.panel('Upload latency') + - $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)), + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)) + + $.panelDescription( + 'Upload latency', + ||| + The average, median (50th percentile), and 99th percentile time + the ingester takes to upload blocks to the long term storage/object store. + ||| + ), ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Ingester - Blocks storage - TSDB Head') + .addPanel( + $.textPanel('', ||| +

+ The ingester(s) maintain a local TSDB per-tenant on disk. + These panels contain metrics specific to the rate of + compaction of data on the ingesters’ local TSDBs. +

+ |||), + ) .addPanel( $.successFailurePanel( 'Compactions / sec', 'sum(rate(cortex_ingester_tsdb_compactions_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_compactions_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), + ) + + $.panelDescription( + 'Compactions / sec', + ||| + This is the rate of compaction operations local to the ingesters, + where every 2 hours by default, a new TSDB block is created + by compacting the head block. + ||| ), ) .addPanel( $.panel('Compactions latency') + - $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)), + $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)) + + $.panelDescription( + 'Compaction Latency', + ||| + The average, median (50th percentile), and 99th percentile time + the ingester takes to compact the head block into a new TSDB block + on its local filesystem. + ||| + ), ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Ingester - Blocks storage - TSDB WAL') + .addPanel( + $.textPanel('', ||| +

+ These panels contain metrics for the optional write-ahead-log (WAL) + that can be enabled for the local TSDBs on the ingesters. +

+ |||), + ) .addPanel( $.successFailurePanel( 'WAL truncations / sec', 'sum(rate(cortex_ingester_tsdb_wal_truncations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), + ) + + $.panelDescription( + 'WAL Truncations / sec', + ||| + The WAL is truncated each time a new TSDB block is written + (by default this is every 2h). This panel measures the rate of + truncations. + ||| ), ) .addPanel( @@ -226,12 +311,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Checkpoints created / sec', 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), + ) + + $.panelDescription( + 'Checkpoints created / sec', + ||| + Checkpoints are created as part of the WAL truncation process. + This metric measures the rate of checkpoint creation. + ||| ), ) .addPanel( $.panel('WAL truncations latency (includes checkpointing)') + $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__rate_interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + - { yaxes: $.yaxes('s') }, + { yaxes: $.yaxes('s') } + + $.panelDescription( + 'WAL Truncations Latency (including checkpointing)', + ||| + Average time taken to perform a full WAL truncation, + including the time taken for the checkpointing to complete. + ||| + ), ) .addPanel( $.panel('Corruptions / sec') + @@ -248,7 +347,71 @@ local utils = import 'mixin-utils/utils.libsonnet'; WAL: '#E24D42', 'mmap-ed chunks': '#E28A42', }, - }, + } + + $.panelDescription( + 'Corruptions / sec', + ||| + Rate of corrupted WAL and mmap-ed chunks. + ||| + ), ) ), -} +} + +( + { + panelDescriptionRps(service):: + $.panelDescription( + 'Requests Per Second', + ||| + Write requests per second made to the %s(s). + ||| % service + ), + + panelDescriptionRpsKvStoreDedupe():: + $.panelDescription( + 'Requests Per Second', + ||| + Requests per second made to the key-value store + that manages high-availability deduplication. + ||| + ), + + panelDescriptionRpsKvStoreRing():: + $.panelDescription( + 'Requests Per Second', + ||| + Requests per second made to the key-value store + used to manage which ingesters own which metrics series. + ||| + ), + + + panelDescriptionLatency(service):: + $.panelDescription( + 'Latency', + ||| + Across all %s instances, the average, median + (50th percentile), and 99th percentile time to respond + to a request. + ||| % service + ), + + panelDescriptionLatencyKvStore():: + $.panelDescription( + 'Latency', + ||| + The average, median (50th percentile), and 99th percentile time + the KV store takes to respond to a request. + ||| + ), + + panelDescriptionP99Latency(service):: + $.panelDescription( + 'Per Instance P99 Latency', + ||| + The 99th percentile latency for each individual + instance of the %s service. + ||| % service + ), + } +) From 143eb0122393c6cf3c19aef2d96ea50f4c811802 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Wed, 9 Jun 2021 00:28:41 -0400 Subject: [PATCH 245/364] fix: text replacements, repair addRows --- .../dashboards/dashboard-utils.libsonnet | 4 +-- .../mimir-mixin/dashboards/reads.libsonnet | 36 +++++++++---------- .../mimir-mixin/dashboards/writes.libsonnet | 20 +++++------ 3 files changed, 29 insertions(+), 31 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 57ae82b3a3c..ddff873beec 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -30,7 +30,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; else self, addRows(rows):: - addRowsIf(true, rows), + self.addRowsIf(true, rows), addClusterSelectorTemplates(multi=true):: local d = self { @@ -379,7 +379,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.queryPanel( ||| sum by(operation) ( diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index f741b18833d..927086f4aed 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -90,7 +90,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Gateway') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) + $.panelDescriptionRps('gateway') ) @@ -111,7 +111,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) + $.panelDescriptionRps('query frontend') ) @@ -146,7 +146,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) + $.panelDescriptionRps('query scheduler') ) @@ -161,16 +161,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.textPanel('', |||

- The query results is an optional service is one of 4 - optional caches that can be deployed as part of a Cortex - cluster to improve query performance. - It is used by the query-frontend to cache entire results - of queries. + The query results cache is one of 4 optional caches + that can be deployed as part of a GEM cluster to improve query performance. + It is used by the query-frontend to cache entire results of queries.

|||) ) .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) + $.panelDescriptionRps('query results') ) @@ -183,7 +181,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Querier') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) + $.panelDescriptionRps( 'querier' @@ -217,7 +215,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescriptionRps('ingester') ) @@ -252,7 +250,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) + $.panelDescriptionRps('store gateway') ) @@ -274,7 +272,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached - Chunks storage - Index') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -286,7 +284,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached - Chunks storage - Chunks') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -310,7 +308,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.queryPanel( ||| sum by(operation) ( @@ -430,7 +428,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), $.row('Cassandra') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -443,7 +441,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), $.row('BigTable') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -456,7 +454,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), $.row('DynamoDB') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -469,7 +467,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_store_backend, 'gcs'), $.row('GCS') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index cdf442acd1c..84c9ab6cae9 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -61,7 +61,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Gateway') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) + $.panelDescriptionRps('gateway') ) @@ -82,7 +82,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Distributor') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) + $.panelDescriptionRps('distributor') ) @@ -103,7 +103,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('KV Store (HA Dedupe)') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) + $.panelDescriptionRpsKvStoreDedupe() ) @@ -116,7 +116,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescriptionRps('ingester') ) @@ -137,7 +137,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('KV Store (Ring)') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescriptionRpsKvStoreRing() ) @@ -151,7 +151,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -164,7 +164,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), $.row('Cassandra') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -177,7 +177,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), $.row('BigTable') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -190,7 +190,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), $.row('DynamoDB') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -203,7 +203,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_store_backend, 'gcs'), $.row('GCS') .addPanel( - $.panel('QPS') + + $.panel('Requests Per Second') + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( From 6d04a834425e4e011b2f3f0b7540d46c5c96075e Mon Sep 17 00:00:00 2001 From: Jennifer Villa Date: Sun, 13 Jun 2021 14:23:58 -0500 Subject: [PATCH 246/364] Changing copy to add 'latency' as well. --- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index ddff873beec..7d5f8047b9b 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -299,7 +299,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.textPanel( '', ||| - - The panels below summarize the rate of requests issued by %s + - The panels below summarize the latency and rate of requests issued by %s to object storage, separated by operation type. - It also includes the average, median, and 99th percentile latency of each operation and the error rate of each operation. From 98dfc2da1e5dc79ed3386806d39ca08f8027790e Mon Sep 17 00:00:00 2001 From: Jennifer Villa Date: Sun, 13 Jun 2021 17:04:01 -0500 Subject: [PATCH 247/364] Cut down on text from initial PR. Tucked existing text from the compactor dashboard under tooltips, rather than making them text boxes. --- .../dashboards/compactor.libsonnet | 59 +++--- .../dashboards/dashboard-utils.libsonnet | 84 +------- .../mimir-mixin/dashboards/reads.libsonnet | 185 ++++-------------- .../mimir-mixin/dashboards/writes.libsonnet | 155 +++------------ 4 files changed, 113 insertions(+), 370 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index a1d5ea90433..4be906a78bc 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -6,12 +6,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addClusterSelectorTemplates() .addRow( $.row('Summary') - .addPanel( - $.textPanel('', ||| - - **Per-instance runs**: number of times a compactor instance triggers a compaction across all tenants its shard manage. - - **Tenants compaction progress**: in a multi-tenant cluster it shows the progress of tenants compacted while compaction is running. Reset to 0 once the compaction run is completed for all tenants in the shard. - |||), - ) .addPanel( $.startedCompletedFailedPanel( 'Per-instance runs / sec', @@ -20,7 +14,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor) ) + $.bars + - { yaxes: $.yaxes('ops') }, + { yaxes: $.yaxes('ops') } + + $.panelDescription( + 'Per-instance runs', + ||| + Number of times a compactor instance triggers a compaction across all tenants its shard manage. + ||| + ), ) .addPanel( $.panel('Tenants compaction progress') + @@ -31,42 +31,55 @@ local utils = import 'mixin-utils/utils.libsonnet'; cortex_compactor_tenants_skipped{%s} ) / cortex_compactor_tenants_discovered{%s} ||| % [$.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor)], '{{%s}}' % $._config.per_instance_label) + - { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) } + + $.panelDescription( + 'Tenants compaction progress', + ||| + In a multi-tenant cluster this shows the progress of tenants compacted while compaction is running. + Reset to 0 once the compaction run is completed for all tenants in the shard. + ||| + ), ) ) .addRow( $.row('') - .addPanel( - $.textPanel('', ||| - - **Compacted blocks**: number of blocks generated as a result of a compaction operation. - - **Per-block compaction duration**: time taken to generate a single compacted block. - |||), - ) .addPanel( $.panel('Compacted blocks / sec') + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + - { yaxes: $.yaxes('ops') }, + { yaxes: $.yaxes('ops') } + + $.panelDescription( + 'Compacted Blocks / Sec', + ||| + Time taken to generate a single compacted block + ||| + ), ) .addPanel( $.panel('Per-block compaction duration') + - $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) + $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) + + $.panelDescription( + 'Per-block compaction duration', + ||| + Rate of blocks generated as a result of a compaction operation + ||| + ), ) ) .addRow( $.row('') - .addPanel( - $.textPanel('', ||| - - **Average blocks / tenant**: the average number of blocks per tenant. - - **Tenants with largest number of blocks**: the 10 tenants with the largest number of blocks. - |||), - ) .addPanel( $.panel('Average blocks / tenant') + $.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), 'avg'), ) .addPanel( $.panel('Tenants with largest number of blocks') + - $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}'), + $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}') + + $.panelDescription( + 'Tenants with largest number of blocks', + ||| + The 10 tenants with the largest number of blocks + ||| + ), ) ) .addRow( diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 7d5f8047b9b..8641bf3ddb1 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -294,19 +294,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; getObjectStoreRows(title, component):: [ - ($.row(title) { height: '25px' }) - .addPanel( - $.textPanel( - '', - ||| - - The panels below summarize the latency and rate of requests issued by %s - to object storage, separated by operation type. - - It also includes the average, median, and 99th percentile latency - of each operation and the error rate of each operation. - ||| % component - ) - ), - $.row('') + ($.row(title)) .addPanel( $.panel('Operations / sec') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') + @@ -346,39 +334,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], thanosMemcachedCache(title, jobName, component, cacheName):: - local config = { - jobMatcher: $.jobMatcher(jobName), - component: component, - cacheName: cacheName, - cacheNameReadable: std.strReplace(cacheName, '-', ' '), - }; - local panelText = { - 'metadata-cache': - ||| - The metadata cache - is an optional component that the - store-gateway and querier - will check before going to object storage. - This set of panels focuses on the - %s’s use of the metadata cache. - ||| % component, - 'chunks-cache': - ||| - The chunks cache - is an optional component that the - store-gateway - will check before going to object storage. - This helps reduce calls to the object store. - |||, - }[cacheName]; - + local config = { + jobMatcher: $.jobMatcher(jobName), + component: component, + cacheName: cacheName, + cacheNameReadable: std.strReplace(cacheName, '-', ' '), + }; super.row(title) - .addPanel( - $.textPanel( - '', panelText - ) - ) - .addPanel( + .addPanel( $.panel('Requests Per Second') + $.queryPanel( ||| @@ -395,16 +358,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; '{{operation}}' ) + $.stack + - { yaxes: $.yaxes('ops') } + - $.panelDescription( - 'Requests Per Second', - ||| - Requests per second made to - the %(cacheNameReadable)s - from the %(component)s, - separated into request type. - ||| % config - ), + { yaxes: $.yaxes('ops') } ) .addPanel( $.panel('Latency (getmulti)') + @@ -418,15 +372,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; name="%(cacheName)s" } ||| % config - ) + - $.panelDescription( - 'Latency (getmulti)', - ||| - The average, median (50th percentile) and 99th percentile - time to satisfy a “getmulti” request - made by the %(component)s, - which retrieves multiple items from the cache. - ||| % config ) ) .addPanel( @@ -455,16 +400,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % config, 'items' ) + - { yaxes: $.yaxes('percentunit') } + - $.panelDescription( - 'Hit Ratio', - ||| - The fraction of %(component)s requests to the - %(cacheNameReadable)s that successfully return data. - Requests that miss the cache must go to - object storage for the underlying data. - ||| % config - ), + { yaxes: $.yaxes('percentunit') } ), filterNodeDiskContainer(containerName):: diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 927086f4aed..af384f4f7e3 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -5,17 +5,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; ($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' }) .addClusterSelectorTemplates() .addRow( - ($.row('Reads Summary') { height: '175px', showTitle: false }) + ($.row('Reads Dashboard Description') { height: '175px', showTitle: false }) .addPanel( $.textPanel('', |||

- This dashboard shows various health metrics for the Cortex read path. + This dashboard shows health metrics for the Cortex read path. It is broken into sections for each service on the read path, and organized by the order in which the read request flows.
Incoming queries travel from the gateway → query frontend → query scheduler → querier → ingester and/or store-gateway (depending on the age of the query). +
+ For each service, there are 3 panels showing (1) requests per second to that service, (2) average, median, and p99 latency of requests to that service, and (3) p99 latency of requests to each instance of that service.

- The dashboard shows metrics for the 4 optional caches that can be deployed with Cortex: + The dashboard also shows metrics for the 4 optional caches that can be deployed with Cortex: the query results cache, the metadata cache, the chunks cache, and the index cache.
These panels will show “no data” if the caches are not deployed. @@ -82,7 +84,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| Rate of range queries per second being made to Cortex via the /prometheus API. - (The ruler does not issue range queries). ||| ), ) @@ -91,42 +92,36 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Gateway') .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) + - $.panelDescriptionRps('gateway') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + - $.panelDescriptionLatency('gateway') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' ) + - { yaxes: $.yaxes('s') } + - $.panelDescriptionP99Latency('gateway') + { yaxes: $.yaxes('s') } ) ) .addRow( $.row('Query Frontend') .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) + - $.panelDescriptionRps('query frontend') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + - $.panelDescriptionLatency('query frontend') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' ) + - { yaxes: $.yaxes('s') } + - $.panelDescriptionP99Latency('query frontend') + { yaxes: $.yaxes('s') } ) ) .addRow( @@ -147,125 +142,77 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) + - $.panelDescriptionRps('query scheduler') + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) .addPanel( $.panel('Latency (Time in Queue)') + - $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) + - $.panelDescriptionLatency('query scheduler') + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) ) .addRow( $.row('Cache - Query Results') - .addPanel( - $.textPanel('', ||| -

- The query results cache is one of 4 optional caches - that can be deployed as part of a GEM cluster to improve query performance. - It is used by the query-frontend to cache entire results of queries. -

- |||) - ) .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) + - $.panelDescriptionRps('query results') + $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) + - $.panelDescriptionLatency('query results') + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) ) ) .addRow( $.row('Querier') .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) + - $.panelDescriptionRps( - 'querier' + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) - ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + - $.panelDescriptionLatency('querier') + utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], '' ) + - { yaxes: $.yaxes('s') } + - $.panelDescriptionP99Latency('querier') + { yaxes: $.yaxes('s') } ) ) .addRow( $.row('Ingester') - .addPanel( - $.textPanel( - '', - ||| -

- For short term queries, queriers go - to the ingester to fetch the data. -

- ||| - ) - ) .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) + - $.panelDescriptionRps('ingester') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) + - $.panelDescriptionLatency('ingester') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' ) + - { yaxes: $.yaxes('s') } + - $.panelDescriptionP99Latency('ingester') + { yaxes: $.yaxes('s') } ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') - .addPanel( - $.textPanel( - '', - ||| -

- For longer term queries, queriers go to the store-gateways to - fetch the data. - Store-gateways are responsible for fetching the data from object - storage. -

- ||| - ) - ) .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) + - $.panelDescriptionRps('store gateway') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) + - $.panelDescriptionLatency('store gateway') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], '' ) + - { yaxes: $.yaxes('s') } + - $.panelDescriptionP99Latency('store gateway') + { yaxes: $.yaxes('s') } ) ) .addRowIf( @@ -294,19 +241,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Memcached – Blocks Storage – Block Index (Store-gateway)') // Resembles thanosMemcachedCache - .addPanel( - $.textPanel( - '', - ||| -

- The block index cache is an optional component that the - store-gateway will check before going to object storage. - This helps reduce calls to the object store. -

- ||| - ) - ) + $.row('Memcached – Blocks Storage – Block Index Cache (Store-gateway accesses)') // Resembles thanosMemcachedCache .addPanel( $.panel('Requests Per Second') + $.queryPanel( @@ -323,16 +258,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}' ) + $.stack + - { yaxes: $.yaxes('ops') } + - $.panelDescription( - 'Requests Per Second', - ||| - Requests per second made to - the block index cache - from the store-gateway, - separated into request type. - ||| - ), + { yaxes: $.yaxes('ops') } ) .addPanel( $.panel('Latency (getmulti)') + @@ -346,15 +272,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; name="index-cache" } ||| % $.jobMatcher($._config.job_names.store_gateway) - ) + - $.panelDescription( - 'Latency (getmulti)', - ||| - The average, median (50th percentile) and 99th percentile - time to satisfy a “getmulti” request - from the store-gateway, - which retrieves multiple items from the cache. - ||| ) ) .addPanel( @@ -384,14 +301,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], '{{item_type}}' ) + - { yaxes: $.yaxes('percentunit') } + + { yaxes: $.yaxes('percentunit') } + $.panelDescription( 'Hit Ratio', ||| - The fraction of requests to the - block index cache that successfully return data. - Requests that miss the cache must go to - object storage for the underlying data. + Even if you do not set up memcached for the blocks index cache, you will still see data in this panel because Cortex by default has an + in-memory blocks index cache. ||| ), ) @@ -399,7 +314,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRowIf( std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache( - 'Memcached – Blocks Storage – Chunks (Store-gateway)', + 'Memcached – Blocks Storage – Chunks Cache (Store-gateway accesses)', $._config.job_names.store_gateway, 'store-gateway', 'chunks-cache' @@ -408,7 +323,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRowIf( std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache( - 'Memcached – Blocks Storage – Metadata (Store-gateway)', + 'Memcached – Blocks Storage – Metadata Cache (Store-gateway accesses)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache' @@ -417,7 +332,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRowIf( std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache( - 'Memcached – Blocks Storage – Metadata (Querier)', + 'Memcached – Blocks Storage – Metadata Cache (Querier accesses)', $._config.job_names.querier, 'querier', 'metadata-cache' @@ -478,41 +393,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Object store metrics for the store-gateway. .addRowsIf( std.member($._config.storage_engine, 'blocks'), - $.getObjectStoreRows('Store-gateway - Blocks Object Store', 'store-gateway') + $.getObjectStoreRows('Blocks Object Store (Store-gateway accesses)', 'store-gateway') ) // Object store metrics for the querier. .addRowsIf( std.member($._config.storage_engine, 'blocks'), - $.getObjectStoreRows('Querier - Blocks Object Store', 'querier') + $.getObjectStoreRows('Blocks Object Store (Querier accesses)', 'querier') ), -} + -( - { - panelDescriptionRps(service):: - $.panelDescription( - 'Requests Per Second', - ||| - Read requests per second made to the %s(s). - ||| % service - ), - - panelDescriptionLatency(service):: - $.panelDescription( - 'Latency', - ||| - Across all %s instances, the average, median - (50th percentile), and 99th percentile time to respond - to a request. - ||| % service - ), - - panelDescriptionP99Latency(service):: - $.panelDescription( - 'Per Instance P99 Latency', - ||| - The 99th percentile latency for each individual - instance of the %s service. - ||| % service - ), - } -) +} \ No newline at end of file diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 84c9ab6cae9..5486fd87193 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -5,7 +5,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ($.dashboard('Cortex / Writes') + { uid: '0156f6d15aa234d452a33a4f13c838e3' }) .addClusterSelectorTemplates() .addRow( - ($.row('Writes Summary') { height: '125px', showTitle: false }) + ($.row('Writes Dashboard Description') { height: '125px', showTitle: false }) .addPanel( $.textPanel('', |||

@@ -14,6 +14,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; and organized by the order in which the write request flows.
Incoming metrics data travels from the gateway → distributor → ingester. +
+ For each service, there are 3 panels showing + (1) requests per second to that service, + (2) average, median, and p99 latency of requests to that service, and + (3) p99 latency of requests to each instance of that service.

It also includes metrics for the key-value (KV) stores used to manage @@ -62,89 +67,76 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Gateway') .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) + - $.panelDescriptionRps('gateway') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) + - $.panelDescriptionLatency('gateway') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' ) + - { yaxes: $.yaxes('s') } + - $.panelDescriptionP99Latency('gateway') + { yaxes: $.yaxes('s') } ) ) .addRow( $.row('Distributor') .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) + - $.panelDescriptionRps('distributor') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) + - $.panelDescriptionLatency('distributor') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' ) + - { yaxes: $.yaxes('s') } + - $.panelDescriptionP99Latency('distributor') + { yaxes: $.yaxes('s') } ) ) .addRow( - $.row('KV Store (HA Dedupe)') + $.row('Key-Value store for high-availability (HA) deduplication') .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) + - $.panelDescriptionRpsKvStoreDedupe() + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) + - $.panelDescriptionLatencyKvStore() + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) ) ) .addRow( $.row('Ingester') .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) + - $.panelDescriptionRps('ingester') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) + - $.panelDescriptionLatency('ingester') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' ) + - { yaxes: $.yaxes('s') } + - $.panelDescriptionP99Latency('ingester') + { yaxes: $.yaxes('s') } ) ) .addRow( - $.row('KV Store (Ring)') + $.row('Key-Value store for the ingester ring') .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) + - $.panelDescriptionRpsKvStoreRing() + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) + - $.panelDescriptionLatencyKvStore() + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) ) ) .addRowIf( @@ -224,7 +216,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Uploaded blocks / sec', ||| The rate of blocks being uploaded from the ingesters - to the long term storage/object store. + to object storage. ||| ), ) @@ -235,7 +227,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Upload latency', ||| The average, median (50th percentile), and 99th percentile time - the ingester takes to upload blocks to the long term storage/object store. + the ingester takes to upload blocks to object storage. ||| ), ) @@ -243,15 +235,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Ingester - Blocks storage - TSDB Head') - .addPanel( - $.textPanel('', ||| -

- The ingester(s) maintain a local TSDB per-tenant on disk. - These panels contain metrics specific to the rate of - compaction of data on the ingesters’ local TSDBs. -

- |||), - ) .addPanel( $.successFailurePanel( 'Compactions / sec', @@ -261,9 +244,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Compactions / sec', ||| - This is the rate of compaction operations local to the ingesters, - where every 2 hours by default, a new TSDB block is created - by compacting the head block. + Ingesters maintain a local TSDB per-tenant on disk. Each TSDB maintains a head block for each + active time series; these blocks get periodically compacted (by default, every 2h). + This panel shows the rate of compaction operations across all TSDBs on all ingesters. ||| ), ) @@ -273,24 +256,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Compaction Latency', ||| - The average, median (50th percentile), and 99th percentile time - the ingester takes to compact the head block into a new TSDB block - on its local filesystem. + The average, median (50th percentile), and 99th percentile time ingesters take to compact head blocks + on the local filesystem. ||| ), ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Ingester - Blocks storage - TSDB WAL') - .addPanel( - $.textPanel('', ||| -

- These panels contain metrics for the optional write-ahead-log (WAL) - that can be enabled for the local TSDBs on the ingesters. -

- |||), - ) + $.row('Ingester - Blocks storage - TSDB Write Ahead Log (WAL)') .addPanel( $.successFailurePanel( 'WAL truncations / sec', @@ -300,8 +274,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'WAL Truncations / sec', ||| - The WAL is truncated each time a new TSDB block is written - (by default this is every 2h). This panel measures the rate of + The WAL is truncated each time a new TSDB block is written. This panel measures the rate of truncations. ||| ), @@ -347,71 +320,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; WAL: '#E24D42', 'mmap-ed chunks': '#E28A42', }, - } + - $.panelDescription( - 'Corruptions / sec', - ||| - Rate of corrupted WAL and mmap-ed chunks. - ||| - ), + }, ) ), -} + -( - { - panelDescriptionRps(service):: - $.panelDescription( - 'Requests Per Second', - ||| - Write requests per second made to the %s(s). - ||| % service - ), - - panelDescriptionRpsKvStoreDedupe():: - $.panelDescription( - 'Requests Per Second', - ||| - Requests per second made to the key-value store - that manages high-availability deduplication. - ||| - ), - - panelDescriptionRpsKvStoreRing():: - $.panelDescription( - 'Requests Per Second', - ||| - Requests per second made to the key-value store - used to manage which ingesters own which metrics series. - ||| - ), - - - panelDescriptionLatency(service):: - $.panelDescription( - 'Latency', - ||| - Across all %s instances, the average, median - (50th percentile), and 99th percentile time to respond - to a request. - ||| % service - ), - - panelDescriptionLatencyKvStore():: - $.panelDescription( - 'Latency', - ||| - The average, median (50th percentile), and 99th percentile time - the KV store takes to respond to a request. - ||| - ), - - panelDescriptionP99Latency(service):: - $.panelDescription( - 'Per Instance P99 Latency', - ||| - The 99th percentile latency for each individual - instance of the %s service. - ||| % service - ), - } -) +} \ No newline at end of file From 1d5daac3b5afa40faf2635ed667784ead1301b02 Mon Sep 17 00:00:00 2001 From: Jennifer Villa Date: Sun, 13 Jun 2021 17:18:20 -0500 Subject: [PATCH 248/364] Getting rid of a few space/comma errors. --- .../dashboards/dashboard-utils.libsonnet | 3 +-- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 14 +++++++------- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 8641bf3ddb1..6a6845e7150 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -292,9 +292,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; type: 'text', } + options, - getObjectStoreRows(title, component):: [ - ($.row(title)) + super.row(title) .addPanel( $.panel('Operations / sec') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') + diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index af384f4f7e3..d606eb61d57 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -96,7 +96,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + @@ -110,7 +110,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Query Frontend') .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( $.panel('Latency') + @@ -121,7 +121,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' ) + - { yaxes: $.yaxes('s') } + { yaxes: $.yaxes('s') } ) ) .addRow( @@ -146,7 +146,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Latency (Time in Queue)') + - $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) ) .addRow( @@ -165,7 +165,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Requests Per Second') + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) - ) + ) .addPanel( $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) @@ -258,7 +258,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}' ) + $.stack + - { yaxes: $.yaxes('ops') } + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Latency (getmulti)') + @@ -400,4 +400,4 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.getObjectStoreRows('Blocks Object Store (Querier accesses)', 'querier') ), -} \ No newline at end of file +} diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 5486fd87193..2815da0d61a 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -323,4 +323,4 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, ) ), -} \ No newline at end of file +} From 48e81683b77ec1cb357850272b2825ae22ca36af Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:29:38 -0400 Subject: [PATCH 249/364] Update cortex-mixin/dashboards/compactor.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 4be906a78bc..28c55a892b9 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -35,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Tenants compaction progress', ||| - In a multi-tenant cluster this shows the progress of tenants compacted while compaction is running. + In a multi-tenant cluster, display the progress of tenants that are compacted while compaction is running. Reset to 0 once the compaction run is completed for all tenants in the shard. ||| ), From 427d787a798ccc96e5d44ec0a99a8db0be07cc42 Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:29:54 -0400 Subject: [PATCH 250/364] Update cortex-mixin/dashboards/compactor.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 28c55a892b9..1867f45f70e 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -36,7 +36,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Tenants compaction progress', ||| In a multi-tenant cluster, display the progress of tenants that are compacted while compaction is running. - Reset to 0 once the compaction run is completed for all tenants in the shard. + Reset to `0` after the compaction run is completed for all tenants in the shard. ||| ), ) From 2a4cfd23f9e56286cb16fecdf9cf2e474417ce6c Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:30:04 -0400 Subject: [PATCH 251/364] Update cortex-mixin/dashboards/compactor.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 1867f45f70e..7e5b6c23a38 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -48,7 +48,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + { yaxes: $.yaxes('ops') } + $.panelDescription( - 'Compacted Blocks / Sec', + 'Compacted blocks / sec', ||| Time taken to generate a single compacted block ||| From 290ea243bbddf3fc7d126a3530af2a818c5cc68c Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:30:48 -0400 Subject: [PATCH 252/364] Update cortex-mixin/dashboards/compactor.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 7e5b6c23a38..027297cff7e 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -50,7 +50,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Compacted blocks / sec', ||| - Time taken to generate a single compacted block + Display the amount of time that it’s taken to generate a single compacted block. ||| ), ) From 08f2f32d9604b4e908231eee0c2a96b7bb0dc7a1 Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:30:59 -0400 Subject: [PATCH 253/364] Update cortex-mixin/dashboards/compactor.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 027297cff7e..98df4965d90 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -60,7 +60,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Per-block compaction duration', ||| - Rate of blocks generated as a result of a compaction operation + Rate of blocks that are generated as a result of a compaction operation. ||| ), ) From 2c3a117cd207b3716778a82a331916cad7b19aae Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:31:09 -0400 Subject: [PATCH 254/364] Update cortex-mixin/dashboards/compactor.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 98df4965d90..c4b77074157 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -77,7 +77,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Tenants with largest number of blocks', ||| - The 10 tenants with the largest number of blocks + The 10 tenants with the largest number of blocks. ||| ), ) From da98abcefbbfdba8a76b852e4c49ad28d0cc90f7 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Tue, 15 Jun 2021 13:33:41 -0400 Subject: [PATCH 255/364] fix: formatting - limit to 4 panels per row --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 84c9ab6cae9..e766ad64c1e 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -282,7 +282,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Ingester - Blocks storage - TSDB WAL') + ($.row('Ingester - Blocks storage - TSDB WAL') {height: "32px"}) .addPanel( $.textPanel('', |||

@@ -291,6 +291,10 @@ local utils = import 'mixin-utils/utils.libsonnet';

|||), ) + ) + .addRowIf( + std.member($._config.storage_engine, 'blocks'), + ($.row('') {showTitle: false}) .addPanel( $.successFailurePanel( 'WAL truncations / sec', From 35f9e73ae2d864381c9ef42ca4ba64ea87ae62b9 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Tue, 15 Jun 2021 13:39:28 -0400 Subject: [PATCH 256/364] fmt --- .../dashboards/dashboard-utils.libsonnet | 14 +++++++------- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 6a6845e7150..4433e28a8a8 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -333,14 +333,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], thanosMemcachedCache(title, jobName, component, cacheName):: - local config = { - jobMatcher: $.jobMatcher(jobName), - component: component, - cacheName: cacheName, - cacheNameReadable: std.strReplace(cacheName, '-', ' '), - }; + local config = { + jobMatcher: $.jobMatcher(jobName), + component: component, + cacheName: cacheName, + cacheNameReadable: std.strReplace(cacheName, '-', ' '), + }; super.row(title) - .addPanel( + .addPanel( $.panel('Requests Per Second') + $.queryPanel( ||| diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index d606eb61d57..c6004e04a2f 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -92,7 +92,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Gateway') .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( $.panel('Latency') + @@ -114,7 +114,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + @@ -142,7 +142,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Requests Per Second') + - $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) .addPanel( $.panel('Latency (Time in Queue)') + @@ -301,7 +301,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], '{{item_type}}' ) + - { yaxes: $.yaxes('percentunit') } + + { yaxes: $.yaxes('percentunit') } + $.panelDescription( 'Hit Ratio', ||| From 877e06f6afdeadf20c4bc2c7ab138c53059f4f06 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Tue, 15 Jun 2021 13:40:49 -0400 Subject: [PATCH 257/364] fix: remove accidental line --- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 1 - 1 file changed, 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 4433e28a8a8..5627522ffc9 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -61,7 +61,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; else d .addTemplate('cluster', 'cortex_build_info', 'cluster') .addTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace'), - editable: true, }, // The mixin allow specialism of the job selector depending on if its a single binary From b46277a4d56d38de044d22f1f3556c6e942d5da3 Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:43:11 -0400 Subject: [PATCH 258/364] Update cortex-mixin/dashboards/dashboard-utils.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 5627522ffc9..a1c25996778 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -340,7 +340,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }; super.row(title) .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.queryPanel( ||| sum by(operation) ( From 2395da8f3d1bdf43aec9bda8758e1314afe28eb0 Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:47:24 -0400 Subject: [PATCH 259/364] Update cortex-mixin/dashboards/reads.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index c6004e04a2f..b44a4501490 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -5,7 +5,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' }) .addClusterSelectorTemplates() .addRow( - ($.row('Reads Dashboard Description') { height: '175px', showTitle: false }) + ($.row('Reads dashboard description') { height: '175px', showTitle: false }) .addPanel( $.textPanel('', |||

From 302c8aea3a2fa42bdab48bec1f538ac069649151 Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:47:56 -0400 Subject: [PATCH 260/364] Update cortex-mixin/dashboards/reads.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index b44a4501490..9d95cb8c479 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -35,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Instant Queries / s') + + $.panel('Instant queries / sec') + $.statPanel(||| sum( rate( From da1744f663bffdf872ebd002871dbfe0fe4e9249 Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:48:13 -0400 Subject: [PATCH 261/364] Update cortex-mixin/dashboards/writes.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 2815da0d61a..eae9436f203 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -298,7 +298,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__rate_interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + { yaxes: $.yaxes('s') } + $.panelDescription( - 'WAL Truncations Latency (including checkpointing)', + 'WAL truncations latency (including checkpointing)', ||| Average time taken to perform a full WAL truncation, including the time taken for the checkpointing to complete. From 58dc25b900fdb083064cf689461718ba24ff268c Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:49:04 -0400 Subject: [PATCH 262/364] Update cortex-mixin/dashboards/writes.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index eae9436f203..d45a6eb489b 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -272,7 +272,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ) + $.panelDescription( - 'WAL Truncations / sec', + 'WAL truncations / sec', ||| The WAL is truncated each time a new TSDB block is written. This panel measures the rate of truncations. From 99c0b8af7b945db87781f09464869a31f167b043 Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:49:15 -0400 Subject: [PATCH 263/364] Update cortex-mixin/dashboards/writes.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index d45a6eb489b..df7af24011d 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -264,7 +264,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Ingester - Blocks storage - TSDB Write Ahead Log (WAL)') + $.row('Ingester - blocks storage - TSDB write ahead log (WAL)') .addPanel( $.successFailurePanel( 'WAL truncations / sec', From 9e8b0a9a9f99c6d91e11b49dbb1182516f52a1bb Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:49:21 -0400 Subject: [PATCH 264/364] Update cortex-mixin/dashboards/writes.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index df7af24011d..dc62e4c5766 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -254,7 +254,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Compactions latency') + $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescription( - 'Compaction Latency', + 'Compaction latency', ||| The average, median (50th percentile), and 99th percentile time ingesters take to compact head blocks on the local filesystem. From c3c4c68b8bb8d997893ac011bd46d7350245a4d7 Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:49:28 -0400 Subject: [PATCH 265/364] Update cortex-mixin/dashboards/writes.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index dc62e4c5766..3c87f193ffd 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -195,7 +195,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_store_backend, 'gcs'), $.row('GCS') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( From 3bbcb8af2b2a0974e3e19df3c8cd9efebb1f8788 Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:50:13 -0400 Subject: [PATCH 266/364] Update cortex-mixin/dashboards/reads.libsonnet Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 9d95cb8c479..8ef201c5ad5 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -57,7 +57,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ruler: $.jobMatcher($._config.job_names.ruler), }, format='reqps') + $.panelDescription( - 'Instant Queries Per Second', + 'Instant Queries per second', ||| Rate of instant queries per second being made to the system. Includes both queries made to the /prometheus API as From 4160279ba3ac9ecf41affcbca4dbabbb36b96f40 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Tue, 15 Jun 2021 13:51:45 -0400 Subject: [PATCH 267/364] fix: Requests per second --- .../mimir-mixin/dashboards/reads.libsonnet | 28 +++++++++---------- .../mimir-mixin/dashboards/writes.libsonnet | 20 ++++++------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 8ef201c5ad5..d58172226d8 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -91,7 +91,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Gateway') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( @@ -109,7 +109,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( @@ -141,7 +141,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) .addPanel( @@ -152,7 +152,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Cache - Query Results') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( @@ -163,7 +163,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Querier') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -181,7 +181,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -200,7 +200,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( @@ -219,7 +219,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached - Chunks storage - Index') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -231,7 +231,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached - Chunks storage - Chunks') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -243,7 +243,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Memcached – Blocks Storage – Block Index Cache (Store-gateway accesses)') // Resembles thanosMemcachedCache .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.queryPanel( ||| sum by(operation) ( @@ -343,7 +343,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), $.row('Cassandra') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -356,7 +356,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), $.row('BigTable') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -369,7 +369,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), $.row('DynamoDB') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -382,7 +382,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_store_backend, 'gcs'), $.row('GCS') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 3c87f193ffd..01f116a40f4 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -59,14 +59,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short') ) .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( @@ -84,7 +84,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Distributor') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( @@ -102,7 +102,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Key-Value store for high-availability (HA) deduplication') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( @@ -113,7 +113,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -131,7 +131,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Key-Value store for the ingester ring') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -143,7 +143,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -156,7 +156,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), $.row('Cassandra') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -169,7 +169,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), $.row('BigTable') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -182,7 +182,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), $.row('DynamoDB') .addPanel( - $.panel('Requests Per Second') + + $.panel('Requests per second') + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( From 3f131f4b837e8a07023b4ce3f95b71681c1eca75 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Tue, 15 Jun 2021 13:53:19 -0400 Subject: [PATCH 268/364] fix: text --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index d58172226d8..971c51df138 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -66,7 +66,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Range Queries / s') + + $.panel('Range queries / s') + $.statPanel(||| sum( rate( From aee69f1b74344aaab812db8abc321acb5aa6e35c Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 13:58:33 -0400 Subject: [PATCH 269/364] Apply suggestions from code review as per @osg-grafana Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 4 ++-- jsonnet/mimir-mixin/dashboards/writes.libsonnet | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 971c51df138..e73afc223e5 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -80,7 +80,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; queryFrontend: $.jobMatcher($._config.job_names.query_frontend), }, format='reqps') + $.panelDescription( - 'Range Queries Per Second', + 'Range queries per second', ||| Rate of range queries per second being made to Cortex via the /prometheus API. @@ -135,7 +135,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; the internal queue from the query frontend into a separate component. If this service is not deployed, - these panels will show "No Data." + these panels will show "No data."

||| ) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 01f116a40f4..ae36445b5e4 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -5,7 +5,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ($.dashboard('Cortex / Writes') + { uid: '0156f6d15aa234d452a33a4f13c838e3' }) .addClusterSelectorTemplates() .addRow( - ($.row('Writes Dashboard Description') { height: '125px', showTitle: false }) + ($.row('Writes dashboard description') { height: '125px', showTitle: false }) .addPanel( $.textPanel('', |||

@@ -22,7 +22,7 @@ local utils = import 'mixin-utils/utils.libsonnet';

It also includes metrics for the key-value (KV) stores used to manage - the High Availability Tracker and the Ingesters. + the high-availability tracker and the ingesters.

|||), ) @@ -100,7 +100,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Key-Value store for high-availability (HA) deduplication') + $.row('Key-value store for high-availability (HA) deduplication') .addPanel( $.panel('Requests per second') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) @@ -129,7 +129,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Key-Value store for the ingester ring') + $.row('Key-value store for the ingester ring') .addPanel( $.panel('Requests per second') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) From 299099fd2d5d930691252d82ea517d3fe08b6258 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Tue, 15 Jun 2021 16:03:56 -0400 Subject: [PATCH 270/364] fix: clarity --- jsonnet/mimir-mixin/dashboards/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index c4b77074157..18065547df8 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -18,7 +18,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Per-instance runs', ||| - Number of times a compactor instance triggers a compaction across all tenants its shard manage. + Number of times a compactor instance triggers a compaction across all tenants that it manages. ||| ), ) From 7d5a0e14f06df1071b08108f5924e869d6381fb0 Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Tue, 15 Jun 2021 16:06:48 -0400 Subject: [PATCH 271/364] Apply suggestions from code review as per @osg-grafana Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com> --- jsonnet/mimir-mixin/dashboards/reads.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index e73afc223e5..84026a8802d 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -241,7 +241,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Memcached – Blocks Storage – Block Index Cache (Store-gateway accesses)') // Resembles thanosMemcachedCache + $.row('Memcached – blocks storage – block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache .addPanel( $.panel('Requests per second') + $.queryPanel( @@ -314,7 +314,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRowIf( std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache( - 'Memcached – Blocks Storage – Chunks Cache (Store-gateway accesses)', + 'Memcached – blocks storage – chunks cache (store-gateway accesses)', $._config.job_names.store_gateway, 'store-gateway', 'chunks-cache' @@ -323,7 +323,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRowIf( std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache( - 'Memcached – Blocks Storage – Metadata Cache (Store-gateway accesses)', + 'Memcached – blocks storage – metadata cache (store-gateway accesses)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache' @@ -332,7 +332,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRowIf( std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache( - 'Memcached – Blocks Storage – Metadata Cache (Querier accesses)', + 'Memcached – blocks storage – metadata cache (querier accesses)', $._config.job_names.querier, 'querier', 'metadata-cache' From 1c214d311eb8ed72bc546f0c35669f3d6b2e1f33 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Wed, 16 Jun 2021 15:30:08 -0700 Subject: [PATCH 272/364] Add a simple playbook for ingester series limit alert. Signed-off-by: Callum Styan --- jsonnet/mimir-mixin/docs/playbooks.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 16511382e96..5f0734a124b 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -26,7 +26,9 @@ If nothing obvious from the above, check for increased load: ### CortexIngesterReachingSeriesLimit -_TODO: this playbook has not been written yet._ +First check the writes resources dashboard and scaling dashboard. The usual target is 1.5M active series per ingester and a max of 2.5M. + +Scaling up the ingesters will help, but it won't resolve the alert immediately, as series are active until the next TSDB Head compaction (every 2h or so). You may also want to temporarily increase the per ingester series limit (this is a runtime reloadable config option) until that next Head compaction occurs. ### CortexIngesterReachingTenantsLimit From d062e94008e761628f99416b13eeadcc6b0fad1c Mon Sep 17 00:00:00 2001 From: Johanna Ratliff Date: Thu, 17 Jun 2021 09:11:23 -0600 Subject: [PATCH 273/364] Add cortex-gw-internal to watched gateway metrics (https://github.com/grafana/cortex-jsonnet/pull/328) * Add cortex-gw-internal to watched gateway metrics * Update CHANGELOG.md Co-authored-by: Marco Pracucci --- jsonnet/mimir-mixin/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index d07c61be22e..95ddc0df6fe 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -34,7 +34,7 @@ query_scheduler: 'query-scheduler', // Not part of single-binary. table_manager: '(table-manager|cortex$)', store_gateway: '(store-gateway|cortex$)', - gateway: '(gateway|cortex-gw)', + gateway: '(gateway|cortex-gw|cortex-gw-internal)', compactor: 'compactor.*', // Match also custom compactor deployments. }, From 1ed03d637b66c382f7236feb7de307e3e302a3e2 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Thu, 17 Jun 2021 13:34:32 -0400 Subject: [PATCH 274/364] fix: query formatting to aid in merge --- .../dashboards/dashboard-utils.libsonnet | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index a1c25996778..ea0f15923f5 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -403,7 +403,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; filterNodeDiskContainer(containerName):: ||| - ignoring(%s) group_right() (label_replace(count by(%s, %s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) + ignoring(%s) group_right() ( + label_replace( + count by( + %s, + %s, + device + ) + ( + container_fs_writes_bytes_total{ + %s, + container="%s", + device!~".*sda.*" + } + ), + "device", + "$1", + "device", + "/dev/(.*)" + ) * 0 + ) ||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName], panelDescription(title, description):: { From c24a79a7cac91c70b0a0540e732f080cf4dc29b7 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Thu, 17 Jun 2021 13:36:43 -0400 Subject: [PATCH 275/364] fix: query formatting to aid in merge --- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index ea0f15923f5..254619d432d 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -423,7 +423,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; "/dev/(.*)" ) * 0 ) - ||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName], + ||| % [ + $._config.per_instance_label, + $._config.per_node_label, + $._config.per_instance_label, + $.namespaceMatcher(), + containerName, + ], panelDescription(title, description):: { description: ||| From 677b9c437baa877c64c58efd7fd29f657b92cc64 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Thu, 17 Jun 2021 15:07:29 -0400 Subject: [PATCH 276/364] fix: consistent labelling --- .../mimir-mixin/dashboards/reads.libsonnet | 6 +++--- .../mimir-mixin/dashboards/writes.libsonnet | 20 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 84026a8802d..aa7d1b26ede 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -35,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Instant queries / sec') + + $.panel('Instant queries per second') + $.statPanel(||| sum( rate( @@ -57,7 +57,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ruler: $.jobMatcher($._config.job_names.ruler), }, format='reqps') + $.panelDescription( - 'Instant Queries per second', + 'Instant queries per second', ||| Rate of instant queries per second being made to the system. Includes both queries made to the /prometheus API as @@ -66,7 +66,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Range queries / s') + + $.panel('Range queries per second') + $.statPanel(||| sum( rate( diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index ae36445b5e4..799fcc4f7b2 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -33,7 +33,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Samples / s') + + $.panel('Samples per second') + $.statPanel( 'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( $._config { @@ -208,12 +208,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ingester - Blocks storage - Shipper') .addPanel( $.successFailurePanel( - 'Uploaded blocks / sec', + 'Uploaded blocks per second', 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ) + $.panelDescription( - 'Uploaded blocks / sec', + 'Uploaded blocks per second', ||| The rate of blocks being uploaded from the ingesters to object storage. @@ -237,12 +237,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ingester - Blocks storage - TSDB Head') .addPanel( $.successFailurePanel( - 'Compactions / sec', + 'Compactions per second', 'sum(rate(cortex_ingester_tsdb_compactions_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_compactions_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ) + $.panelDescription( - 'Compactions / sec', + 'Compactions per second', ||| Ingesters maintain a local TSDB per-tenant on disk. Each TSDB maintains a head block for each active time series; these blocks get periodically compacted (by default, every 2h). @@ -267,12 +267,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ingester - blocks storage - TSDB write ahead log (WAL)') .addPanel( $.successFailurePanel( - 'WAL truncations / sec', + 'WAL truncations per second', 'sum(rate(cortex_ingester_tsdb_wal_truncations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ) + $.panelDescription( - 'WAL truncations / sec', + 'WAL truncations per second', ||| The WAL is truncated each time a new TSDB block is written. This panel measures the rate of truncations. @@ -281,12 +281,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.successFailurePanel( - 'Checkpoints created / sec', + 'Checkpoints created per second', 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ) + $.panelDescription( - 'Checkpoints created / sec', + 'Checkpoints created per second', ||| Checkpoints are created as part of the WAL truncation process. This metric measures the rate of checkpoint creation. @@ -306,7 +306,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Corruptions / sec') + + $.panel('Corruptions per second') + $.queryPanel([ 'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), From e56f0e1a7c212d1e91a0df8276bfea76d0f0b753 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Thu, 17 Jun 2021 15:22:50 -0400 Subject: [PATCH 277/364] fix: ensure panel titles are consistent - Most existing "per second" panel titles in `main` are written "/ sec", corrected recent commits to match. --- .../mimir-mixin/dashboards/reads.libsonnet | 32 ++++++++--------- .../mimir-mixin/dashboards/writes.libsonnet | 36 +++++++++---------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index aa7d1b26ede..cb2411d448c 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -35,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Instant queries per second') + + $.panel('Instant queries / sec') + $.statPanel(||| sum( rate( @@ -66,7 +66,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Range queries per second') + + $.panel('Range queries / sec') + $.statPanel(||| sum( rate( @@ -91,7 +91,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Gateway') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( @@ -109,7 +109,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( @@ -141,7 +141,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) .addPanel( @@ -152,7 +152,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Cache - Query Results') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( @@ -163,7 +163,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Querier') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -181,7 +181,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -200,7 +200,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( @@ -219,7 +219,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached - Chunks storage - Index') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -231,7 +231,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached - Chunks storage - Chunks') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -243,7 +243,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Memcached – blocks storage – block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.queryPanel( ||| sum by(operation) ( @@ -343,7 +343,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), $.row('Cassandra') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -356,7 +356,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), $.row('BigTable') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -369,7 +369,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), $.row('DynamoDB') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( @@ -382,7 +382,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_store_backend, 'gcs'), $.row('GCS') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 799fcc4f7b2..cf49e0dab56 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -33,7 +33,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Samples per second') + + $.panel('Samples / sec') + $.statPanel( 'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( $._config { @@ -59,14 +59,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short') ) .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( @@ -84,7 +84,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Distributor') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( @@ -102,7 +102,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Key-value store for high-availability (HA) deduplication') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( @@ -113,7 +113,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -131,7 +131,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Key-value store for the ingester ring') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -143,7 +143,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'chunks'), $.row('Memcached') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -156,7 +156,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), $.row('Cassandra') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -169,7 +169,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), $.row('BigTable') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -182,7 +182,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), $.row('DynamoDB') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -195,7 +195,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.chunk_store_backend, 'gcs'), $.row('GCS') .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( @@ -208,12 +208,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ingester - Blocks storage - Shipper') .addPanel( $.successFailurePanel( - 'Uploaded blocks per second', + 'Uploaded blocks / sec', 'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ) + $.panelDescription( - 'Uploaded blocks per second', + 'Uploaded blocks / sec', ||| The rate of blocks being uploaded from the ingesters to object storage. @@ -237,7 +237,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ingester - Blocks storage - TSDB Head') .addPanel( $.successFailurePanel( - 'Compactions per second', + 'Compactions / sec', 'sum(rate(cortex_ingester_tsdb_compactions_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_compactions_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ) + @@ -267,7 +267,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ingester - blocks storage - TSDB write ahead log (WAL)') .addPanel( $.successFailurePanel( - 'WAL truncations per second', + 'WAL truncations / sec', 'sum(rate(cortex_ingester_tsdb_wal_truncations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ) + @@ -281,7 +281,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.successFailurePanel( - 'Checkpoints created per second', + 'Checkpoints created / sec', 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), ) + @@ -306,7 +306,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Corruptions per second') + + $.panel('Corruptions / sec') + $.queryPanel([ 'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), From c857e0173e2f03f7e232f1e43d08a0e61c68c697 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Jun 2021 12:05:15 +0200 Subject: [PATCH 278/364] Improved CortexIngesterReachingSeriesLimit playbook and added CortexIngesterReachingTenantsLimit playbook Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 42 +++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 5f0734a124b..163a09df0d4 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -26,13 +26,49 @@ If nothing obvious from the above, check for increased load: ### CortexIngesterReachingSeriesLimit -First check the writes resources dashboard and scaling dashboard. The usual target is 1.5M active series per ingester and a max of 2.5M. +This alert fires when the `max_series` per ingester instance limit is enabled and the actual number of in-memory series in a ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new series, while appending samples to existing ones will continue to succeed. -Scaling up the ingesters will help, but it won't resolve the alert immediately, as series are active until the next TSDB Head compaction (every 2h or so). You may also want to temporarily increase the per ingester series limit (this is a runtime reloadable config option) until that next Head compaction occurs. +In case of **emergency**: +- If the actual number of series is very close or already hit the limit, then you can increase the limit via runtime config to gain some time +- Increasing the limit will increase the ingesters memory utilization. Please monitor the ingesters memory utilization via the `Cortex / Writes Resources` dashboard + +How the limit is **configured**: +- The limit can be configured either on CLI (`-ingester.instance-limits.max-series`) or in the runtime config: + ``` + ingester_limits: + max_series: + ``` +- The mixin configures the limit in the runtime config and can be fine-tuned via `_config+:: { ingester_instance_limits+:: { ... } }` +- When configured in the runtime config, changes are applied live without requiring an ingester restart +- The configured limit can be queried via `cortex_ingester_instance_limits{limit="max_series"}` + +How to **fix**: +1. **Scale up ingesters**
+ Scaling up ingesters will lower the number of series per ingester. However, the effect of this change will take up to 4h, because after the scale up we need to wait until all stale series are dropped from memory as the effect of TSDB head compaction, which could take up to 4h (with the default config, TSDB keeps in-memory series up to 3h old and it gets compacted every 2h). +2. **Temporarily increase the limit**
+ If the actual number of series is very close or already hit the limit, or if you foresee the ingester will hit the limit before dropping the stale series as effect of the scale up, you should also temporarily increase the limit. ### CortexIngesterReachingTenantsLimit -_TODO: this playbook has not been written yet._ +This alert fires when the `max_tenants` per ingester instance limit is enabled and the actual number of tenants in a ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new tenants, while they will continue to succeed for previously existing ones. + +In case of **emergency**: +- If the actual number of tenants is very close or already hit the limit, then you can increase the limit via runtime config to gain some time +- Increasing the limit will increase the ingesters memory utilization. Please monitor the ingesters memory utilization via the `Cortex / Writes Resources` dashboard + +How the limit is **configured**: +- The limit can be configured either on CLI (`-ingester.instance-limits.max-tenants`) or in the runtime config: + ``` + ingester_limits: + max_tenants: + ``` +- The mixin configures the limit in the runtime config and can be fine-tuned via `_config+:: { ingester_instance_limits+:: { ... } }` +- When configured in the runtime config, changes are applied live without requiring an ingester restart +- The configured limit can be queried via `cortex_ingester_instance_limits{limit="max_tenants"}` + +How to **fix**: +1. Ensure shuffle-sharding is enabled in the Cortex cluster +1. Assuming shuffle-sharding is enabled, scaling up ingesters will lower the number of tenants per ingester. However, the effect of this change will be visible only after `-blocks-storage.tsdb.close-idle-tsdb-timeout` period so you may have to temporarily increase the limit ### CortexRequestLatency First establish if the alert is for read or write latency. The alert should say. From 70d0bf69589fe62382b3f9fe3b77a1cca7ddaa7d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Jun 2021 12:06:51 +0200 Subject: [PATCH 279/364] Better formatting for ingester_instance_limits+ example Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 163a09df0d4..3c6525139ff 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -38,7 +38,14 @@ How the limit is **configured**: ingester_limits: max_series: ``` -- The mixin configures the limit in the runtime config and can be fine-tuned via `_config+:: { ingester_instance_limits+:: { ... } }` +- The mixin configures the limit in the runtime config and can be fine-tuned via: + ``` + _config+:: { + ingester_instance_limits+:: { + max_series: + } + } + ``` - When configured in the runtime config, changes are applied live without requiring an ingester restart - The configured limit can be queried via `cortex_ingester_instance_limits{limit="max_series"}` @@ -62,7 +69,14 @@ How the limit is **configured**: ingester_limits: max_tenants: ``` -- The mixin configures the limit in the runtime config and can be fine-tuned via `_config+:: { ingester_instance_limits+:: { ... } }` +- The mixin configures the limit in the runtime config and can be fine-tuned via: + ``` + _config+:: { + ingester_instance_limits+:: { + max_tenants: + } + } + ``` - When configured in the runtime config, changes are applied live without requiring an ingester restart - The configured limit can be queried via `cortex_ingester_instance_limits{limit="max_tenants"}` From b3fe9d565d15f311d4d9969814eb7c927c2b708e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Jun 2021 12:14:45 +0200 Subject: [PATCH 280/364] Clarify which alerts apply to chunks storage only Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 5f0734a124b..5fc93472c5f 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -349,15 +349,15 @@ _TODO: this playbook has not been written yet._ ### CortexCheckpointCreationFailed -_TODO: this playbook has not been written yet._ +_This alert applies to Cortex chunks storage only._ ### CortexCheckpointDeletionFailed -_TODO: this playbook has not been written yet._ +_This alert applies to Cortex chunks storage only._ ### CortexProvisioningMemcachedTooSmall -_TODO: this playbook has not been written yet._ +_This alert applies to Cortex chunks storage only._ ### CortexProvisioningTooManyActiveSeries From 4e75e0f4a0488a1dea72cb9c9894cc6d7aea0e22 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Jun 2021 12:45:01 +0200 Subject: [PATCH 281/364] Improve compactor alerts and playbooks Signed-off-by: Marco Pracucci --- .../mimir-mixin/alerts/compactor.libsonnet | 30 +++++++++---------- jsonnet/mimir-mixin/docs/playbooks.md | 24 +++++++-------- 2 files changed, 25 insertions(+), 29 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index be3de8c0c2c..1f28a7e54a1 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -47,6 +47,19 @@ message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.', }, }, + { + // Alert if compactor failed to run 2 consecutive compactions. + alert: 'CortexCompactorHasNotSuccessfullyRunCompaction', + expr: ||| + increase(cortex_compactor_runs_failed_total[2h]) >= 2 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} failed to run 2 consecutive compactions.', + }, + }, { // Alert if the compactor has not uploaded anything in the last 24h. alert: 'CortexCompactorHasNotUploadedBlocks', @@ -65,7 +78,7 @@ }, { // Alert if the compactor has not uploaded anything since its start. - alert: 'CortexCompactorHasNotUploadedBlocksSinceStart', + alert: 'CortexCompactorHasNotUploadedBlocks', 'for': '24h', expr: ||| thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} == 0 @@ -77,21 +90,6 @@ message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.', }, }, - { - // Alert if compactor fails. - alert: 'CortexCompactorRunFailed', - expr: ||| - increase(cortex_compactor_runs_failed_total[2h]) >= 2 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} failed to run compaction. - |||, - }, - }, ], }, ], diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index cc3a3ad9283..daf5eaed95d 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -272,11 +272,21 @@ Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHas This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time. How to **investigate**: -- If the alert `CortexCompactorHasNotSuccessfullyRun` or `CortexCompactorHasNotSuccessfullyRunSinceStart` have fired as well, then investigate that issue first +- If the alert `CortexCompactorHasNotSuccessfullyRunCompaction` have fired as well, then investigate that issue first - If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first - Ensure ingesters are successfully shipping blocks to the storage - Look for any error in the compactor logs +### CortexCompactorHasNotSuccessfullyRunCompaction + +This alert fires if the compactor is not able to successfully run a full compaction. + +When this alert fires, the compactor may still have successfully compacted some blocks but, for some reason, other blocks compaction is consistently failing. A common case is when the compactor is trying to compact a corrupted block for a single tenant: in this case the compaction of blocks for other tenants is still working, but compaction for the affected tenant is blocked by the corrupted block. + +How to **investigate**: +- Look for any error in the compactor logs + - Corruption: [`not healthy index found`](#compactor-is-failing-because-of-not-healthy-index-found) + #### Compactor is failing because of `not healthy index found` The compactor may fail to compact blocks due a corrupted block index found in one of the source blocks: @@ -301,18 +311,6 @@ To rename a block stored on GCS you can use the `gsutil` CLI: gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK ``` -### CortexCompactorHasNotUploadedBlocksSinceStart - -Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks). - -### CortexCompactorHasNotSuccessfullyRunCompaction - -_TODO: this playbook has not been written yet._ - -### CortexCompactorRunFailed - -_TODO: this playbook has not been written yet._ - ### CortexBucketIndexNotUpdated This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. From 02eaf9202c98ac439dd4df8294f4e6fad766ee83 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Jun 2021 15:10:24 +0200 Subject: [PATCH 282/364] Addressed review comments Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index daf5eaed95d..1ad67bfe397 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -272,14 +272,14 @@ Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHas This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time. How to **investigate**: -- If the alert `CortexCompactorHasNotSuccessfullyRunCompaction` have fired as well, then investigate that issue first +- If the alert `CortexCompactorHasNotSuccessfullyRunCompaction` has fired as well, then investigate that issue first - If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first - Ensure ingesters are successfully shipping blocks to the storage - Look for any error in the compactor logs ### CortexCompactorHasNotSuccessfullyRunCompaction -This alert fires if the compactor is not able to successfully run a full compaction. +This alert fires if the compactor is not able to successfully compact all discovered compactable blocks. When this alert fires, the compactor may still have successfully compacted some blocks but, for some reason, other blocks compaction is consistently failing. A common case is when the compactor is trying to compact a corrupted block for a single tenant: in this case the compaction of blocks for other tenants is still working, but compaction for the affected tenant is blocked by the corrupted block. From 7b96c2276ffcfcd26d859ce87223101f7e346c2e Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Jun 2021 15:54:21 +0200 Subject: [PATCH 283/364] Update cortex-mixin/docs/playbooks.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marco Pracucci Co-authored-by: Peter Štibraný --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 1ad67bfe397..704b649282b 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -279,7 +279,7 @@ How to **investigate**: ### CortexCompactorHasNotSuccessfullyRunCompaction -This alert fires if the compactor is not able to successfully compact all discovered compactable blocks. +This alert fires if the compactor is not able to successfully compact all discovered compactable blocks (across all tenants). When this alert fires, the compactor may still have successfully compacted some blocks but, for some reason, other blocks compaction is consistently failing. A common case is when the compactor is trying to compact a corrupted block for a single tenant: in this case the compaction of blocks for other tenants is still working, but compaction for the affected tenant is blocked by the corrupted block. From 6f2542d8b366cfede089a025dddf3ad2c37d08f2 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Jun 2021 15:50:27 +0200 Subject: [PATCH 284/364] Fixed and improved runtime config alerts and playbooks Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 21 +++++------------ jsonnet/mimir-mixin/docs/playbooks.md | 26 ++++++++++++++++++--- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 5498fbfd75f..a93ffe05f55 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -92,39 +92,30 @@ }, }, { - alert: 'CortexInconsistentConfig', + alert: 'CortexInconsistentRuntimeConfig', expr: ||| - count(count by(%s, job, sha256) (cortex_config_hash)) without(sha256) > 1 + count(count by(%s, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1 ||| % $._config.alert_aggregation_labels, 'for': '1h', labels: { - severity: 'warning', + severity: 'critical', }, annotations: { message: ||| - An inconsistent config file hash is used across cluster {{ $labels.job }}. + An inconsistent runtime config file is used across cluster {{ $labels.job }}. |||, }, }, { - // As of https://github.com/cortexproject/cortex/pull/2092, this metric is - // only exposed when it is supposed to be non-zero, so we don't need to do - // any special filtering on the job label. - // The metric itself was renamed in - // https://github.com/cortexproject/cortex/pull/2874 - // - // TODO: Remove deprecated metric name of - // cortex_overrides_last_reload_successful in the future alert: 'CortexBadRuntimeConfig', expr: ||| + # The metric value is reset to 0 on error while reloading the config at runtime. cortex_runtime_config_last_reload_successful == 0 - or - cortex_overrides_last_reload_successful == 0 |||, // Alert quicker for human errors. 'for': '5m', labels: { - severity: 'warning', + severity: 'critical', }, annotations: { message: ||| diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 704b649282b..90bf32c5024 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -367,13 +367,33 @@ _TODO: this playbook has not been written yet._ _TODO: this playbook has not been written yet._ -### CortexInconsistentConfig +### CortexInconsistentRuntimeConfig -_TODO: this playbook has not been written yet._ +This alert fires if multiple replicas of the same Cortex service are loading a different runtime config. + +The Cortex runtime config is a config file which gets live reloaded by Cortex at runtime. In order for Cortex to work properly, the loaded config is expected to be the exact same across multiple replicas of the same Cortex service (eg. distributors, ingesters, ...). When the config changes, there may be short periods of time during which some replicas have loaded the new config and others are still running on the previous one, but it shouldn't last for more than few minutes. + +How to **investigate**: +- Check how many different config file versions (hashes) are reported + ``` + count by (sha256) (cortex_runtime_config_hash{namespace=""}) + ``` +- Check which replicas are running a different version + ``` + cortex_runtime_config_hash{namespace="",sha256=""} + ``` +- Check if the runtime config has been updated on the affected replicas' filesystem +- Check the affected replicas logs and look for any error loading the runtime config ### CortexBadRuntimeConfig -_TODO: this playbook has not been written yet._ +This alert fires if Cortex is unable to reload the runtime config. + +This typically means an invalid runtime config was deployed. Cortex keeps running with the previous (valid) version of the runtime config; running Cortex replicas and the system availability shouldn't be affected, but new replicas won't be able to startup until the runtime config is fixed. + +How to **investigate**: +- Check the latest runtime config update (it's likely to be broken) +- Check Cortex logs to get more details about what's wrong with the config ### CortexQuerierCapacityFull From 8984245de749f8d2d4639fccfab203b0ff78b9d1 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Mon, 21 Jun 2021 10:02:15 -0400 Subject: [PATCH 285/364] fix: resolve review feedback --- jsonnet/mimir-mixin/config.libsonnet | 6 ++++++ .../dashboards/compactor.libsonnet | 6 +++--- .../dashboards/dashboard-utils.libsonnet | 3 +-- .../mimir-mixin/dashboards/reads.libsonnet | 21 ++++++++++--------- .../mimir-mixin/dashboards/writes.libsonnet | 11 +++++----- 5 files changed, 27 insertions(+), 20 deletions(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index d07c61be22e..3e884292298 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -58,5 +58,11 @@ // The label used to differentiate between different nodes (i.e. servers). per_node_label: 'instance', + + // Whether certain dashboard description headers should be shown + show_dashboard_descriptions: { + writes: true, + reads: true, + }, }, } diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet index 18065547df8..aeb644919f3 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor.libsonnet @@ -36,7 +36,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Tenants compaction progress', ||| In a multi-tenant cluster, display the progress of tenants that are compacted while compaction is running. - Reset to `0` after the compaction run is completed for all tenants in the shard. + Reset to 0 after the compaction run is completed for all tenants in the shard. ||| ), ) @@ -50,7 +50,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Compacted blocks / sec', ||| - Display the amount of time that it’s taken to generate a single compacted block. + Rate of blocks that are generated as a result of a compaction operation. ||| ), ) @@ -60,7 +60,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Per-block compaction duration', ||| - Rate of blocks that are generated as a result of a compaction operation. + Display the amount of time that it has taken to generate a single compacted block. ||| ), ) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 254619d432d..c965b2656cf 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -336,11 +336,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; jobMatcher: $.jobMatcher(jobName), component: component, cacheName: cacheName, - cacheNameReadable: std.strReplace(cacheName, '-', ' '), }; super.row(title) .addPanel( - $.panel('Requests per second') + + $.panel('Requests / sec') + $.queryPanel( ||| sum by(operation) ( diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index cb2411d448c..965e0e76615 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -4,7 +4,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'cortex-reads.json': ($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' }) .addClusterSelectorTemplates() - .addRow( + .addRowIf( + $._config.show_dashboard_descriptions.reads, ($.row('Reads dashboard description') { height: '175px', showTitle: false }) .addPanel( $.textPanel('', ||| @@ -12,7 +13,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; This dashboard shows health metrics for the Cortex read path. It is broken into sections for each service on the read path, and organized by the order in which the read request flows.
- Incoming queries travel from the gateway → query frontend → query scheduler → querier → ingester and/or store-gateway (depending on the age of the query). + Incoming queries travel from the gateway → query frontend → query scheduler → querier → ingester and/or store-gateway (depending on the time range of the query).
For each service, there are 3 panels showing (1) requests per second to that service, (2) average, median, and p99 latency of requests to that service, and (3) p99 latency of requests to each instance of that service.

@@ -42,14 +43,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; cortex_request_duration_seconds_count{ %(queryFrontend)s, route=~"(prometheus|api_prom)_api_v1_query" - }[1h] + }[$__rate_interval] ) ) + sum( rate( cortex_prometheus_rule_evaluations_total{ %(ruler)s - }[1h] + }[$__rate_interval] ) ) ||| % { @@ -73,7 +74,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; cortex_request_duration_seconds_count{ %(queryFrontend)s, route=~"(prometheus|api_prom)_api_v1_query_range" - }[1h] + }[$__rate_interval] ) ) ||| % { @@ -132,7 +133,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; |||

The query scheduler is an optional service that moves - the internal queue from the query frontend into a + the internal queue from the query-frontend into a separate component. If this service is not deployed, these panels will show "No data." @@ -241,7 +242,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Memcached – blocks storage – block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache + $.row('Memcached – Blocks storage – Block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache .addPanel( $.panel('Requests / sec') + $.queryPanel( @@ -314,7 +315,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRowIf( std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache( - 'Memcached – blocks storage – chunks cache (store-gateway accesses)', + 'Memcached – Blocks storage – Chunks cache (store-gateway accesses)', $._config.job_names.store_gateway, 'store-gateway', 'chunks-cache' @@ -323,7 +324,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRowIf( std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache( - 'Memcached – blocks storage – metadata cache (store-gateway accesses)', + 'Memcached – Blocks storage – Metadata cache (store-gateway accesses)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache' @@ -332,7 +333,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRowIf( std.member($._config.storage_engine, 'blocks'), $.thanosMemcachedCache( - 'Memcached – blocks storage – metadata cache (querier accesses)', + 'Memcached – Blocks storage – Metadata cache (querier accesses)', $._config.job_names.querier, 'querier', 'metadata-cache' diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index cf49e0dab56..8a77be1c16d 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -4,7 +4,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'cortex-writes.json': ($.dashboard('Cortex / Writes') + { uid: '0156f6d15aa234d452a33a4f13c838e3' }) .addClusterSelectorTemplates() - .addRow( + .addRowIf( + $._config.show_dashboard_descriptions.writes, ($.row('Writes dashboard description') { height: '125px', showTitle: false }) .addPanel( $.textPanel('', ||| @@ -129,7 +130,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Key-value store for the ingester ring') + $.row('Key-value store for the ingesters ring') .addPanel( $.panel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) @@ -227,7 +228,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Upload latency', ||| The average, median (50th percentile), and 99th percentile time - the ingester takes to upload blocks to object storage. + the ingesters take to upload blocks to object storage. ||| ), ) @@ -256,7 +257,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Compaction latency', ||| - The average, median (50th percentile), and 99th percentile time ingesters take to compact head blocks + The average, median (50th percentile), and 99th percentile time ingesters take to compact TSDB head blocks on the local filesystem. ||| ), @@ -264,7 +265,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.member($._config.storage_engine, 'blocks'), - $.row('Ingester - blocks storage - TSDB write ahead log (WAL)') + $.row('Ingester - Blocks storage - TSDB write ahead log (WAL)') .addPanel( $.successFailurePanel( 'WAL truncations / sec', From 7a75f25f5446201a1a637d4a83f628880e89cd02 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 22 Jun 2021 11:46:17 +0200 Subject: [PATCH 286/364] Update cortex-mixin/docs/playbooks.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marco Pracucci Co-authored-by: Peter Štibraný --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 90bf32c5024..021538d7bfb 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -369,7 +369,7 @@ _TODO: this playbook has not been written yet._ ### CortexInconsistentRuntimeConfig -This alert fires if multiple replicas of the same Cortex service are loading a different runtime config. +This alert fires if multiple replicas of the same Cortex service are using a different runtime config for a longer period of time. The Cortex runtime config is a config file which gets live reloaded by Cortex at runtime. In order for Cortex to work properly, the loaded config is expected to be the exact same across multiple replicas of the same Cortex service (eg. distributors, ingesters, ...). When the config changes, there may be short periods of time during which some replicas have loaded the new config and others are still running on the previous one, but it shouldn't last for more than few minutes. From 585582d2f83efb16ee556703dda57240ae51b2d4 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 22 Jun 2021 11:46:34 +0200 Subject: [PATCH 287/364] Update cortex-mixin/docs/playbooks.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marco Pracucci Co-authored-by: Peter Štibraný --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 021538d7bfb..2dc2e26b493 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -382,7 +382,7 @@ How to **investigate**: ``` cortex_runtime_config_hash{namespace="",sha256=""} ``` -- Check if the runtime config has been updated on the affected replicas' filesystem +- Check if the runtime config has been updated on the affected replicas' filesystem. Check `-runtime-config.file` command line argument to find the location of the file. - Check the affected replicas logs and look for any error loading the runtime config ### CortexBadRuntimeConfig From 6fe9763c8a95325802537daeb976f306c593ab33 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 22 Jun 2021 11:50:42 +0200 Subject: [PATCH 288/364] MarkCortexTableSyncFailure and CortexOldChunkInMemory alerts as chunks storage only Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 2dc2e26b493..3f9228d8af0 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -361,7 +361,7 @@ _TODO: this playbook has not been written yet._ ### CortexTableSyncFailure -_TODO: this playbook has not been written yet._ +_This alert applies to Cortex chunks storage only._ ### CortexQueriesIncorrect @@ -413,7 +413,7 @@ _TODO: this playbook has not been written yet._ ### CortexOldChunkInMemory -_TODO: this playbook has not been written yet._ +_This alert applies to Cortex chunks storage only._ ### CortexCheckpointCreationFailed From c891992752c0119eb11a1779e649c8ed55ae1562 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 22 Jun 2021 12:24:46 +0200 Subject: [PATCH 289/364] Fixed whitespace noise Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- .../dashboards/dashboard-utils.libsonnet | 18 ++++++------- .../mimir-mixin/dashboards/reads.libsonnet | 26 +++++++++---------- .../mimir-mixin/dashboards/writes.libsonnet | 20 +++++++------- jsonnet/mimir-mixin/docs/playbooks.md | 6 ++--- jsonnet/mimir-mixin/groups.libsonnet | 2 +- 6 files changed, 37 insertions(+), 37 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index a93ffe05f55..582aba4b60e 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -599,7 +599,7 @@ container_memory_working_set_bytes{container="etcd"} / container_spec_memory_limit_bytes{container="etcd"} - ) > 0.65 + ) > 0.65 |||, 'for': '15m', labels: { diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index c965b2656cf..c54ae6590a5 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -383,8 +383,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; name="%(cacheName)s" }[$__rate_interval] ) - ) - / + ) + / sum( rate( thanos_cache_memcached_requests_total{ @@ -405,20 +405,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; ignoring(%s) group_right() ( label_replace( count by( - %s, - %s, + %s, + %s, device - ) + ) ( container_fs_writes_bytes_total{ %s, container="%s", device!~".*sda.*" } - ), - "device", - "$1", - "device", + ), + "device", + "$1", + "device", "/dev/(.*)" ) * 0 ) diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/jsonnet/mimir-mixin/dashboards/reads.libsonnet index 965e0e76615..9bc9b7d6b31 100644 --- a/jsonnet/mimir-mixin/dashboards/reads.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads.libsonnet @@ -16,15 +16,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; Incoming queries travel from the gateway → query frontend → query scheduler → querier → ingester and/or store-gateway (depending on the time range of the query).
For each service, there are 3 panels showing (1) requests per second to that service, (2) average, median, and p99 latency of requests to that service, and (3) p99 latency of requests to each instance of that service. -

+

- The dashboard also shows metrics for the 4 optional caches that can be deployed with Cortex: - the query results cache, the metadata cache, the chunks cache, and the index cache. + The dashboard also shows metrics for the 4 optional caches that can be deployed with Cortex: + the query results cache, the metadata cache, the chunks cache, and the index cache.
- These panels will show “no data” if the caches are not deployed. + These panels will show “no data” if the caches are not deployed.

- Lastly, it also includes metrics for how the ingester and store-gateway interact with object storage. + Lastly, it also includes metrics for how the ingester and store-gateway interact with object storage.

|||), ) @@ -45,7 +45,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; route=~"(prometheus|api_prom)_api_v1_query" }[$__rate_interval] ) - ) + + ) + sum( rate( cortex_prometheus_rule_evaluations_total{ @@ -61,7 +61,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Instant queries per second', ||| Rate of instant queries per second being made to the system. - Includes both queries made to the /prometheus API as + Includes both queries made to the /prometheus API as well as queries from the ruler. ||| ), @@ -83,8 +83,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Range queries per second', ||| - Rate of range queries per second being made to - Cortex via the /prometheus API. + Rate of range queries per second being made to + Cortex via the /prometheus API. ||| ), ) @@ -135,7 +135,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; The query scheduler is an optional service that moves the internal queue from the query-frontend into a separate component. - If this service is not deployed, + If this service is not deployed, these panels will show "No data."

||| @@ -286,8 +286,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; %s }[$__rate_interval] ) - ) - / + ) + / sum by(item_type) ( rate( thanos_store_index_cache_requests_total{ @@ -307,7 +307,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Hit Ratio', ||| Even if you do not set up memcached for the blocks index cache, you will still see data in this panel because Cortex by default has an - in-memory blocks index cache. + in-memory blocks index cache. ||| ), ) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 8a77be1c16d..e99faee4c4e 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -11,16 +11,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.textPanel('', |||

This dashboard shows various health metrics for the Cortex write path. - It is broken into sections for each service on the write path, + It is broken into sections for each service on the write path, and organized by the order in which the write request flows.
Incoming metrics data travels from the gateway → distributor → ingester.
For each service, there are 3 panels showing - (1) requests per second to that service, - (2) average, median, and p99 latency of requests to that service, and + (1) requests per second to that service, + (2) average, median, and p99 latency of requests to that service, and (3) p99 latency of requests to each instance of that service. -

+

It also includes metrics for the key-value (KV) stores used to manage the high-availability tracker and the ingesters. @@ -216,7 +216,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Uploaded blocks / sec', ||| - The rate of blocks being uploaded from the ingesters + The rate of blocks being uploaded from the ingesters to object storage. ||| ), @@ -227,7 +227,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Upload latency', ||| - The average, median (50th percentile), and 99th percentile time + The average, median (50th percentile), and 99th percentile time the ingesters take to upload blocks to object storage. ||| ), @@ -247,7 +247,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| Ingesters maintain a local TSDB per-tenant on disk. Each TSDB maintains a head block for each active time series; these blocks get periodically compacted (by default, every 2h). - This panel shows the rate of compaction operations across all TSDBs on all ingesters. + This panel shows the rate of compaction operations across all TSDBs on all ingesters. ||| ), ) @@ -275,7 +275,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'WAL truncations per second', ||| - The WAL is truncated each time a new TSDB block is written. This panel measures the rate of + The WAL is truncated each time a new TSDB block is written. This panel measures the rate of truncations. ||| ), @@ -289,7 +289,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Checkpoints created per second', ||| - Checkpoints are created as part of the WAL truncation process. + Checkpoints are created as part of the WAL truncation process. This metric measures the rate of checkpoint creation. ||| ), @@ -301,7 +301,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'WAL truncations latency (including checkpointing)', ||| - Average time taken to perform a full WAL truncation, + Average time taken to perform a full WAL truncation, including the time taken for the checkpointing to complete. ||| ), diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 2dc2e26b493..292d932af13 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -16,7 +16,7 @@ In events you're looking for things like: ``` 57m Normal NodeControllerEviction Pod Marking for deletion Pod ingester-01 from Node cloud-provider-node-01 37m Normal SuccessfulDelete ReplicaSet (combined from similar events): Deleted pod: ingester-01 -32m Normal NodeNotReady Node Node cloud-provider-node-01 status is now: NodeNotReady +32m Normal NodeNotReady Node Node cloud-provider-node-01 status is now: NodeNotReady 28m Normal DeletingAllPods Node Node cloud-provider-node-01 event: Deleting all Pods from Node cloud-provider-node-01. ``` @@ -313,7 +313,7 @@ gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK ### CortexBucketIndexNotUpdated -This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. +This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. How to **investigate**: - Ensure the compactor is successfully running @@ -557,7 +557,7 @@ metadata: spec: accessModes: - ReadWriteOnce - capacity: + capacity: storage: 150Gi gcePersistentDisk: fsType: ext4 diff --git a/jsonnet/mimir-mixin/groups.libsonnet b/jsonnet/mimir-mixin/groups.libsonnet index 630766722f4..6d33ea3661d 100644 --- a/jsonnet/mimir-mixin/groups.libsonnet +++ b/jsonnet/mimir-mixin/groups.libsonnet @@ -29,7 +29,7 @@ if alert_aggregation_labels_override != null then std.trace( ||| - Deprecated: _config.alert_aggregation_labels + Deprecated: _config.alert_aggregation_labels This field has been explicitly overridden to "%s". Instead, express the override in terms of _config.cluster_labels. E.g., cluster_labels: %s will automatically convert to "%s". From 447bc0a2b0f141c365493c88f8352cf917dabb7c Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Wed, 16 Jun 2021 19:01:36 -0400 Subject: [PATCH 290/364] refactor: resources dashboard comtainer functions added: - containerDiskWritesPanel - containerDiskReadsPanel - containerDiskSpaceUtilization --- .../alertmanager-resources.libsonnet | 20 +--- .../dashboards/compactor-resources.libsonnet | 28 ++---- .../dashboards/dashboard-utils.libsonnet | 96 +++++++++++++------ .../dashboards/reads-resources.libsonnet | 20 +--- .../dashboards/writes-resources.libsonnet | 20 +--- 5 files changed, 85 insertions(+), 99 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index 4c67c1615a0..8a719d52f25 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -52,30 +52,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Disk') .addPanel( - $.panel('Writes') + - $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('alertmanager')], - '{{%s}} - {{device}}' % $._config.per_instance_label - ) + - $.stack + - { yaxes: $.yaxes('Bps') }, + $.containerDiskWritesPanel('Writes', 'alertmanager'), ) .addPanel( - $.panel('Reads') + - $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('alertmanager')], - '{{%s}} - {{device}}' % $._config.per_instance_label - ) + - $.stack + - { yaxes: $.yaxes('Bps') }, + $.containerDiskReadsPanel('Reads', 'alertmanager'), ) ) .addRow( $.row('') .addPanel( - $.panel('Disk Space Utilization') + - $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="alertmanager"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + - { yaxes: $.yaxes('percentunit') }, + $.containerDiskSpaceUtilization('Disk Space Utilization', 'alertmanager'), ) ), } diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet index 79a7ac03fa8..82a6bce4f07 100644 --- a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet @@ -28,27 +28,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Disk') .addPanel( - $.panel('Disk Writes') + - $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('compactor')], - '{{%s}} - {{device}}' % $._config.per_instance_label - ) + - $.stack + - { yaxes: $.yaxes('Bps') }, - ) - .addPanel( - $.panel('Disk Reads') + - $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('compactor')], - '{{%s}} - {{device}}' % $._config.per_instance_label - ) + - $.stack + - { yaxes: $.yaxes('Bps') }, - ) - .addPanel( - $.panel('Disk Space Utilization') + - $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="compactor"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + - { yaxes: $.yaxes('percentunit') }, + $.containerDiskWritesPanel('Disk Writes', 'compactor'), + ) + .addPanel( + $.containerDiskReadsPanel('Disk Reads', 'compactor'), + ) + .addPanel( + $.containerDiskSpaceUtilization('Disk Space Utilization', 'compactor'), ) ) + { templating+: { diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index c54ae6590a5..0b11db1c336 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -198,6 +198,73 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerNetworkTransmitBytesPanel(instanceName):: $.containerNetworkPanel('Transmit Bandwidth', 'container_network_transmit_bytes_total', instanceName), + containerDiskWritesPanel(title, containerName):: + $.panel(title) + + $.queryPanel( + ||| + sum by(%s, %s, device) ( + rate( + node_disk_written_bytes_total[$__rate_interval] + ) + ) + + + %s + ||| % [ + $._config.per_node_label, + $._config.per_instance_label, + $.filterNodeDiskContainer(containerName), + ], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + + $.stack + + { yaxes: $.yaxes('Bps') }, + + containerDiskReadsPanel(title, containerName):: + $.panel(title) + + $.queryPanel( + ||| + sum by(%s, %s, device) ( + rate( + node_disk_read_bytes_total[$__rate_interval] + ) + ) + %s + ||| % [ + $._config.per_node_label, + $._config.per_instance_label, + $.filterNodeDiskContainer(containerName), + ], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + + $.stack + + { yaxes: $.yaxes('Bps') }, + + containerDiskSpaceUtilization(title, containerName):: + $.panel(title) + + $.queryPanel( + ||| + max by(persistentvolumeclaim) ( + kubelet_volume_stats_used_bytes{%(namespace)s} / + kubelet_volume_stats_capacity_bytes{%(namespace)s} + ) + and + count by(persistentvolumeclaim) ( + kube_persistentvolumeclaim_labels{ + %(namespace)s, + %(label)s + } + ) + ||| % { + namespace: $.namespaceMatcher(), + label: $.containerLabelMatcher(containerName), + }, '{{persistentvolumeclaim}}' + ) + + { yaxes: $.yaxes('percentunit') }, + + containerLabelMatcher(containerName):: + if containerName == 'ingester' + then 'label_name=~"ingester.*"' + else 'label_name="%s"' % containerName, + goHeapInUsePanel(title, jobName):: $.panel(title) + $.queryPanel( @@ -402,33 +469,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; filterNodeDiskContainer(containerName):: ||| - ignoring(%s) group_right() ( - label_replace( - count by( - %s, - %s, - device - ) - ( - container_fs_writes_bytes_total{ - %s, - container="%s", - device!~".*sda.*" - } - ), - "device", - "$1", - "device", - "/dev/(.*)" - ) * 0 - ) - ||| % [ - $._config.per_instance_label, - $._config.per_node_label, - $._config.per_instance_label, - $.namespaceMatcher(), - containerName, - ], + ignoring(%s) group_right() (label_replace(count by(%s, %s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) + ||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName], panelDescription(title, description):: { description: ||| diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index 697a7fd49da..a1b36272a38 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -103,27 +103,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Disk Writes') + - $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('store-gateway')], - '{{%s}} - {{device}}' % $._config.per_instance_label - ) + - $.stack + - { yaxes: $.yaxes('Bps') }, + $.containerDiskWritesPanel('Disk Writes', 'store-gateway'), ) .addPanel( - $.panel('Disk Reads') + - $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('store-gateway')], - '{{%s}} - {{device}}' % $._config.per_instance_label - ) + - $.stack + - { yaxes: $.yaxes('Bps') }, + $.containerDiskReadsPanel('Disk Reads', 'store-gateway'), ) .addPanel( - $.panel('Disk Space Utilization') + - $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name="store-gateway"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + - { yaxes: $.yaxes('percentunit') }, + $.containerDiskSpaceUtilization('Disk Space Utilization', 'store-gateway'), ) ) + { templating+: { diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index f833e406629..85d7f4c48b5 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -56,27 +56,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Disk Writes') + - $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('ingester')], - '{{%s}} - {{device}}' % $._config.per_instance_label - ) + - $.stack + - { yaxes: $.yaxes('Bps') }, + $.containerDiskWritesPanel('Disk Writes', 'ingester') ) .addPanel( - $.panel('Disk Reads') + - $.queryPanel( - 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('ingester')], - '{{%s}} - {{device}}' % $._config.per_instance_label - ) + - $.stack + - { yaxes: $.yaxes('Bps') }, + $.containerDiskReadsPanel('Disk Reads', 'ingester') ) .addPanel( - $.panel('Disk Space Utilization') + - $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name=~"ingester.*"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + - { yaxes: $.yaxes('percentunit') }, + $.containerDiskSpaceUtilization('Disk Space Utilization', 'ingester'), ) ) + { From a77706592555a92fa4bc2225d4f8910e9add3116 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Tue, 22 Jun 2021 10:18:06 -0400 Subject: [PATCH 291/364] revert: matching spacing format of main --- .../dashboards/dashboard-utils.libsonnet | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 0b11db1c336..61fc674280f 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -469,8 +469,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; filterNodeDiskContainer(containerName):: ||| - ignoring(%s) group_right() (label_replace(count by(%s, %s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) - ||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName], + ignoring(%s) group_right() ( + label_replace( + count by( + %s, + %s, + device + ) + ( + container_fs_writes_bytes_total{ + %s, + container="%s", + device!~".*sda.*" + } + ), + "device", + "$1", + "device", + "/dev/(.*)" + ) * 0 + ) + ||| % [ + $._config.per_instance_label, + $._config.per_node_label, + $._config.per_instance_label, + $.namespaceMatcher(), + containerName, + ], panelDescription(title, description):: { description: ||| From 4399d9b5e2d0f45525e2d90090d08fb3cc213722 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Tue, 22 Jun 2021 10:21:55 -0400 Subject: [PATCH 292/364] lint: white noise --- .../mimir-mixin/dashboards/dashboard-utils.libsonnet | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index 61fc674280f..f8e8cfbe564 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -206,8 +206,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; rate( node_disk_written_bytes_total[$__rate_interval] ) - ) - + + ) + + %s ||| % [ $._config.per_node_label, @@ -243,10 +243,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( ||| max by(persistentvolumeclaim) ( - kubelet_volume_stats_used_bytes{%(namespace)s} / + kubelet_volume_stats_used_bytes{%(namespace)s} / kubelet_volume_stats_capacity_bytes{%(namespace)s} - ) - and + ) + and count by(persistentvolumeclaim) ( kube_persistentvolumeclaim_labels{ %(namespace)s, From a3e9b281c97830df03bac4cba1feca8333a3d8b6 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 23 Jun 2021 10:27:29 +0200 Subject: [PATCH 293/364] Add playbook for CortexRequestErrors and config option to exclude specific routes Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 21 ++++++++++++++++----- jsonnet/mimir-mixin/config.libsonnet | 3 +++ jsonnet/mimir-mixin/docs/playbooks.md | 17 ++++++++++++----- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 582aba4b60e..11640f7a52f 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -21,11 +21,14 @@ // Note if alert_aggregation_labels is "job", this will repeat the label. But // prometheus seems to tolerate that. expr: ||| - 100 * sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready"}[1m])) + 100 * sum by (%(group_by)s, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"%(excluded_routes)s"}[1m])) / - sum by (%s, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready"}[1m])) + sum by (%(group_by)s, job, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[1m])) > 1 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + ||| % { + group_by: $._config.alert_aggregation_labels, + excluded_routes: std.join('|', ['ready'] + $._config.alert_excluded_routes), + }, 'for': '15m', labels: { severity: 'critical', @@ -39,10 +42,18 @@ { alert: 'CortexRequestLatency', expr: ||| - %(group_prefix_jobs)s_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"} + %(group_prefix_jobs)s_route:cortex_request_duration_seconds:99quantile{route!~"%(excluded_routes)s"} > %(cortex_p99_latency_threshold_seconds)s - ||| % $._config, + ||| % $._config { + excluded_routes: std.join('|', [ + 'metrics', + '/frontend.Frontend/Process', + 'ready', + '/schedulerpb.SchedulerForFrontend/FrontendLoop', + '/schedulerpb.SchedulerForQuerier/QuerierLoop', + ] + $._config.alert_excluded_routes), + }, 'for': '15m', labels: { severity: 'warning', diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index dacd06ea3df..917ffd51599 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -64,5 +64,8 @@ writes: true, reads: true, }, + + // The routes to exclude from alerts. + alert_excluded_routes: [], }, } diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 9114f564481..4373bad8ac7 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -109,7 +109,18 @@ Right now most of the execution time will be spent in PromQL's innerEval. NB tha ### CortexRequestErrors -_TODO: this playbook has not been written yet._ +This alert fires when the rate of 5xx errors of a specific route is > 1% for some time. + +This alert typically acts as a last resort to detect issues / outages. SLO alerts are expected to trigger earlier: if an **SLO alert** has triggered as well for the same read/write path, then you can ignore this alert and focus on the SLO one. + +How to **investigate**: +- Check for which route the alert fired + - Write path: open the `Cortex / Writes` dashboard + - Read path: open the `Cortex / Reads` dashboard +- Looking at the dashboard you should see in which Cortex service the error originates + - The panels in the dashboard are vertically sorted by the network path (eg. on the write path: cortex-gw -> distributor -> ingester) +- If the failing service is going OOM (`OOMKilled`): scale up or increase the memory +- If the failing service is crashing / panicking: look for the stack trace in the logs and investigate from there ### CortexTransferFailed This alert goes off when an ingester fails to find another node to transfer its data to when it was shutting down. If there is both a pod stuck terminating and one stuck joining, look at the kubernetes events. This may be due to scheduling problems caused by some combination of anti affinity rules/resource utilization. Adding a new node can help in these circumstances. You can see recent events associated with a resource via kubectl describe, ex: `kubectl -n describe pod ` @@ -355,10 +366,6 @@ WAL corruptions are only detected at startups, so at this point the WAL/Checkpoi 2. Equal or more than the quorum number but less than replication factor: There is a good chance that there is no data loss if it was replicated to desired number of ingesters. But it's good to check once for data loss. 3. Equal or more than the replication factor: Then there is definitely some data loss. -### CortexRequestErrors - -_TODO: this playbook has not been written yet._ - ### CortexTableSyncFailure _This alert applies to Cortex chunks storage only._ From c090874efc8548dd9e785c43a3c5500145a9b8fe Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Fri, 25 Jun 2021 17:02:13 +0000 Subject: [PATCH 294/364] Change min-step to 15s to show better detail. $__rate_interval will be floored at 4x this quantity, so 15s lets us see faster transients than the previous value of 1m. Signed-off-by: Bryan Boreham --- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index f8e8cfbe564..fe467f5b9c6 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -84,7 +84,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; super.queryPanel(queries, legends, legendLink) + { targets: [ target { - interval: '1m', + interval: '15s', } for target in super.targets ], @@ -104,7 +104,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; super.qpsPanel(selector) + { targets: [ target { - interval: '1m', + interval: '15s', } for target in super.targets ], @@ -114,7 +114,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; super.latencyPanel(metricName, selector, multiplier) + { targets: [ target { - interval: '1m', + interval: '15s', } for target in super.targets ], From 18c3cb96d086a7e68a527e8c87839f06ec36f2d9 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 1 Jul 2021 16:05:04 +0200 Subject: [PATCH 295/364] Added playbook for CortexFrontendQueriesStuck and CortexSchedulerQueriesStuck Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 4373bad8ac7..99afc0a53cd 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -408,11 +408,31 @@ _TODO: this playbook has not been written yet._ ### CortexFrontendQueriesStuck -_TODO: this playbook has not been written yet._ +This alert fires if Cortex is running without query-scheduler and queries are piling up in the query-frontend queue. + +The procedure to investigate it is the same as the one for [`CortexSchedulerQueriesStuck`](#CortexSchedulerQueriesStuck): please see the other playbook for more details. ### CortexSchedulerQueriesStuck -_TODO: this playbook has not been written yet._ +This alert fires if Cortex is queries are piling up in the query-scheduler. + +How it **works**: +- A query-frontend API endpoint is called to execute a query +- The query-frontend enqueues the request to the query-scheduler +- The query-scheduler is responsible to dispatch enqueued queries to idle querier workers +- The querier runs the query, sends the response back directly to the query-frontend and notifies the query-scheduler + +How to **investigate**: +- Are queriers in a crash loop (eg. OOMKilled)? + - `OOMKilled`: temporarily increase queriers memory request/limit + - `panic`: look for the stack trace in the logs and investigate from there +- Is QPS increased? + - Scale up queriers to satisfy the increased workload +- Is query latency increased? + - An increased latency reduces the number of queries we can run / sec: once all workers are busy, new queries will pile up in the queue + - Temporarily scale up queriers to try to stop the bleed + - Check the `Cortex / Slow Queries` dashboard to see if a specific tenant is running heavy queries + - If it's a multi-tenant Cortex cluster and shuffle-sharing is disabled for queriers, you may consider to enable it only for that specific tenant to reduce its blast radius. To enable queriers shuffle-sharding for a single tenant you need to set the `max_queriers_per_tenant` limit override for the specific tenant (the value should be set to the number of queriers assigned to the tenant). ### CortexCacheRequestErrors From 38aabca1c61f0ec658ebe5e09a50196276328841 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 1 Jul 2021 16:39:54 +0200 Subject: [PATCH 296/364] Remove CortexQuerierCapacityFull alert Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 15 --------------- jsonnet/mimir-mixin/docs/playbooks.md | 4 ---- 2 files changed, 19 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 11640f7a52f..0175fd16509 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -134,21 +134,6 @@ |||, }, }, - { - alert: 'CortexQuerierCapacityFull', - expr: ||| - prometheus_engine_queries_concurrent_max{job=~".+/(cortex|ruler|querier)"} - prometheus_engine_queries{job=~".+/(cortex|ruler|querier)"} == 0 - |||, - 'for': '5m', // We don't want to block for longer. - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} is at capacity processing queries. - |||, - }, - }, { alert: 'CortexFrontendQueriesStuck', expr: ||| diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 4373bad8ac7..a3d52981dd8 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -402,10 +402,6 @@ How to **investigate**: - Check the latest runtime config update (it's likely to be broken) - Check Cortex logs to get more details about what's wrong with the config -### CortexQuerierCapacityFull - -_TODO: this playbook has not been written yet._ - ### CortexFrontendQueriesStuck _TODO: this playbook has not been written yet._ From 53adf94a16228b32e32b9176314c16ef0fba0b16 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 1 Jul 2021 17:38:55 +0200 Subject: [PATCH 297/364] Added playbook for CortexProvisioningTooManyWrites Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 4373bad8ac7..825fa1c67dc 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -440,7 +440,14 @@ _TODO: this playbook has not been written yet._ ### CortexProvisioningTooManyWrites -_TODO: this playbook has not been written yet._ +This alert fires if the average number of samples ingested / sec in ingesters is above our target. + +How to **fix**: +- Scale up ingesters + - To compute the desired number of ingesters to satisfy the average samples rate you can run the following query, replacing `` with the namespace to analyse and `` with the target number of samples/sec per ingester (check out the alert threshold to see the current target): + ``` + sum(rate(cortex_ingester_ingested_samples_total{namespace=""}[$__rate_interval])) / ( * 0.9) + ``` ### CortexAllocatingTooMuchMemory From bc50863de1178f6811853b61c4a8512820a431d5 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Jul 2021 12:20:19 +0200 Subject: [PATCH 298/364] Added playbook for CortexAllocatingTooMuchMemory Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 6 +++--- jsonnet/mimir-mixin/docs/playbooks.md | 20 +++++++++++++++++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 11640f7a52f..befc83bf86c 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -479,7 +479,7 @@ }, annotations: { message: ||| - High QPS for ingesters, add more ingesters. + Ingesters in {{ $labels.namespace }} have an high samples/sec rate. |||, }, }, @@ -498,7 +498,7 @@ }, annotations: { message: ||| - Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - add more ingesters. + Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory. |||, }, }, @@ -517,7 +517,7 @@ }, annotations: { message: ||| - Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - add more ingesters. + Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory. |||, }, }, diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 825fa1c67dc..21f143fd72e 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -451,7 +451,25 @@ How to **fix**: ### CortexAllocatingTooMuchMemory -_TODO: this playbook has not been written yet._ +This alert fires when an ingester memory utilization is getting closer to the limit. + +How it **works**: +- Cortex ingesters are a stateful service +- Having 2+ ingesters `OOMKilled` may cause a cluster outage +- Ingester memory baseline usage is primarily influenced by memory allocated by the process (mostly go heap) and mmap-ed files (used by TSDB) +- Ingester memory short spikes are primarily influenced by queries +- A pod gets `OOMKilled` once it's working set memory reaches the configured limit, so it's important to prevent ingesters memory utilization (working set memory) from getting close to the limit (we need to keep at least 30% room for spikes due to queries) + +How to **fix**: +- Check if the issue occurs only for few ingesters. If so: + - Restart affected ingesters 1 by 1 (proceed with the next one once the previous pod has restarted and it's Ready) + ``` + kubectl -n delete pod ingester-XXX + ``` + - Restarting an ingester typically reduces the memory allocated by mmap-ed files. Such memory could be reallocated again, but may let you gain more time while working on a longer term solution +- Check the `Cortex / Writes Resources` dashboard to see if the number of series per ingester is above the target (1.5M). If so: + - Scale up ingesters + - Memory is expected to be reclaimed at the next TSDB head compaction (occurring every 2h) ### CortexGossipMembersMismatch From b592c8b456bf68e9cddbf215923e38e315ddddfd Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Jul 2021 13:34:13 +0200 Subject: [PATCH 299/364] Address review feedback Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- jsonnet/mimir-mixin/docs/playbooks.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index befc83bf86c..71655505973 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -479,7 +479,7 @@ }, annotations: { message: ||| - Ingesters in {{ $labels.namespace }} have an high samples/sec rate. + Ingesters in {{ $labels.namespace }} ingest too many samples per second. |||, }, }, diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 21f143fd72e..dc5058529b0 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -457,8 +457,8 @@ How it **works**: - Cortex ingesters are a stateful service - Having 2+ ingesters `OOMKilled` may cause a cluster outage - Ingester memory baseline usage is primarily influenced by memory allocated by the process (mostly go heap) and mmap-ed files (used by TSDB) -- Ingester memory short spikes are primarily influenced by queries -- A pod gets `OOMKilled` once it's working set memory reaches the configured limit, so it's important to prevent ingesters memory utilization (working set memory) from getting close to the limit (we need to keep at least 30% room for spikes due to queries) +- Ingester memory short spikes are primarily influenced by queries and TSDB head compaction into new blocks (occurring every 2h) +- A pod gets `OOMKilled` once its working set memory reaches the configured limit, so it's important to prevent ingesters memory utilization (working set memory) from getting close to the limit (we need to keep at least 30% room for spikes due to queries) How to **fix**: - Check if the issue occurs only for few ingesters. If so: @@ -466,7 +466,7 @@ How to **fix**: ``` kubectl -n delete pod ingester-XXX ``` - - Restarting an ingester typically reduces the memory allocated by mmap-ed files. Such memory could be reallocated again, but may let you gain more time while working on a longer term solution + - Restarting an ingester typically reduces the memory allocated by mmap-ed files. After the restart, ingester may allocate this memory again over time, but it may give more time while working on a longer term solution - Check the `Cortex / Writes Resources` dashboard to see if the number of series per ingester is above the target (1.5M). If so: - Scale up ingesters - Memory is expected to be reclaimed at the next TSDB head compaction (occurring every 2h) From 9d80934b3f380cf3adc8e69dbd3592ca7b371dbd Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Jul 2021 14:07:37 +0200 Subject: [PATCH 300/364] Replaced CortexCacheRequestErrors with CortexMemcachedRequestErrors Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 14 +++++------ jsonnet/mimir-mixin/docs/playbooks.md | 28 +++++++++++++++++++-- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 71655505973..ad24ac8ebf1 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -180,20 +180,20 @@ }, }, { - alert: 'CortexCacheRequestErrors', + alert: 'CortexMemcachedRequestErrors', expr: ||| - 100 * sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count{status_code=~"5.."}[1m])) - / - sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count[1m])) - > 1 + ( + sum by(%s, name, operation) (rate(thanos_memcached_operation_failures_total[1m])) / + sum by(%s, name, operation) (rate(thanos_memcached_operations_total[1m])) + ) * 100 > 5 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], - 'for': '15m', + 'for': '5m', labels: { severity: 'warning', }, annotations: { message: ||| - Cache {{ $labels.method }} is experiencing {{ printf "%.2f" $value }}% errors. + Memcached {{ $labels.name }} used by Cortex in {{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. |||, }, }, diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index dc5058529b0..5c4cbd4336d 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -414,9 +414,33 @@ _TODO: this playbook has not been written yet._ _TODO: this playbook has not been written yet._ -### CortexCacheRequestErrors +### CortexMemcachedRequestErrors -_TODO: this playbook has not been written yet._ +This alert fires if Cortex memcached client is experiencing an high error rate for a specific cache and operation. + +How to **investigate**: +- The alert reports which cache is experiencing issue + - `metadata-cache`: object store metadata cache + - `index-cache`: TSDB index cache + - `chunks-cache`: TSDB chunks cache +- Check which specific error is occurring + - Run the following query to find out the reason (replace `` with the actual Cortex cluster namespace) + ``` + sum by(name, operation, reason) (rate(thanos_memcached_operation_failures_total{namespace=""}[1m])) > 0 + ``` +- Based on the **`reason`**: + - `timeout` + - Scale up the memcached replicas + - `server-error` + - Check both Cortex and memcached logs to find more details + - `network-error` + - Check Cortex logs to find more details + - `malformed-key` + - The key is too long or contains invalid characters + - Check Cortex logs to find the offending key + - Fixing this will require changes to the application code + - `other` + - Check both Cortex and memcached logs to find more details ### CortexOldChunkInMemory From f08854b7b3c3577dada15cb3e15a35e8bba42896 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 2 Jul 2021 14:14:05 +0200 Subject: [PATCH 301/364] Replace ruler alerts, and add playbooks. --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 30 +++++++++++++++++---- jsonnet/mimir-mixin/docs/playbooks.md | 21 +++++++++++++-- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 71655505973..cb79eb377be 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -527,12 +527,32 @@ name: 'ruler_alerts', rules: [ { - alert: 'CortexRulerFailedEvaluations', + alert: 'CortexRulerTooManyFailedPushes', expr: ||| - sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_evaluation_failures_total[1m])) + 100 * ( + sum by (%s, instance) (rate(cortex_ruler_write_requests_failed_total[1m])) / - sum by (%s, instance, rule_group) (rate(cortex_prometheus_rule_evaluations_total[1m])) - > 0.01 + sum by (%s, instance) (rate(cortex_ruler_write_requests_total[1m])) + ) > 1 + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% write errors. + |||, + }, + }, + { + alert: 'CortexRulerTooManyFailedQueries', + expr: ||| + 100 * ( + sum by (%s, instance) (rate(cortex_ruler_queries_failed_total[1m])) + / + sum by (%s, instance) (rate(cortex_ruler_queries_total[1m])) + ) > 1 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', labels: { @@ -540,7 +560,7 @@ }, annotations: { message: ||| - Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% errors for the rule group {{ $labels.rule_group }}. + Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% write errors. |||, }, }, diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index dc5058529b0..4d5ea87489d 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -144,9 +144,26 @@ More information: This alert occurs when a ruler is unable to validate whether or not it should claim ownership over the evaluation of a rule group. The most likely cause is that one of the rule ring entries is unhealthy. If this is the case proceed to the ring admin http page and forget the unhealth ruler. The other possible cause would be an error returned the ring client. If this is the case look into debugging the ring based on the in-use backend implementation. -### CortexRulerFailedEvaluations +### CortexRulerTooManyFailedPushes -_TODO: this playbook has not been written yet._ +This alert fires when rulers cannot push new samples (result of rule evaluation) to ingesters. + +In general, pushing samples can fail due to problems with Cortex operations (eg. too many ingesters have crashed, and ruler cannot write samples to them), or due to problems with resulting data (eg. user hitting limit for number of series, out of order samples, etc.). +This alert fires only for first kind of problems, and not for problems caused by limits or invalid rules. + +How to **fix**: +- Investigate the ruler logs to find out the reason why ruler cannot write samples. + +### CortexRulerTooManyFailedQueries + +This alert fires when rulers fail to evaluate rule queries. + +Each rule evaluation may fail due to many reasons, eg. due to invalid PromQL expression, or query hits limits on number of chunks. These are "user errors", and this alert ignores them. + +There is a category of errors that is more important: errors due to failure to read data from store-gateways or ingesters. These errors would result in 500 when run from querier. This alert fires if there is too many of such failures. + +How to **fix**: +- Investigate the ruler logs to find out the reason why ruler cannot evaluate queries. Note that rule logs rule evaluation errors even for "user errors", but those are not causing the alert to fire. Focus on problems with ingesters or store-gateways. ### CortexRulerMissedEvaluations From 1ba6047c98fad2a80139dfc0111ac5b52a51a3e2 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Jul 2021 14:16:04 +0200 Subject: [PATCH 302/364] Addressed review comments Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 99afc0a53cd..acfc1ff443d 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -414,13 +414,13 @@ The procedure to investigate it is the same as the one for [`CortexSchedulerQuer ### CortexSchedulerQueriesStuck -This alert fires if Cortex is queries are piling up in the query-scheduler. +This alert fires if queries are piling up in the query-scheduler. How it **works**: - A query-frontend API endpoint is called to execute a query - The query-frontend enqueues the request to the query-scheduler -- The query-scheduler is responsible to dispatch enqueued queries to idle querier workers -- The querier runs the query, sends the response back directly to the query-frontend and notifies the query-scheduler +- The query-scheduler is responsible for dispatching enqueued queries to idle querier workers +- The querier runs the query, sends the response back directly to the query-frontend and notifies the query-scheduler that it can process another query How to **investigate**: - Are queriers in a crash loop (eg. OOMKilled)? @@ -431,8 +431,11 @@ How to **investigate**: - Is query latency increased? - An increased latency reduces the number of queries we can run / sec: once all workers are busy, new queries will pile up in the queue - Temporarily scale up queriers to try to stop the bleed - - Check the `Cortex / Slow Queries` dashboard to see if a specific tenant is running heavy queries - - If it's a multi-tenant Cortex cluster and shuffle-sharing is disabled for queriers, you may consider to enable it only for that specific tenant to reduce its blast radius. To enable queriers shuffle-sharding for a single tenant you need to set the `max_queriers_per_tenant` limit override for the specific tenant (the value should be set to the number of queriers assigned to the tenant). + - Check if a specific tenant is running heavy queries + - Run `sum by (user) (cortex_query_scheduler_queue_length{namespace=""}) > 0` to find tenants with enqueued queries + - Check the `Cortex / Slow Queries` dashboard to find slow queries + - On multi-tenant Cortex cluster with **shuffle-sharing for queriers disabled**, you may consider to enable it for that specific tenant to reduce its blast radius. To enable queriers shuffle-sharding for a single tenant you need to set the `max_queriers_per_tenant` limit override for the specific tenant (the value should be set to the number of queriers assigned to the tenant). + - On multi-tenant Cortex cluster with **shuffle-sharding for queriers enabled**, you may consider to temporarily increase the shard size for affected tenants: be aware that this could affect other tenants too, reducing resources available to run other tenant queries. Alternatively, you may choose to do nothing and let Cortex return errors for that given user once the per-tenant queue is full. ### CortexCacheRequestErrors From 357876b4538c4f33d2572040aff13aad030ba07c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 2 Jul 2021 14:25:55 +0200 Subject: [PATCH 303/364] Fix white space. --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 4d5ea87489d..2135d21f0c8 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -163,7 +163,7 @@ Each rule evaluation may fail due to many reasons, eg. due to invalid PromQL exp There is a category of errors that is more important: errors due to failure to read data from store-gateways or ingesters. These errors would result in 500 when run from querier. This alert fires if there is too many of such failures. How to **fix**: -- Investigate the ruler logs to find out the reason why ruler cannot evaluate queries. Note that rule logs rule evaluation errors even for "user errors", but those are not causing the alert to fire. Focus on problems with ingesters or store-gateways. +- Investigate the ruler logs to find out the reason why ruler cannot evaluate queries. Note that rule logs rule evaluation errors even for "user errors", but those are not causing the alert to fire. Focus on problems with ingesters or store-gateways. ### CortexRulerMissedEvaluations From a1465fb99ded0a854d53bac024e4b5c3466acbc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 2 Jul 2021 14:27:28 +0200 Subject: [PATCH 304/364] Better alert messages. --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index cb79eb377be..97f1f5948fa 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -541,7 +541,7 @@ }, annotations: { message: ||| - Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% write errors. + Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. |||, }, }, @@ -560,7 +560,7 @@ }, annotations: { message: ||| - Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% write errors. + Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. |||, }, }, From 027e65424c5d0334c769655d0e24ec4f66beb21b Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Jul 2021 14:46:37 +0200 Subject: [PATCH 305/364] Improve CortexIngesterReachingSeriesLimit playbook Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 41b5354b947..96a1be15eaa 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -50,10 +50,12 @@ How the limit is **configured**: - The configured limit can be queried via `cortex_ingester_instance_limits{limit="max_series"}` How to **fix**: +1. **Temporarily increase the limit**
+ If the actual number of series is very close or already hit the limit, or if you foresee the ingester will hit the limit before dropping the stale series as effect of the scale up, you should also temporarily increase the limit. +1. **Check if shuffle-sharding shard size is correct**
+ When shuffle-sharding is enabled, we target to 100K series / tenant / ingester. You can run `avg by (user) (cortex_ingester_memory_series_created_total{namespace=""} - cortex_ingester_memory_series_removed_total{namespace=""}) > 100000` to find out tenants with > 100K series / ingester. You may want to increase the shard size for these tenants. 1. **Scale up ingesters**
Scaling up ingesters will lower the number of series per ingester. However, the effect of this change will take up to 4h, because after the scale up we need to wait until all stale series are dropped from memory as the effect of TSDB head compaction, which could take up to 4h (with the default config, TSDB keeps in-memory series up to 3h old and it gets compacted every 2h). -2. **Temporarily increase the limit**
- If the actual number of series is very close or already hit the limit, or if you foresee the ingester will hit the limit before dropping the stale series as effect of the scale up, you should also temporarily increase the limit. ### CortexIngesterReachingTenantsLimit From fb98b9a812549de4bd8263e47e74dea6478548aa Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Jul 2021 15:19:58 +0200 Subject: [PATCH 306/364] Add playbook for CortexProvisioningTooManyActiveSeries Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 10 +++++----- jsonnet/mimir-mixin/docs/playbooks.md | 11 ++++++++++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 1ae6f40b9ce..a491cc138df 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -436,19 +436,19 @@ }, { alert: 'CortexProvisioningTooManyActiveSeries', - // 1.5 million active series per ingester max. + // We target each ingester to 1.5M in-memory series. This alert fires if the average + // number of series / ingester in a Cortex cluster is > 1.6M for 2h (we compact + // the TSDB head every 2h). expr: ||| avg by (%s) (cortex_ingester_memory_series) > 1.6e6 - and - sum by (%s) (rate(cortex_ingester_received_chunks[1h])) == 0 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], - 'for': '1h', + 'for': '2h', labels: { severity: 'warning', }, annotations: { message: ||| - Too many active series for ingesters, add more ingesters. + The number of in-memory series per ingester in {{ $labels.namespace }} is too high. |||, }, }, diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 96a1be15eaa..fd244d6d72d 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -457,7 +457,16 @@ _This alert applies to Cortex chunks storage only._ ### CortexProvisioningTooManyActiveSeries -_TODO: this playbook has not been written yet._ +This alert fires if the average number of in-memory series per ingester is above our target (1.5M). + +How to **fix**: +- Scale up ingesters + - To find out the Cortex clusters where ingesters should be scaled up and how many minimum replicas are expected: + ``` + ceil(sum by(cluster, namespace) (cortex_ingester_memory_series) / 1.5e6) > + count by(cluster, namespace) (cortex_ingester_memory_series) + ``` +- After the scale up, the in-memory series are expected to be reduced at the next TSDB head compaction (occurring every 2h) ### CortexProvisioningTooManyWrites From 7f33efb199ae7c6db6bdf730ec215fa3f6f0e0c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Fri, 2 Jul 2021 15:26:45 +0200 Subject: [PATCH 307/364] Improve messaging. --- jsonnet/mimir-mixin/docs/playbooks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 2135d21f0c8..c9e61d4f19d 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -152,7 +152,7 @@ In general, pushing samples can fail due to problems with Cortex operations (eg. This alert fires only for first kind of problems, and not for problems caused by limits or invalid rules. How to **fix**: -- Investigate the ruler logs to find out the reason why ruler cannot write samples. +- Investigate the ruler logs to find out the reason why ruler cannot write samples. Note that ruler logs all push errors, including "user errors", but those are not causing the alert to fire. Focus on problems with ingesters. ### CortexRulerTooManyFailedQueries @@ -163,7 +163,7 @@ Each rule evaluation may fail due to many reasons, eg. due to invalid PromQL exp There is a category of errors that is more important: errors due to failure to read data from store-gateways or ingesters. These errors would result in 500 when run from querier. This alert fires if there is too many of such failures. How to **fix**: -- Investigate the ruler logs to find out the reason why ruler cannot evaluate queries. Note that rule logs rule evaluation errors even for "user errors", but those are not causing the alert to fire. Focus on problems with ingesters or store-gateways. +- Investigate the ruler logs to find out the reason why ruler cannot evaluate queries. Note that ruler logs rule evaluation errors even for "user errors", but those are not causing the alert to fire. Focus on problems with ingesters or store-gateways. ### CortexRulerMissedEvaluations From 9de29642659d161ab6f04725ea2d3d3d94104a0a Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Jul 2021 15:31:06 +0200 Subject: [PATCH 308/364] Fixed formatting Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index a491cc138df..4c8fad8643d 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -441,7 +441,7 @@ // the TSDB head every 2h). expr: ||| avg by (%s) (cortex_ingester_memory_series) > 1.6e6 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + ||| % [$._config.alert_aggregation_labels], 'for': '2h', labels: { severity: 'warning', From 66e36d835c2768502648d8d7dca64538fc7c573c Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Jul 2021 16:58:38 +0200 Subject: [PATCH 309/364] Improved alert messages with Cortex cluster Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 64 +++++++++---------- jsonnet/mimir-mixin/alerts/blocks.libsonnet | 62 +++++++++--------- .../mimir-mixin/alerts/compactor.libsonnet | 12 ++-- jsonnet/mimir-mixin/groups.libsonnet | 17 +++++ 4 files changed, 86 insertions(+), 69 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index a0e286ccea8..9eefe7f822e 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -13,7 +13,7 @@ severity: 'critical', }, annotations: { - message: 'There are {{ printf "%f" $value }} unhealthy ingester(s).', + message: 'Cortex cluster %(alert_aggregation_variables)s has {{ printf "%%f" $value }} unhealthy ingester(s).' % $._config, }, }, { @@ -35,8 +35,8 @@ }, annotations: { message: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, + The route {{ $labels.route }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors. + ||| % $._config, }, }, { @@ -98,8 +98,8 @@ }, annotations: { message: ||| - Incorrect results for {{ printf "%.2f" $value }}% of queries. - |||, + The Cortex cluster %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% incorrect query results. + ||| % $._config, }, }, { @@ -113,8 +113,8 @@ }, annotations: { message: ||| - An inconsistent runtime config file is used across cluster {{ $labels.job }}. - |||, + An inconsistent runtime config file is used across cluster %(alert_aggregation_variables)s. + ||| % $._config, }, }, { @@ -145,8 +145,8 @@ }, annotations: { message: ||| - There are {{ $value }} queued up queries in query-frontend. - |||, + There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-frontend. + ||| % $._config, }, }, { @@ -160,8 +160,8 @@ }, annotations: { message: ||| - There are {{ $value }} queued up queries in query-scheduler. - |||, + There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-scheduler. + ||| % $._config, }, }, { @@ -178,8 +178,8 @@ }, annotations: { message: ||| - Memcached {{ $labels.name }} used by Cortex in {{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. - |||, + Memcached {{ $labels.name }} used by Cortex %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors for {{ $labels.operation }} operation. + ||| % $._config, }, }, { @@ -430,8 +430,8 @@ }, annotations: { message: ||| - Chunk memcached cluster is too small, should be at least {{ printf "%.2f" $value }}GB. - |||, + Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB. + ||| % $._config, }, }, { @@ -448,8 +448,8 @@ }, annotations: { message: ||| - The number of in-memory series per ingester in {{ $labels.namespace }} is too high. - |||, + The number of in-memory series per ingester in %(alert_aggregation_variables)s is too high. + ||| % $._config, }, }, { @@ -464,8 +464,8 @@ }, annotations: { message: ||| - Ingesters in {{ $labels.namespace }} ingest too many samples per second. - |||, + Ingesters in %(alert_aggregation_variables)s ingest too many samples per second. + ||| % $._config, }, }, { @@ -483,8 +483,8 @@ }, annotations: { message: ||| - Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory. - |||, + Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory. + ||| % $._config, }, }, { @@ -502,8 +502,8 @@ }, annotations: { message: ||| - Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory. - |||, + Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory. + ||| % $._config, }, }, ], @@ -526,8 +526,8 @@ }, annotations: { message: ||| - Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. - |||, + Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% write (push) errors. + ||| % $._config, }, }, { @@ -545,8 +545,8 @@ }, annotations: { message: ||| - Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. - |||, + Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors while evaluating rules. + ||| % $._config, }, }, { @@ -563,8 +563,8 @@ }, annotations: { message: ||| - Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. - |||, + Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% missed iterations for the rule group {{ $labels.rule_group }}. + ||| % $._config, }, }, { @@ -579,8 +579,8 @@ }, annotations: { message: ||| - Cortex Rulers {{ $labels.job }} are experiencing errors when checking the ring for rule group ownership. - |||, + Cortex Rulers in %(alert_aggregation_variables)s are experiencing errors when checking the ring for rule group ownership. + ||| % $._config, }, }, ], @@ -600,7 +600,7 @@ severity: 'warning', }, annotations: { - message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.', + message: 'Cortex instance {{ $labels.instance }} in %(alert_aggregation_variables)s sees incorrect number of gossip members.' % $._config, }, }, ], diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/jsonnet/mimir-mixin/alerts/blocks.libsonnet index d1157f38438..a60ac2da263 100644 --- a/jsonnet/mimir-mixin/alerts/blocks.libsonnet +++ b/jsonnet/mimir-mixin/alerts/blocks.libsonnet @@ -9,24 +9,24 @@ alert: 'CortexIngesterHasNotShippedBlocks', 'for': '15m', expr: ||| - (min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4) + (min by(%(alert_aggregation_labels)s, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4) and - (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) + (max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) and # Only if the ingester has ingested samples over the last 4h. - (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) + (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) and # Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance # had ingested samples in the past, then no traffic was received for a long period and then it starts # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving # samples, while the a block shipping is expected within the next 4h. - (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0) - |||, + (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0) + ||| % $._config, labels: { severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config, }, }, { @@ -35,15 +35,15 @@ alert: 'CortexIngesterHasNotShippedBlocksSinceStart', 'for': '4h', expr: ||| - (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0) + (max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0) and - (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) - |||, + (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) + ||| % $._config, labels: { severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config, }, }, { @@ -61,7 +61,7 @@ severity: 'critical', }, annotations: { - message: "Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet.", + message: "Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet." % $._config, }, }, { @@ -77,7 +77,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to compact TSDB head.' % $._config, }, }, { @@ -89,7 +89,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB head.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB head.' % $._config, }, }, { @@ -101,7 +101,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to create TSDB checkpoint.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to create TSDB checkpoint.' % $._config, }, }, { @@ -113,7 +113,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete TSDB checkpoint.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to delete TSDB checkpoint.' % $._config, }, }, { @@ -125,7 +125,7 @@ severity: 'warning', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB WAL.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB WAL.' % $._config, }, }, { @@ -137,7 +137,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} got a corrupted TSDB WAL.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config, }, }, { @@ -150,7 +150,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to write to TSDB WAL.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to write to TSDB WAL.' % $._config, }, }, { @@ -166,7 +166,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.', + message: 'Cortex Querier {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully scanned the bucket since {{ $value | humanizeDuration }}.' % $._config, }, }, { @@ -177,20 +177,20 @@ expr: ||| 100 * ( ( - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) + sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) - - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m])) + sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m])) ) / - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) + sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) ) > 1 - |||, + ||| % $._config, labels: { severity: 'warning', }, annotations: { - message: 'Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.', + message: 'Cortex Queries in %(alert_aggregation_variables)s are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%%.0f" $value }}%% of queries.' % $._config, }, }, { @@ -206,20 +206,20 @@ severity: 'critical', }, annotations: { - message: 'Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.', + message: 'Cortex Store Gateway {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully synched the bucket since {{ $value | humanizeDuration }}.' % $._config, }, }, { // Alert if the bucket index has not been updated for a given user. alert: 'CortexBucketIndexNotUpdated', expr: ||| - min by(namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 - |||, + min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + ||| % $._config, labels: { severity: 'critical', }, annotations: { - message: 'Cortex bucket index for tenant {{ $labels.user }} in {{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}.', + message: 'Cortex bucket index for tenant {{ $labels.user }} in %(alert_aggregation_variables)s has not been updated since {{ $value | humanizeDuration }}.' % $._config, }, }, { @@ -227,13 +227,13 @@ alert: 'CortexTenantHasPartialBlocks', 'for': '6h', expr: ||| - max by(namespace, user) (cortex_bucket_blocks_partials_count) > 0 - |||, + max by(%(alert_aggregation_labels)s, user) (cortex_bucket_blocks_partials_count) > 0 + ||| % $._config, labels: { severity: 'warning', }, annotations: { - message: 'Cortex tenant {{ $labels.user }} in {{ $labels.namespace }} has {{ $value }} partial blocks.', + message: 'Cortex tenant {{ $labels.user }} in %(alert_aggregation_variables)s has {{ $value }} partial blocks.' % $._config, }, }, ], diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/jsonnet/mimir-mixin/alerts/compactor.libsonnet index 1f28a7e54a1..5538545e249 100644 --- a/jsonnet/mimir-mixin/alerts/compactor.libsonnet +++ b/jsonnet/mimir-mixin/alerts/compactor.libsonnet @@ -14,7 +14,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 6 hours.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully cleaned up blocks in the last 6 hours.' % $._config, }, }, { @@ -30,7 +30,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not run compaction in the last 24 hours.' % $._config, }, }, { @@ -44,7 +44,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not run compaction in the last 24 hours.' % $._config, }, }, { @@ -57,7 +57,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} failed to run 2 consecutive compactions.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s failed to run 2 consecutive compactions.' % $._config, }, }, { @@ -73,7 +73,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not uploaded any block in the last 24 hours.' % $._config, }, }, { @@ -87,7 +87,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not uploaded any block in the last 24 hours.' % $._config, }, }, ], diff --git a/jsonnet/mimir-mixin/groups.libsonnet b/jsonnet/mimir-mixin/groups.libsonnet index 6d33ea3661d..c2c35f90d21 100644 --- a/jsonnet/mimir-mixin/groups.libsonnet +++ b/jsonnet/mimir-mixin/groups.libsonnet @@ -41,5 +41,22 @@ alert_aggregation_labels_override ) else group_by_cluster, + + // This field contains contains the Prometheus template variables that should + // be used to display values of the configured "group_by_cluster" (or the + // deprecated "alert_aggregation_labels"). + alert_aggregation_variables: + std.join( + '/', + // Generate the variable replacement for each label. + std.map( + function(l) '{{ $labels.%s }}' % l, + // Split the configured labels by comma and remove whitespaces. + std.map( + function(l) std.strReplace(l, ' ', ''), + std.split($._config.alert_aggregation_labels, ',') + ), + ), + ), }, } From c0625cfdd9b35c912eb2c591d0e825ab82f24a5b Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 5 Jul 2021 10:57:45 +0200 Subject: [PATCH 310/364] Improved CortexRequestLatency playbook Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 54 +++++++++++++++++++-------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 093ff99d75f..2630e1ff4c7 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -87,33 +87,57 @@ How to **fix**: 1. Assuming shuffle-sharding is enabled, scaling up ingesters will lower the number of tenants per ingester. However, the effect of this change will be visible only after `-blocks-storage.tsdb.close-idle-tsdb-timeout` period so you may have to temporarily increase the limit ### CortexRequestLatency -First establish if the alert is for read or write latency. The alert should say. -#### Write Latency -Using the Cortex write dashboard, find the cluster which reported the high write latency and deduce where in the stack the latency is being introduced: - -distributor: It is quite normal for the distributor P99 latency to be 50-100ms, and for the ingesters to be ~5ms. If the distributor latency is higher than this, you may need to scale up the distributors. If there is a high error rate being introduced at the distributors (400s or 500s) this has been know to induce latency. +This alert fires when a specific Cortex route is experiencing an high latency. -ingesters: It is very unusual for ingester latency to be high, as they just write to memory. They probably needs scaling up, but it is worth investigating what is going on first. - -#### Read Latency -Query performance is an known problem. When you get this alert, you need to work out if: (a) this is a operation issue / configuration (b) this is because of algorithms and inherently limited (c) this is a bug +The alert message includes both the Cortex service and route experiencing the high latency. Establish if the alert is about the read or write path based on that. -Using the Cortex read dashboard, find the cluster which reported the high read latency and deduce where in the stack the latency is being introduced. +#### Write Latency -query_frontend: If there is a significant P99 or avg latency difference between the frontend and the querier, you can't scale them up - we rely on their being two frontend. Is this latency coming from the cache? Scale that up. What the CPU usage of the query frontend service? Do we need to increase the CPU requests and have it scheduled to a less busy box? Note QPS on the querier will be higher than on the frontend as it splits queries into multiple smaller ones. +How to **investigate**: +- Check the `Cortex / Writes` dashboard + - Looking at the dashboard you should see in which Cortex service the high latency originates + - The panels in the dashboard are vertically sorted by the network path (eg. cortex-gw -> distributor -> ingester) +- Deduce where in the stack the latency is being introduced + - **`cortex-gw`** + - The cortex-gw may need to be scaled up. Use the `Cortex / Scaling` dashboard to check for CPU usage vs requests. + - There could be a problem with authentication (eg. slow to run auth layer) + - **`distributor`** + - Typically, distributor p99 latency is in the range 50-100ms. If the distributor latency is higher than this, you may need to scale up the distributors. + - **`ingester`** + - Typically, ingester p99 latency is in the range 5-50ms. If the ingester latency is higher than this, you should investigate the root cause before scaling up ingesters. -ingesters: Latency should be in the ~100ms - queries are in memory. If its more, check the CPU usage and consider scaling it up. NB scale ingesters slowly, 1-2 new replicas an hour. +#### Read Latency -If you think its provisioning / scaling is the problem, consult the scaling dashboard. These are just recommendations - make reasonable adjustments. +Query performance is a known issue. A query may be slow because of high cardinality, large time range and/or because not leveraging on cache (eg. querying series data not cached yet). When investigating this alert, you should check if it's caused by few slow queries or there's an operational / config issue to be fixed. -Right now most of the execution time will be spent in PromQL's innerEval. NB that the prepare (index and chunk fetch) are now interleaved with Eval, so you need to expand both to confirm if its flow execution of slow fetching. +How to **investigate**: +- Check the `Cortex / Reads` dashboard + - Looking at the dashboard you should see in which Cortex service the high latency originates + - The panels in the dashboard are vertically sorted by the network path (eg. cortex-gw -> query-frontend -> query->scheduler -> querier -> store-gateway) +- Check the `Cortex / Slow Queries` dashboard to find out if it's caused by few slow queries +- Deduce where in the stack the latency is being introduced + - **`cortex-gw`** + - The cortex-gw may need to be scaled up. Use the `Cortex / Scaling` dashboard to check for CPU usage vs requests. + - There could be a problem with authentication (eg. slow to run auth layer) + - **`query-frontend`** + - The query-frontend may beed to be scaled up. If the Cortex cluster is running with the query-scheduler, the query-frontend can be scaled up with no side effects, otherwise the maximum number of query-frontend replicas should be the configured `-querier.worker-parallelism`. + - **`querier`** + - Look at slow queries traces to find out where it's slow. + - Typically, slowness either comes from running PromQL engine (`innerEval`) or fetching chunks from ingesters and/or store-gateways. + - If slowness comes from running PromQL engine, typically there's not much we can do. Scaling up queriers may help only if querier nodes are overloaded. + - If slowness comes from fetching chunks from ingesters and/or store-gateways you should investigate deeper on the root cause. Common causes: + - High CPU utilization in ingesters + - Scale up ingesters + - Low cache hit ratio in the store-gateways + - If memcached eviction rate is high, then you should scale up memcached replicas. Check the recommendations by `Cortex / Scaling` dashboard and make reasonable adjustments as necessary. + - If memcached eviction rate is zero or very low, then it may be caused by "first time" queries ### CortexRequestErrors This alert fires when the rate of 5xx errors of a specific route is > 1% for some time. -This alert typically acts as a last resort to detect issues / outages. SLO alerts are expected to trigger earlier: if an **SLO alert** has triggered as well for the same read/write path, then you can ignore this alert and focus on the SLO one. +This alert typically acts as a last resort to detect issues / outages. SLO alerts are expected to trigger earlier: if an **SLO alert** has triggered as well for the same read/write path, then you can ignore this alert and focus on the SLO one (but the investigation procedure is typically the same). How to **investigate**: - Check for which route the alert fired From 03cfca37203f931e89dfbff776c4f29375d30786 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 5 Jul 2021 13:45:27 +0200 Subject: [PATCH 311/364] Added 'Per route p99 latency' to ruler configuration API Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/ruler.libsonnet | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index b9347f7fd8d..070a80a9569 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -1,6 +1,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { + local ruler_config_api_routes_re = 'api_prom_rules.*|api_prom_api_v1_(rules|alerts)', rulerQueries+:: { ruleEvaluations: { @@ -106,11 +107,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Configuration API (gateway)') .addPanel( $.panel('QPS') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_prom_rules.*|api_prom_api_v1_(rules|alerts)"}' % $.jobMatcher($._config.job_names.gateway)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re]) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_prom_rules.*|api_prom_api_v1_(rules|alerts)')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', ruler_config_api_routes_re)]) + ) + .addPanel( + $.panel('Per route p99 Latency') + + $.queryPanel( + 'histogram_quantile(0.99, sum by (route, le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%s, route=~"%s"}))' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re], + '{{ route }}' + ) + + { yaxes: $.yaxes('s') } ) ) .addRow( From c9f5db8cfe8777cfa8840e926319f81ff78d3cc6 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 5 Jul 2021 14:01:36 +0200 Subject: [PATCH 312/364] Addressed review comments Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 30 +++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 2630e1ff4c7..769f980833a 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -90,7 +90,7 @@ How to **fix**: This alert fires when a specific Cortex route is experiencing an high latency. -The alert message includes both the Cortex service and route experiencing the high latency. Establish if the alert is about the read or write path based on that. +The alert message includes both the Cortex service and route experiencing the high latency. Establish if the alert is about the read or write path based on that (see [Cortex routes by path](#cortex-routes-by-path)). #### Write Latency @@ -106,6 +106,9 @@ How to **investigate**: - Typically, distributor p99 latency is in the range 50-100ms. If the distributor latency is higher than this, you may need to scale up the distributors. - **`ingester`** - Typically, ingester p99 latency is in the range 5-50ms. If the ingester latency is higher than this, you should investigate the root cause before scaling up ingesters. + - Check out the following alerts and fix them if firing: + - `CortexProvisioningTooManyActiveSeries` + - `CortexProvisioningTooManyWrites` #### Read Latency @@ -130,6 +133,7 @@ How to **investigate**: - High CPU utilization in ingesters - Scale up ingesters - Low cache hit ratio in the store-gateways + - Check `Memcached Overview` dashboard - If memcached eviction rate is high, then you should scale up memcached replicas. Check the recommendations by `Cortex / Scaling` dashboard and make reasonable adjustments as necessary. - If memcached eviction rate is zero or very low, then it may be caused by "first time" queries @@ -140,7 +144,7 @@ This alert fires when the rate of 5xx errors of a specific route is > 1% for som This alert typically acts as a last resort to detect issues / outages. SLO alerts are expected to trigger earlier: if an **SLO alert** has triggered as well for the same read/write path, then you can ignore this alert and focus on the SLO one (but the investigation procedure is typically the same). How to **investigate**: -- Check for which route the alert fired +- Check for which route the alert fired (see [Cortex routes by path](#cortex-routes-by-path)) - Write path: open the `Cortex / Writes` dashboard - Read path: open the `Cortex / Reads` dashboard - Looking at the dashboard you should see in which Cortex service the error originates @@ -588,6 +592,28 @@ This can be triggered if there are too many HA dedupe keys in etcd. We saw this }, ``` +## Cortex routes by path + +**Write path**: +- `/distributor.Distributor/Push` +- `/cortex.Ingester/Push` +- `api_v1_push` +- `api_prom_push` +- `api_v1_push_influx_write` + +**Read path**: +- `/schedulerpb.SchedulerForFrontend/FrontendLoop` +- `/cortex.Ingester/QueryStream` +- `/cortex.Ingester/QueryExemplars` +- `/gatewaypb.StoreGateway/Series` +- `api_prom_label` +- `api_prom_api_v1_query_exemplars` + +**Ruler / rules path**: +- `api_v1_rules` +- `api_v1_rules_namespace` +- `api_prom_rules_namespace` + ## Cortex blocks storage - What to do when things to wrong ## Recovering from a potential data loss incident From acb4a07b95664acf2090a5d206cc96ed3060a87f Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 6 Jul 2021 11:40:10 +0200 Subject: [PATCH 313/364] Aded object storage metrics for Ruler and Alertmanager Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet | 3 +++ jsonnet/mimir-mixin/dashboards/ruler.libsonnet | 3 +++ 2 files changed, 6 insertions(+) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index fa5c6abf6cd..b329ce6ba19 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -83,5 +83,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_v1_alerts|alertmanager')]) ) + ) + .addRows( + $.getObjectStoreRows('Alertmanager Configuration Object Store (Alertmanager accesses)', 'alertmanager-storage') ), } diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet index 070a80a9569..bfa231b7c20 100644 --- a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/ruler.libsonnet @@ -248,5 +248,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; '{{ user }}' ) ) + ) + .addRows( + $.getObjectStoreRows('Ruler Configuration Object Store (Ruler accesses)', 'ruler-storage') ), } From 97a918c68174aacfb78db00738dac3d79d8745b7 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Wed, 14 Jul 2021 16:07:37 +0200 Subject: [PATCH 314/364] Add playbook entry for CortexGossipMembersMismatch. --- jsonnet/mimir-mixin/docs/playbooks.md | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 769f980833a..cecd400c039 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -572,7 +572,31 @@ How to **fix**: ### CortexGossipMembersMismatch -_TODO: this playbook has not been written yet._ +This alert fires when any instance does not register all other instances as members of the memberlist cluster. + +How it **works**: +- This alert applies when memberlist is used for the ring backing store. +- All Cortex instances, regardless of type, join the a single memberlist cluster. +- Each instance (=memberlist cluster member) should be able to see all others. +- Therefore the following should be equal for every instance: + - The reported number of cluster members (`memberlist_client_cluster_members_count`) + - The total number of currently responsive instances. + +How to **investigate**: +- The instance which has the incomplete view of the cluster (too few members) is specified in the alert. +- If the count is zero: + - It is possible that the joining the cluster has yet to succeed. + - The following log message indicates that the _initial_ initial join did not succeed: `failed to join memberlist cluster` + - The following log messages indicate that subsequent re-join attempts are failing: `re-joining memberlist cluster failed` + - If it is the case that the initial join failed, take action according to the reason given. +- Verify communication with other members by checking memberlist traffic is being sent and received by the instance using the following metrics: + - `memberlist_tcp_transport_packets_received_total` + - `memberlist_tcp_transport_packets_sent_total` +- If traffic is present, then verify there are no errors sending or receiving packets using the following metrics: + - `memberlist_tcp_transport_packets_sent_errors_total` + - `memberlist_tcp_transport_packets_received_errors_total` + - These errors (and others) can be found by searching for messages prefixed with `TCPTransport:`. +- Logs coming directly from memberlist are also logged by Cortex; they may indicate where to investigate further. These can be identified as such due to being tagged with `caller=memberlist_logger.go:xyz`. ### EtcdAllocatingTooMuchMemory From 61362fd5e53629a6736b3cfc78aff445c63fd328 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 14 Jul 2021 17:12:18 +0200 Subject: [PATCH 315/364] Clarify data loss related to 'not healthy index found' issue Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 769f980833a..6578970242d 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -354,7 +354,7 @@ level=error ts=2020-07-12T17:35:05.516823471Z caller=compactor.go:339 component= ``` When this happen you should: -1. Rename the block prefixing it with `corrupted-` so that it will be skipped by the compactor and queriers. Keep in mind that doing so the block will become invisible to the queriers too, so its series/samples will not be queried. It's safe to do it on a single block with compaction level 1 (because of the samples replication), but not on multiple overlapping level 1 blocks or any block with a compaction level > 1. +1. Rename the block prefixing it with `corrupted-` so that it will be skipped by the compactor and queriers. Keep in mind that doing so the block will become invisible to the queriers too, so its series/samples will not be queried. If the corruption affects only 1 block whose compaction `level` is 1 (the information is stored inside its `meta.json`) then Cortex guarantees no data loss because all the data is replicated across other blocks. In all other cases, there may be some data loss once you rename the block and stop querying it. 2. Ensure the compactor has recovered 3. Investigate offline the root cause (eg. download the corrupted block and debug it locally) From dd543ccd1eaaec41d472a708e0ea277d27d60c31 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Thu, 15 Jul 2021 08:56:38 +0200 Subject: [PATCH 316/364] Review comments. --- jsonnet/mimir-mixin/docs/playbooks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index cecd400c039..7837390179b 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -576,7 +576,7 @@ This alert fires when any instance does not register all other instances as memb How it **works**: - This alert applies when memberlist is used for the ring backing store. -- All Cortex instances, regardless of type, join the a single memberlist cluster. +- All Cortex instances using the ring, regardless of type, join a single memberlist cluster. - Each instance (=memberlist cluster member) should be able to see all others. - Therefore the following should be equal for every instance: - The reported number of cluster members (`memberlist_client_cluster_members_count`) @@ -587,7 +587,7 @@ How to **investigate**: - If the count is zero: - It is possible that the joining the cluster has yet to succeed. - The following log message indicates that the _initial_ initial join did not succeed: `failed to join memberlist cluster` - - The following log messages indicate that subsequent re-join attempts are failing: `re-joining memberlist cluster failed` + - The following log message indicates that subsequent re-join attempts are failing: `re-joining memberlist cluster failed` - If it is the case that the initial join failed, take action according to the reason given. - Verify communication with other members by checking memberlist traffic is being sent and received by the instance using the following metrics: - `memberlist_tcp_transport_packets_received_total` From 3248aae3031e5c4cfbb7d78b1a3535ab61b80a15 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 22 Jul 2021 19:24:54 +0200 Subject: [PATCH 317/364] Improve CortexIngesterReachingSeriesLimit playbook Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 04f4b832ef2..98e0b94e605 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -53,7 +53,29 @@ How to **fix**: 1. **Temporarily increase the limit**
If the actual number of series is very close or already hit the limit, or if you foresee the ingester will hit the limit before dropping the stale series as effect of the scale up, you should also temporarily increase the limit. 1. **Check if shuffle-sharding shard size is correct**
- When shuffle-sharding is enabled, we target to 100K series / tenant / ingester. You can run `avg by (user) (cortex_ingester_memory_series_created_total{namespace=""} - cortex_ingester_memory_series_removed_total{namespace=""}) > 100000` to find out tenants with > 100K series / ingester. You may want to increase the shard size for these tenants. + - When shuffle-sharding is enabled, we target to 100K series / tenant / ingester assuming tenants on average uses 50% of their max series limit. + - Run the following **instant query** to find tenants that may cause an higher pressure on some ingesters: + ``` + ( + sum by(user) (cortex_ingester_memory_series_created_total{namespace=""} + - + cortex_ingester_memory_series_removed_total{namespace=""}) + ) + > + ( + max by(user) (cortex_overrides{namespace="",limit_name="max_global_series_per_user"}) + * + scalar(max(cortex_distributor_replication_factor{namespace=""})) + * + 0.5 + ) + > 200000 + + # Decomment the following to show only tenants beloging to a specific ingester's shard. + # and count by(user) (cortex_ingester_active_series{namespace="",pod="ingester-"}) + ``` + - Check the current shard size of each tenant in the output and, if they're not already sharded across all ingesters, you may consider to double their shard size + - The in-memory series in the ingesters will be effectively reduced at the TSDB Head compaction happening at least 1h after you increased the shard size for the affected tenants 1. **Scale up ingesters**
Scaling up ingesters will lower the number of series per ingester. However, the effect of this change will take up to 4h, because after the scale up we need to wait until all stale series are dropped from memory as the effect of TSDB head compaction, which could take up to 4h (with the default config, TSDB keeps in-memory series up to 3h old and it gets compacted every 2h). From ec2f95cf7aee906c741a769d0eec1726d83ee2e4 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 27 Jul 2021 19:04:00 +0200 Subject: [PATCH 318/364] Increased CortexIngesterReachingSeriesLimit critical alert threshold from 80% to 85% Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 9eefe7f822e..be47dd7f0d0 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -274,7 +274,7 @@ (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) and ignoring (limit) (cortex_ingester_instance_limits{limit="max_series"} > 0) - ) > 0.8 + ) > 0.85 |||, 'for': '5m', labels: { From c5d98a9171a0979a77d35e7d1422415a560b7872 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Mon, 26 Jul 2021 19:08:00 +0200 Subject: [PATCH 319/364] Increase CortexIngesterReachingSeriesLimit warning `for` duration As it turns out, during normal shuffle-sharding operation, the 70% mark is often exceeded, but not by much. Rather than increasing the threshold to 75%, this commit increases the `for` duration to 3h, following the thought that we want this alert to fire if ingesters are constantly above the threshold even after stale series are flushed (which occurs every 2h, when the TSDB head is compacted). We flush series with a timestamp between [-3h, -1h] after the last compaction, so the worst case scenario is that it takes 3h to flush a stale series. Signed-off-by: beorn7 --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 9eefe7f822e..203623ec793 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -257,7 +257,7 @@ (cortex_ingester_instance_limits{limit="max_series"} > 0) ) > 0.7 |||, - 'for': '5m', + 'for': '3h', labels: { severity: 'warning', }, From 35b2479a75a26793b11f7b9edd14dfd35c41eca2 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 28 Jul 2021 14:06:00 +0200 Subject: [PATCH 320/364] Fix scaling dashboard to work on multi-zone ingesters Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/recording_rules.libsonnet | 63 ++++++++++++++----- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 00c7d701855..c7034cd825d 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -69,12 +69,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; rules: [ { // Convenience rule to get the number of replicas for both a deployment and a statefulset. + // Multi-zone deployments are grouped together removing the "zone-X" suffix. record: 'cluster_namespace_deployment:actual_replicas:count', expr: ||| - sum by (cluster, namespace, deployment) (kube_deployment_spec_replicas) - or sum by (cluster, namespace, deployment) ( - label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)") + label_replace( + kube_deployment_spec_replicas, + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + or + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)"), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) ) |||, }, @@ -188,7 +197,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; expr: ||| ceil( (sum by (cluster, namespace) ( - cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester"} + cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} ) / 4) / avg by (cluster, namespace) ( @@ -199,18 +208,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, { // Convenience rule to get the CPU utilization for both a deployment and a statefulset. + // Multi-zone deployments are grouped together removing the "zone-X" suffix. record: 'cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate', expr: ||| sum by (cluster, namespace, deployment) ( label_replace( - node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) |||, }, { // Convenience rule to get the CPU request for both a deployment and a statefulset. + // Multi-zone deployments are grouped together removing the "zone-X" suffix. record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum', expr: ||| # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 @@ -223,8 +237,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ( sum by (cluster, namespace, deployment) ( label_replace( - kube_pod_container_resource_requests_cpu_cores, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + kube_pod_container_resource_requests_cpu_cores, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) ) @@ -234,8 +251,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ( sum by (cluster, namespace, deployment) ( label_replace( - kube_pod_container_resource_requests{resource="cpu"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + kube_pod_container_resource_requests{resource="cpu"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) ) @@ -261,18 +281,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, { // Convenience rule to get the Memory utilization for both a deployment and a statefulset. + // Multi-zone deployments are grouped together removing the "zone-X" suffix. record: 'cluster_namespace_deployment:container_memory_usage_bytes:sum', expr: ||| sum by (cluster, namespace, deployment) ( label_replace( - container_memory_usage_bytes, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + container_memory_usage_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) |||, }, { // Convenience rule to get the Memory request for both a deployment and a statefulset. + // Multi-zone deployments are grouped together removing the "zone-X" suffix. record: 'cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum', expr: ||| # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 @@ -285,8 +310,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ( sum by (cluster, namespace, deployment) ( label_replace( - kube_pod_container_resource_requests_memory_bytes, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + kube_pod_container_resource_requests_memory_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) ) @@ -296,8 +324,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ( sum by (cluster, namespace, deployment) ( label_replace( - kube_pod_container_resource_requests{resource="memory"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + label_replace( + kube_pod_container_resource_requests{resource="memory"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) ) From cec1a40697a4785085feb6768bcec44d59143a67 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 28 Jul 2021 14:58:04 +0200 Subject: [PATCH 321/364] Simplified cluster_namespace_deployment:actual_replicas:count recording rule Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/recording_rules.libsonnet | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index c7034cd825d..e960648630f 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -80,10 +80,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) or sum by (cluster, namespace, deployment) ( - label_replace( - label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)"), - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) + label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") ) |||, }, From 3d0e6f5b17113e014311f3898526eba25eacabd6 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 28 Jul 2021 16:08:52 +0200 Subject: [PATCH 322/364] Added a comment to explain '.*?' Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/recording_rules.libsonnet | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index e960648630f..433fa8e6270 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -75,6 +75,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum by (cluster, namespace, deployment) ( label_replace( kube_deployment_spec_replicas, + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -214,6 +216,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -238,6 +242,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; kube_pod_container_resource_requests_cpu_cores, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -252,6 +258,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; kube_pod_container_resource_requests{resource="cpu"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -287,6 +295,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; container_memory_usage_bytes, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -311,6 +321,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; kube_pod_container_resource_requests_memory_bytes, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) @@ -325,6 +337,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; kube_pod_container_resource_requests{resource="memory"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" ) ) From 432c4a3011f15de59c25c7b2588faf33f2a87c80 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 29 Jul 2021 10:21:28 +0200 Subject: [PATCH 323/364] Fix rollout dashboard to work with multi-zone deployments Signed-off-by: Marco Pracucci --- .../dashboards/rollout-progress.libsonnet | 49 +++++++++++++++---- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet b/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet index 83a5abb7a2f..948a468d0c9 100644 --- a/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet @@ -6,7 +6,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gateway_job_matcher: $.jobMatcher($._config.job_names.gateway), gateway_write_routes_regex: 'api_(v1|prom)_push', gateway_read_routes_regex: '(prometheus|api_prom)_api_v1_.+', - all_services_regex: std.join('|', ['cortex-gw', 'distributor', 'ingester', 'query-frontend', 'querier', 'compactor', 'store-gateway', 'ruler', 'alertmanager']), + all_services_regex: std.join('|', ['cortex-gw', 'distributor', 'ingester.*', 'query-frontend', 'querier', 'compactor', 'store-gateway', 'ruler', 'alertmanager']), }, 'cortex-rollout-progress.json': @@ -22,29 +22,60 @@ local utils = import 'mixin-utils/utils.libsonnet'; // $.panel('Rollout progress') + $.barGauge([ + // Multi-zone deployments are grouped together removing the "zone-X" suffix. + // After the grouping, the resulting label is called "cortex_service". ||| ( - kube_statefulset_status_replicas_updated{%(namespace_matcher)s,statefulset=~"%(all_services_regex)s"} + sum by(cortex_service) ( + label_replace( + kube_statefulset_status_replicas_updated{%(namespace_matcher)s,statefulset=~"%(all_services_regex)s"}, + "cortex_service", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?" + ) + ) / - kube_statefulset_replicas{%(namespace_matcher)s} + sum by(cortex_service) ( + label_replace( + kube_statefulset_replicas{%(namespace_matcher)s}, + "cortex_service", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?" + ) + ) ) and ( - kube_statefulset_replicas{%(namespace_matcher)s} + sum by(cortex_service) ( + label_replace( + kube_statefulset_replicas{%(namespace_matcher)s}, + "cortex_service", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?" + ) + ) > 0 ) ||| % config, ||| ( - kube_deployment_status_replicas_updated{%(namespace_matcher)s,deployment=~"%(all_services_regex)s"} + sum by(cortex_service) ( + label_replace( + kube_deployment_status_replicas_updated{%(namespace_matcher)s,deployment=~"%(all_services_regex)s"}, + "cortex_service", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) / - kube_deployment_spec_replicas{%(namespace_matcher)s} + sum by(cortex_service) ( + label_replace( + kube_deployment_spec_replicas{%(namespace_matcher)s}, + "cortex_service", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) ) and ( - kube_deployment_spec_replicas{%(namespace_matcher)s} + sum by(cortex_service) ( + label_replace( + kube_deployment_spec_replicas{%(namespace_matcher)s}, + "cortex_service", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) > 0 ) ||| % config, ], legends=[ - '{{statefulset}}', - '{{deployment}}', + '{{cortex_service}}', ], thresholds=[ { color: 'yellow', value: null }, { color: 'yellow', value: 0.999 }, From a5950dcee4a9ccc6cd1a0d1bed66d5ddcae807a9 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 29 Jul 2021 10:23:57 +0200 Subject: [PATCH 324/364] Fixed legends Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet b/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet index 948a468d0c9..e481ce6aee8 100644 --- a/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet @@ -76,6 +76,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % config, ], legends=[ '{{cortex_service}}', + '{{cortex_service}}', ], thresholds=[ { color: 'yellow', value: null }, { color: 'yellow', value: 0.999 }, From 4498eb11019e17c0fd6d82506fa535dbe6ddf533 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 25 May 2021 10:19:43 +0200 Subject: [PATCH 325/364] Extend Alertmanager dashboard with currently unused metrics. Metrics for general operation: - Added "Tenants" stat panel using: `cortex_alertmanager_tenants_discovered` - Added "Tenant Configuration Sync" row using: `cortex_alertmanager_sync_configs_failed_total` `cortex_alertmanager_sync_configs_total` `cortex_alertmanager_ring_check_errors_total` Metrics specific to sharding operation: - Added "Sharding Initial State Sync" row using: `cortex_alertmanager_state_initial_sync_completed_total` `cortex_alertmanager_state_initial_sync_completed_total` `cortex_alertmanager_state_initial_sync_duration_seconds` - Added "Sharding State Operations" row using: `cortex_alertmanager_state_fetch_replica_state_total` `cortex_alertmanager_state_fetch_replica_state_failed_total` `cortex_alertmanager_state_replication_total` `cortex_alertmanager_state_replication_failed_total` `cortex_alertmanager_partial_state_merges_total` `cortex_alertmanager_partial_state_merges_failed_total` `cortex_alertmanager_state_persist_total` `cortex_alertmanager_state_persist_failed_total` --- .../dashboards/alertmanager.libsonnet | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index b329ce6ba19..6d7ee562f90 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -17,6 +17,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Total Silences') + $.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short') ) + .addPanel( + $.panel('Tenants') + + $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager'), format='short') + ) ) .addRow( $.row('Alerts Received') @@ -86,5 +90,150 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRows( $.getObjectStoreRows('Alertmanager Configuration Object Store (Alertmanager accesses)', 'alertmanager-storage') + ) + .addRow( + $.row('Replication') + .addPanel( + $.panel('Tenants (By Instance)') + + $.queryPanel( + 'sum by(pod) (cortex_alertmanager_tenants_owned{%s})' % $.jobMatcher('alertmanager'), + '{{pod}}' + ) + + $.stack + ) + .addPanel( + $.panel('Alerts (By Instance)') + + $.queryPanel( + 'sum by(pod) (cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), + '{{pod}}' + ) + + $.stack + ) + .addPanel( + $.panel('Silences (By Instance)') + + $.queryPanel( + 'sum by(pod) (cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), + '{{pod}}' + ) + + $.stack + ) + ) + .addRow( + $.row('Tenant Configuration Sync') + .addPanel( + $.panel('Syncs/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Syncs/sec (By Reason)') + + $.queryPanel( + 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + '{{reason}}' + ) + ) + .addPanel( + $.panel('Ring Check Errors/sec') + + $.queryPanel( + 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'errors' + ) + ) + ) + .addRow( + $.row('Sharding Initial State Sync') + .addPanel( + $.panel('Syncs/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_initial_sync_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Syncs/sec (By Outcome)') + + $.queryPanel( + 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + '{{outcome}}' + ) + ) + .addPanel( + $.panel('Duration') + + utils.latencyRecordingRulePanel('cortex_alertmanager_state_initial_sync_duration_seconds', $.jobSelector('alertmanager')) + ) + ) + .addRow( + $.row('Sharding State Operations') + .addPanel( + $.panel('Replica Fetches/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Replica Updates/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Partial Merges/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Remote Storage Persists/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) ), } From 00d7414577271824a8b1ae8dfba6f7f8d1986c80 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Thu, 27 May 2021 12:55:33 +0200 Subject: [PATCH 326/364] Review comments + fix latency panel. --- .../dashboards/alertmanager.libsonnet | 52 +++++++------------ 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index 6d7ee562f90..922b2861bc2 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -94,26 +94,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Replication') .addPanel( - $.panel('Tenants (By Instance)') + + $.panel('Per %s Tenants' % $._config.per_instance_label) + $.queryPanel( - 'sum by(pod) (cortex_alertmanager_tenants_owned{%s})' % $.jobMatcher('alertmanager'), - '{{pod}}' + 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label ) + $.stack ) .addPanel( - $.panel('Alerts (By Instance)') + + $.panel('Per %s Alerts' % $._config.per_instance_label) + $.queryPanel( - 'sum by(pod) (cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), - '{{pod}}' + 'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label ) + $.stack ) .addPanel( - $.panel('Silences (By Instance)') + + $.panel('Per %s Silences' % $._config.per_instance_label) + $.queryPanel( - 'sum by(pod) (cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), - '{{pod}}' + 'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label ) + $.stack ) @@ -150,37 +150,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Sharding Initial State Sync') + $.row('Sharding Runtime State Sync') .addPanel( $.panel('Syncs/sec') + - $.queryPanel( - [ - ||| - sum(rate(cortex_alertmanager_state_initial_sync_total{%s}[$__rate_interval])) - - - sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval])) - ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), - ], - ['success', 'failed'] - ) - ) - .addPanel( - $.panel('Syncs/sec (By Outcome)') + $.queryPanel( 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), '{{outcome}}' ) ) .addPanel( - $.panel('Duration') + - utils.latencyRecordingRulePanel('cortex_alertmanager_state_initial_sync_duration_seconds', $.jobSelector('alertmanager')) + $.panel('Sync duration') + + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) ) - ) - .addRow( - $.row('Sharding State Operations') .addPanel( - $.panel('Replica Fetches/sec') + + $.panel('Fetch state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -193,8 +176,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ['success', 'failed'] ) ) + ) + .addRow( + $.row('Sharding State Operations') .addPanel( - $.panel('Replica Updates/sec') + + $.panel('Replicate state to other alertmanagers /sec') + $.queryPanel( [ ||| @@ -208,7 +194,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Partial Merges/sec') + + $.panel('Merge state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -222,7 +208,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Remote Storage Persists/sec') + + $.panel('Persist state to remote storage /sec') + $.queryPanel( [ ||| From b0e76f9fcc3ee5ad5d12b99e79338f6d4ddf7a4f Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Fri, 30 Jul 2021 11:49:02 +0200 Subject: [PATCH 327/364] Review comments. --- jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index 922b2861bc2..6f578b11357 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -150,16 +150,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Sharding Runtime State Sync') + $.row('Sharding Initial State Sync') .addPanel( - $.panel('Syncs/sec') + + $.panel('Initial syncs/sec') + $.queryPanel( 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), '{{outcome}}' ) ) .addPanel( - $.panel('Sync duration') + + $.panel('Initial sync duration') + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) ) .addPanel( @@ -178,7 +178,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Sharding State Operations') + $.row('Sharding Runtime State Sync') .addPanel( $.panel('Replicate state to other alertmanagers /sec') + $.queryPanel( From 918a35f537e50982768187c18f15ff2f9eb71e06 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Mon, 16 Aug 2021 16:29:34 -0500 Subject: [PATCH 328/364] Clarify the gsutil mv command for moving corrupted blocks Signed-off-by: Tyler Reid --- jsonnet/mimir-mixin/docs/playbooks.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 98e0b94e605..085e38dce53 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -380,16 +380,14 @@ When this happen you should: 2. Ensure the compactor has recovered 3. Investigate offline the root cause (eg. download the corrupted block and debug it locally) -To rename a block stored on GCS you can use the `gsutil` CLI: - +To rename a block stored on GCS you can use the `gsutil` CLI command: ``` -# Replace the placeholders: -# - BUCKET: bucket name -# - TENANT: tenant ID -# - BLOCK: block ID - gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK ``` +Where: +- `BUCKET` is the gcs bucket name the compactor is using. The cell's bucket name is specified as the `blocks_storage_bucket_name` in the cell configuration +- `TENANT` is the tenant id reported in the example error message above as `REDACTED-TENANT` +- `BLOCK` is the last part of the file path reported as `REDACTED-BLOCK` in the example error message above ### CortexBucketIndexNotUpdated From 98d38ccbebda7b3fd7fddc195540b1e3128c3a56 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Tue, 17 Aug 2021 09:52:07 -0500 Subject: [PATCH 329/364] Modify log message to fit example command Signed-off-by: Tyler Reid --- jsonnet/mimir-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 085e38dce53..5dcf8d52746 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -372,7 +372,7 @@ How to **investigate**: The compactor may fail to compact blocks due a corrupted block index found in one of the source blocks: ``` -level=error ts=2020-07-12T17:35:05.516823471Z caller=compactor.go:339 component=compactor msg="failed to compact user blocks" user=REDACTED err="compaction: group 0@6672437747845546250: block with not healthy index found /data/compact/0@6672437747845546250/REDACTED; Compaction level 1; Labels: map[__org_id__:REDACTED]: 1/1183085 series have an average of 1.000 out-of-order chunks: 0.000 of these are exact duplicates (in terms of data and time range)" +level=error ts=2020-07-12T17:35:05.516823471Z caller=compactor.go:339 component=compactor msg="failed to compact user blocks" user=REDACTED-TENANT err="compaction: group 0@6672437747845546250: block with not healthy index found /data/compact/0@6672437747845546250/REDACTED-BLOCK; Compaction level 1; Labels: map[__org_id__:REDACTED]: 1/1183085 series have an average of 1.000 out-of-order chunks: 0.000 of these are exact duplicates (in terms of data and time range)" ``` When this happen you should: From bf14ff540de641b59db084fadfe1636ea199af73 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Wed, 18 Aug 2021 09:32:26 +0000 Subject: [PATCH 330/364] Update grafana-builder from Mar 2019 to Feb 2021 Brings in the following changes: - Use default as a picker value for datasource variable grafana/jsonnet-libshttps://github.com/grafana/cortex-jsonnet/pull/204 - allow table link in new tab grafana/jsonnet-libshttps://github.com/grafana/cortex-jsonnet/pull/238 - allow setting a default datasource grafana/jsonnet-libshttps://github.com/grafana/cortex-jsonnet/pull/301 - Add textPanel grafana/jsonnet-libshttps://github.com/grafana/cortex-jsonnet/pull/341 - make status code label name overrideable in qpsPanel grafana/jsonnet-libshttps://github.com/grafana/cortex-jsonnet/pull/397 - use $__rate_interval over $__interval grafana/jsonnet-libshttps://github.com/grafana/cortex-jsonnet/pull/401 - Set shared tooltip to false by default grafana/jsonnet-libshttps://github.com/grafana/cortex-jsonnet/pull/458 - Use custom 'all' value to avoid massive regexes in queries. grafana/jsonnet-libshttps://github.com/grafana/cortex-jsonnet/pull/469 https://github.com/grafana/jsonnet-libs/commits/master/grafana-builder/ --- jsonnet/mimir-mixin/jsonnetfile.lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/jsonnetfile.lock.json b/jsonnet/mimir-mixin/jsonnetfile.lock.json index 6490fafd9e4..a1b021910f4 100644 --- a/jsonnet/mimir-mixin/jsonnetfile.lock.json +++ b/jsonnet/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "8f9d72b2e35b5f3cc1b7c2a8af9bbae7658804e2", - "sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE=" + "version": "0d13e5ba1b3a4c29015738c203d92ea39f71ebe2", + "sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8=" }, { "source": { From 303186409324541b4c6c7ce90bef91f1d0576f3d Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 24 Aug 2021 11:30:10 +0200 Subject: [PATCH 331/364] Match query-frontend/query-scheduler/querier custom deployments by default Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- jsonnet/mimir-mixin/config.libsonnet | 8 ++++---- jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet | 2 +- jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index e74ab846ca6..42c1e5f78f8 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -593,7 +593,7 @@ expr: ||| memberlist_client_cluster_members_count != on (%s) group_left - sum by (%s) (up{job=~".+/(admin-api|compactor|store-gateway|distributor|ingester.*|querier|cortex|ruler)"}) + sum by (%s) (up{job=~".+/(admin-api|compactor|store-gateway|distributor|ingester.*|querier.*|cortex|ruler)"}) ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], 'for': '5m', labels: { diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 917ffd51599..94de5dd7080 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -26,12 +26,12 @@ // These are used by the dashboards and allow for the simultaneous display of // microservice and single binary cortex clusters. job_names: { - ingester: '(ingester.*|cortex$)', // Match also ingester-blocks, which is used during the migration from chunks to blocks. + ingester: '(ingester.*|cortex$)', // Match also custom and per-zone ingester deployments. distributor: '(distributor|cortex$)', - querier: '(querier|cortex$)', + querier: '(querier.*|cortex$)', // Match also custom querier deployments. ruler: '(ruler|cortex$)', - query_frontend: '(query-frontend|cortex$)', - query_scheduler: 'query-scheduler', // Not part of single-binary. + query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments. + query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments. table_manager: '(table-manager|cortex$)', store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', diff --git a/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet b/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet index e481ce6aee8..16c54095570 100644 --- a/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet @@ -6,7 +6,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gateway_job_matcher: $.jobMatcher($._config.job_names.gateway), gateway_write_routes_regex: 'api_(v1|prom)_push', gateway_read_routes_regex: '(prometheus|api_prom)_api_v1_.+', - all_services_regex: std.join('|', ['cortex-gw', 'distributor', 'ingester.*', 'query-frontend', 'querier', 'compactor', 'store-gateway', 'ruler', 'alertmanager']), + all_services_regex: std.join('|', ['cortex-gw', 'distributor', 'ingester.*', 'query-frontend.*', 'query-scheduler.*', 'querier.*', 'compactor', 'store-gateway', 'ruler', 'alertmanager']), }, 'cortex-rollout-progress.json': diff --git a/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet b/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet index a732388a067..90916facdc8 100644 --- a/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet @@ -16,7 +16,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; targets: [ { // Filter out the remote read endpoint. - expr: '{cluster=~"$cluster",namespace=~"$namespace",name="query-frontend"} |= "query stats" != "/api/v1/read" | logfmt | org_id=~"${tenant_id}" | response_time > ${min_duration}', + expr: '{cluster=~"$cluster",namespace=~"$namespace",name=~"query-frontend.*"} |= "query stats" != "/api/v1/read" | logfmt | org_id=~"${tenant_id}" | response_time > ${min_duration}', instant: false, legendFormat: '', range: true, From 0ea95cd7abaa26a13a220c81029253af4ca705d3 Mon Sep 17 00:00:00 2001 From: George Robinson Date: Wed, 25 Aug 2021 11:19:49 +0100 Subject: [PATCH 332/364] Create playbooks for sharded alertmanager --- jsonnet/mimir-mixin/docs/playbooks.md | 89 +++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 5dcf8d52746..a1b0fac0fa7 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -636,6 +636,95 @@ This can be triggered if there are too many HA dedupe keys in etcd. We saw this }, ``` +### CortexAlertmanagerSyncConfigsFailing + +How it **works**: + +This alert is fired when the multi-tenant alertmanager cannot load alertmanager configs from the remote object store for at least 30 minutes. + +Loading the alertmanager configs can happen in the following situations: + +1. When the multi tenant alertmanager is started +2. Each time it polls for config changes in the alertmanager +3. When there is a ring change + +The metric for this alert is cortex_alertmanager_sync_configs_failed_total and is incremented each time one of the above fails. + +When there is a ring change or the interval has elapsed, a failure to load configs from the store is logged as a warning. + +How to **investigate**: + +Look at the error message that is logged and attempt to understand what is causing the failure. I.e. it could be a networking issue, incorrect configuration for the store, etc. + +### CortexAlertmanagerRingCheckFailing + +How it **works**: + +This alert is fired when the multi-tenant alertmanager has been unable to check if one or more tenants should be owned on this shard for at least 10 minutes. + +When the alertmanager loads its configuration on start up, when it polls for config changes or when there is a ring change it must check the ring to see if the tenant is still owned on this shard. To prevent one error from causing the loading of all configurations to fail we assume that on error the tenant is NOT owned for this shard. If checking the ring continues to fail then some tenants might not be assigned an alertmanager and might not be able to receive notifications for their alerts. + +The metric for this alert is cortex_alertmanager_ring_check_errors_total. + +How to **investigate**: + +Look at the error message that is logged and attempt to understand what is causing the failure. In most cases the error will be encountered when attempting to read from the ring, which can fail if there is an issue with in-use backend implementation. + +### CortexAlertmanagerPartialStateMergeFailing + +How it **works**: + +This alert is fired when the multi-tenant alertmanager attempts to merge a partial state for something that it either does not know about or the partial state cannot be merged with the existing local state. State merges are gRPC messages that are gossiped between a shard and the corresponding alertmanager instance in other shards. + +The metric for this alert is cortex_alertmanager_partial_state_merges_failed_total. + +How to **investigate**: + +The error is not currently logged on the receiver side. If this alert is firing, it is likely that CortexAlertmanagerReplicationFailing is firing also, so instead follow the investigation steps for that alert, with the assumption that the issue is not RPC/communication related. + +### CortexAlertmanagerReplicationFailing + +How it **works**: + +This alert is fired when the multi-tenant alertmanager attempts to replicate a state update for a tenant (i.e. a silence or a notification) to another alertmanager instance but failed. This could be due to an RPC/communication error or the other alertmanager being unable to merge the state with its own local state. + +The metric for this alert is cortex_alertmanager_state_replication_failed_total. + +How to **investigate**: + +When state replication fails it gets logged as an error in the alertmanager that attempted the state replication. Check the error message in the log to understand the cause of the error (i.e. was it due to an RPC/communication error or was there an error in the receiving alertmanager). + +### CortexAlertmanagerPersistStateFailing + +How it **works**: + +This alert is fired when the multi-tenant alertmanager cannot persist its state to the remote object store. This operation is attempted periodically (every 15m by default). + +Each alertmanager writes its state (silences, notification log) to the remote object storage and the cortex_alertmanager_state_persist_failed_total metric is incremented each time this fails. The alert fires if this fails for an hour or more. + +How to **investigate**: + +Each failure to persist state to the remote object storage is logged. Find the reason in the Alertmanager container logs with the text “failed to persist state”. Possibles reasons: + +- The most probable cause is that remote write failed. Try to investigate why based on the message (network issue, storage issue). If the error indicates the issue might be transient, then you can wait until the next periodic attempt and see if it succeeds. +- It is also possible that encoding the state failed. This does not depend on external factors as it is just pulling state from the Alertmanager internal state. It may indicate a bug in the encoding method. + +### CortexAlertmanagerInitialSyncFailed + +How it **works**: + +When a tenant replica becomes owned it is assigned to an alertmanager instance. The alertmanager instance attempts to read the state from other alertmanager instances. If no other alertmanager instances could replicate the full state then it attempts to read the full state from the remote object store. This alert fires when both of these operations fail. + +Note that the case where there is no state for this user in remote object storage, is not treated as a failure. This is expected when a new tenant becomes active for the first time. + +How to **investigate**: + +When an alertmanager cannot read the state for a tenant from storage it gets logged as the following error: "failed to read state from storage; continuing anyway". The possible causes of this error could be: + +- The state could not be merged because it might be invalid and could not be decoded. This could indicate data corruption and therefore a bug in the reading or writing of the state, and would need further investigation. +- The state could not be read from storage. This could be due to a networking issue such as a timeout or an authentication and authorization issue with the remote object store. + + ## Cortex routes by path **Write path**: From fd26edbdaa89befbf57d78bf4ea1f9fa7e0a13d9 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 24 Aug 2021 13:15:54 +0200 Subject: [PATCH 333/364] Add new alerts for alertmanager sharding mode of operation. --- jsonnet/mimir-mixin/alerts.libsonnet | 1 + .../mimir-mixin/alerts/alertmanager.libsonnet | 98 +++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 jsonnet/mimir-mixin/alerts/alertmanager.libsonnet diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index 771c62c89d5..4dc1f85c247 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -1,6 +1,7 @@ { prometheusAlerts+:: (import 'alerts/alerts.libsonnet') + + (import 'alerts/alertmanager.libsonnet') + (if std.member($._config.storage_engine, 'blocks') then diff --git a/jsonnet/mimir-mixin/alerts/alertmanager.libsonnet b/jsonnet/mimir-mixin/alerts/alertmanager.libsonnet new file mode 100644 index 00000000000..e73d04b3e1a --- /dev/null +++ b/jsonnet/mimir-mixin/alerts/alertmanager.libsonnet @@ -0,0 +1,98 @@ +{ + groups+: [ + { + name: 'alertmanager_alerts', + rules: [ + { + alert: 'CortexAlertmanagerSyncConfigsFailing', + expr: ||| + rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 + |||, + 'for': '30m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to read tenant configurations from storage. + |||, + }, + }, + { + alert: 'CortexAlertmanagerRingCheckFailing', + expr: ||| + rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 + |||, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to check tenants ownership via the ring. + |||, + }, + }, + { + alert: 'CortexAlertmanagerPartialStateMergeFailing', + expr: ||| + rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 + |||, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to merge partial state changes received from a replica. + |||, + }, + }, + { + alert: 'CortexAlertmanagerReplicationFailing', + expr: ||| + rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 + |||, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to replicating partial state to its replicas. + |||, + }, + }, + { + alert: 'CortexAlertmanagerPersistStateFailing', + expr: ||| + rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 + |||, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to persist full state snaphots to remote storage. + |||, + }, + }, + { + alert: 'CortexAlertmanagerInitialSyncFailed', + expr: ||| + increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} was unable to obtain some initial state when starting up. + |||, + }, + }, + ], + }, + ], +} From a283035f4e87aca57dfbe9352a39cd64b9ac8b65 Mon Sep 17 00:00:00 2001 From: Duologic Date: Wed, 25 Aug 2021 14:31:40 +0200 Subject: [PATCH 334/364] fix(rules): upstream recording rule switched to sum_irate ref: https://github.com/kubernetes-monitoring/kubernetes-mixin/pull/619 --- jsonnet/mimir-mixin/recording_rules.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 433fa8e6270..4074cc008f1 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -213,7 +213,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; sum by (cluster, namespace, deployment) ( label_replace( label_replace( - node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate, + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), # The question mark in "(.*?)" is used to make it non-greedy, otherwise it From cad752b798209089a7c1a200b9909f187375e7b3 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Thu, 26 Aug 2021 14:47:59 +0200 Subject: [PATCH 335/364] Fix CortexIngesterReachingSeriesLimit playbook Signed-off-by: Arve Knudsen --- jsonnet/mimir-mixin/docs/playbooks.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index a1b0fac0fa7..a009952efcb 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -26,11 +26,11 @@ If nothing obvious from the above, check for increased load: ### CortexIngesterReachingSeriesLimit -This alert fires when the `max_series` per ingester instance limit is enabled and the actual number of in-memory series in a ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new series, while appending samples to existing ones will continue to succeed. +This alert fires when the `max_series` per ingester instance limit is enabled and the actual number of in-memory series in an ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new series, while appending samples to existing ones will continue to succeed. In case of **emergency**: -- If the actual number of series is very close or already hit the limit, then you can increase the limit via runtime config to gain some time -- Increasing the limit will increase the ingesters memory utilization. Please monitor the ingesters memory utilization via the `Cortex / Writes Resources` dashboard +- If the actual number of series is very close to or already hit the limit, then you can increase the limit via runtime config to gain some time +- Increasing the limit will increase the ingesters' memory utilization. Please monitor the ingesters' memory utilization via the `Cortex / Writes Resources` dashboard How the limit is **configured**: - The limit can be configured either on CLI (`-ingester.instance-limits.max-series`) or in the runtime config: @@ -51,10 +51,10 @@ How the limit is **configured**: How to **fix**: 1. **Temporarily increase the limit**
- If the actual number of series is very close or already hit the limit, or if you foresee the ingester will hit the limit before dropping the stale series as effect of the scale up, you should also temporarily increase the limit. -1. **Check if shuffle-sharding shard size is correct**
- - When shuffle-sharding is enabled, we target to 100K series / tenant / ingester assuming tenants on average uses 50% of their max series limit. - - Run the following **instant query** to find tenants that may cause an higher pressure on some ingesters: + If the actual number of series is very close to or already hit the limit, or if you foresee the ingester will hit the limit before dropping the stale series as an effect of the scale up, you should also temporarily increase the limit. +2. **Check if shuffle-sharding shard size is correct**
+ - When shuffle-sharding is enabled, we target up to 100K series / tenant / ingester assuming tenants on average use 50% of their max series limit. + - Run the following **instant query** to find tenants that may cause higher pressure on some ingesters: ``` ( sum by(user) (cortex_ingester_memory_series_created_total{namespace=""} @@ -75,17 +75,17 @@ How to **fix**: # and count by(user) (cortex_ingester_active_series{namespace="",pod="ingester-"}) ``` - Check the current shard size of each tenant in the output and, if they're not already sharded across all ingesters, you may consider to double their shard size - - The in-memory series in the ingesters will be effectively reduced at the TSDB Head compaction happening at least 1h after you increased the shard size for the affected tenants -1. **Scale up ingesters**
+ - The in-memory series in the ingesters will be effectively reduced at the TSDB head compaction happening at least 1h after you increased the shard size for the affected tenants +3. **Scale up ingesters**
Scaling up ingesters will lower the number of series per ingester. However, the effect of this change will take up to 4h, because after the scale up we need to wait until all stale series are dropped from memory as the effect of TSDB head compaction, which could take up to 4h (with the default config, TSDB keeps in-memory series up to 3h old and it gets compacted every 2h). ### CortexIngesterReachingTenantsLimit -This alert fires when the `max_tenants` per ingester instance limit is enabled and the actual number of tenants in a ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new tenants, while they will continue to succeed for previously existing ones. +This alert fires when the `max_tenants` per ingester instance limit is enabled and the actual number of tenants in an ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new tenants, while they will continue to succeed for previously existing ones. In case of **emergency**: -- If the actual number of tenants is very close or already hit the limit, then you can increase the limit via runtime config to gain some time -- Increasing the limit will increase the ingesters memory utilization. Please monitor the ingesters memory utilization via the `Cortex / Writes Resources` dashboard +- If the actual number of tenants is very close to or already hit the limit, then you can increase the limit via runtime config to gain some time +- Increasing the limit will increase the ingesters' memory utilization. Please monitor the ingesters' memory utilization via the `Cortex / Writes Resources` dashboard How the limit is **configured**: - The limit can be configured either on CLI (`-ingester.instance-limits.max-tenants`) or in the runtime config: @@ -577,7 +577,7 @@ How it **works**: - Having 2+ ingesters `OOMKilled` may cause a cluster outage - Ingester memory baseline usage is primarily influenced by memory allocated by the process (mostly go heap) and mmap-ed files (used by TSDB) - Ingester memory short spikes are primarily influenced by queries and TSDB head compaction into new blocks (occurring every 2h) -- A pod gets `OOMKilled` once its working set memory reaches the configured limit, so it's important to prevent ingesters memory utilization (working set memory) from getting close to the limit (we need to keep at least 30% room for spikes due to queries) +- A pod gets `OOMKilled` once its working set memory reaches the configured limit, so it's important to prevent ingesters' memory utilization (working set memory) from getting close to the limit (we need to keep at least 30% room for spikes due to queries) How to **fix**: - Check if the issue occurs only for few ingesters. If so: From 64b9842a10b184d7cff26ec2e23cfb6be75cc1a9 Mon Sep 17 00:00:00 2001 From: Jack Baldry Date: Tue, 11 May 2021 17:44:15 +0100 Subject: [PATCH 336/364] feat: Allow configuration of ring members in gossip alerts Signed-off-by: Jack Baldry --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 4 ++-- jsonnet/mimir-mixin/config.libsonnet | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 42c1e5f78f8..81ef396a3e0 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -593,8 +593,8 @@ expr: ||| memberlist_client_cluster_members_count != on (%s) group_left - sum by (%s) (up{job=~".+/(admin-api|compactor|store-gateway|distributor|ingester.*|querier.*|cortex|ruler)"}) - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + sum by (%s) (up{job=~".+/%s"}) + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels, $._config.job_names.ring_members], 'for': '5m', labels: { severity: 'warning', diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 94de5dd7080..e0e76a92516 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -33,6 +33,7 @@ query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments. query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments. table_manager: '(table-manager|cortex$)', + ring_members: '(distributor|ingester|querier|cortex|ruler)', store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', compactor: 'compactor.*', // Match also custom compactor deployments. From 24a3c9143ca3fa990e989697c427acb5780eba56 Mon Sep 17 00:00:00 2001 From: Jack Baldry Date: Tue, 11 May 2021 17:45:08 +0100 Subject: [PATCH 337/364] fix: Add store-gateway and compactor ring_members Also re-order names for readability. Signed-off-by: Jack Baldry --- jsonnet/mimir-mixin/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index e0e76a92516..803bbb7923f 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -33,7 +33,7 @@ query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments. query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments. table_manager: '(table-manager|cortex$)', - ring_members: '(distributor|ingester|querier|cortex|ruler)', + ring_members: '(compactor|distributor|ingester|querier|ruler|store-gateway|cortex)', store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', compactor: 'compactor.*', // Match also custom compactor deployments. From 2b39d4aaf7c80a20ad54b090081870f5643ec07d Mon Sep 17 00:00:00 2001 From: Jack Baldry Date: Mon, 24 May 2021 10:54:46 +0100 Subject: [PATCH 338/364] fix: Match all ingester workloads and avoid matching the cortex-gateway Signed-off-by: Jack Baldry --- jsonnet/mimir-mixin/config.libsonnet | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 803bbb7923f..a9eccf75d07 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -33,7 +33,9 @@ query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments. query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments. table_manager: '(table-manager|cortex$)', - ring_members: '(compactor|distributor|ingester|querier|ruler|store-gateway|cortex)', + // ingester-.* accommodates multiple ingester StatefulSets or Deployments. + // cortex$ prevents matching the cortex-gateway. + ring_members: '(compactor|distributor|ingester-.*|querier|ruler|store-gateway|cortex$)', store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', compactor: 'compactor.*', // Match also custom compactor deployments. From 10e0dfdcf8525bc47c274c27dc6fc6fb29b3847c Mon Sep 17 00:00:00 2001 From: Jack Baldry Date: Thu, 3 Jun 2021 11:51:26 +0100 Subject: [PATCH 339/364] feat: Optionally allow use of array or string to configure ring members Signed-off-by: Jack Baldry --- jsonnet/mimir-mixin/config.libsonnet | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index a9eccf75d07..386745c27cc 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -33,9 +33,13 @@ query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments. query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments. table_manager: '(table-manager|cortex$)', + // If non-null, this list is joined together to form the regexp for the ring_members matcher value. + // to completely, override the matcher value, you can make this null and override the `ring_members` + // field instead of this. // ingester-.* accommodates multiple ingester StatefulSets or Deployments. // cortex$ prevents matching the cortex-gateway. - ring_members: '(compactor|distributor|ingester-.*|querier|ruler|store-gateway|cortex$)', + ring_members_list: ['compactor', 'distributor', 'ingester-.*', 'querier', 'ruler', 'store-gateway', 'cortex$'], + ring_members: if self.ring_members_list != null then '(%s)' % std.join('|', self.ring_members_list) else '', store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', compactor: 'compactor.*', // Match also custom compactor deployments. From 2bbc3c7d2c0f8b24c05236e1f775e737bc943363 Mon Sep 17 00:00:00 2001 From: Jack Baldry Date: Tue, 20 Jul 2021 10:40:39 +0100 Subject: [PATCH 340/364] address review feedback Signed-off-by: Jack Baldry --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 16 +++++++++++----- jsonnet/mimir-mixin/config.libsonnet | 8 +------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 81ef396a3e0..3c407b4813a 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -1,4 +1,9 @@ { + // simpleRegexpOpt produces a simple regexp that matches all strings in the input array. + local simpleRegexpOpt(strings) = + assert std.isArray(strings) : 'simpleRegexpOpt requires that `strings` is an array of strings`'; + '(' + std.join('|', strings) + ')', + groups+: [ { name: 'cortex_alerts', @@ -590,11 +595,12 @@ rules: [ { alert: 'CortexGossipMembersMismatch', - expr: ||| - memberlist_client_cluster_members_count - != on (%s) group_left - sum by (%s) (up{job=~".+/%s"}) - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels, $._config.job_names.ring_members], + expr: + ||| + memberlist_client_cluster_members_count + != on (%s) group_left + sum by (%s) (up{job=~".+/%s"}) + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels, simpleRegexpOpt($._config.job_names.ring_members)], 'for': '5m', labels: { severity: 'warning', diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 386745c27cc..3a5ac33cfba 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -33,13 +33,7 @@ query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments. query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments. table_manager: '(table-manager|cortex$)', - // If non-null, this list is joined together to form the regexp for the ring_members matcher value. - // to completely, override the matcher value, you can make this null and override the `ring_members` - // field instead of this. - // ingester-.* accommodates multiple ingester StatefulSets or Deployments. - // cortex$ prevents matching the cortex-gateway. - ring_members_list: ['compactor', 'distributor', 'ingester-.*', 'querier', 'ruler', 'store-gateway', 'cortex$'], - ring_members: if self.ring_members_list != null then '(%s)' % std.join('|', self.ring_members_list) else '', + ring_members: ['compactor', 'distributor', 'ingester-.*', 'querier', 'ruler', 'store-gateway', 'cortex'], store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', compactor: 'compactor.*', // Match also custom compactor deployments. From cbed1b67c38b6a459e55a73a47bf2b71a7754b5e Mon Sep 17 00:00:00 2001 From: Jack Baldry Date: Fri, 27 Aug 2021 09:35:21 +0100 Subject: [PATCH 341/364] fix: Correct ingester and querier regexps Signed-off-by: Jack Baldry --- jsonnet/mimir-mixin/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/config.libsonnet b/jsonnet/mimir-mixin/config.libsonnet index 3a5ac33cfba..9b8d81a283c 100644 --- a/jsonnet/mimir-mixin/config.libsonnet +++ b/jsonnet/mimir-mixin/config.libsonnet @@ -33,7 +33,7 @@ query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments. query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments. table_manager: '(table-manager|cortex$)', - ring_members: ['compactor', 'distributor', 'ingester-.*', 'querier', 'ruler', 'store-gateway', 'cortex'], + ring_members: ['compactor', 'distributor', 'ingester.*', 'querier.*', 'ruler', 'store-gateway', 'cortex'], store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', compactor: 'compactor.*', // Match also custom compactor deployments. From 5b6235cce4c7648573875a23e2a7e4a8cdfcf5fc Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Thu, 2 Sep 2021 12:16:27 +0200 Subject: [PATCH 342/364] Fixes to initial state sync panels on alertmanager dashboard. 1) Change minimal interval to 1m for sync duration and fetch state panels. This is in order to show infrequent events at smaller time windows. 2) Change syncs/sec panel to reflect absolute value of metric not rate. The initial sync only occurs once per-tenant so the counter value is essentially 0 or 1. Due to how per-tenant metrics are aggregated, the external facing metric really acts more like a gauge reflecting the number of tenants which achieved each outcome. Also, stack this panel as it becomes easier to visually see when the initial syncs have completed for all tenants (e.g. during a rollout). --- .../dashboards/alertmanager.libsonnet | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index 6f578b11357..33b257b539d 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -152,15 +152,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Sharding Initial State Sync') .addPanel( - $.panel('Initial syncs/sec') + + $.panel('Tenant initial sync outcomes') + $.queryPanel( - 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum by(outcome) (cortex_alertmanager_state_initial_sync_completed_total{%s})' % $.jobMatcher('alertmanager'), '{{outcome}}' - ) + ) + + $.stack ) .addPanel( $.panel('Initial sync duration') + - $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + } ) .addPanel( $.panel('Fetch state from other alertmanagers /sec') + @@ -174,7 +182,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] - ) + ) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + } ) ) .addRow( From f2774b899bb50502082ef99a47a1cce5200fdfb9 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 7 Sep 2021 11:46:11 +0200 Subject: [PATCH 343/364] Add rate back to Alertmanager dashboard initial syncs panel. The metric in fact does act like a counter due to soft deletion of the per-user registry when the user is unconfigured (e.g. moved to another instance or configuration deleted). --- .../mimir-mixin/dashboards/alertmanager.libsonnet | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index 33b257b539d..7e2e3c581aa 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -152,12 +152,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Sharding Initial State Sync') .addPanel( - $.panel('Tenant initial sync outcomes') + + $.panel('Initial syncs /sec') + $.queryPanel( - 'sum by(outcome) (cortex_alertmanager_state_initial_sync_completed_total{%s})' % $.jobMatcher('alertmanager'), + 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), '{{outcome}}' - ) + - $.stack + ) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + } ) .addPanel( $.panel('Initial sync duration') + From 7a840652d6e438fbbdcc143e496f14a0324018cb Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Tue, 7 Sep 2021 14:41:37 +0200 Subject: [PATCH 344/364] Make the overrides metric name configurable. We (Grafana Labs) are about to put in a new system to control and export data about limits and we'll need to use a different name. This shouldn't affect our OSS users. Signed-off-by: Goutham Veeramachaneni --- jsonnet/mimir-mixin/recording_rules.libsonnet | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 4074cc008f1..1ce2519ad78 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -6,6 +6,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; max_samples_per_sec_per_ingester: 80e3, max_samples_per_sec_per_distributor: 240e3, limit_utilisation_target: 0.6, + cortex_overrides_metric: 'cortex_overrides', } + $._config + $._group_config, prometheusRules+:: { groups+: [ @@ -114,7 +115,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, expr: ||| ceil( - sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"}) + sum by (cluster, namespace) (%(cortex_overrides_metric)s{limit_name="ingestion_rate"}) * %(limit_utilisation_target)s / %(max_samples_per_sec_per_distributor)s ) ||| % _config, @@ -166,7 +167,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, expr: ||| ceil( - sum by (cluster, namespace) (cortex_overrides{limit_name="max_global_series_per_user"}) + sum by (cluster, namespace) (%(cortex_overrides_metric)s{limit_name="max_global_series_per_user"}) * 3 * %(limit_utilisation_target)s / %(max_series_per_ingester)s ) ||| % _config, @@ -181,7 +182,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, expr: ||| ceil( - sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"}) + sum by (cluster, namespace) (%(cortex_overrides_metric)s{limit_name="ingestion_rate"}) * %(limit_utilisation_target)s / %(max_samples_per_sec_per_ingester)s ) ||| % _config, From 7d042edeb602e12496064acc4b7b23b8bef09273 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Tue, 14 Sep 2021 12:50:01 +0200 Subject: [PATCH 345/364] Improve Cortex / Queries dashboard Signed-off-by: Marco Pracucci --- .../mimir-mixin/dashboards/queries.libsonnet | 64 ++++++++++++++----- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/jsonnet/mimir-mixin/dashboards/queries.libsonnet index fedbc949235..259f5dfabd3 100644 --- a/jsonnet/mimir-mixin/dashboards/queries.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/queries.libsonnet @@ -33,30 +33,55 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Query Frontend - Results Cache') + $.row('Query Frontend - Query Splitting and Results Cache') .addPanel( - $.panel('Cache Hit %') + + $.panel('Intervals per Query') + + $.queryPanel('sum(rate(cortex_frontend_split_queries_total{%s}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{%s, method="split_by_interval"}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'splitting rate') + + $.panelDescription( + 'Intervals per Query', + ||| + The average number of splitted queries (partitioned by time) executed a single input query. + ||| + ), + ) + .addPanel( + $.panel('Results Cache Hit %') + $.queryPanel('sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m])) / sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Hit Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( - $.panel('Cache misses') + + $.panel('Results Cache misses') + $.queryPanel('sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m])) - sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Miss Rate'), ) ) .addRow( - $.row('Query Frontend - Sharding/Splitting') - .addPanel( - $.panel('Intervals per Query') + - $.queryPanel('sum(rate(cortex_frontend_split_queries_total{%s}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{%s, method="split_by_interval"}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'partition rate'), - ) - .addPanel( - $.panel('Sharded Queries %') + - $.queryPanel('sum(rate(cortex_frontend_mapped_asts_total{%s}[1m])) / sum(rate(cortex_frontend_split_queries_total{%s}[1m])) * 100' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'shard rate'), - ) - .addPanel( - $.panel('Sharding factor') + - $.queryPanel('sum(rate(cortex_frontend_sharded_queries_total{%s}[1m])) / sum(rate(cortex_frontend_mapped_asts_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Average'), + $.row('Query Frontend - Query sharding') + .addPanel( + $.panel('Sharded Queries Ratio') + + $.queryPanel(||| + sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{%s}[$__rate_interval])) / + sum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'sharded queries ratio') + + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) } + + $.panelDescription( + 'Sharded Queries Ratio', + ||| + The % of queries that have been successfully rewritten and executed in a shardable way. + This panel takes in account only type of queries which are supported by query sharding (eg. range queries). + ||| + ), + ) + .addPanel( + $.panel('Number of Sharded Queries per Query') + + $.latencyPanel('cortex_frontend_sharded_queries_per_query', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + + { yaxes: $.yaxes('short') } + + $.panelDescription( + 'Number of Sharded Queries per Query', + ||| + How many sharded queries have been executed for a single input query. It tracks only queries which have + been successfully rewritten in a shardable way. + ||| + ), ) ) .addRow( @@ -248,5 +273,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Index-header lazy load duration') + $.latencyPanel('cortex_bucket_store_indexheader_lazy_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) + .addPanel( + $.panel('Series hash cache hit ratio') + + $.queryPanel(||| + sum(rate(cortex_bucket_store_series_hash_cache_hits_total{%s}[$__rate_interval])) + / + sum(rate(cortex_bucket_store_series_hash_cache_requests_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'hit ratio') + + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, + ) ), } From a861947d56ef82304212298f4417c200f3db4958 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Thu, 2 Sep 2021 19:37:13 +0200 Subject: [PATCH 346/364] Add recording rules for speeding up Alertmanager dashboard. With large numbers of tenants the queries for some panels on thos dashboard can become quite slow as the metrics exposed are per-tenant. --- .../dashboards/alertmanager.libsonnet | 38 ++++++------ jsonnet/mimir-mixin/recording_rules.libsonnet | 61 +++++++++++++++++++ 2 files changed, 80 insertions(+), 19 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index 7e2e3c581aa..f7350057e46 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -11,11 +11,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; }) .addPanel( $.panel('Total Alerts') + - $.statPanel('sum(cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), format='short') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short') ) .addPanel( $.panel('Total Silences') + - $.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short') ) .addPanel( $.panel('Tenants') + @@ -29,11 +29,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_alerts_received_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_alerts_received_total:rate5m{%s}) - - sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -46,11 +46,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval])) + sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) - - sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) + sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -61,13 +61,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; [ ||| ( - sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval])) by(integration) + sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) by(integration) - - sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration) + sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration) ) > 0 or on () vector(0) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration)' % $.jobMatcher('alertmanager'), + 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher('alertmanager'), ], ['success - {{ integration }}', 'failed - {{ integration }}'] ) @@ -104,7 +104,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s Alerts' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + 'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')], '{{%s}}' % $._config.per_instance_label ) + $.stack @@ -112,7 +112,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s Silences' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + 'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')], '{{%s}}' % $._config.per_instance_label ) + $.stack @@ -205,11 +205,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_state_replication_total{%s}:rate5m) - - sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_state_replication_failed_total{%s}:rate5m) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_state_replication_failed_total{%s}:rate5m)' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -219,11 +219,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_partial_state_merges_total{%s}:rate5m) - - sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total{%s}:rate5m) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total{%s}:rate5m)' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 1ce2519ad78..990f8c32115 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -364,6 +364,67 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) |||, }, + // Aggregations of per-user Alertmanager metrics used in dashboards. + { + record: 'cluster_job_%s:cortex_alertmanager_alerts:sum' % $._config.per_instance_label, + expr: ||| + sum by (cluster, job, %s) (cortex_alertmanager_alerts) + ||| % $._config.per_instance_label, + }, + { + record: 'cluster_job_%s:cortex_alertmanager_silences:sum' % $._config.per_instance_label, + expr: ||| + sum by (cluster, job, %s) (cortex_alertmanager_silences) + ||| % $._config.per_instance_label, + }, + { + record: 'cluster_job:cortex_alertmanager_alerts_received_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) + |||, + }, + { + record: 'cluster_job_integration:cortex_alertmanager_notifications_total:rate5m', + expr: ||| + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) + |||, + }, + { + record: 'cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m', + expr: ||| + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_state_replication_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_state_persist_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_state_persist_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_state_persist_failed_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_state_persist_failed_total[5m])) + |||, + }, ], }, ], From 43c14231c698ec259afb8a439fe69a27ce86287d Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Wed, 15 Sep 2021 14:44:53 +0200 Subject: [PATCH 347/364] Fixes from testing. --- .../mimir-mixin/dashboards/alertmanager.libsonnet | 12 ++++++------ jsonnet/mimir-mixin/recording_rules.libsonnet | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index f7350057e46..8897034eea9 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -205,11 +205,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(cluster_job:cortex_alertmanager_state_replication_total{%s}:rate5m) + sum(cluster_job:cortex_alertmanager_state_replication_total:rate5m{%s}) - - sum(cluster_job:cortex_alertmanager_state_replication_failed_total{%s}:rate5m) + sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(cluster_job:cortex_alertmanager_state_replication_failed_total{%s}:rate5m)' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -219,11 +219,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(cluster_job:cortex_alertmanager_partial_state_merges_total{%s}:rate5m) + sum(cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m{%s}) - - sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total{%s}:rate5m) + sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total{%s}:rate5m)' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 990f8c32115..439f44de8e4 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -414,15 +414,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; |||, }, { - record: 'cluster_job:cortex_alertmanager_state_persist_total:rate5m', + record: 'cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m', expr: ||| - sum by (cluster, job) (rate(cortex_alertmanager_state_persist_total[5m])) + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) |||, }, { - record: 'cluster_job:cortex_alertmanager_state_persist_failed_total:rate5m', + record: 'cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m', expr: ||| - sum by (cluster, job) (rate(cortex_alertmanager_state_persist_failed_total[5m])) + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) |||, }, ], From 1033b9deec7ff716fd7df82bc74e07e260dc9cc7 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Wed, 15 Sep 2021 14:53:34 +0200 Subject: [PATCH 348/364] Move rules to their own group. --- jsonnet/mimir-mixin/recording_rules.libsonnet | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index 439f44de8e4..a438cabfa7a 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -364,6 +364,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) |||, }, + ], + }, + { + name: 'cortex_alertmanager_rules', + rules: [ // Aggregations of per-user Alertmanager metrics used in dashboards. { record: 'cluster_job_%s:cortex_alertmanager_alerts:sum' % $._config.per_instance_label, From 17e71c4e395e482da9c847cb0aeb7e04263a46ff Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Mon, 4 Oct 2021 09:09:09 +0200 Subject: [PATCH 349/364] Split `cortex_api` recording rule group into three groups. This is a workaround for large clusters where this group can become slow to evaluate. --- jsonnet/mimir-mixin/recording_rules.libsonnet | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/jsonnet/mimir-mixin/recording_rules.libsonnet index a438cabfa7a..0383524787b 100644 --- a/jsonnet/mimir-mixin/recording_rules.libsonnet +++ b/jsonnet/mimir-mixin/recording_rules.libsonnet @@ -11,10 +11,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; prometheusRules+:: { groups+: [ { - name: 'cortex_api', + name: 'cortex_api_1', + rules: + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job']), + }, + { + name: 'cortex_api_2', + rules: + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job', 'route']), + }, + { + name: 'cortex_api_3', rules: - utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job']) + - utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'job', 'route']) + utils.histogramRules('cortex_request_duration_seconds', ['cluster', 'namespace', 'job', 'route']), }, { From 6787211d3ff8e07627bf2c08f5542026dbc00a32 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 6 Oct 2021 13:04:40 +0200 Subject: [PATCH 350/364] Update gsutil installation playbook Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index a009952efcb..98416a6d2d1 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -904,13 +904,12 @@ After this preparation, one can use `kubectl exec -t -i clone-ingester-7-dataacc tar -zxvf gsutil.tar.gz ./gsutil/gsutil --help ``` -3. Create `/etc/boto.cfg` with the following content: +3. Configure credentials ``` - [GoogleCompute] - service_account = default + gsutil config -e - [Plugin] - plugin_directory = /usr/lib/python3.8/site-packages/google_compute_engine/boto + # Private key path: /var/secrets/google/credentials.json + # Project ID: your google project ID ``` ### Deleting a StatefulSet with persistent volumes From bbb9b341929eb06468808d26615139f5f4026689 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 12 Oct 2021 15:39:28 +0200 Subject: [PATCH 351/364] Use `$._config.job_names.gateway` in resources dashboards. This fixes panels where `cortex-gw` was hardcoded. --- .../dashboards/alertmanager-resources.libsonnet | 4 ++-- jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet | 8 ++++---- jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet | 4 ++-- jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet index 8a719d52f25..415060206f7 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet @@ -7,10 +7,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Gateway') .addPanel( - $.containerCPUUsagePanel('CPU', 'cortex-gw'), + $.containerCPUUsagePanel('CPU', $._config.job_names.gateway), ) .addPanel( - $.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'), + $.containerMemoryWorkingSetPanel('Memory (workingset)', $._config.job_names.gateway), ) .addPanel( $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.gateway), diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet index fe467f5b9c6..981614ac83e 100644 --- a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -145,8 +145,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerCPUUsagePanel(title, containerName):: $.panel(title) + $.queryPanel([ - 'sum by(%s) (rate(container_cpu_usage_seconds_total{%s,container="%s"}[$__rate_interval]))' % [$._config.per_instance_label, $.namespaceMatcher(), containerName], - 'min(container_spec_cpu_quota{%s,container="%s"} / container_spec_cpu_period{%s,container="%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], + 'sum by(%s) (rate(container_cpu_usage_seconds_total{%s,container=~"%s"}[$__rate_interval]))' % [$._config.per_instance_label, $.namespaceMatcher(), containerName], + 'min(container_spec_cpu_quota{%s,container=~"%s"} / container_spec_cpu_period{%s,container=~"%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], ], ['{{%s}}' % $._config.per_instance_label, 'limit']) + { seriesOverrides: [ @@ -164,8 +164,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel([ // We use "max" instead of "sum" otherwise during a rolling update of a statefulset we will end up // summing the memory of the old instance/pod (whose metric will be stale for 5m) to the new instance/pod. - 'max by(%s) (container_memory_working_set_bytes{%s,container="%s"})' % [$._config.per_instance_label, $.namespaceMatcher(), containerName], - 'min(container_spec_memory_limit_bytes{%s,container="%s"} > 0)' % [$.namespaceMatcher(), containerName], + 'max by(%s) (container_memory_working_set_bytes{%s,container=~"%s"})' % [$._config.per_instance_label, $.namespaceMatcher(), containerName], + 'min(container_spec_memory_limit_bytes{%s,container=~"%s"} > 0)' % [$.namespaceMatcher(), containerName], ], ['{{%s}}' % $._config.per_instance_label, 'limit']) + { seriesOverrides: [ diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet index a1b36272a38..f0750c885ac 100644 --- a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet @@ -7,10 +7,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Gateway') .addPanel( - $.containerCPUUsagePanel('CPU', 'cortex-gw'), + $.containerCPUUsagePanel('CPU', $._config.job_names.gateway), ) .addPanel( - $.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'), + $.containerMemoryWorkingSetPanel('Memory (workingset)', $._config.job_names.gateway), ) .addPanel( $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.gateway), diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet index 85d7f4c48b5..64f83ef1cca 100644 --- a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet @@ -7,10 +7,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Gateway') .addPanel( - $.containerCPUUsagePanel('CPU', 'cortex-gw'), + $.containerCPUUsagePanel('CPU', $._config.job_names.gateway), ) .addPanel( - $.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'), + $.containerMemoryWorkingSetPanel('Memory (workingset)', $._config.job_names.gateway), ) .addPanel( $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.gateway), From c180588611eb25bd4063430bbb656f4e755d96bb Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 Oct 2021 09:58:42 +0200 Subject: [PATCH 352/364] Fine tune CortexIngesterReachingSeriesLimit alert Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 3c407b4813a..c8e925fb2d5 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -260,7 +260,7 @@ (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) and ignoring (limit) (cortex_ingester_instance_limits{limit="max_series"} > 0) - ) > 0.7 + ) > 0.8 |||, 'for': '3h', labels: { @@ -279,7 +279,7 @@ (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) and ignoring (limit) (cortex_ingester_instance_limits{limit="max_series"} > 0) - ) > 0.85 + ) > 0.9 |||, 'for': '5m', labels: { From eebc52906c8cb3e2661a6b43365a79a16b78adb5 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 Oct 2021 10:52:44 +0200 Subject: [PATCH 353/364] Add CortexRolloutStuck alert Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 61 +++++++++++++++++++++ jsonnet/mimir-mixin/docs/playbooks.md | 10 ++++ 2 files changed, 71 insertions(+) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 3c407b4813a..74c4ff6cfa4 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -412,6 +412,67 @@ }, ], }, + { + name: 'cortex-rollout-alerts', + rules: [ + { + alert: 'CortexRolloutStuck', + expr: ||| + ( + max without (revision) ( + kube_statefulset_status_current_revision + unless + kube_statefulset_status_update_revision + ) + * + ( + kube_statefulset_replicas + != + kube_statefulset_status_replicas_updated + ) + ) and ( + changes(kube_statefulset_status_replicas_updated[15m]) + == + 0 + ) + * on(%s) group_left max by(%s) (cortex_build_info) + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + The {{ $labels.statefulset }} rollout is stuck in %(alert_aggregation_variables)s. + ||| % $._config, + }, + }, + { + alert: 'CortexRolloutStuck', + expr: ||| + ( + kube_deployment_spec_replicas + != + kube_deployment_status_replicas_updated + ) and ( + changes(kube_deployment_status_replicas_updated[15m]) + == + 0 + ) + * on(%s) group_left max by(%s) (cortex_build_info) + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + The {{ $labels.deployment }} rollout is stuck in %(alert_aggregation_variables)s. + ||| % $._config, + }, + }, + ], + }, { name: 'cortex-provisioning', rules: [ diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 98416a6d2d1..2393bc94a66 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -231,6 +231,16 @@ How to **investigate**: _If the alert `CortexIngesterTSDBHeadCompactionFailed` fired as well, then give priority to it because that could be the cause._ +### CortexRolloutStuck + +This alert fires when a Cortex service rollout is stuck, which means the number of updated replicas doesn't match the expected one and looks there's no progress in the rollout. The alert monitors services deployed as Kubernetes `StatefulSet` and `Deployment`. + +How to **investigate**: +- Run `kubectl -n get pods -l name=` to get a list of running pods +- Ensure there's no pod in a failing state (eg. `Error`, `OOMKilled`, `CrashLoopBackOff`) +- Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) +- Run `kubectl -n describe statefulset ` or `kubectl -n describe deployment ` and look at "Pod Status" and "Events" to get more information + #### Ingester hit the disk capacity If the ingester hit the disk capacity, any attempt to append samples will fail. You should: From ea3274f377f52ca1b220cc757f0406ed0bd9531c Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 Oct 2021 11:25:07 +0200 Subject: [PATCH 354/364] Fixed playbook Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 2393bc94a66..e61f24ff57e 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -231,16 +231,6 @@ How to **investigate**: _If the alert `CortexIngesterTSDBHeadCompactionFailed` fired as well, then give priority to it because that could be the cause._ -### CortexRolloutStuck - -This alert fires when a Cortex service rollout is stuck, which means the number of updated replicas doesn't match the expected one and looks there's no progress in the rollout. The alert monitors services deployed as Kubernetes `StatefulSet` and `Deployment`. - -How to **investigate**: -- Run `kubectl -n get pods -l name=` to get a list of running pods -- Ensure there's no pod in a failing state (eg. `Error`, `OOMKilled`, `CrashLoopBackOff`) -- Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) -- Run `kubectl -n describe statefulset ` or `kubectl -n describe deployment ` and look at "Pod Status" and "Events" to get more information - #### Ingester hit the disk capacity If the ingester hit the disk capacity, any attempt to append samples will fail. You should: @@ -734,6 +724,15 @@ When an alertmanager cannot read the state for a tenant from storage it gets log - The state could not be merged because it might be invalid and could not be decoded. This could indicate data corruption and therefore a bug in the reading or writing of the state, and would need further investigation. - The state could not be read from storage. This could be due to a networking issue such as a timeout or an authentication and authorization issue with the remote object store. +### CortexRolloutStuck + +This alert fires when a Cortex service rollout is stuck, which means the number of updated replicas doesn't match the expected one and looks there's no progress in the rollout. The alert monitors services deployed as Kubernetes `StatefulSet` and `Deployment`. + +How to **investigate**: +- Run `kubectl -n get pods -l name=` to get a list of running pods +- Ensure there's no pod in a failing state (eg. `Error`, `OOMKilled`, `CrashLoopBackOff`) +- Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) +- Run `kubectl -n describe statefulset ` or `kubectl -n describe deployment ` and look at "Pod Status" and "Events" to get more information ## Cortex routes by path From dcb33069acf6501e4acf08c52927aa85526103a9 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 Oct 2021 11:22:43 +0200 Subject: [PATCH 355/364] Added CortexFailingToTalkToConsul alert Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 26 +++++++++++++++++++++ jsonnet/mimir-mixin/docs/playbooks.md | 12 ++++++++++ 2 files changed, 38 insertions(+) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 993323edb68..d042ce09b84 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -715,5 +715,31 @@ }, ], }, + { + name: 'cortex-consul-alerts', + rules: [ + { + alert: 'CortexFailingToTalkToConsul', + expr: ||| + ( + sum by(%s, pod, status_code, kv_name) (rate(cortex_consul_request_duration_seconds_count{status_code!~"2.+"}[1m])) + / + sum by(%s, pod, status_code, kv_name) (rate(cortex_consul_request_duration_seconds_count[1m])) + ) + # We want to get alerted only in case there's a constant failure. + == 1 + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Cortex {{ $labels.pod }} in %(alert_aggregation_variables)s is failing to talk to Consul store ${{ labels.kv_name }}. + ||| % $._config, + }, + }, + ], + }, ], } diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index e61f24ff57e..1a925466331 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -734,6 +734,18 @@ How to **investigate**: - Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) - Run `kubectl -n describe statefulset ` or `kubectl -n describe deployment ` and look at "Pod Status" and "Events" to get more information +### CortexFailingToTalkToConsul + +This alert fires if a Cortex instance is failing to run any operation on Consul. + +How it **works**: +- Consul is typically used to store the hash ring state. +- If an instance is failing to talk to Consul, either the instance can't update the heartbeat in the ring or is failing to receive ring updates. + +How to **investigate**: +- Ensure Consul is up and running. +- Investigate the logs of the affected instance to find the specific error occurring when talking to Consul. + ## Cortex routes by path **Write path**: From d79b304e77d3599f6a278966fe2a331acedb1f62 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 Oct 2021 11:28:39 +0200 Subject: [PATCH 356/364] Fixed alert message Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index d042ce09b84..2553d793b17 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -735,7 +735,7 @@ }, annotations: { message: ||| - Cortex {{ $labels.pod }} in %(alert_aggregation_variables)s is failing to talk to Consul store ${{ labels.kv_name }}. + Cortex {{ $labels.pod }} in %(alert_aggregation_variables)s is failing to talk to Consul store {{ $labels.kv_name }}. ||| % $._config, }, }, From d02bb6b3662de8b3eb14d57620aef5a8d152a6d8 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 14 Oct 2021 09:45:09 +0200 Subject: [PATCH 357/364] Update alert to be generic to KV stores Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 47 +++++++++------------ jsonnet/mimir-mixin/docs/playbooks.md | 12 +++--- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 2553d793b17..59022dd8afc 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -235,6 +235,27 @@ |||, }, }, + { + alert: 'CortexKVStoreFailure', + expr: ||| + ( + sum by(%s, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) + / + sum by(%s, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) + ) + # We want to get alerted only in case there's a constant failure. + == 1 + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Cortex {{ $labels.pod }} in %(alert_aggregation_variables)s is failing to talk to the KV store {{ $labels.kv_name }}. + ||| % $._config, + }, + }, { alert: 'CortexMemoryMapAreasTooHigh', expr: ||| @@ -715,31 +736,5 @@ }, ], }, - { - name: 'cortex-consul-alerts', - rules: [ - { - alert: 'CortexFailingToTalkToConsul', - expr: ||| - ( - sum by(%s, pod, status_code, kv_name) (rate(cortex_consul_request_duration_seconds_count{status_code!~"2.+"}[1m])) - / - sum by(%s, pod, status_code, kv_name) (rate(cortex_consul_request_duration_seconds_count[1m])) - ) - # We want to get alerted only in case there's a constant failure. - == 1 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - Cortex {{ $labels.pod }} in %(alert_aggregation_variables)s is failing to talk to Consul store {{ $labels.kv_name }}. - ||| % $._config, - }, - }, - ], - }, ], } diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 1a925466331..180ed50d8c3 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -734,17 +734,19 @@ How to **investigate**: - Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) - Run `kubectl -n describe statefulset ` or `kubectl -n describe deployment ` and look at "Pod Status" and "Events" to get more information -### CortexFailingToTalkToConsul +### CortexKVStoreFailure -This alert fires if a Cortex instance is failing to run any operation on Consul. +This alert fires if a Cortex instance is failing to run any operation on a KV store (eg. consul or etcd). How it **works**: - Consul is typically used to store the hash ring state. -- If an instance is failing to talk to Consul, either the instance can't update the heartbeat in the ring or is failing to receive ring updates. +- Etcd is typically used to store by the HA tracker (distributor) to deduplicate samples. +- If an instance is failing operations on the **hash ring**, either the instance can't update the heartbeat in the ring or is failing to receive ring updates. +- If an instance is failing operations on the **HA tracker** backend, either the instance can't update the authoritative replica or is failing to receive updates. How to **investigate**: -- Ensure Consul is up and running. -- Investigate the logs of the affected instance to find the specific error occurring when talking to Consul. +- Ensure Consul/Etcd is up and running. +- Investigate the logs of the affected instance to find the specific error occurring when talking to Consul/Etcd. ## Cortex routes by path From 6f1bc1865a8cb0c28bb6b83dcd614ddb055ebb9d Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Mon, 18 Oct 2021 12:10:58 +0100 Subject: [PATCH 358/364] Add README --- jsonnet/mimir-mixin/README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 jsonnet/mimir-mixin/README.md diff --git a/jsonnet/mimir-mixin/README.md b/jsonnet/mimir-mixin/README.md new file mode 100644 index 00000000000..768fad9c5c5 --- /dev/null +++ b/jsonnet/mimir-mixin/README.md @@ -0,0 +1,18 @@ +# Monitoring for Mimir + +To generate the Grafana dashboards and Prometheus alerts for Mimir: + +## Usage + +```console +$ GO111MODULE=on go get github.com/monitoring-mixins/mixtool/cmd/mixtool +$ GO111MODULE=on go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb +$ git clone https://github.com/grafana/mimir.git +$ make build-mixin +``` + +This will leave all the alerts and dashboards in jsonnet/mimir-mixin/mimir-mixin.zip (or jsonnet/mimir-mixin/out). + +## Known Problems + +If you get an error like `cannot use cli.StringSliceFlag literal (type cli.StringSliceFlag) as type cli.Flag in slice literal` when installing [mixtool](https://github.com/monitoring-mixins/mixtool/issues/27), make sure you set `GO111MODULE=on` before `go get`. From 4e65adf94250d121aff8f635f144d41e14b95a85 Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Mon, 18 Oct 2021 12:12:26 +0100 Subject: [PATCH 359/364] Add mimir-mixin CI checks --- .github/workflows/test-build-deploy.yml | 2 ++ Makefile | 35 ++++++++++++++++++- jsonnet/mimir-mixin/.gitignore | 3 ++ jsonnet/mimir-mixin/scripts/lint-playbooks.sh | 28 +++++++++++++++ mimir-build-image/Dockerfile | 22 +++++++----- 5 files changed, 80 insertions(+), 10 deletions(-) create mode 100644 jsonnet/mimir-mixin/.gitignore create mode 100755 jsonnet/mimir-mixin/scripts/lint-playbooks.sh diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml index 7b2ebe88130..f67d58f740f 100644 --- a/.github/workflows/test-build-deploy.yml +++ b/.github/workflows/test-build-deploy.yml @@ -36,6 +36,8 @@ jobs: run: make BUILD_IN_CONTAINER=false check-protos - name: Check Generated Documentation run: make BUILD_IN_CONTAINER=false check-doc + - name: Check Mixin + run: make BUILD_IN_CONTAINER=false check-mixin - name: Check White Noise. run: make BUILD_IN_CONTAINER=false check-white-noise - name: Check License Header diff --git a/Makefile b/Makefile index 1eac6e43d7a..8b53eab7aec 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # WARNING: do not commit to a repository! -include Makefile.local -.PHONY: all test integration-tests cover clean images protos exes dist doc clean-doc check-doc push-multiarch-build-image license check-license format +.PHONY: all test integration-tests cover clean images protos exes dist doc clean-doc check-doc push-multiarch-build-image license check-license format check-mixin check-mixin-jb check-mixin-mixtool checkin-mixin-playbook build-mixin format-mixin .DEFAULT_GOAL := all # Version number @@ -25,6 +25,12 @@ GIT_REVISION := $(shell git rev-parse --short HEAD) GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD) UPTODATE := .uptodate +# path to jsonnetfmt +JSONNET_FMT := jsonnetfmt + +# path to the mimir/mixin +MIXIN_PATH := jsonnet/mimir-mixin + .PHONY: image-tag image-tag: @echo $(IMAGE_TAG) @@ -313,6 +319,33 @@ clean-white-noise: check-white-noise: clean-white-noise @git diff --exit-code --quiet -- '*.md' || (echo "Please remove trailing whitespaces running 'make clean-white-noise'" && false) +check-mixin: format-mixin check-mixin-jb check-mixin-mixtool check-mixin-playbook + @git diff --exit-code --quiet -- $(MIXIN_PATH) || (echo "Please format mixin by running 'make format-mixin'" && false) + + @cd $(MIXIN_PATH) && \ + jb install && \ + mixtool lint mixin.libsonnet + +check-mixin-jb: + @cd $(MIXIN_PATH) && \ + jb install + +check-mixin-mixtool: check-mixin-jb + @cd $(MIXIN_PATH) && \ + mixtool lint mixin.libsonnet + +check-mixin-playbook: build-mixin + @$(MIXIN_PATH)/scripts/lint-playbooks.sh + +build-mixin: check-mixin-jb + @rm -rf $(MIXIN_PATH)/out && mkdir $(MIXIN_PATH)/out + @cd $(MIXIN_PATH) && \ + mixtool generate all --output-alerts out/alerts.yaml --output-rules out/rules.yaml --directory out/dashboards mixin.libsonnet && \ + zip -q -r mimir-mixin.zip out + +format-mixin: + @find $(MIXIN_PATH) -type f -name '*.libsonnet' -print -o -name '*.jsonnet' -print | xargs jsonnetfmt -i + web-serve: cd website && hugo --config config.toml --minify -v server diff --git a/jsonnet/mimir-mixin/.gitignore b/jsonnet/mimir-mixin/.gitignore new file mode 100644 index 00000000000..7aac0df8ce9 --- /dev/null +++ b/jsonnet/mimir-mixin/.gitignore @@ -0,0 +1,3 @@ +/out/ +/vendor/ +/mimir-mixin.zip diff --git a/jsonnet/mimir-mixin/scripts/lint-playbooks.sh b/jsonnet/mimir-mixin/scripts/lint-playbooks.sh new file mode 100755 index 00000000000..7aa92122ab4 --- /dev/null +++ b/jsonnet/mimir-mixin/scripts/lint-playbooks.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -eu -o pipefail + +SCRIPT_DIR=$(realpath "$(dirname "${0}")") + +# List all alerts. +ALERTS=$(yq eval '.groups.[].rules.[].alert' "${SCRIPT_DIR}/../out/alerts.yaml" 2> /dev/stdout) +if [ $? -ne 0 ]; then + echo "Unable to list alerts. Got output:" + echo "$ALERTS" + exit 1 +elif [ -z "$ALERTS" ]; then + echo "No alerts found. Something went wrong with the listing." + exit 1 +fi + +# Check if each alert is referenced in the playbooks. +STATUS=0 + +for ALERT in $ALERTS; do + if ! grep -q "${ALERT}$" "${SCRIPT_DIR}/../docs/playbooks.md"; then + echo "Missing playbook for: $ALERT" + STATUS=1 + fi +done + +exit $STATUS diff --git a/mimir-build-image/Dockerfile b/mimir-build-image/Dockerfile index d4e8d0d8319..326cb6aabbf 100644 --- a/mimir-build-image/Dockerfile +++ b/mimir-build-image/Dockerfile @@ -36,15 +36,19 @@ RUN GOARCH=$(go env GOARCH) && \ RUN curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh| sh -s -- -b /usr/bin v1.27.0 -RUN GO111MODULE=on go get \ - github.com/client9/misspell/cmd/misspell@v0.3.4 \ - github.com/golang/protobuf/protoc-gen-go@v1.3.1 \ - github.com/gogo/protobuf/protoc-gen-gogoslick@v1.3.0 \ - github.com/gogo/protobuf/gogoproto@v1.3.0 \ - github.com/weaveworks/tools/cover@bdd647e92546027e12cdde3ae0714bb495e43013 \ - github.com/fatih/faillint@v1.5.0 \ - github.com/campoy/embedmd@v1.0.0 \ - && rm -rf /go/pkg /go/src /root/.cache +RUN GO111MODULE=on \ + go get github.com/client9/misspell/cmd/misspell@v0.3.4 && \ + go get github.com/golang/protobuf/protoc-gen-go@v1.3.1 && \ + go get github.com/gogo/protobuf/protoc-gen-gogoslick@v1.3.0 && \ + go get github.com/gogo/protobuf/gogoproto@v1.3.0 && \ + go get github.com/weaveworks/tools/cover@bdd647e92546027e12cdde3ae0714bb495e43013 && \ + go get github.com/fatih/faillint@v1.5.0 && \ + go get github.com/campoy/embedmd@v1.0.0 && \ + go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.4.0 && \ + go get github.com/monitoring-mixins/mixtool/cmd/mixtool@bca3066 && \ + go get github.com/mikefarah/yq/v4@v4.13.4 && \ + go get github.com/google/go-jsonnet/cmd/jsonnetfmt@v0.17.0 && \ + rm -rf /go/pkg /go/src /root/.cache ENV NODE_PATH=/usr/lib/node_modules COPY build.sh / From 8b4b9423826a1e2270eba9301f4bf7a3d4344cc2 Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Tue, 19 Oct 2021 09:07:58 +0100 Subject: [PATCH 360/364] Update build image --- .github/workflows/test-build-deploy.yml | 6 +++--- Makefile | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml index f67d58f740f..66f6622e139 100644 --- a/.github/workflows/test-build-deploy.yml +++ b/.github/workflows/test-build-deploy.yml @@ -10,7 +10,7 @@ jobs: lint: runs-on: ubuntu-20.04 container: - image: us.gcr.io/kubernetes-dev/mimir-build-image:add-prettier-08d2e2a61 + image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-4e65adf94 credentials: username: _json_key password: ${{ secrets.gcr_json_key }} @@ -46,7 +46,7 @@ jobs: test: runs-on: ubuntu-20.04 container: - image: us.gcr.io/kubernetes-dev/mimir-build-image:add-prettier-08d2e2a61 + image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-4e65adf94 credentials: username: _json_key password: ${{ secrets.gcr_json_key }} @@ -70,7 +70,7 @@ jobs: build: runs-on: ubuntu-20.04 container: - image: us.gcr.io/kubernetes-dev/mimir-build-image:add-prettier-08d2e2a61 + image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-4e65adf94 credentials: username: _json_key password: ${{ secrets.gcr_json_key }} diff --git a/Makefile b/Makefile index 8b53eab7aec..edffb635ac6 100644 --- a/Makefile +++ b/Makefile @@ -126,7 +126,7 @@ mimir-build-image/$(UPTODATE): mimir-build-image/* # All the boiler plate for building golang follows: SUDO := $(shell docker info >/dev/null 2>&1 || echo "sudo -E") BUILD_IN_CONTAINER := true -LATEST_BUILD_IMAGE_TAG ?= add-prettier-08d2e2a61 +LATEST_BUILD_IMAGE_TAG ?= 20211018_import-cortex-mixin-4e65adf94 # TTY is parameterized to allow Google Cloud Builder to run builds, # as it currently disallows TTY devices. This value needs to be overridden From c62520540f815cd90636d696190942a5aa68cba9 Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Tue, 19 Oct 2021 10:31:19 +0100 Subject: [PATCH 361/364] Move to operations folder --- Makefile | 2 +- {jsonnet => operations}/mimir-mixin/.gitignore | 0 {jsonnet => operations}/mimir-mixin/README.md | 0 {jsonnet => operations}/mimir-mixin/alerts.libsonnet | 0 .../mimir-mixin/alerts/alertmanager.libsonnet | 0 {jsonnet => operations}/mimir-mixin/alerts/alerts.libsonnet | 0 {jsonnet => operations}/mimir-mixin/alerts/blocks.libsonnet | 0 {jsonnet => operations}/mimir-mixin/alerts/compactor.libsonnet | 0 {jsonnet => operations}/mimir-mixin/config.libsonnet | 0 {jsonnet => operations}/mimir-mixin/dashboards.libsonnet | 0 .../mimir-mixin/dashboards/alertmanager-resources.libsonnet | 0 .../mimir-mixin/dashboards/alertmanager.libsonnet | 0 {jsonnet => operations}/mimir-mixin/dashboards/chunks.libsonnet | 0 .../mimir-mixin/dashboards/compactor-resources.libsonnet | 0 .../mimir-mixin/dashboards/compactor.libsonnet | 0 .../mimir-mixin/dashboards/comparison.libsonnet | 0 {jsonnet => operations}/mimir-mixin/dashboards/config.libsonnet | 0 .../mimir-mixin/dashboards/dashboard-utils.libsonnet | 0 .../mimir-mixin/dashboards/object-store.libsonnet | 0 .../mimir-mixin/dashboards/queries.libsonnet | 0 .../mimir-mixin/dashboards/reads-resources.libsonnet | 0 {jsonnet => operations}/mimir-mixin/dashboards/reads.libsonnet | 0 .../mimir-mixin/dashboards/rollout-progress.libsonnet | 0 {jsonnet => operations}/mimir-mixin/dashboards/ruler.libsonnet | 0 .../mimir-mixin/dashboards/scaling.libsonnet | 0 .../mimir-mixin/dashboards/slow-queries.libsonnet | 0 .../mimir-mixin/dashboards/writes-resources.libsonnet | 0 {jsonnet => operations}/mimir-mixin/dashboards/writes.libsonnet | 0 {jsonnet => operations}/mimir-mixin/docs/playbooks.md | 0 {jsonnet => operations}/mimir-mixin/groups.libsonnet | 0 {jsonnet => operations}/mimir-mixin/jsonnetfile.json | 0 {jsonnet => operations}/mimir-mixin/jsonnetfile.lock.json | 0 {jsonnet => operations}/mimir-mixin/mixin.libsonnet | 0 {jsonnet => operations}/mimir-mixin/recording_rules.libsonnet | 0 {jsonnet => operations}/mimir-mixin/scripts/lint-playbooks.sh | 0 35 files changed, 1 insertion(+), 1 deletion(-) rename {jsonnet => operations}/mimir-mixin/.gitignore (100%) rename {jsonnet => operations}/mimir-mixin/README.md (100%) rename {jsonnet => operations}/mimir-mixin/alerts.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/alerts/alertmanager.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/alerts/alerts.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/alerts/blocks.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/alerts/compactor.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/config.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/alertmanager-resources.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/alertmanager.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/chunks.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/compactor-resources.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/compactor.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/comparison.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/config.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/dashboard-utils.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/object-store.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/queries.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/reads-resources.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/reads.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/rollout-progress.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/ruler.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/scaling.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/slow-queries.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/writes-resources.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/dashboards/writes.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/docs/playbooks.md (100%) rename {jsonnet => operations}/mimir-mixin/groups.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/jsonnetfile.json (100%) rename {jsonnet => operations}/mimir-mixin/jsonnetfile.lock.json (100%) rename {jsonnet => operations}/mimir-mixin/mixin.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/recording_rules.libsonnet (100%) rename {jsonnet => operations}/mimir-mixin/scripts/lint-playbooks.sh (100%) diff --git a/Makefile b/Makefile index edffb635ac6..b39b09eb662 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ UPTODATE := .uptodate JSONNET_FMT := jsonnetfmt # path to the mimir/mixin -MIXIN_PATH := jsonnet/mimir-mixin +MIXIN_PATH := operations/mimir-mixin .PHONY: image-tag image-tag: diff --git a/jsonnet/mimir-mixin/.gitignore b/operations/mimir-mixin/.gitignore similarity index 100% rename from jsonnet/mimir-mixin/.gitignore rename to operations/mimir-mixin/.gitignore diff --git a/jsonnet/mimir-mixin/README.md b/operations/mimir-mixin/README.md similarity index 100% rename from jsonnet/mimir-mixin/README.md rename to operations/mimir-mixin/README.md diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/operations/mimir-mixin/alerts.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/alerts.libsonnet rename to operations/mimir-mixin/alerts.libsonnet diff --git a/jsonnet/mimir-mixin/alerts/alertmanager.libsonnet b/operations/mimir-mixin/alerts/alertmanager.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/alerts/alertmanager.libsonnet rename to operations/mimir-mixin/alerts/alertmanager.libsonnet diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/alerts/alerts.libsonnet rename to operations/mimir-mixin/alerts/alerts.libsonnet diff --git a/jsonnet/mimir-mixin/alerts/blocks.libsonnet b/operations/mimir-mixin/alerts/blocks.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/alerts/blocks.libsonnet rename to operations/mimir-mixin/alerts/blocks.libsonnet diff --git a/jsonnet/mimir-mixin/alerts/compactor.libsonnet b/operations/mimir-mixin/alerts/compactor.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/alerts/compactor.libsonnet rename to operations/mimir-mixin/alerts/compactor.libsonnet diff --git a/jsonnet/mimir-mixin/config.libsonnet b/operations/mimir-mixin/config.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/config.libsonnet rename to operations/mimir-mixin/config.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards.libsonnet b/operations/mimir-mixin/dashboards.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards.libsonnet rename to operations/mimir-mixin/dashboards.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet b/operations/mimir-mixin/dashboards/alertmanager-resources.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/alertmanager-resources.libsonnet rename to operations/mimir-mixin/dashboards/alertmanager-resources.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/operations/mimir-mixin/dashboards/alertmanager.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet rename to operations/mimir-mixin/dashboards/alertmanager.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/chunks.libsonnet b/operations/mimir-mixin/dashboards/chunks.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/chunks.libsonnet rename to operations/mimir-mixin/dashboards/chunks.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet b/operations/mimir-mixin/dashboards/compactor-resources.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/compactor-resources.libsonnet rename to operations/mimir-mixin/dashboards/compactor-resources.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/compactor.libsonnet b/operations/mimir-mixin/dashboards/compactor.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/compactor.libsonnet rename to operations/mimir-mixin/dashboards/compactor.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/comparison.libsonnet b/operations/mimir-mixin/dashboards/comparison.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/comparison.libsonnet rename to operations/mimir-mixin/dashboards/comparison.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/config.libsonnet b/operations/mimir-mixin/dashboards/config.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/config.libsonnet rename to operations/mimir-mixin/dashboards/config.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/dashboard-utils.libsonnet rename to operations/mimir-mixin/dashboards/dashboard-utils.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/object-store.libsonnet b/operations/mimir-mixin/dashboards/object-store.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/object-store.libsonnet rename to operations/mimir-mixin/dashboards/object-store.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/queries.libsonnet b/operations/mimir-mixin/dashboards/queries.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/queries.libsonnet rename to operations/mimir-mixin/dashboards/queries.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet b/operations/mimir-mixin/dashboards/reads-resources.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/reads-resources.libsonnet rename to operations/mimir-mixin/dashboards/reads-resources.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/reads.libsonnet b/operations/mimir-mixin/dashboards/reads.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/reads.libsonnet rename to operations/mimir-mixin/dashboards/reads.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet b/operations/mimir-mixin/dashboards/rollout-progress.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/rollout-progress.libsonnet rename to operations/mimir-mixin/dashboards/rollout-progress.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/ruler.libsonnet b/operations/mimir-mixin/dashboards/ruler.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/ruler.libsonnet rename to operations/mimir-mixin/dashboards/ruler.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/scaling.libsonnet b/operations/mimir-mixin/dashboards/scaling.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/scaling.libsonnet rename to operations/mimir-mixin/dashboards/scaling.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet b/operations/mimir-mixin/dashboards/slow-queries.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/slow-queries.libsonnet rename to operations/mimir-mixin/dashboards/slow-queries.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet b/operations/mimir-mixin/dashboards/writes-resources.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/writes-resources.libsonnet rename to operations/mimir-mixin/dashboards/writes-resources.libsonnet diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/operations/mimir-mixin/dashboards/writes.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/dashboards/writes.libsonnet rename to operations/mimir-mixin/dashboards/writes.libsonnet diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/operations/mimir-mixin/docs/playbooks.md similarity index 100% rename from jsonnet/mimir-mixin/docs/playbooks.md rename to operations/mimir-mixin/docs/playbooks.md diff --git a/jsonnet/mimir-mixin/groups.libsonnet b/operations/mimir-mixin/groups.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/groups.libsonnet rename to operations/mimir-mixin/groups.libsonnet diff --git a/jsonnet/mimir-mixin/jsonnetfile.json b/operations/mimir-mixin/jsonnetfile.json similarity index 100% rename from jsonnet/mimir-mixin/jsonnetfile.json rename to operations/mimir-mixin/jsonnetfile.json diff --git a/jsonnet/mimir-mixin/jsonnetfile.lock.json b/operations/mimir-mixin/jsonnetfile.lock.json similarity index 100% rename from jsonnet/mimir-mixin/jsonnetfile.lock.json rename to operations/mimir-mixin/jsonnetfile.lock.json diff --git a/jsonnet/mimir-mixin/mixin.libsonnet b/operations/mimir-mixin/mixin.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/mixin.libsonnet rename to operations/mimir-mixin/mixin.libsonnet diff --git a/jsonnet/mimir-mixin/recording_rules.libsonnet b/operations/mimir-mixin/recording_rules.libsonnet similarity index 100% rename from jsonnet/mimir-mixin/recording_rules.libsonnet rename to operations/mimir-mixin/recording_rules.libsonnet diff --git a/jsonnet/mimir-mixin/scripts/lint-playbooks.sh b/operations/mimir-mixin/scripts/lint-playbooks.sh similarity index 100% rename from jsonnet/mimir-mixin/scripts/lint-playbooks.sh rename to operations/mimir-mixin/scripts/lint-playbooks.sh From 61250efee763c27e599146c3a50148d7a7c0e346 Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Tue, 19 Oct 2021 10:54:46 +0100 Subject: [PATCH 362/364] Add missing zip to build-image --- .github/workflows/test-build-deploy.yml | 6 +++--- mimir-build-image/Dockerfile | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml index 66f6622e139..8308ac1af8d 100644 --- a/.github/workflows/test-build-deploy.yml +++ b/.github/workflows/test-build-deploy.yml @@ -10,7 +10,7 @@ jobs: lint: runs-on: ubuntu-20.04 container: - image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-4e65adf94 + image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-c62520540 credentials: username: _json_key password: ${{ secrets.gcr_json_key }} @@ -46,7 +46,7 @@ jobs: test: runs-on: ubuntu-20.04 container: - image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-4e65adf94 + image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-c62520540 credentials: username: _json_key password: ${{ secrets.gcr_json_key }} @@ -70,7 +70,7 @@ jobs: build: runs-on: ubuntu-20.04 container: - image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-4e65adf94 + image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-c62520540 credentials: username: _json_key password: ${{ secrets.gcr_json_key }} diff --git a/mimir-build-image/Dockerfile b/mimir-build-image/Dockerfile index 326cb6aabbf..aae51a49dd9 100644 --- a/mimir-build-image/Dockerfile +++ b/mimir-build-image/Dockerfile @@ -6,7 +6,7 @@ FROM golang:1.16.6-buster ARG goproxyValue ENV GOPROXY=${goproxyValue} -RUN apt-get update && apt-get install -y curl python-requests python-yaml file jq unzip protobuf-compiler libprotobuf-dev shellcheck && \ +RUN apt-get update && apt-get install -y curl python-requests python-yaml file jq zip unzip protobuf-compiler libprotobuf-dev shellcheck && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* RUN go get -u golang.org/x/tools/cmd/goimports@3fce476f0a782aeb5034d592c189e63be4ba6c9e RUN curl -sL https://deb.nodesource.com/setup_14.x | bash - From e7b4eab3c3d23470c35a52884cb8e5b2052a67d3 Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Tue, 19 Oct 2021 11:18:55 +0100 Subject: [PATCH 363/364] Run prettifier on playbooks.md --- operations/mimir-mixin/docs/playbooks.md | 148 ++++++++++++++++------- 1 file changed, 107 insertions(+), 41 deletions(-) diff --git a/operations/mimir-mixin/docs/playbooks.md b/operations/mimir-mixin/docs/playbooks.md index 180ed50d8c3..8534b0c14bd 100644 --- a/operations/mimir-mixin/docs/playbooks.md +++ b/operations/mimir-mixin/docs/playbooks.md @@ -8,11 +8,13 @@ This document contains playbooks, or at least a checklist of what to look for, f ## Alerts ### CortexIngesterRestarts + First, check if the alert is for a single ingester or multiple. Even if the alert is only for one ingester, it's best to follow up by checking `kubectl get pods --namespace=` every few minutes, or looking at the query `rate(kube_pod_container_status_restarts_total{container="ingester"}[30m]) > 0` just until you're sure there isn't a larger issue causing multiple restarts. Next, check `kubectl get events`, with and without the addition of the `--namespace` flag, to look for node restarts or other related issues. Grep or something similar to filter the output can be useful here. The most common cause of this alert is a single cloud providers node restarting and causing the ingester on that node to be rescheduled somewhere else. In events you're looking for things like: + ``` 57m Normal NodeControllerEviction Pod Marking for deletion Pod ingester-01 from Node cloud-provider-node-01 37m Normal SuccessfulDelete ReplicaSet (combined from similar events): Deleted pod: ingester-01 @@ -21,6 +23,7 @@ In events you're looking for things like: ``` If nothing obvious from the above, check for increased load: + - If there is an increase in the number of active series and the memory provisioned is not enough, scale up the ingesters horizontally to have the same number of series as before per ingester. - If we had an outage and once Cortex is back up, the incoming traffic increases. (or) The clients have their Prometheus remote-write lagging and starts to send samples at a higher rate (again, an increase in traffic but in terms of number of samples). Scale up the ingester horizontally in this case too. @@ -29,10 +32,12 @@ If nothing obvious from the above, check for increased load: This alert fires when the `max_series` per ingester instance limit is enabled and the actual number of in-memory series in an ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new series, while appending samples to existing ones will continue to succeed. In case of **emergency**: + - If the actual number of series is very close to or already hit the limit, then you can increase the limit via runtime config to gain some time - Increasing the limit will increase the ingesters' memory utilization. Please monitor the ingesters' memory utilization via the `Cortex / Writes Resources` dashboard How the limit is **configured**: + - The limit can be configured either on CLI (`-ingester.instance-limits.max-series`) or in the runtime config: ``` ingester_limits: @@ -50,32 +55,37 @@ How the limit is **configured**: - The configured limit can be queried via `cortex_ingester_instance_limits{limit="max_series"}` How to **fix**: + 1. **Temporarily increase the limit**
If the actual number of series is very close to or already hit the limit, or if you foresee the ingester will hit the limit before dropping the stale series as an effect of the scale up, you should also temporarily increase the limit. 2. **Check if shuffle-sharding shard size is correct**
- - When shuffle-sharding is enabled, we target up to 100K series / tenant / ingester assuming tenants on average use 50% of their max series limit. - - Run the following **instant query** to find tenants that may cause higher pressure on some ingesters: - ``` - ( - sum by(user) (cortex_ingester_memory_series_created_total{namespace=""} - - - cortex_ingester_memory_series_removed_total{namespace=""}) - ) - > - ( - max by(user) (cortex_overrides{namespace="",limit_name="max_global_series_per_user"}) - * - scalar(max(cortex_distributor_replication_factor{namespace=""})) - * - 0.5 - ) - > 200000 - - # Decomment the following to show only tenants beloging to a specific ingester's shard. - # and count by(user) (cortex_ingester_active_series{namespace="",pod="ingester-"}) - ``` - - Check the current shard size of each tenant in the output and, if they're not already sharded across all ingesters, you may consider to double their shard size - - The in-memory series in the ingesters will be effectively reduced at the TSDB head compaction happening at least 1h after you increased the shard size for the affected tenants + +- When shuffle-sharding is enabled, we target up to 100K series / tenant / ingester assuming tenants on average use 50% of their max series limit. +- Run the following **instant query** to find tenants that may cause higher pressure on some ingesters: + + ``` + ( + sum by(user) (cortex_ingester_memory_series_created_total{namespace=""} + - + cortex_ingester_memory_series_removed_total{namespace=""}) + ) + > + ( + max by(user) (cortex_overrides{namespace="",limit_name="max_global_series_per_user"}) + * + scalar(max(cortex_distributor_replication_factor{namespace=""})) + * + 0.5 + ) + > 200000 + + # Decomment the following to show only tenants beloging to a specific ingester's shard. + # and count by(user) (cortex_ingester_active_series{namespace="",pod="ingester-"}) + ``` + +- Check the current shard size of each tenant in the output and, if they're not already sharded across all ingesters, you may consider to double their shard size +- The in-memory series in the ingesters will be effectively reduced at the TSDB head compaction happening at least 1h after you increased the shard size for the affected tenants + 3. **Scale up ingesters**
Scaling up ingesters will lower the number of series per ingester. However, the effect of this change will take up to 4h, because after the scale up we need to wait until all stale series are dropped from memory as the effect of TSDB head compaction, which could take up to 4h (with the default config, TSDB keeps in-memory series up to 3h old and it gets compacted every 2h). @@ -84,10 +94,12 @@ How to **fix**: This alert fires when the `max_tenants` per ingester instance limit is enabled and the actual number of tenants in an ingester is reaching the limit. Once the limit is reached, writes to the ingester will fail (5xx) for new tenants, while they will continue to succeed for previously existing ones. In case of **emergency**: + - If the actual number of tenants is very close to or already hit the limit, then you can increase the limit via runtime config to gain some time - Increasing the limit will increase the ingesters' memory utilization. Please monitor the ingesters' memory utilization via the `Cortex / Writes Resources` dashboard How the limit is **configured**: + - The limit can be configured either on CLI (`-ingester.instance-limits.max-tenants`) or in the runtime config: ``` ingester_limits: @@ -105,6 +117,7 @@ How the limit is **configured**: - The configured limit can be queried via `cortex_ingester_instance_limits{limit="max_tenants"}` How to **fix**: + 1. Ensure shuffle-sharding is enabled in the Cortex cluster 1. Assuming shuffle-sharding is enabled, scaling up ingesters will lower the number of tenants per ingester. However, the effect of this change will be visible only after `-blocks-storage.tsdb.close-idle-tsdb-timeout` period so you may have to temporarily increase the limit @@ -117,6 +130,7 @@ The alert message includes both the Cortex service and route experiencing the hi #### Write Latency How to **investigate**: + - Check the `Cortex / Writes` dashboard - Looking at the dashboard you should see in which Cortex service the high latency originates - The panels in the dashboard are vertically sorted by the network path (eg. cortex-gw -> distributor -> ingester) @@ -137,6 +151,7 @@ How to **investigate**: Query performance is a known issue. A query may be slow because of high cardinality, large time range and/or because not leveraging on cache (eg. querying series data not cached yet). When investigating this alert, you should check if it's caused by few slow queries or there's an operational / config issue to be fixed. How to **investigate**: + - Check the `Cortex / Reads` dashboard - Looking at the dashboard you should see in which Cortex service the high latency originates - The panels in the dashboard are vertically sorted by the network path (eg. cortex-gw -> query-frontend -> query->scheduler -> querier -> store-gateway) @@ -166,6 +181,7 @@ This alert fires when the rate of 5xx errors of a specific route is > 1% for som This alert typically acts as a last resort to detect issues / outages. SLO alerts are expected to trigger earlier: if an **SLO alert** has triggered as well for the same read/write path, then you can ignore this alert and focus on the SLO one (but the investigation procedure is typically the same). How to **investigate**: + - Check for which route the alert fired (see [Cortex routes by path](#cortex-routes-by-path)) - Write path: open the `Cortex / Writes` dashboard - Read path: open the `Cortex / Reads` dashboard @@ -175,9 +191,11 @@ How to **investigate**: - If the failing service is crashing / panicking: look for the stack trace in the logs and investigate from there ### CortexTransferFailed + This alert goes off when an ingester fails to find another node to transfer its data to when it was shutting down. If there is both a pod stuck terminating and one stuck joining, look at the kubernetes events. This may be due to scheduling problems caused by some combination of anti affinity rules/resource utilization. Adding a new node can help in these circumstances. You can see recent events associated with a resource via kubectl describe, ex: `kubectl -n describe pod ` ### CortexIngesterUnhealthy + This alert goes off when an ingester is marked as unhealthy. Check the ring web page to see which is marked as unhealthy. You could then check the logs to see if there are any related to that ingester ex: `kubectl logs -f ingester-01 --namespace=prod`. A simple way to resolve this may be to click the "Forgot" button on the ring page, especially if the pod doesn't exist anymore. It might not exist anymore because it was on a node that got shut down, so you could check to see if there are any logs related to the node that pod is/was on, ex: `kubectl get events --namespace=prod | grep cloud-provider-node`. ### CortexMemoryMapAreasTooHigh @@ -185,10 +203,12 @@ This alert goes off when an ingester is marked as unhealthy. Check the ring web This alert fires when a Cortex process has a number of memory map areas close to the limit. The limit is a per-process limit imposed by the kernel and this issue is typically caused by a large number of mmap-ed failes. How to **fix**: + - Increase the limit on your system: `sysctl -w vm.max_map_count=` - If it's caused by a store-gateway, consider enabling `-blocks-storage.bucket-store.index-header-lazy-loading-enabled=true` to lazy mmap index-headers at query time More information: + - [Kernel doc](https://www.kernel.org/doc/Documentation/sysctl/vm.txt) - [Side effects when increasing `vm.max_map_count`](https://www.suse.com/support/kb/doc/?id=000016692) @@ -204,6 +224,7 @@ In general, pushing samples can fail due to problems with Cortex operations (eg. This alert fires only for first kind of problems, and not for problems caused by limits or invalid rules. How to **fix**: + - Investigate the ruler logs to find out the reason why ruler cannot write samples. Note that ruler logs all push errors, including "user errors", but those are not causing the alert to fire. Focus on problems with ingesters. ### CortexRulerTooManyFailedQueries @@ -215,6 +236,7 @@ Each rule evaluation may fail due to many reasons, eg. due to invalid PromQL exp There is a category of errors that is more important: errors due to failure to read data from store-gateways or ingesters. These errors would result in 500 when run from querier. This alert fires if there is too many of such failures. How to **fix**: + - Investigate the ruler logs to find out the reason why ruler cannot evaluate queries. Note that ruler logs rule evaluation errors even for "user errors", but those are not causing the alert to fire. Focus on problems with ingesters or store-gateways. ### CortexRulerMissedEvaluations @@ -226,6 +248,7 @@ _TODO: this playbook has not been written yet._ This alert fires when a Cortex ingester is not uploading any block to the long-term storage. An ingester is expected to upload a block to the storage every block range period (defaults to 2h) and if a longer time elapse since the last successful upload it means something is not working correctly. How to **investigate**: + - Ensure the ingester is receiving write-path traffic (samples to ingest) - Look for any upload error in the ingester logs (ie. networking or authentication issues) @@ -237,8 +260,9 @@ If the ingester hit the disk capacity, any attempt to append samples will fail. 1. Increase the disk size and restart the ingester. If the ingester is running in Kubernetes with a Persistent Volume, please refers to [Resizing Persistent Volumes using Kubernetes](#resizing-persistent-volumes-using-kubernetes). 2. Investigate why the disk capacity has been hit - - Was the disk just too small? - - Was there an issue compacting TSDB head and the WAL is increasing indefinitely? + +- Was the disk just too small? +- Was there an issue compacting TSDB head and the WAL is increasing indefinitely? ### CortexIngesterHasNotShippedBlocksSinceStart @@ -249,6 +273,7 @@ Same as [`CortexIngesterHasNotShippedBlocks`](#CortexIngesterHasNotShippedBlocks This alert fires when a Cortex ingester has compacted some blocks but such blocks haven't been successfully uploaded to the storage yet. How to **investigate**: + - Look for details in the ingester logs ### CortexIngesterTSDBHeadCompactionFailed @@ -258,11 +283,13 @@ This alert fires when a Cortex ingester is failing to compact the TSDB head into A TSDB instance is opened for each tenant writing at least 1 series to the ingester and its head contains the in-memory series not flushed to a block yet. Once the TSDB head is compactable, the ingester will try to compact it every 1 minute. If the TSDB head compaction repeatedly fails, it means it's failing to compact a block from the in-memory series for at least 1 tenant, and it's a critical condition that should be immediately investigated. The cause triggering this alert could **lead to**: + - Ingesters run out of memory - Ingesters run out of disk space - Queries return partial results after `-querier.query-ingesters-within` time since the beginning of the incident How to **investigate**: + - Look for details in the ingester logs ### CortexIngesterTSDBHeadTruncationFailed @@ -272,6 +299,7 @@ This alert fires when a Cortex ingester fails to truncate the TSDB head. The TSDB head is the in-memory store used to keep series and samples not compacted into a block yet. If head truncation fails for a long time, the ingester disk might get full as it won't continue to the WAL truncation stage and the subsequent ingester restart may take a long time or even go into an OOMKilled crash loop because of the huge WAL to replay. For this reason, it's important to investigate and address the issue as soon as it happen. How to **investigate**: + - Look for details in the ingester logs ### CortexIngesterTSDBCheckpointCreationFailed @@ -279,6 +307,7 @@ How to **investigate**: This alert fires when a Cortex ingester fails to create a TSDB checkpoint. How to **investigate**: + - Look for details in the ingester logs - If the checkpoint fails because of a `corruption in segment`, you can restart the ingester because at next startup TSDB will try to "repair" it. After restart, if the issue is repaired and the ingester is running, you should also get paged by `CortexIngesterTSDBWALCorrupted` to signal you the WAL was corrupted and manual investigation is required. @@ -289,6 +318,7 @@ This alert fires when a Cortex ingester fails to delete a TSDB checkpoint. Generally, this is not an urgent issue, but manual investigation is required to find the root cause of the issue and fix it. How to **investigate**: + - Look for details in the ingester logs ### CortexIngesterTSDBWALTruncationFailed @@ -296,6 +326,7 @@ How to **investigate**: This alert fires when a Cortex ingester fails to truncate the TSDB WAL. How to **investigate**: + - Look for details in the ingester logs ### CortexIngesterTSDBWALCorrupted @@ -311,6 +342,7 @@ If this alert fires during a **checkpoint creation**, you should have also been This alert fires when a Cortex ingester is failing to log records to the TSDB WAL on disk. How to **investigate**: + - Look for details in the ingester logs ### CortexQuerierHasNotScanTheBucket @@ -318,6 +350,7 @@ How to **investigate**: This alert fires when a Cortex querier is not successfully scanning blocks in the storage (bucket). A querier is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. How to **investigate**: + - Look for any scan error in the querier logs (ie. networking or rate limiting issues) ### CortexQuerierHighRefetchRate @@ -325,6 +358,7 @@ How to **investigate**: This alert fires when there's an high number of queries for which series have been refetched from a different store-gateway because of missing blocks. This could happen for a short time whenever a store-gateway ring resharding occurs (e.g. during/after an outage or while rolling out store-gateway) but store-gateways should reconcile in a short time. This alert fires if the issue persist for an unexpected long time and thus it should be investigated. How to **investigate**: + - Ensure there are no errors related to blocks scan or sync in the queriers and store-gateways - Check store-gateway logs to see if all store-gateway have successfully completed a blocks sync @@ -333,6 +367,7 @@ How to **investigate**: This alert fires when a Cortex store-gateway is not successfully scanning blocks in the storage (bucket). A store-gateway is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket for a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results. How to **investigate**: + - Look for any scan error in the store-gateway logs (ie. networking or rate limiting issues) ### CortexCompactorHasNotSuccessfullyCleanedUpBlocks @@ -340,6 +375,7 @@ How to **investigate**: This alert fires when a Cortex compactor is not successfully deleting blocks marked for deletion for a long time. How to **investigate**: + - Ensure the compactor is not crashing during compaction (ie. `OOMKilled`) - Look for any error in the compactor logs (ie. bucket Delete API errors) @@ -352,6 +388,7 @@ Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHas This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time. How to **investigate**: + - If the alert `CortexCompactorHasNotSuccessfullyRunCompaction` has fired as well, then investigate that issue first - If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first - Ensure ingesters are successfully shipping blocks to the storage @@ -364,6 +401,7 @@ This alert fires if the compactor is not able to successfully compact all discov When this alert fires, the compactor may still have successfully compacted some blocks but, for some reason, other blocks compaction is consistently failing. A common case is when the compactor is trying to compact a corrupted block for a single tenant: in this case the compaction of blocks for other tenants is still working, but compaction for the affected tenant is blocked by the corrupted block. How to **investigate**: + - Look for any error in the compactor logs - Corruption: [`not healthy index found`](#compactor-is-failing-because-of-not-healthy-index-found) @@ -376,15 +414,19 @@ level=error ts=2020-07-12T17:35:05.516823471Z caller=compactor.go:339 component= ``` When this happen you should: + 1. Rename the block prefixing it with `corrupted-` so that it will be skipped by the compactor and queriers. Keep in mind that doing so the block will become invisible to the queriers too, so its series/samples will not be queried. If the corruption affects only 1 block whose compaction `level` is 1 (the information is stored inside its `meta.json`) then Cortex guarantees no data loss because all the data is replicated across other blocks. In all other cases, there may be some data loss once you rename the block and stop querying it. 2. Ensure the compactor has recovered 3. Investigate offline the root cause (eg. download the corrupted block and debug it locally) To rename a block stored on GCS you can use the `gsutil` CLI command: + ``` gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK ``` + Where: + - `BUCKET` is the gcs bucket name the compactor is using. The cell's bucket name is specified as the `blocks_storage_bucket_name` in the cell configuration - `TENANT` is the tenant id reported in the example error message above as `REDACTED-TENANT` - `BLOCK` is the last part of the file path reported as `REDACTED-BLOCK` in the example error message above @@ -394,6 +436,7 @@ Where: This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. How to **investigate**: + - Ensure the compactor is successfully running - Look for any error in the compactor logs @@ -405,6 +448,7 @@ This alert fires when Cortex finds partial blocks for a given tenant. A partial 2. A block deletion has been interrupted and `deletion-mark.json` has been deleted before `meta.json` How to **investigate**: + - Look for the block ID in the logs. Example Loki query: ``` {cluster="",namespace="",container="compactor"} |= "skipped partial block" @@ -421,17 +465,19 @@ This alert is only related to the chunks storage. This can happen because of 2 r WAL corruptions are only detected at startups, so at this point the WAL/Checkpoint would have been repaired automatically. So we can only check what happened and if there was any data loss and take actions to avoid this happening in future. 1. Check if there was any node restarts that force killed pods. If there is, then the corruption is from the non graceful shutdown of ingesters, which is generally fine. You can: - * Describe the pod to see the last state. - * Use `kube_pod_info` to check the node for the pod. `node_boot_time_seconds` to see if node just booted (which also indicates restart). - * You can use `eventrouter` logs to double check. - * Check ingester logs to check if the shutdown logs are missing at that time. + +- Describe the pod to see the last state. +- Use `kube_pod_info` to check the node for the pod. `node_boot_time_seconds` to see if node just booted (which also indicates restart). +- You can use `eventrouter` logs to double check. +- Check ingester logs to check if the shutdown logs are missing at that time. + 2. To confirm this, in the logs, check the WAL segment on which the corruption happened (let's say `X`) and the last checkpoint attempt number (let's say `Y`, this is the last WAL segment that was present when checkpointing started). 3. If `X > Y`, then it's most likely an abrupt restart of ingester and the corruption would be on the last few records of the last segment. To verify this, check the file timestamps of WAL segment `X` and `X - 1` if they were recent. 4. If `X < Y`, then the corruption was in some WAL segment which was not the last one. This indicates faulty disk and some data loss on that ingester. 5. In case of faulty disk corruption, if the number or ingesters that had corruption within the chunk flush age: - 1. Less than the quorum number for your replication factor: No data loss, because there is a guarantee that the data is replicated. For example, if replication factor is 3, then it's fine if corruption was on 1 ingester. - 2. Equal or more than the quorum number but less than replication factor: There is a good chance that there is no data loss if it was replicated to desired number of ingesters. But it's good to check once for data loss. - 3. Equal or more than the replication factor: Then there is definitely some data loss. +6. Less than the quorum number for your replication factor: No data loss, because there is a guarantee that the data is replicated. For example, if replication factor is 3, then it's fine if corruption was on 1 ingester. +7. Equal or more than the quorum number but less than replication factor: There is a good chance that there is no data loss if it was replicated to desired number of ingesters. But it's good to check once for data loss. +8. Equal or more than the replication factor: Then there is definitely some data loss. ### CortexTableSyncFailure @@ -448,6 +494,7 @@ This alert fires if multiple replicas of the same Cortex service are using a dif The Cortex runtime config is a config file which gets live reloaded by Cortex at runtime. In order for Cortex to work properly, the loaded config is expected to be the exact same across multiple replicas of the same Cortex service (eg. distributors, ingesters, ...). When the config changes, there may be short periods of time during which some replicas have loaded the new config and others are still running on the previous one, but it shouldn't last for more than few minutes. How to **investigate**: + - Check how many different config file versions (hashes) are reported ``` count by (sha256) (cortex_runtime_config_hash{namespace=""}) @@ -466,6 +513,7 @@ This alert fires if Cortex is unable to reload the runtime config. This typically means an invalid runtime config was deployed. Cortex keeps running with the previous (valid) version of the runtime config; running Cortex replicas and the system availability shouldn't be affected, but new replicas won't be able to startup until the runtime config is fixed. How to **investigate**: + - Check the latest runtime config update (it's likely to be broken) - Check Cortex logs to get more details about what's wrong with the config @@ -480,12 +528,14 @@ The procedure to investigate it is the same as the one for [`CortexSchedulerQuer This alert fires if queries are piling up in the query-scheduler. How it **works**: + - A query-frontend API endpoint is called to execute a query - The query-frontend enqueues the request to the query-scheduler - The query-scheduler is responsible for dispatching enqueued queries to idle querier workers - The querier runs the query, sends the response back directly to the query-frontend and notifies the query-scheduler that it can process another query How to **investigate**: + - Are queriers in a crash loop (eg. OOMKilled)? - `OOMKilled`: temporarily increase queriers memory request/limit - `panic`: look for the stack trace in the logs and investigate from there @@ -505,6 +555,7 @@ How to **investigate**: This alert fires if Cortex memcached client is experiencing an high error rate for a specific cache and operation. How to **investigate**: + - The alert reports which cache is experiencing issue - `metadata-cache`: object store metadata cache - `index-cache`: TSDB index cache @@ -549,6 +600,7 @@ _This alert applies to Cortex chunks storage only._ This alert fires if the average number of in-memory series per ingester is above our target (1.5M). How to **fix**: + - Scale up ingesters - To find out the Cortex clusters where ingesters should be scaled up and how many minimum replicas are expected: ``` @@ -562,6 +614,7 @@ How to **fix**: This alert fires if the average number of samples ingested / sec in ingesters is above our target. How to **fix**: + - Scale up ingesters - To compute the desired number of ingesters to satisfy the average samples rate you can run the following query, replacing `` with the namespace to analyse and `` with the target number of samples/sec per ingester (check out the alert threshold to see the current target): ``` @@ -573,6 +626,7 @@ How to **fix**: This alert fires when an ingester memory utilization is getting closer to the limit. How it **works**: + - Cortex ingesters are a stateful service - Having 2+ ingesters `OOMKilled` may cause a cluster outage - Ingester memory baseline usage is primarily influenced by memory allocated by the process (mostly go heap) and mmap-ed files (used by TSDB) @@ -580,6 +634,7 @@ How it **works**: - A pod gets `OOMKilled` once its working set memory reaches the configured limit, so it's important to prevent ingesters' memory utilization (working set memory) from getting close to the limit (we need to keep at least 30% room for spikes due to queries) How to **fix**: + - Check if the issue occurs only for few ingesters. If so: - Restart affected ingesters 1 by 1 (proceed with the next one once the previous pod has restarted and it's Ready) ``` @@ -595,6 +650,7 @@ How to **fix**: This alert fires when any instance does not register all other instances as members of the memberlist cluster. How it **works**: + - This alert applies when memberlist is used for the ring backing store. - All Cortex instances using the ring, regardless of type, join a single memberlist cluster. - Each instance (=memberlist cluster member) should be able to see all others. @@ -603,6 +659,7 @@ How it **works**: - The total number of currently responsive instances. How to **investigate**: + - The instance which has the incomplete view of the cluster (too few members) is specified in the alert. - If the count is zero: - It is possible that the joining the cluster has yet to succeed. @@ -729,6 +786,7 @@ When an alertmanager cannot read the state for a tenant from storage it gets log This alert fires when a Cortex service rollout is stuck, which means the number of updated replicas doesn't match the expected one and looks there's no progress in the rollout. The alert monitors services deployed as Kubernetes `StatefulSet` and `Deployment`. How to **investigate**: + - Run `kubectl -n get pods -l name=` to get a list of running pods - Ensure there's no pod in a failing state (eg. `Error`, `OOMKilled`, `CrashLoopBackOff`) - Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) @@ -739,18 +797,21 @@ How to **investigate**: This alert fires if a Cortex instance is failing to run any operation on a KV store (eg. consul or etcd). How it **works**: + - Consul is typically used to store the hash ring state. - Etcd is typically used to store by the HA tracker (distributor) to deduplicate samples. - If an instance is failing operations on the **hash ring**, either the instance can't update the heartbeat in the ring or is failing to receive ring updates. - If an instance is failing operations on the **HA tracker** backend, either the instance can't update the authoritative replica or is failing to receive updates. How to **investigate**: + - Ensure Consul/Etcd is up and running. - Investigate the logs of the affected instance to find the specific error occurring when talking to Consul/Etcd. ## Cortex routes by path **Write path**: + - `/distributor.Distributor/Push` - `/cortex.Ingester/Push` - `api_v1_push` @@ -758,6 +819,7 @@ How to **investigate**: - `api_v1_push_influx_write` **Read path**: + - `/schedulerpb.SchedulerForFrontend/FrontendLoop` - `/cortex.Ingester/QueryStream` - `/cortex.Ingester/QueryExemplars` @@ -766,6 +828,7 @@ How to **investigate**: - `api_prom_api_v1_query_exemplars` **Ruler / rules path**: + - `api_v1_rules` - `api_v1_rules_namespace` - `api_prom_rules_namespace` @@ -826,7 +889,9 @@ To take a **GCP persistent disk snapshot**: Halting the ingesters should be the **very last resort** because of the side effects. To halt the ingesters, while preserving their disk and without disrupting the cluster write path, you need to: 1. Create a second pool of ingesters - - Uses the functions `newIngesterStatefulSet()`, `newIngesterPdb()` + +- Uses the functions `newIngesterStatefulSet()`, `newIngesterPdb()` + 2. Wait until the second pool is up and running 3. Halt existing ingesters (scale down to 0 or delete their statefulset) @@ -865,7 +930,7 @@ metadata: name: clone-ingester-7-pv spec: accessModes: - - ReadWriteOnce + - ReadWriteOnce capacity: storage: 150Gi gcePersistentDisk: @@ -892,20 +957,20 @@ spec: apiVersion: v1 kind: Pod metadata: - name: clone-ingester-7-dataaccess + name: clone-ingester-7-dataaccess spec: - containers: + containers: - name: alpine image: alpine:latest - command: ['sleep', 'infinity'] + command: ["sleep", "infinity"] volumeMounts: - - name: mypvc - mountPath: /data + - name: mypvc + mountPath: /data resources: requests: cpu: 500m memory: 1024Mi - volumes: + volumes: - name: mypvc persistentVolumeClaim: claimName: clone-ingester-7-pvc @@ -928,6 +993,7 @@ After this preparation, one can use `kubectl exec -t -i clone-ingester-7-dataacc ./gsutil/gsutil --help ``` 3. Configure credentials + ``` gsutil config -e @@ -944,7 +1010,6 @@ A PVC can be manually deleted by an operator. When a PVC claim is deleted, what - `Retain`: the volume will not be deleted until the PV resource will be manually deleted from Kubernetes - `Delete`: the volume will be automatically deleted - ## Log lines ### Log line containing 'sample with repeated timestamp but different value' @@ -952,5 +1017,6 @@ A PVC can be manually deleted by an operator. When a PVC claim is deleted, what This means a sample with the same timestamp as the latest one was received with a different value. The number of occurrences is recorded in the `cortex_discarded_samples_total` metric with the label `reason="new-value-for-timestamp"`. Possible reasons for this are: + - Incorrect relabelling rules can cause a label to be dropped from a series so that multiple series have the same labels. If these series were collected from the same target they will have the same timestamp. - The exporter being scraped sets the same timestamp on every scrape. Note that exporters should generally not set timestamps. From 93f9b88e39075c83635b2b0d65220f5f9f83dc8f Mon Sep 17 00:00:00 2001 From: Christian Simon Date: Tue, 19 Oct 2021 13:33:42 +0100 Subject: [PATCH 364/364] Update build-image --- .github/workflows/test-build-deploy.yml | 6 +++--- Makefile | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml index 8308ac1af8d..88357824d7e 100644 --- a/.github/workflows/test-build-deploy.yml +++ b/.github/workflows/test-build-deploy.yml @@ -10,7 +10,7 @@ jobs: lint: runs-on: ubuntu-20.04 container: - image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-c62520540 + image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-e7b4eab3c credentials: username: _json_key password: ${{ secrets.gcr_json_key }} @@ -46,7 +46,7 @@ jobs: test: runs-on: ubuntu-20.04 container: - image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-c62520540 + image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-e7b4eab3c credentials: username: _json_key password: ${{ secrets.gcr_json_key }} @@ -70,7 +70,7 @@ jobs: build: runs-on: ubuntu-20.04 container: - image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-c62520540 + image: us.gcr.io/kubernetes-dev/mimir-build-image:20211018_import-cortex-mixin-e7b4eab3c credentials: username: _json_key password: ${{ secrets.gcr_json_key }} diff --git a/Makefile b/Makefile index b39b09eb662..c7c59d47e1b 100644 --- a/Makefile +++ b/Makefile @@ -126,7 +126,7 @@ mimir-build-image/$(UPTODATE): mimir-build-image/* # All the boiler plate for building golang follows: SUDO := $(shell docker info >/dev/null 2>&1 || echo "sudo -E") BUILD_IN_CONTAINER := true -LATEST_BUILD_IMAGE_TAG ?= 20211018_import-cortex-mixin-4e65adf94 +LATEST_BUILD_IMAGE_TAG ?= 20211018_import-cortex-mixin-e7b4eab3c # TTY is parameterized to allow Google Cloud Builder to run builds, # as it currently disallows TTY devices. This value needs to be overridden