Skip to content

Commit

Permalink
Merge branch 'main' into aja_alertmanager
Browse files Browse the repository at this point in the history
  • Loading branch information
aallawala authored Oct 18, 2021
2 parents 3c18fd8 + 7503360 commit 9fad351
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 19 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
* [ENHANCEMENT] Add recording rules to improve responsiveness of Alertmanager dashboard. #387
* [ENHANCEMENT] Add `CortexRolloutStuck` alert. #405
* [ENHANCEMENT] Added `CortexKVStoreFailure` alert. #406
* [ENHANCEMENT] Add ability to override `datasource` for generated dashboards. #407
* [ENHANCEMENT] Use alertmanager jobname for alertmanager dashboard panels #411
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
* [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329
Expand Down
3 changes: 3 additions & 0 deletions cortex-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -69,5 +69,8 @@

// The routes to exclude from alerts.
alert_excluded_routes: [],

// Name of the datasource for which the dashboards should attach to
dashboard_datasource: 'default',
},
}
2 changes: 1 addition & 1 deletion cortex-mixin/dashboards/dashboard-utils.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
// - default tags,
// - some links that propagate the selectred cluster.
dashboard(title)::
super.dashboard(title) + {
super.dashboard(title=title, datasource=$._config.dashboard_datasource) + {
addRowIf(condition, row)::
if condition
then self.addRow(row)
Expand Down
36 changes: 18 additions & 18 deletions cortex-mixin/dashboards/ruler.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,19 @@ local utils = import 'mixin-utils/utils.libsonnet';
})
.addPanel(
$.panel('Active Configurations') +
$.statPanel('sum(cortex_ruler_managers_total{%s})' % $.jobMatcher('ruler'), format='short')
$.statPanel('sum(cortex_ruler_managers_total{%s})' % $.jobMatcher($._config.job_names.ruler), format='short')
)
.addPanel(
$.panel('Total Rules') +
$.statPanel('sum(cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher('ruler'), format='short')
$.statPanel('sum(cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher($._config.job_names.ruler), format='short')
)
.addPanel(
$.panel('Read from Ingesters - QPS') +
$.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}[5m]))' % $.jobMatcher('ruler'), format='reqps')
$.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps')
)
.addPanel(
$.panel('Write to Ingesters - QPS') +
$.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}[5m]))' % $.jobMatcher('ruler'), format='reqps')
$.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps')
)
)
.addRow(
Expand All @@ -89,16 +89,16 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panel('EPS') +
$.queryPanel(
[
$.rulerQueries.ruleEvaluations.success % [$.jobMatcher('ruler'), $.jobMatcher('ruler')],
$.rulerQueries.ruleEvaluations.failure % $.jobMatcher('ruler'),
$.rulerQueries.ruleEvaluations.success % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)],
$.rulerQueries.ruleEvaluations.failure % $.jobMatcher($._config.job_names.ruler),
],
['success', 'failed'],
),
)
.addPanel(
$.panel('Latency') +
$.queryPanel(
$.rulerQueries.ruleEvaluations.latency % [$.jobMatcher('ruler'), $.jobMatcher('ruler')],
$.rulerQueries.ruleEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)],
'average'
),
)
Expand Down Expand Up @@ -126,22 +126,22 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Writes (Ingesters)')
.addPanel(
$.panel('QPS') +
$.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher('ruler'))
$.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler))
)
.addPanel(
$.panel('Latency') +
$.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher('ruler'))
$.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler))
)
)
.addRow(
$.row('Reads (Ingesters)')
.addPanel(
$.panel('QPS') +
$.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher('ruler'))
$.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler))
)
.addPanel(
$.panel('Latency') +
$.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher('ruler'))
$.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler))
)
)
.addRowIf(
Expand Down Expand Up @@ -208,34 +208,34 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Notifications')
.addPanel(
$.panel('Delivery Errors') +
$.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}')
$.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}')
)
.addPanel(
$.panel('Queue Length') +
$.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}')
$.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}')
)
.addPanel(
$.panel('Dropped') +
$.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher('ruler'), '{{ user }}')
$.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher($._config.job_names.ruler), '{{ user }}')
)
)
.addRow(
($.row('Group Evaluations') + { collapse: true })
.addPanel(
$.panel('Missed Iterations') +
$.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher('ruler'), '{{ user }}'),
$.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher($._config.job_names.ruler), '{{ user }}'),
)
.addPanel(
$.panel('Latency') +
$.queryPanel(
$.rulerQueries.groupEvaluations.latency % [$.jobMatcher('ruler'), $.jobMatcher('ruler')],
$.rulerQueries.groupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)],
'{{ user }}'
),
)
.addPanel(
$.panel('Failures') +
$.queryPanel(
$.rulerQueries.perUserPerGroupEvaluations.failure % [$.jobMatcher('ruler')], '{{ rule_group }}'
$.rulerQueries.perUserPerGroupEvaluations.failure % [$.jobMatcher($._config.job_names.ruler)], '{{ rule_group }}'
)
)
)
Expand All @@ -244,7 +244,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addPanel(
$.panel('Latency') +
$.queryPanel(
$.rulerQueries.perUserPerGroupEvaluations.latency % [$.jobMatcher('ruler'), $.jobMatcher('ruler')],
$.rulerQueries.perUserPerGroupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)],
'{{ user }}'
)
)
Expand Down

0 comments on commit 9fad351

Please sign in to comment.