diff --git a/internal/selfmonitor/config/expr_builder.go b/internal/selfmonitor/config/expr_builder.go index 0d17c1139..0e1170017 100644 --- a/internal/selfmonitor/config/expr_builder.go +++ b/internal/selfmonitor/config/expr_builder.go @@ -78,6 +78,23 @@ func (eb *exprBuilder) greaterThan(value float64) *exprBuilder { return eb } +func (eb *exprBuilder) equal(value float64) *exprBuilder { + eb.expr = fmt.Sprintf("%s == %s", eb.expr, strconv.FormatFloat(value, 'f', -1, 64)) + return eb +} + func (eb *exprBuilder) build() string { return eb.expr } + +func and(exprs ...string) string { + return strings.Join(wrapInParentheses(exprs), " and ") +} + +func wrapInParentheses(input []string) []string { + wrapped := make([]string, len(input)) + for i, str := range input { + wrapped[i] = fmt.Sprintf("(%s)", str) + } + return wrapped +} diff --git a/internal/selfmonitor/config/fluent_bit_rule_builder.go b/internal/selfmonitor/config/fluent_bit_rule_builder.go index e07deda5c..87eca16e6 100644 --- a/internal/selfmonitor/config/fluent_bit_rule_builder.go +++ b/internal/selfmonitor/config/fluent_bit_rule_builder.go @@ -1,5 +1,9 @@ package config +import ( + "time" +) + const ( fluentBitMetricsServiceName = "telemetry-fluent-bit-metrics" fluentBitSidecarMetricsServiceName = "telemetry-fluent-bit-exporter-metrics" @@ -11,6 +15,9 @@ const ( bufferUsage300MB = 300000000 bufferUsage900MB = 900000000 + + // alertWaitTime is the time the alert have a pending state before firing + alertWaitTime = 1 * time.Minute ) type fluentBitRuleBuilder struct { @@ -19,10 +26,10 @@ type fluentBitRuleBuilder struct { func (rb fluentBitRuleBuilder) rules() []Rule { return []Rule{ rb.exporterSentRule(), - rb.receiverReadRule(), rb.exporterDroppedRule(), rb.bufferInUseRule(), rb.bufferFullRule(), + rb.noLogsDeliveredRule(), } } @@ -36,16 +43,6 @@ func (rb fluentBitRuleBuilder) exporterSentRule() Rule { } } -func (rb fluentBitRuleBuilder) receiverReadRule() Rule { - return Rule{ - Alert: rb.namePrefix() + RuleNameLogAgentReceiverReadLogs, - Expr: rate(metricFluentBitInputBytesTotal, selectService(fluentBitMetricsServiceName)). - sumBy(labelPipelineName). - greaterThan(0). - build(), - } -} - func (rb fluentBitRuleBuilder) exporterDroppedRule() Rule { return Rule{ Alert: rb.namePrefix() + RuleNameLogAgentExporterDroppedLogs, @@ -74,6 +71,24 @@ func (rb fluentBitRuleBuilder) bufferFullRule() Rule { } } +func (rb fluentBitRuleBuilder) noLogsDeliveredRule() Rule { + receiverReadExpr := rate(metricFluentBitInputBytesTotal, selectService(fluentBitMetricsServiceName)). + sumBy(labelPipelineName). + greaterThan(0). + build() + + exporterNotSentExpr := rate(metricFluentBitOutputProcBytesTotal, selectService(fluentBitMetricsServiceName)). + sumBy(labelPipelineName). + equal(0). + build() + + return Rule{ + Alert: rb.namePrefix() + RuleNameLogAgentNoLogsDelivered, + Expr: and(receiverReadExpr, exporterNotSentExpr), + For: alertWaitTime, + } +} + func (rb fluentBitRuleBuilder) namePrefix() string { return ruleNamePrefix(typeLogPipeline) } diff --git a/internal/selfmonitor/config/rules.go b/internal/selfmonitor/config/rules.go index 282707b0b..d0ef7739a 100644 --- a/internal/selfmonitor/config/rules.go +++ b/internal/selfmonitor/config/rules.go @@ -21,6 +21,7 @@ const ( RuleNameLogAgentExporterDroppedLogs = "AgentExporterDroppedLogs" RuleNameLogAgentBufferInUse = "AgentBufferInUse" RuleNameLogAgentBufferFull = "AgentBufferFull" + RuleNameLogAgentNoLogsDelivered = "AgentNoLogsDelivered" // Common rule labels labelService = "service" diff --git a/internal/selfmonitor/config/rules_test.go b/internal/selfmonitor/config/rules_test.go index 3523f181a..f09f7d4cc 100644 --- a/internal/selfmonitor/config/rules_test.go +++ b/internal/selfmonitor/config/rules_test.go @@ -48,17 +48,17 @@ func TestMakeRules(t *testing.T) { require.Equal(t, "LogAgentExporterSentLogs", ruleGroup.Rules[10].Alert) require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_output_proc_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[10].Expr) - require.Equal(t, "LogAgentReceiverReadLogs", ruleGroup.Rules[11].Alert) - require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_input_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[11].Expr) + require.Equal(t, "LogAgentExporterDroppedLogs", ruleGroup.Rules[11].Alert) + require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_output_dropped_records_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[11].Expr) - require.Equal(t, "LogAgentExporterDroppedLogs", ruleGroup.Rules[12].Alert) - require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_output_dropped_records_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[12].Expr) + require.Equal(t, "LogAgentBufferInUse", ruleGroup.Rules[12].Alert) + require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 300000000", ruleGroup.Rules[12].Expr) - require.Equal(t, "LogAgentBufferInUse", ruleGroup.Rules[13].Alert) - require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 300000000", ruleGroup.Rules[13].Expr) + require.Equal(t, "LogAgentBufferFull", ruleGroup.Rules[13].Alert) + require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 900000000", ruleGroup.Rules[13].Expr) - require.Equal(t, "LogAgentBufferFull", ruleGroup.Rules[14].Alert) - require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 900000000", ruleGroup.Rules[14].Expr) + require.Equal(t, "LogAgentNoLogsDelivered", ruleGroup.Rules[14].Alert) + require.Equal(t, "(sum by (pipeline_name) (rate(fluentbit_input_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0) and (sum by (pipeline_name) (rate(fluentbit_output_proc_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) == 0)", ruleGroup.Rules[14].Expr) } func TestMatchesLogPipelineRule(t *testing.T) { diff --git a/internal/selfmonitor/prober/log_pipeline_prober.go b/internal/selfmonitor/prober/log_pipeline_prober.go index 8711928ab..ff5494837 100644 --- a/internal/selfmonitor/prober/log_pipeline_prober.go +++ b/internal/selfmonitor/prober/log_pipeline_prober.go @@ -64,9 +64,7 @@ func (p *LogPipelineProber) someDataDropped(alerts []promv1.Alert, pipelineName } func (p *LogPipelineProber) noLogsDelivered(alerts []promv1.Alert, pipelineName string) bool { - receiverReadLogs := p.isFiring(alerts, config.RuleNameLogAgentReceiverReadLogs, pipelineName) - exporterSentLogs := p.isFiring(alerts, config.RuleNameLogAgentExporterSentLogs, pipelineName) - return receiverReadLogs && !exporterSentLogs + return p.isFiring(alerts, config.RuleNameLogAgentNoLogsDelivered, pipelineName) } func (p *LogPipelineProber) bufferFillingUp(alerts []promv1.Alert, pipelineName string) bool { @@ -78,11 +76,8 @@ func (p *LogPipelineProber) healthy(alerts []promv1.Alert, pipelineName string) bufferInUse := p.isFiring(alerts, config.RuleNameLogAgentBufferInUse, pipelineName) bufferFull := p.isFiring(alerts, config.RuleNameLogAgentBufferFull, pipelineName) exporterDroppedLogs := p.isFiring(alerts, config.RuleNameLogAgentExporterDroppedLogs, pipelineName) - - // The pipeline is healthy if either no logs are being read or all logs are being sent - receiverReadLogs := p.isFiring(alerts, config.RuleNameLogAgentReceiverReadLogs, pipelineName) - exporterSentLogs := p.isFiring(alerts, config.RuleNameLogAgentExporterSentLogs, pipelineName) - return !(bufferInUse || bufferFull || exporterDroppedLogs) && (!receiverReadLogs || exporterSentLogs) + noLogsDelivered := p.isFiring(alerts, config.RuleNameLogAgentNoLogsDelivered, pipelineName) + return !(bufferInUse || bufferFull || exporterDroppedLogs || noLogsDelivered) } func (p *LogPipelineProber) isFiring(alerts []promv1.Alert, ruleName, pipelineName string) bool { diff --git a/internal/selfmonitor/prober/log_pipeline_prober_test.go b/internal/selfmonitor/prober/log_pipeline_prober_test.go index 7d1e42c36..9b133eb68 100644 --- a/internal/selfmonitor/prober/log_pipeline_prober_test.go +++ b/internal/selfmonitor/prober/log_pipeline_prober_test.go @@ -61,6 +61,25 @@ func TestLogPipelineProber(t *testing.T) { }, }, }, + { + name: "pending alert", + pipelineName: "cls", + alerts: promv1.AlertsResult{ + Alerts: []promv1.Alert{ + { + Labels: model.LabelSet{ + "alertname": "LogAgentBufferFull", + }, + State: promv1.AlertStatePending, + }, + }, + }, + expected: LogPipelineProbeResult{ + PipelineProbeResult: PipelineProbeResult{ + Healthy: true, + }, + }, + }, { name: "alert missing pipeline_name label", pipelineName: "cls", @@ -222,13 +241,13 @@ func TestLogPipelineProber(t *testing.T) { }, }, { - name: "receiver read logs and exporter did not send logs firing", + name: "no logs delivered firing", pipelineName: "cls", alerts: promv1.AlertsResult{ Alerts: []promv1.Alert{ { Labels: model.LabelSet{ - "alertname": "LogAgentReceiverReadLogs", + "alertname": "LogAgentNoLogsDelivered", "pipeline_name": "cls", }, State: promv1.AlertStateFiring, @@ -240,17 +259,10 @@ func TestLogPipelineProber(t *testing.T) { }, }, { - name: "exporter read logs and exporter sent logs firing", + name: "exporter sent logs firing", pipelineName: "cls", alerts: promv1.AlertsResult{ Alerts: []promv1.Alert{ - { - Labels: model.LabelSet{ - "alertname": "LogAgentReceiverReadLogs", - "pipeline_name": "cls", - }, - State: promv1.AlertStateFiring, - }, { Labels: model.LabelSet{ "alertname": "LogAgentExporterSentLogs", diff --git a/internal/selfmonitor/prober/otel_pipeline_prober_test.go b/internal/selfmonitor/prober/otel_pipeline_prober_test.go index 96175cc9b..00e96822e 100644 --- a/internal/selfmonitor/prober/otel_pipeline_prober_test.go +++ b/internal/selfmonitor/prober/otel_pipeline_prober_test.go @@ -61,6 +61,25 @@ func TestOTelPipelineProber(t *testing.T) { }, }, }, + { + name: "pending alert", + pipelineName: "cls", + alerts: promv1.AlertsResult{ + Alerts: []promv1.Alert{ + { + Labels: model.LabelSet{ + "alertname": "TraceGatewayExporterDroppedData", + }, + State: promv1.AlertStatePending, + }, + }, + }, + expected: OTelPipelineProbeResult{ + PipelineProbeResult: PipelineProbeResult{ + Healthy: true, + }, + }, + }, { name: "alert missing pipeline_name label", pipelineName: "cls",