Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: False positive NoLogsDelivered LogPipeline status reason #1397

Merged
Merged
17 changes: 17 additions & 0 deletions internal/selfmonitor/config/expr_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,23 @@ func (eb *exprBuilder) greaterThan(value float64) *exprBuilder {
return eb
}

func (eb *exprBuilder) equal(value float64) *exprBuilder {
eb.expr = fmt.Sprintf("%s == %s", eb.expr, strconv.FormatFloat(value, 'f', -1, 64))
return eb
}

func (eb *exprBuilder) build() string {
return eb.expr
}

func and(exprs ...string) string {
return strings.Join(wrapInParentheses(exprs), " and ")
}

func wrapInParentheses(input []string) []string {
wrapped := make([]string, len(input))
for i, str := range input {
wrapped[i] = fmt.Sprintf("(%s)", str)
}
return wrapped
}
37 changes: 26 additions & 11 deletions internal/selfmonitor/config/fluent_bit_rule_builder.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
package config

import (
"time"
)

const (
fluentBitMetricsServiceName = "telemetry-fluent-bit-metrics"
fluentBitSidecarMetricsServiceName = "telemetry-fluent-bit-exporter-metrics"
Expand All @@ -11,6 +15,9 @@ const (

bufferUsage300MB = 300000000
bufferUsage900MB = 900000000

// alertWaitTime is the time the alert have a pending state before firing
alertWaitTime = 1 * time.Minute
)

type fluentBitRuleBuilder struct {
Expand All @@ -19,10 +26,10 @@ type fluentBitRuleBuilder struct {
func (rb fluentBitRuleBuilder) rules() []Rule {
return []Rule{
rb.exporterSentRule(),
rb.receiverReadRule(),
rb.exporterDroppedRule(),
rb.bufferInUseRule(),
rb.bufferFullRule(),
rb.noLogsDeliveredRule(),
}
}

Expand All @@ -36,16 +43,6 @@ func (rb fluentBitRuleBuilder) exporterSentRule() Rule {
}
}

func (rb fluentBitRuleBuilder) receiverReadRule() Rule {
return Rule{
Alert: rb.namePrefix() + RuleNameLogAgentReceiverReadLogs,
Expr: rate(metricFluentBitInputBytesTotal, selectService(fluentBitMetricsServiceName)).
sumBy(labelPipelineName).
greaterThan(0).
build(),
}
}

func (rb fluentBitRuleBuilder) exporterDroppedRule() Rule {
return Rule{
Alert: rb.namePrefix() + RuleNameLogAgentExporterDroppedLogs,
Expand Down Expand Up @@ -74,6 +71,24 @@ func (rb fluentBitRuleBuilder) bufferFullRule() Rule {
}
}

func (rb fluentBitRuleBuilder) noLogsDeliveredRule() Rule {
receiverReadExpr := rate(metricFluentBitInputBytesTotal, selectService(fluentBitMetricsServiceName)).
sumBy(labelPipelineName).
greaterThan(0).
build()

exporterNotSentExpr := rate(metricFluentBitOutputProcBytesTotal, selectService(fluentBitMetricsServiceName)).
sumBy(labelPipelineName).
equal(0).
TeodorSAP marked this conversation as resolved.
Show resolved Hide resolved
build()

return Rule{
Alert: rb.namePrefix() + RuleNameLogAgentNoLogsDelivered,
Expr: and(receiverReadExpr, exporterNotSentExpr),
For: alertWaitTime,
}
}

func (rb fluentBitRuleBuilder) namePrefix() string {
return ruleNamePrefix(typeLogPipeline)
}
1 change: 1 addition & 0 deletions internal/selfmonitor/config/rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ const (
RuleNameLogAgentExporterDroppedLogs = "AgentExporterDroppedLogs"
RuleNameLogAgentBufferInUse = "AgentBufferInUse"
RuleNameLogAgentBufferFull = "AgentBufferFull"
RuleNameLogAgentNoLogsDelivered = "AgentNoLogsDelivered"

// Common rule labels
labelService = "service"
Expand Down
16 changes: 8 additions & 8 deletions internal/selfmonitor/config/rules_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,17 @@ func TestMakeRules(t *testing.T) {
require.Equal(t, "LogAgentExporterSentLogs", ruleGroup.Rules[10].Alert)
require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_output_proc_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[10].Expr)

require.Equal(t, "LogAgentReceiverReadLogs", ruleGroup.Rules[11].Alert)
require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_input_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[11].Expr)
require.Equal(t, "LogAgentExporterDroppedLogs", ruleGroup.Rules[11].Alert)
require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_output_dropped_records_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[11].Expr)

require.Equal(t, "LogAgentExporterDroppedLogs", ruleGroup.Rules[12].Alert)
require.Equal(t, "sum by (pipeline_name) (rate(fluentbit_output_dropped_records_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0", ruleGroup.Rules[12].Expr)
require.Equal(t, "LogAgentBufferInUse", ruleGroup.Rules[12].Alert)
require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 300000000", ruleGroup.Rules[12].Expr)

require.Equal(t, "LogAgentBufferInUse", ruleGroup.Rules[13].Alert)
require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 300000000", ruleGroup.Rules[13].Expr)
require.Equal(t, "LogAgentBufferFull", ruleGroup.Rules[13].Alert)
require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 900000000", ruleGroup.Rules[13].Expr)

require.Equal(t, "LogAgentBufferFull", ruleGroup.Rules[14].Alert)
require.Equal(t, "telemetry_fsbuffer_usage_bytes{service=\"telemetry-fluent-bit-exporter-metrics\"} > 900000000", ruleGroup.Rules[14].Expr)
require.Equal(t, "LogAgentNoLogsDelivered", ruleGroup.Rules[14].Alert)
require.Equal(t, "(sum by (pipeline_name) (rate(fluentbit_input_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) > 0) and (sum by (pipeline_name) (rate(fluentbit_output_proc_bytes_total{service=\"telemetry-fluent-bit-metrics\"}[5m])) == 0)", ruleGroup.Rules[14].Expr)
}

func TestMatchesLogPipelineRule(t *testing.T) {
Expand Down
11 changes: 3 additions & 8 deletions internal/selfmonitor/prober/log_pipeline_prober.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ func (p *LogPipelineProber) someDataDropped(alerts []promv1.Alert, pipelineName
}

func (p *LogPipelineProber) noLogsDelivered(alerts []promv1.Alert, pipelineName string) bool {
receiverReadLogs := p.isFiring(alerts, config.RuleNameLogAgentReceiverReadLogs, pipelineName)
exporterSentLogs := p.isFiring(alerts, config.RuleNameLogAgentExporterSentLogs, pipelineName)
return receiverReadLogs && !exporterSentLogs
return p.isFiring(alerts, config.RuleNameLogAgentNoLogsDelivered, pipelineName)
}

func (p *LogPipelineProber) bufferFillingUp(alerts []promv1.Alert, pipelineName string) bool {
Expand All @@ -78,11 +76,8 @@ func (p *LogPipelineProber) healthy(alerts []promv1.Alert, pipelineName string)
bufferInUse := p.isFiring(alerts, config.RuleNameLogAgentBufferInUse, pipelineName)
bufferFull := p.isFiring(alerts, config.RuleNameLogAgentBufferFull, pipelineName)
exporterDroppedLogs := p.isFiring(alerts, config.RuleNameLogAgentExporterDroppedLogs, pipelineName)

// The pipeline is healthy if either no logs are being read or all logs are being sent
receiverReadLogs := p.isFiring(alerts, config.RuleNameLogAgentReceiverReadLogs, pipelineName)
exporterSentLogs := p.isFiring(alerts, config.RuleNameLogAgentExporterSentLogs, pipelineName)
return !(bufferInUse || bufferFull || exporterDroppedLogs) && (!receiverReadLogs || exporterSentLogs)
noLogsDelivered := p.isFiring(alerts, config.RuleNameLogAgentNoLogsDelivered, pipelineName)
return !(bufferInUse || bufferFull || exporterDroppedLogs || noLogsDelivered)
}

func (p *LogPipelineProber) isFiring(alerts []promv1.Alert, ruleName, pipelineName string) bool {
Expand Down
32 changes: 22 additions & 10 deletions internal/selfmonitor/prober/log_pipeline_prober_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,25 @@ func TestLogPipelineProber(t *testing.T) {
},
},
},
{
name: "pending alert",
pipelineName: "cls",
alerts: promv1.AlertsResult{
Alerts: []promv1.Alert{
{
Labels: model.LabelSet{
"alertname": "LogAgentBufferFull",
TeodorSAP marked this conversation as resolved.
Show resolved Hide resolved
},
State: promv1.AlertStatePending,
},
},
},
expected: LogPipelineProbeResult{
PipelineProbeResult: PipelineProbeResult{
Healthy: true,
},
},
},
{
name: "alert missing pipeline_name label",
pipelineName: "cls",
Expand Down Expand Up @@ -222,13 +241,13 @@ func TestLogPipelineProber(t *testing.T) {
},
},
{
name: "receiver read logs and exporter did not send logs firing",
name: "no logs delivered firing",
pipelineName: "cls",
alerts: promv1.AlertsResult{
Alerts: []promv1.Alert{
{
Labels: model.LabelSet{
"alertname": "LogAgentReceiverReadLogs",
"alertname": "LogAgentNoLogsDelivered",
"pipeline_name": "cls",
},
State: promv1.AlertStateFiring,
Expand All @@ -240,17 +259,10 @@ func TestLogPipelineProber(t *testing.T) {
},
},
{
name: "exporter read logs and exporter sent logs firing",
name: "exporter sent logs firing",
pipelineName: "cls",
alerts: promv1.AlertsResult{
Alerts: []promv1.Alert{
{
Labels: model.LabelSet{
"alertname": "LogAgentReceiverReadLogs",
"pipeline_name": "cls",
},
State: promv1.AlertStateFiring,
},
{
Labels: model.LabelSet{
"alertname": "LogAgentExporterSentLogs",
Expand Down
19 changes: 19 additions & 0 deletions internal/selfmonitor/prober/otel_pipeline_prober_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,25 @@ func TestOTelPipelineProber(t *testing.T) {
},
},
},
{
name: "pending alert",
pipelineName: "cls",
alerts: promv1.AlertsResult{
Alerts: []promv1.Alert{
{
Labels: model.LabelSet{
"alertname": "TraceGatewayExporterDroppedData",
},
State: promv1.AlertStatePending,
},
},
},
expected: OTelPipelineProbeResult{
PipelineProbeResult: PipelineProbeResult{
Healthy: true,
},
},
},
{
name: "alert missing pipeline_name label",
pipelineName: "cls",
Expand Down
Loading