From 5ce0f715dd1a1b70a488277a9f333a4556595efc Mon Sep 17 00:00:00 2001 From: Kornilios Kourtis Date: Tue, 2 Apr 2024 18:06:38 +0200 Subject: [PATCH] policystatemetrics: timeout for ListTracingPolicies This patch adds a timeout for ListTracingPolicies. It can be the case that the sensor manager is stuck or misbehaving. This patch (combined with the previous one) ensures that metrics will continue after a timeout. Tested manually using: ```diff diff --git a/pkg/metrics/policystatemetrics/policystatemetrics_test.go b/pkg/metrics/policystatemetrics/policystatemetrics_test.go index 227306b65..fd581392b 100644 --- a/pkg/metrics/policystatemetrics/policystatemetrics_test.go +++ b/pkg/metrics/policystatemetrics/policystatemetrics_test.go @@ -9,6 +9,7 @@ import ( "io" "strings" "testing" + "time" "github.com/cilium/tetragon/pkg/observer" tus "github.com/cilium/tetragon/pkg/testutils/sensors" @@ -57,3 +58,22 @@ tetragon_tracingpolicy_loaded{state="load_error"} %d err = testutil.CollectAndCompare(collector, expectedMetrics(1, 0, 0, 0)) assert.NoError(t, err) } + +func TestTimeout(t *testing.T) { + reg := prometheus.NewRegistry() + + manager := tus.GetTestSensorManager(context.TODO(), t).Manager + observer.SetSensorManager(manager) + t.Cleanup(observer.ResetSensorManager) + + collector := newPolicyStateCollector() + reg.Register(collector) + + go func() { + err := manager.SleepForTesting(context.TODO(), t, 1*time.Second) + assert.NoError(t, err) + }() + + err := testutil.CollectAndCompare(collector, strings.NewReader("")) + assert.NoError(t, err) +} diff --git a/pkg/sensors/manager.go b/pkg/sensors/manager.go index eaf908340..291a58c8f 100644 --- a/pkg/sensors/manager.go +++ b/pkg/sensors/manager.go @@ -8,6 +8,8 @@ import ( "errors" "fmt" "strings" + "testing" + "time" "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1" @@ -96,6 +98,13 @@ func startSensorManager( logger.GetLogger().Debugf("stopping sensor controller...") done = true err = nil + + // NB(kkourt): for testing + case *sensorManagerSleep: + time.Sleep(op.d) + err = nil + default: err = fmt.Errorf("unknown sensorOp: %v", op) } @@ -421,6 +430,13 @@ type sensorCtlStop struct { retChan chan error } +// sensorManagerSleep just sleeps. Intended only for testing. +type sensorManagerSleep struct { + ctx context.Context + retChan chan error + d time.Duration +} + type LoadArg struct{} type UnloadArg = LoadArg @@ -436,5 +452,18 @@ func (s *sensorEnable) sensorOpDone(e error) { s.retChan <- e } func (s *sensorDisable) sensorOpDone(e error) { s.retChan <- e } func (s *sensorList) sensorOpDone(e error) { s.retChan <- e } func (s *sensorCtlStop) sensorOpDone(e error) { s.retChan <- e } +func (s *sensorManagerSleep) sensorOpDone(e error) { s.retChan <- e } type sensorCtlHandle = chan<- sensorOp + +func (h *Manager) SleepForTesting(ctx context.Context, t *testing.T, d time.Duration) error { + retc := make(chan error) + op := &sensorManagerSleep{ + ctx: ctx, + retChan: retc, + d: d, + } + + h.sensorCtl <- op + return <-retc +} ``` Signed-off-by: Kornilios Kourtis --- pkg/metrics/policystatemetrics/policystatemetrics.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/metrics/policystatemetrics/policystatemetrics.go b/pkg/metrics/policystatemetrics/policystatemetrics.go index e59b7721869..6bf2ef4107f 100644 --- a/pkg/metrics/policystatemetrics/policystatemetrics.go +++ b/pkg/metrics/policystatemetrics/policystatemetrics.go @@ -6,6 +6,7 @@ package policystatemetrics import ( "context" "strings" + "time" "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/pkg/logger" @@ -49,7 +50,10 @@ func (c *policyStateCollector) Collect(ch chan<- prometheus.Metric) { logger.GetLogger().Debug("failed retrieving the sensor manager: manager is nil") return } - list, err := sm.ListTracingPolicies(context.Background()) + + ctx, cancel := context.WithTimeout(context.TODO(), 900*time.Millisecond) + defer cancel() + list, err := sm.ListTracingPolicies(ctx) if err != nil { logger.GetLogger().WithError(err).Warn("error listing tracing policies to collect policies state") return