Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
policystatemetrics: timeout for ListTracingPolicies
This patch adds a timeout for ListTracingPolicies. It can be the case that the sensor manager is stuck or misbehaving. This patch (combined with the previous one) ensures that metrics will continue after a timeout. Tested manually using: ```diff diff --git a/pkg/metrics/policystatemetrics/policystatemetrics_test.go b/pkg/metrics/policystatemetrics/policystatemetrics_test.go index 227306b65..fd581392b 100644 --- a/pkg/metrics/policystatemetrics/policystatemetrics_test.go +++ b/pkg/metrics/policystatemetrics/policystatemetrics_test.go @@ -9,6 +9,7 @@ import ( "io" "strings" "testing" + "time" "github.com/cilium/tetragon/pkg/observer" tus "github.com/cilium/tetragon/pkg/testutils/sensors" @@ -57,3 +58,22 @@ tetragon_tracingpolicy_loaded{state="load_error"} %d err = testutil.CollectAndCompare(collector, expectedMetrics(1, 0, 0, 0)) assert.NoError(t, err) } + +func TestTimeout(t *testing.T) { + reg := prometheus.NewRegistry() + + manager := tus.GetTestSensorManager(context.TODO(), t).Manager + observer.SetSensorManager(manager) + t.Cleanup(observer.ResetSensorManager) + + collector := newPolicyStateCollector() + reg.Register(collector) + + go func() { + err := manager.SleepForTesting(context.TODO(), t, 1*time.Second) + assert.NoError(t, err) + }() + + err := testutil.CollectAndCompare(collector, strings.NewReader("")) + assert.NoError(t, err) +} diff --git a/pkg/sensors/manager.go b/pkg/sensors/manager.go index eaf908340..291a58c8f 100644 --- a/pkg/sensors/manager.go +++ b/pkg/sensors/manager.go @@ -8,6 +8,8 @@ import ( "errors" "fmt" "strings" + "testing" + "time" "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1" @@ -96,6 +98,13 @@ func startSensorManager( logger.GetLogger().Debugf("stopping sensor controller...") done = true err = nil + + // NB(kkourt): for testing + case *sensorManagerSleep: + time.Sleep(op.d) + err = nil + default: err = fmt.Errorf("unknown sensorOp: %v", op) } @@ -421,6 +430,13 @@ type sensorCtlStop struct { retChan chan error } +// sensorManagerSleep just sleeps. Intended only for testing. +type sensorManagerSleep struct { + ctx context.Context + retChan chan error + d time.Duration +} + type LoadArg struct{} type UnloadArg = LoadArg @@ -436,5 +452,18 @@ func (s *sensorEnable) sensorOpDone(e error) { s.retChan <- e } func (s *sensorDisable) sensorOpDone(e error) { s.retChan <- e } func (s *sensorList) sensorOpDone(e error) { s.retChan <- e } func (s *sensorCtlStop) sensorOpDone(e error) { s.retChan <- e } +func (s *sensorManagerSleep) sensorOpDone(e error) { s.retChan <- e } type sensorCtlHandle = chan<- sensorOp + +func (h *Manager) SleepForTesting(ctx context.Context, t *testing.T, d time.Duration) error { + retc := make(chan error) + op := &sensorManagerSleep{ + ctx: ctx, + retChan: retc, + d: d, + } + + h.sensorCtl <- op + return <-retc +} ``` Signed-off-by: Kornilios Kourtis <kornilios@isovalent.com>
- Loading branch information