Skip to content

Commit

Permalink
Log partial selectors from workload attestation on context cancellati…
Browse files Browse the repository at this point in the history
…on (spiffe#4846)

* Log partial selectors from workload attestation on context cancellation

In the case when a client hangs up a connection to SPIRE Agent during
workload attestation (e.g. timeout, process dies, etc.) while the agent
is still discovering selectors from workload attestation plugins, it can
be helpful to log whatever selectors the agent could find for debugging
purposes. Today, only the pid is captured in logs in this case,
which is often not enough information to trace back the caller since:
- The caller process may have already died (e.g. crashlooping on startup)
- pid alone is only useful if you can query the state of running processes on the host around the time of workload attestation (often not practical)

Logging the partial set of selectors fetched by the agent before the
caller hung up can help diagnose affected workloads in case a workload attestation
plugin is misbehaving or a downstream dependency required by the plugin
is down/not responding with accurate information.

Signed-off-by: Ryan Turner <turner@uber.com>
  • Loading branch information
rturner3 authored and rushi47 committed Apr 11, 2024
1 parent 1ccf81b commit 69ff8e9
Show file tree
Hide file tree
Showing 6 changed files with 133 additions and 21 deletions.
19 changes: 16 additions & 3 deletions pkg/agent/attestor/workload/workload.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,33 @@ type attestor struct {
}

type Attestor interface {
Attest(ctx context.Context, pid int) []*common.Selector
Attest(ctx context.Context, pid int) ([]*common.Selector, error)
}

func New(config *Config) Attestor {
return newAttestor(config)
}

func newAttestor(config *Config) *attestor {
if config.selectorHook == nil {
config.selectorHook = func([]*common.Selector) {}
}

return &attestor{c: config}
}

type Config struct {
Catalog catalog.Catalog
Log logrus.FieldLogger
Metrics telemetry.Metrics

// Test hook called when selectors are obtained from a workload attestor plugin
selectorHook func([]*common.Selector)
}

// Attest invokes all workload attestor plugins against the provided PID. If an error
// is encountered, it is logged and selectors from the failing plugin are discarded.
func (wla *attestor) Attest(ctx context.Context, pid int) []*common.Selector {
func (wla *attestor) Attest(ctx context.Context, pid int) ([]*common.Selector, error) {
counter := telemetry_workload.StartAttestationCall(wla.c.Metrics)
defer counter.Done(nil)

Expand All @@ -63,8 +70,14 @@ func (wla *attestor) Attest(ctx context.Context, pid int) []*common.Selector {
select {
case s := <-sChan:
selectors = append(selectors, s...)
wla.c.selectorHook(selectors)
case err := <-errChan:
log.WithError(err).Error("Failed to collect all selectors for PID")
case <-ctx.Done():
// If the client times out before all workload attestation plugins have reported selectors or an error,
// it can be helpful to see the partial set of selectors discovered for debugging purposes.
log.WithField(telemetry.PartialSelectors, selectors).Error("Timed out collecting selectors for PID")
return nil, ctx.Err()
}
}

Expand All @@ -76,7 +89,7 @@ func (wla *attestor) Attest(ctx context.Context, pid int) []*common.Selector {
if pid != os.Getpid() {
log.WithField(telemetry.Selectors, selectors).Debug("PID attested to have selectors")
}
return selectors
return selectors, nil
}

// invokeAttestor invokes attestation against the supplied plugin. Should be called from a goroutine.
Expand Down
89 changes: 75 additions & 14 deletions pkg/agent/attestor/workload/workload_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ package attestor
import (
"context"
"errors"
"fmt"
"testing"

"github.com/sirupsen/logrus"
"github.com/sirupsen/logrus/hooks/test"
"github.com/spiffe/spire/pkg/common/telemetry"
telemetry_workload "github.com/spiffe/spire/pkg/common/telemetry/agent/workloadapi"
Expand All @@ -25,15 +27,15 @@ var (

attestor1Pids = map[int32][]string{
1: nil,
2: []string{"bar"},
2: {"bar"},
// 3: attestor1 cannot attest process 3
4: []string{"bar"},
4: {"bar"},
}
attestor2Pids = map[int32][]string{
1: nil,
2: nil,
3: []string{"baz"},
4: []string{"baz"},
3: {"baz"},
4: {"baz"},
}
)

Expand All @@ -44,12 +46,14 @@ func TestWorkloadAttestor(t *testing.T) {
type WorkloadAttestorTestSuite struct {
suite.Suite

attestor *attestor
catalog *fakeagentcatalog.Catalog
attestor *attestor
catalog *fakeagentcatalog.Catalog
loggerHook *test.Hook
}

func (s *WorkloadAttestorTestSuite) SetupTest() {
log, _ := test.NewNullLogger()
log, hook := test.NewNullLogger()
s.loggerHook = hook

s.catalog = fakeagentcatalog.New()
s.attestor = newAttestor(&Config{
Expand All @@ -66,19 +70,23 @@ func (s *WorkloadAttestorTestSuite) TestAttestWorkload() {
)

// both attestors succeed but with no selectors
selectors := s.attestor.Attest(ctx, 1)
selectors, err := s.attestor.Attest(ctx, 1)
s.Assert().Nil(err)
s.Empty(selectors)

// attestor1 has selectors, but not attestor2
selectors = s.attestor.Attest(ctx, 2)
selectors, err = s.attestor.Attest(ctx, 2)
s.Assert().Nil(err)
spiretest.AssertProtoListEqual(s.T(), selectors1, selectors)

// attestor2 has selectors, attestor1 fails
selectors = s.attestor.Attest(ctx, 3)
selectors, err = s.attestor.Attest(ctx, 3)
s.Assert().Nil(err)
spiretest.AssertProtoListEqual(s.T(), selectors2, selectors)

// both have selectors
selectors = s.attestor.Attest(ctx, 4)
selectors, err = s.attestor.Attest(ctx, 4)
s.Assert().Nil(err)
util.SortSelectors(selectors)
combined := make([]*common.Selector, 0, len(selectors1)+len(selectors2))
combined = append(combined, selectors1...)
Expand All @@ -97,7 +105,8 @@ func (s *WorkloadAttestorTestSuite) TestAttestWorkloadMetrics() {
metrics := fakemetrics.New()
s.attestor.c.Metrics = metrics

selectors := s.attestor.Attest(ctx, 2)
selectors, err := s.attestor.Attest(ctx, 2)
s.Assert().Nil(err)

// Create expected metrics
expected := fakemetrics.New()
Expand All @@ -114,12 +123,13 @@ func (s *WorkloadAttestorTestSuite) TestAttestWorkloadMetrics() {
s.attestor.c.Metrics = metrics

// No selectors expected
selectors = s.attestor.Attest(ctx, 3)
selectors, err = s.attestor.Attest(ctx, 3)
s.Assert().Nil(err)
s.Empty(selectors)

// Create expected metrics with error key
expected = fakemetrics.New()
err := errors.New("some error")
err = errors.New("some error")
attestorCounter = telemetry_workload.StartAttestorCall(expected, "fake1")
attestorCounter.Done(&err)
telemetry_workload.AddDiscoveredSelectorsSample(expected, float32(0))
Expand All @@ -128,3 +138,54 @@ func (s *WorkloadAttestorTestSuite) TestAttestWorkloadMetrics() {

s.Require().Equal(expected.AllMetrics(), metrics.AllMetrics())
}

func (s *WorkloadAttestorTestSuite) TestAttestLogsPartialSelectorsOnContextCancellation() {
pid := 4
selectorC := make(chan []*common.Selector, 1)
s.attestor.c.selectorHook = func(selectors []*common.Selector) {
selectorC <- selectors
}

pluginC := make(chan struct{}, 1)
// Add one attestor that provides selectors and another that doesn't return before the test context is cancelled
s.catalog.SetWorkloadAttestors(
fakeworkloadattestor.New(s.T(), "fake1", attestor1Pids),
fakeworkloadattestor.NewTimeoutAttestor(s.T(), "faketimeoutattestor", pluginC),
)

defer func() {
// Unblock attestor that is blocking on channel
pluginC <- struct{}{}
}()

attestCh := make(chan struct{}, 1)
ctx, cancel := context.WithCancel(context.Background())
var selectors []*common.Selector
var attestErr error
go func(innerCtx context.Context, pid int) {
selectors, attestErr = s.attestor.Attest(innerCtx, pid)
attestCh <- struct{}{}
}(ctx, pid)

// Wait for one of the plugins to return selectors
partialSelectors := <-selectorC

// Cancel context to simulate caller hanging up in the middle of workload attestation
cancel()

// Wait for attestation goroutine to complete execution
<-attestCh

s.Assert().Nil(selectors)
s.Assert().Error(attestErr)
spiretest.AssertLogs(s.T(), s.loggerHook.AllEntries(), []spiretest.LogEntry{
{
Level: logrus.ErrorLevel,
Message: "Timed out collecting selectors for PID",
Data: logrus.Fields{
telemetry.PartialSelectors: fmt.Sprint(partialSelectors),
telemetry.PID: fmt.Sprint(pid),
},
},
})
}
5 changes: 4 additions & 1 deletion pkg/agent/endpoints/peertracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ func (a PeerTrackerAttestor) Attest(ctx context.Context) ([]*common.Selector, er
return nil, status.Error(codes.Internal, "peer tracker watcher missing from context")
}

selectors := a.Attestor.Attest(ctx, int(watcher.PID()))
selectors, err := a.Attestor.Attest(ctx, int(watcher.PID()))
if err != nil {
return nil, err
}

// Ensure that the original caller is still alive so that we know we didn't
// attest some other process that happened to be assigned the original PID
Expand Down
6 changes: 3 additions & 3 deletions pkg/agent/endpoints/peertracker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ func TestPeerTrackerAttestor(t *testing.T) {

type FakeAttestor struct{}

func (a FakeAttestor) Attest(_ context.Context, pid int) []*common.Selector {
func (a FakeAttestor) Attest(_ context.Context, pid int) ([]*common.Selector, error) {
if pid == os.Getpid() {
return []*common.Selector{{Type: "Type", Value: "Value"}}
return []*common.Selector{{Type: "Type", Value: "Value"}}, nil
}
return nil
return nil, nil
}

func WithFakeWatcher(alive bool) context.Context {
Expand Down
3 changes: 3 additions & 0 deletions pkg/common/telemetry/names.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,9 @@ const (
// ParentID tags parent ID for an entry
ParentID = "parent_id"

// PartialSelectors is a partial set of selectors for a workload
PartialSelectors = "partial_selectors"

// Path declares some logic path, likely on the file system
Path = "path"

Expand Down
32 changes: 32 additions & 0 deletions test/fakes/fakeworkloadattestor/timeout_attestor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package fakeworkloadattestor

import (
"context"
"testing"

workloadattestorv1 "github.com/spiffe/spire-plugin-sdk/proto/spire/plugin/agent/workloadattestor/v1"
"github.com/spiffe/spire/pkg/agent/plugin/workloadattestor"
"github.com/spiffe/spire/pkg/common/catalog"
"github.com/spiffe/spire/test/plugintest"
)

func NewTimeoutAttestor(t *testing.T, name string, c chan struct{}) workloadattestor.WorkloadAttestor {
server := workloadattestorv1.WorkloadAttestorPluginServer(&timeoutWorkloadAttestor{
c: c,
})
wa := new(workloadattestor.V1)
plugintest.Load(t, catalog.MakeBuiltIn(name, server), wa)
return wa
}

type timeoutWorkloadAttestor struct {
workloadattestorv1.UnimplementedWorkloadAttestorServer

c chan struct{}
}

func (twa *timeoutWorkloadAttestor) Attest(_ context.Context, _ *workloadattestorv1.AttestRequest) (*workloadattestorv1.AttestResponse, error) {
// Block on channel until test sends signal
<-twa.c
return &workloadattestorv1.AttestResponse{}, nil
}

0 comments on commit 69ff8e9

Please sign in to comment.