diff --git a/cmd/spire-agent/cli/run/run.go b/cmd/spire-agent/cli/run/run.go index 627182b525..ccd309f223 100644 --- a/cmd/spire-agent/cli/run/run.go +++ b/cmd/spire-agent/cli/run/run.go @@ -90,6 +90,7 @@ type agentConfig struct { AllowUnauthenticatedVerifiers bool `hcl:"allow_unauthenticated_verifiers"` AllowedForeignJWTClaims []string `hcl:"allowed_foreign_jwt_claims"` AvailabilityTarget string `hcl:"availability_target"` + X509SVIDCacheMaxSize int `hcl:"x509_svid_cache_max_size"` AuthorizedDelegates []string `hcl:"authorized_delegates"` @@ -494,6 +495,11 @@ func NewAgentConfig(c *Config, logOptions []log.Option, allowUnknownConfig bool) ac.LogReopener = log.ReopenOnSignal(logger, reopenableFile) } + if c.Agent.X509SVIDCacheMaxSize < 0 { + return nil, errors.New("x509_svid_cache_max_size should not be negative") + } + ac.X509SVIDCacheMaxSize = c.Agent.X509SVIDCacheMaxSize + td, err := common_cli.ParseTrustDomain(c.Agent.TrustDomain, logger) if err != nil { return nil, err diff --git a/cmd/spire-agent/cli/run/run_test.go b/cmd/spire-agent/cli/run/run_test.go index 164b4e0e4d..a4938ef2fc 100644 --- a/cmd/spire-agent/cli/run/run_test.go +++ b/cmd/spire-agent/cli/run/run_test.go @@ -897,6 +897,42 @@ func TestNewAgentConfig(t *testing.T) { require.Nil(t, c) }, }, + { + msg: "x509_svid_cache_max_size is set", + input: func(c *Config) { + c.Agent.X509SVIDCacheMaxSize = 100 + }, + test: func(t *testing.T, c *agent.Config) { + require.EqualValues(t, 100, c.X509SVIDCacheMaxSize) + }, + }, + { + msg: "x509_svid_cache_max_size is not set", + input: func(c *Config) { + }, + test: func(t *testing.T, c *agent.Config) { + require.EqualValues(t, 0, c.X509SVIDCacheMaxSize) + }, + }, + { + msg: "x509_svid_cache_max_size is zero", + input: func(c *Config) { + c.Agent.X509SVIDCacheMaxSize = 0 + }, + test: func(t *testing.T, c *agent.Config) { + require.EqualValues(t, 0, c.X509SVIDCacheMaxSize) + }, + }, + { + msg: "x509_svid_cache_max_size is negative", + expectError: true, + input: func(c *Config) { + c.Agent.X509SVIDCacheMaxSize = -10 + }, + test: func(t *testing.T, c *agent.Config) { + require.Nil(t, c) + }, + }, { msg: "allowed_foreign_jwt_claims provided", input: func(c *Config) { diff --git a/doc/spire_agent.md b/doc/spire_agent.md index c9e82ea940..7f8cd81d98 100644 --- a/doc/spire_agent.md +++ b/doc/spire_agent.md @@ -70,6 +70,7 @@ This may be useful for templating configuration files, for example across differ | `trust_domain` | The trust domain that this agent belongs to (should be no more than 255 characters) | | | `workload_x509_svid_key_type` | The workload X509 SVID key type <rsa-2048|ec-p256> | ec-p256 | | `availability_target` | The minimum amount of time desired to gracefully handle SPIRE Server or Agent downtime. This configurable influences how aggressively X509 SVIDs should be rotated. If set, must be at least 24h. See [Availability Target](#availability-target) | | +| `x509_svid_cache_max_size` | Soft limit of max number of SVIDs that would be stored in LRU cache | 1000 | | experimental | Description | Default | |:------------------------------|--------------------------------------------------------------------------------------|-------------------------| diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index b6358ec7b5..6b3f2d62a7 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -275,6 +275,7 @@ func (a *Agent) newManager(ctx context.Context, sto storage.Storage, cat catalog Storage: sto, SyncInterval: a.c.SyncInterval, UseSyncAuthorizedEntries: a.c.UseSyncAuthorizedEntries, + SVIDCacheMaxSize: a.c.X509SVIDCacheMaxSize, SVIDStoreCache: cache, NodeAttestor: na, RotationStrategy: rotationutil.NewRotationStrategy(a.c.AvailabilityTarget), diff --git a/pkg/agent/api/delegatedidentity/v1/service_test.go b/pkg/agent/api/delegatedidentity/v1/service_test.go index 6aaec07928..6375354d9c 100644 --- a/pkg/agent/api/delegatedidentity/v1/service_test.go +++ b/pkg/agent/api/delegatedidentity/v1/service_test.go @@ -996,7 +996,7 @@ func (m *FakeManager) SubscribeToBundleChanges() *cache.BundleStream { func newTestCache() *cache.LRUCache { log, _ := test.NewNullLogger() - return cache.NewLRUCache(log, trustDomain1, bundle1, telemetry.Blackhole{}, clock.New()) + return cache.NewLRUCache(log, trustDomain1, bundle1, telemetry.Blackhole{}, cache.DefaultSVIDCacheMaxSize, clock.New()) } func generateSubscribeToX509SVIDMetrics() []fakemetrics.MetricItem { diff --git a/pkg/agent/config.go b/pkg/agent/config.go index 1239b820c6..b7b21a7d8e 100644 --- a/pkg/agent/config.go +++ b/pkg/agent/config.go @@ -66,6 +66,9 @@ type Config struct { // is used to sync entries from the server. UseSyncAuthorizedEntries bool + // X509SVIDCacheMaxSize is a soft limit of max number of SVIDs that would be stored in cache + X509SVIDCacheMaxSize int + // Trust domain and associated CA bundle TrustDomain spiffeid.TrustDomain TrustBundle []*x509.Certificate diff --git a/pkg/agent/manager/cache/lru_cache.go b/pkg/agent/manager/cache/lru_cache.go index 49d6bc5e32..f3786ea296 100644 --- a/pkg/agent/manager/cache/lru_cache.go +++ b/pkg/agent/manager/cache/lru_cache.go @@ -20,8 +20,8 @@ import ( ) const ( - // SVIDCacheMaxSize is the size for the cache - SVIDCacheMaxSize = 1000 + // DefaultSVIDCacheMaxSize is set when svidCacheMaxSize is not provided + DefaultSVIDCacheMaxSize = 1000 // SVIDSyncInterval is the interval at which SVIDs are synced with subscribers SVIDSyncInterval = 500 * time.Millisecond // Default batch size for processing tainted SVIDs @@ -139,6 +139,9 @@ type LRUCache struct { // svids are stored by entry IDs svids map[string]*X509SVID + // svidCacheMaxSize is a soft limit of max number of SVIDs that would be stored in cache + svidCacheMaxSize int + subscribeBackoffFn func() backoff.BackOff processingBatchSize int @@ -146,7 +149,11 @@ type LRUCache struct { taintedBatchProcessedCh chan struct{} } -func NewLRUCache(log logrus.FieldLogger, trustDomain spiffeid.TrustDomain, bundle *Bundle, metrics telemetry.Metrics, clk clock.Clock) *LRUCache { +func NewLRUCache(log logrus.FieldLogger, trustDomain spiffeid.TrustDomain, bundle *Bundle, metrics telemetry.Metrics, svidCacheMaxSize int, clk clock.Clock) *LRUCache { + if svidCacheMaxSize <= 0 { + svidCacheMaxSize = DefaultSVIDCacheMaxSize + } + return &LRUCache{ BundleCache: NewBundleCache(trustDomain, bundle), JWTSVIDCache: NewJWTSVIDCache(), @@ -160,8 +167,9 @@ func NewLRUCache(log logrus.FieldLogger, trustDomain spiffeid.TrustDomain, bundl bundles: map[spiffeid.TrustDomain]*spiffebundle.Bundle{ trustDomain: bundle, }, - svids: make(map[string]*X509SVID), - clk: clk, + svids: make(map[string]*X509SVID), + svidCacheMaxSize: svidCacheMaxSize, + clk: clk, subscribeBackoffFn: func() backoff.BackOff { return backoff.NewBackoff(clk, SVIDSyncInterval) }, @@ -433,7 +441,7 @@ func (c *LRUCache) UpdateEntries(update *UpdateEntries, checkSVID func(*common.R // entries with active subscribers which are not cached will be put in staleEntries map; // irrespective of what svid cache size as we cannot deny identity to a subscriber activeSubsByEntryID, recordsWithLastAccessTime := c.syncSVIDsWithSubscribers() - extraSize := len(c.svids) - SVIDCacheMaxSize + extraSize := len(c.svids) - c.svidCacheMaxSize // delete svids without subscribers and which have not been accessed since svidCacheExpiryTime if extraSize > 0 { @@ -772,7 +780,7 @@ func (c *LRUCache) syncSVIDsWithSubscribers() (map[string]struct{}, []recordAcce lastAccessTimestamps = append(lastAccessTimestamps, newRecordAccessEvent(record.lastAccessTimestamp, id)) } - remainderSize := SVIDCacheMaxSize - len(c.svids) + remainderSize := c.svidCacheMaxSize - len(c.svids) // add records which are not cached for remainder of cache size for id := range c.records { if len(c.staleEntries) >= remainderSize { diff --git a/pkg/agent/manager/cache/lru_cache_test.go b/pkg/agent/manager/cache/lru_cache_test.go index d1c1e62543..3047631e57 100644 --- a/pkg/agent/manager/cache/lru_cache_test.go +++ b/pkg/agent/manager/cache/lru_cache_test.go @@ -659,7 +659,8 @@ func TestLRUCacheSubscriberNotNotifiedOnOverlappingSVIDChanges(t *testing.T) { func TestLRUCacheSVIDCacheExpiry(t *testing.T) { clk := clock.NewMock(t) - cache := newTestLRUCacheWithConfig(clk) + svidCacheMaxSize := 10 + cache := newTestLRUCacheWithConfig(svidCacheMaxSize, clk) clk.Add(1 * time.Second) foo := makeRegistrationEntry("FOO", "A") @@ -703,7 +704,7 @@ func TestLRUCacheSVIDCacheExpiry(t *testing.T) { // Move clk by 2 seconds clk.Add(2 * time.Second) // update total of size+2 entries - updateEntries := createUpdateEntries(SVIDCacheMaxSize, makeBundles(bundleV1)) + updateEntries := createUpdateEntries(svidCacheMaxSize, makeBundles(bundleV1)) updateEntries.RegistrationEntries[foo.EntryId] = foo updateEntries.RegistrationEntries[bar.EntryId] = bar @@ -720,10 +721,10 @@ func TestLRUCacheSVIDCacheExpiry(t *testing.T) { sub.Finish() } } - assert.Equal(t, SVIDCacheMaxSize+2, cache.CountX509SVIDs()) + assert.Equal(t, svidCacheMaxSize+2, cache.CountX509SVIDs()) cache.UpdateEntries(updateEntries, nil) - assert.Equal(t, SVIDCacheMaxSize, cache.CountX509SVIDs()) + assert.Equal(t, svidCacheMaxSize, cache.CountX509SVIDs()) // foo SVID should be removed from cache as it does not have active subscriber assert.False(t, cache.notifySubscriberIfSVIDAvailable(makeSelectors("A"), subA.(*lruCacheSubscriber))) @@ -739,24 +740,25 @@ func TestLRUCacheSVIDCacheExpiry(t *testing.T) { require.Len(t, cache.GetStaleEntries(), 1) assert.Equal(t, foo, cache.GetStaleEntries()[0].Entry) - assert.Equal(t, SVIDCacheMaxSize, cache.CountX509SVIDs()) + assert.Equal(t, svidCacheMaxSize, cache.CountX509SVIDs()) } func TestLRUCacheMaxSVIDCacheSize(t *testing.T) { clk := clock.NewMock(t) - cache := newTestLRUCacheWithConfig(clk) + svidCacheMaxSize := 10 + cache := newTestLRUCacheWithConfig(svidCacheMaxSize, clk) - // create entries more than maxSvidCacheSize - updateEntries := createUpdateEntries(SVIDCacheMaxSize+2, makeBundles(bundleV1)) + // create entries more than svidCacheMaxSize + updateEntries := createUpdateEntries(svidCacheMaxSize+2, makeBundles(bundleV1)) cache.UpdateEntries(updateEntries, nil) - require.Len(t, cache.GetStaleEntries(), SVIDCacheMaxSize) + require.Len(t, cache.GetStaleEntries(), svidCacheMaxSize) cache.UpdateSVIDs(&UpdateSVIDs{ X509SVIDs: makeX509SVIDsFromStaleEntries(cache.GetStaleEntries()), }) require.Len(t, cache.GetStaleEntries(), 0) - assert.Equal(t, SVIDCacheMaxSize, cache.CountX509SVIDs()) + assert.Equal(t, svidCacheMaxSize, cache.CountX509SVIDs()) // Validate that active subscriber will still get SVID even if SVID count is at maxSvidCacheSize foo := makeRegistrationEntry("FOO", "A") @@ -767,25 +769,26 @@ func TestLRUCacheMaxSVIDCacheSize(t *testing.T) { cache.UpdateEntries(updateEntries, nil) require.Len(t, cache.GetStaleEntries(), 1) - assert.Equal(t, SVIDCacheMaxSize, cache.CountX509SVIDs()) + assert.Equal(t, svidCacheMaxSize, cache.CountX509SVIDs()) cache.UpdateSVIDs(&UpdateSVIDs{ X509SVIDs: makeX509SVIDs(foo), }) - assert.Equal(t, SVIDCacheMaxSize+1, cache.CountX509SVIDs()) + assert.Equal(t, svidCacheMaxSize+1, cache.CountX509SVIDs()) require.Len(t, cache.GetStaleEntries(), 0) } func TestSyncSVIDsWithSubscribers(t *testing.T) { clk := clock.NewMock(t) - cache := newTestLRUCacheWithConfig(clk) + svidCacheMaxSize := 5 + cache := newTestLRUCacheWithConfig(svidCacheMaxSize, clk) - updateEntries := createUpdateEntries(SVIDCacheMaxSize, makeBundles(bundleV1)) + updateEntries := createUpdateEntries(svidCacheMaxSize, makeBundles(bundleV1)) cache.UpdateEntries(updateEntries, nil) cache.UpdateSVIDs(&UpdateSVIDs{ X509SVIDs: makeX509SVIDsFromStaleEntries(cache.GetStaleEntries()), }) - assert.Equal(t, SVIDCacheMaxSize, cache.CountX509SVIDs()) + assert.Equal(t, svidCacheMaxSize, cache.CountX509SVIDs()) // Update foo but its SVID is not yet cached foo := makeRegistrationEntry("FOO", "A") @@ -803,7 +806,7 @@ func TestSyncSVIDsWithSubscribers(t *testing.T) { require.Len(t, cache.GetStaleEntries(), 1) assert.Equal(t, []*StaleEntry{{Entry: cache.records[foo.EntryId].entry}}, cache.GetStaleEntries()) - assert.Equal(t, SVIDCacheMaxSize, cache.CountX509SVIDs()) + assert.Equal(t, svidCacheMaxSize, cache.CountX509SVIDs()) } func TestNotifySubscriberWhenSVIDIsAvailable(t *testing.T) { @@ -828,12 +831,13 @@ func TestNotifySubscriberWhenSVIDIsAvailable(t *testing.T) { func TestSubscribeToWorkloadUpdatesLRUNoSelectors(t *testing.T) { clk := clock.NewMock(t) - cache := newTestLRUCacheWithConfig(clk) + svidCacheMaxSize := 1 + cache := newTestLRUCacheWithConfig(svidCacheMaxSize, clk) // Creating test entries, but this will not affect current test... foo := makeRegistrationEntry("FOO", "A") bar := makeRegistrationEntry("BAR", "B") - updateEntries := createUpdateEntries(SVIDCacheMaxSize, makeBundles(bundleV1)) + updateEntries := createUpdateEntries(svidCacheMaxSize, makeBundles(bundleV1)) updateEntries.RegistrationEntries[foo.EntryId] = foo updateEntries.RegistrationEntries[bar.EntryId] = bar cache.UpdateEntries(updateEntries, nil) @@ -874,7 +878,7 @@ func TestSubscribeToWorkloadUpdatesLRUNoSelectors(t *testing.T) { <-subWaitCh cache.SyncSVIDsWithSubscribers() - assert.Len(t, cache.GetStaleEntries(), SVIDCacheMaxSize) + assert.Len(t, cache.GetStaleEntries(), svidCacheMaxSize) cache.UpdateSVIDs(&UpdateSVIDs{ X509SVIDs: makeX509SVIDs(foo, bar), }) @@ -890,7 +894,7 @@ func TestSubscribeToWorkloadUpdatesLRUNoSelectors(t *testing.T) { func TestSubscribeToLRUCacheChanges(t *testing.T) { clk := clock.NewMock(t) - cache := newTestLRUCacheWithConfig(clk) + cache := newTestLRUCacheWithConfig(1, clk) foo := makeRegistrationEntry("FOO", "A") bar := makeRegistrationEntry("BAR", "B") @@ -979,7 +983,7 @@ func TestTaintX509SVIDs(t *testing.T) { batchProcessedCh := make(chan struct{}, 1) // Initialize cache with configuration - cache := newTestLRUCacheWithConfig(clk) + cache := newTestLRUCacheWithConfig(10, clk) cache.processingBatchSize = 4 cache.log = log cache.taintedBatchProcessedCh = batchProcessedCh @@ -1130,7 +1134,7 @@ func TestTaintX509SVIDsNoSVIDs(t *testing.T) { log.Level = logrus.DebugLevel // Initialize cache with configuration - cache := newTestLRUCacheWithConfig(clk) + cache := newTestLRUCacheWithConfig(10, clk) cache.log = log entries := createTestEntries(10) @@ -1216,9 +1220,19 @@ func TestMetrics(t *testing.T) { } func TestNewLRUCache(t *testing.T) { - // expected cache size - cache := newTestLRUCacheWithConfig(clock.NewMock(t)) - require.NotNil(t, cache) + // Negative for value for svidCacheMaxSize should set default value in + // cache.svidCacheMaxSize + cache := newTestLRUCacheWithConfig(-5, clock.NewMock(t)) + require.Equal(t, DefaultSVIDCacheMaxSize, cache.svidCacheMaxSize) + + // Zero for value for svidCacheMaxSize should set default value in + // cache.svidCacheMaxSize + cache = newTestLRUCacheWithConfig(0, clock.NewMock(t)) + require.Equal(t, DefaultSVIDCacheMaxSize, cache.svidCacheMaxSize) + + // Custom value for svidCacheMaxSize should propagate properly + cache = newTestLRUCacheWithConfig(55, clock.NewMock(t)) + require.Equal(t, 55, cache.svidCacheMaxSize) } func BenchmarkLRUCacheGlobalNotification(b *testing.B) { @@ -1269,12 +1283,12 @@ func BenchmarkLRUCacheGlobalNotification(b *testing.B) { func newTestLRUCache(t testing.TB) *LRUCache { log, _ := test.NewNullLogger() return NewLRUCache(log, spiffeid.RequireTrustDomainFromString("domain.test"), bundleV1, - telemetry.Blackhole{}, clock.NewMock(t)) + telemetry.Blackhole{}, 0, clock.NewMock(t)) } -func newTestLRUCacheWithConfig(clk clock.Clock) *LRUCache { +func newTestLRUCacheWithConfig(svidCacheMaxSize int, clk clock.Clock) *LRUCache { log, _ := test.NewNullLogger() - return NewLRUCache(log, trustDomain1, bundleV1, telemetry.Blackhole{}, clk) + return NewLRUCache(log, trustDomain1, bundleV1, telemetry.Blackhole{}, svidCacheMaxSize, clk) } // numEntries should not be more than 12 digits diff --git a/pkg/agent/manager/config.go b/pkg/agent/manager/config.go index abba259284..16a33e3bde 100644 --- a/pkg/agent/manager/config.go +++ b/pkg/agent/manager/config.go @@ -66,7 +66,7 @@ func newManager(c *Config) *manager { } cache := managerCache.NewLRUCache(c.Log.WithField(telemetry.SubsystemName, telemetry.CacheManager), c.TrustDomain, c.Bundle, - c.Metrics, c.Clk) + c.Metrics, c.SVIDCacheMaxSize, c.Clk) rotCfg := &svid.RotatorConfig{ SVIDKeyManager: keymanager.ForSVID(c.Catalog.GetKeyManager()), diff --git a/test/integration/suites/fetch-x509-svids/04-create-registration-entries b/test/integration/suites/fetch-x509-svids/04-create-registration-entries index c70ae606b8..356c7ad908 100755 --- a/test/integration/suites/fetch-x509-svids/04-create-registration-entries +++ b/test/integration/suites/fetch-x509-svids/04-create-registration-entries @@ -1,8 +1,8 @@ #!/bin/bash -# LRU Cache size is 1000; we expect uid:1001 to receive all 1002 identities, -# and later on disconnect for the cache to be pruned back to 1000 -SIZE=1002 +# LRU Cache size is 8; we expect uid:1001 to receive all 10 identities, +# and later on disconnect for the cache to be pruned back to 8 +SIZE=10 # Create entries for uid 1001 for ((m=1;m<=$SIZE;m++)); do diff --git a/test/integration/suites/fetch-x509-svids/05-fetch-x509-svids b/test/integration/suites/fetch-x509-svids/05-fetch-x509-svids index e80fea74b0..2518884a74 100755 --- a/test/integration/suites/fetch-x509-svids/05-fetch-x509-svids +++ b/test/integration/suites/fetch-x509-svids/05-fetch-x509-svids @@ -1,7 +1,7 @@ #!/bin/bash -ENTRYCOUNT=1002 -CACHESIZE=1000 +ENTRYCOUNT=10 +CACHESIZE=8 X509SVIDCOUNT=$(docker compose exec -u 1001 -T spire-agent \ /opt/spire/bin/spire-agent api fetch x509 \ @@ -14,4 +14,4 @@ else fi # Call agent debug endpoints and check if extra X.509-SVIDs from cache are cleaned up -check-x509-svid-count "spire-agent" 1000 +check-x509-svid-count "spire-agent" $CACHESIZE diff --git a/test/integration/suites/fetch-x509-svids/06-create-registration-entries b/test/integration/suites/fetch-x509-svids/06-create-registration-entries index 5e50ceb10e..9870a14691 100755 --- a/test/integration/suites/fetch-x509-svids/06-create-registration-entries +++ b/test/integration/suites/fetch-x509-svids/06-create-registration-entries @@ -1,8 +1,8 @@ #!/bin/bash -# LRU Cache size is 1000; we expect uid:1002 to receive all 1002 identities, -# and later on disconnect for the cache to be pruned back to 1000 -SIZE=1002 +# LRU Cache size is 8; we expect uid:1002 to receive all 10 identities, +# and later on disconnect for the cache to be pruned back to 8 +SIZE=10 # Create entries for uid 1002 for ((m=1;m<=$SIZE;m++)); do diff --git a/test/integration/suites/fetch-x509-svids/07-fetch-x509-svids b/test/integration/suites/fetch-x509-svids/07-fetch-x509-svids index 103f2e3544..1c06253360 100755 --- a/test/integration/suites/fetch-x509-svids/07-fetch-x509-svids +++ b/test/integration/suites/fetch-x509-svids/07-fetch-x509-svids @@ -1,7 +1,7 @@ #!/bin/bash -ENTRYCOUNT=1002 -CACHESIZE=1000 +ENTRYCOUNT=10 +CACHESIZE=8 X509SVIDCOUNT=$(docker compose exec -u 1002 -T spire-agent \ /opt/spire/bin/spire-agent api fetch x509 \ diff --git a/test/integration/suites/fetch-x509-svids/conf/agent/agent.conf b/test/integration/suites/fetch-x509-svids/conf/agent/agent.conf index 11012a904b..3beb6fc476 100644 --- a/test/integration/suites/fetch-x509-svids/conf/agent/agent.conf +++ b/test/integration/suites/fetch-x509-svids/conf/agent/agent.conf @@ -7,6 +7,7 @@ agent { trust_bundle_path = "/opt/spire/conf/agent/bootstrap.crt" trust_domain = "domain.test" admin_socket_path = "/opt/debug.sock" + x509_svid_cache_max_size = 8 } plugins {