Skip to content

Commit

Permalink
fix: fix node resolution cache for nodes in maintenance mode
Browse files Browse the repository at this point in the history
There was a problem with the node resolution (a.k.a. DNS) cache of the nodes.
When a machine is in maintenance mode, there is a corresponding `MachineStatus` resource for it, but there isn't any `ClusterMachineIdentity`.
Both of these types trigger updates in the node resolution cache.
When a machine was never part of a cluster, the only source is `MachineStatus`, and the cache updates on it did not populate the machine ID in the cache.
This caused the GRPC router to pick the wrong destination.

Furthermore, we did not remove the cluster and node name information from the cache when a machine was removed from a cluster. This caused the cache to contain obsolete cluster information, causing Talos GRPC proxy to not proxy the requests correctly after a machine was removed from a cluster.

Co-authored-by: Artem Chernyshev <artem.chernyshev@talos-systems.com>
Signed-off-by: Utku Ozdemir <utku.ozdemir@siderolabs.com>
  • Loading branch information
utkuozdemir and Unix4ever committed Jan 31, 2025
1 parent 65244f6 commit 6f014b1
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 1 deletion.
3 changes: 3 additions & 0 deletions internal/backend/dns/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ func (d *Service) updateEntryByMachineStatus(res *omni.MachineStatus) {

info := d.machineIDToInfo[res.Metadata().ID()]

info.ID = res.Metadata().ID()
info.TalosVersion = version
info.managementEndpoint = res.TypedSpec().Value.ManagementAddress

Expand Down Expand Up @@ -316,6 +317,8 @@ func (d *Service) deleteIdentityMappings(id resource.ID) {
d.nodenameToID.remove(info.Name, id)
}

info.Cluster = ""
info.Name = ""
info.address = ""

d.machineIDToInfo[id] = info
Expand Down
46 changes: 45 additions & 1 deletion internal/backend/dns/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// Use of this software is governed by the Business Source License
// included in the LICENSE file.

//nolint:goconst
package dns_test

import (
Expand Down Expand Up @@ -115,7 +116,7 @@ func (suite *ServiceSuite) TestResolve() {
// destroy the identity, assert that it doesn't resolve anymore
suite.Require().NoError(suite.state.Destroy(suite.ctx, identity.Metadata()))

expected = dns.NewInfo(cluster, "test-1", "test-1-node", "")
expected = dns.NewInfo("", "test-1", "", "")
expected.TalosVersion = machineStatus.TypedSpec().Value.TalosVersion

// still resolves by the node id, but has an empty address
Expand All @@ -129,6 +130,49 @@ func (suite *ServiceSuite) TestResolve() {
suite.assertResolve("test-1", zeroInfo)
}

func (suite *ServiceSuite) TestResolveAllocateAndDeallocate() {
expected := dns.NewInfo("", "test-1", "", "")

expected.TalosVersion = "3.2.1"

// In the maintenance mode, we only have MachineStatus, so we start with that
// (means cache will be initialized with the data on MachineStatus and nothing else - no ClusterMachineIdentity)
machineStatus := omni.NewMachineStatus(resources.DefaultNamespace, "test-1")

machineStatus.TypedSpec().Value.TalosVersion = "3.2.1"

suite.Require().NoError(suite.state.Create(suite.ctx, machineStatus))

suite.assertResolve("test-1", expected)

// allocate the machine to a cluster by creating a ClusterMachineIdentity

identity := omni.NewClusterMachineIdentity(resources.DefaultNamespace, "test-1")
identity.Metadata().Labels().Set(omni.LabelCluster, "test-cluster-1")

identity.TypedSpec().Value.Nodename = "test-1-node"

suite.Require().NoError(suite.state.Create(suite.ctx, identity))

// assert that cluster information gets resolved

expected.Cluster = "test-cluster-1"
expected.Name = "test-1-node"

suite.assertResolve("test-1", expected)

// deallocate the machine by destroying the ClusterMachineIdentity

suite.Require().NoError(suite.state.Destroy(suite.ctx, identity.Metadata()))

// assert that the machine still resolves but the cluster information is gone

expected.Cluster = ""
expected.Name = ""

suite.assertResolve("test-1", expected)
}

func (suite *ServiceSuite) assertResolveAddress(cluster, node, expected string) {
err := retry.Constant(3*time.Second, retry.WithUnits(100*time.Millisecond)).RetryWithContext(suite.ctx, func(context.Context) error {
resolved := suite.dnsService.Resolve(cluster, node)
Expand Down

0 comments on commit 6f014b1

Please sign in to comment.