Skip to content

Commit

Permalink
fix: abort node watch on hostname change
Browse files Browse the repository at this point in the history
The method `WaitForCacheSync` might block for a long time, and looks
like it can block forever if the nodename requested doesn't match the
actual nodename.

Allow the wait to be aborted by running it in a goroutine, and notifying
readiness of the watcher via the `notifyCh`.

Fixes #10163

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
  • Loading branch information
smira committed Jan 21, 2025
1 parent 99ba539 commit 0b7fc7c
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,25 @@ func (r *NodeWatcher) Watch(ctx context.Context, logger *zap.Logger) (<-chan str

informerFactory.Start(ctx.Done())

logger.Debug("waiting for node cache sync")
go func() {
logger.Debug("waiting for node cache sync")

informerFactory.WaitForCacheSync(ctx.Done())
result := informerFactory.WaitForCacheSync(ctx.Done())

logger.Debug("node cache sync done")
var synced bool

// result should contain a single entry
for _, v := range result {
synced = v
}

logger.Debug("node cache sync done", zap.Bool("synced", synced))

select {
case notifyCh <- struct{}{}:
default:
}
}()

return notifyCh, watchErrCh, informerFactory.Shutdown, nil
}
14 changes: 10 additions & 4 deletions internal/app/machined/pkg/controllers/k8s/node_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ func (ctrl *NodeStatusController) Run(ctx context.Context, r controller.Runtime,
watchErrCh <-chan error
notifyCloser func()
watchErrors int
watchReady bool
)

closeWatcher := func() {
Expand All @@ -82,7 +83,6 @@ func (ctrl *NodeStatusController) Run(ctx context.Context, r controller.Runtime,
notifyCloser = nil
notifyCh = nil
watchErrCh = nil
watchErrors = 0
}

if kubernetesClient != nil {
Expand All @@ -91,6 +91,8 @@ func (ctrl *NodeStatusController) Run(ctx context.Context, r controller.Runtime,
kubernetesClient = nil
}

watchErrors = 0
watchReady = false
nodewatcher = nil
}

Expand All @@ -103,15 +105,14 @@ func (ctrl *NodeStatusController) Run(ctx context.Context, r controller.Runtime,
case <-r.EventCh():
case <-notifyCh:
watchErrors = 0
watchReady = true
case watchErr := <-watchErrCh:
logger.Error("node watch error", zap.Error(watchErr), zap.Int("error_count", watchErrors))

watchErrors++

if watchErrors >= watchErrorsThreshold {
closeWatcher()

watchErrors = 0
} else {
// keep waiting
continue
Expand Down Expand Up @@ -164,6 +165,11 @@ func (ctrl *NodeStatusController) Run(ctx context.Context, r controller.Runtime,
}
}

if !watchReady {
// node watcher is not ready yet, skip updating output resource
continue
}

touchedIDs := make(map[resource.ID]struct{})

node, err := nodewatcher.Get()
Expand All @@ -172,7 +178,7 @@ func (ctrl *NodeStatusController) Run(ctx context.Context, r controller.Runtime,
}

if node != nil {
if err = safe.WriterModify[*k8s.NodeStatus](ctx, r, k8s.NewNodeStatus(k8s.NamespaceName, node.Name),
if err = safe.WriterModify(ctx, r, k8s.NewNodeStatus(k8s.NamespaceName, node.Name),
func(res *k8s.NodeStatus) error {
res.TypedSpec().Nodename = node.Name
res.TypedSpec().Unschedulable = node.Spec.Unschedulable
Expand Down

0 comments on commit 0b7fc7c

Please sign in to comment.