From 823aec5d2e110184e2e991a4a6644b1b149a59f2 Mon Sep 17 00:00:00 2001 From: Sridhar Gaddam Date: Mon, 5 Jun 2023 21:19:55 +0530 Subject: [PATCH] Fix race in GN pod for out-of-order RemoteEndpoint events When there is a gateway migration in a remote cluster or if there is any stale endpoint on the Broker associated with the remoteCluster, the events might come in out of order which can create issues for datapath connectivity. This PR includes the necessary checks in Globalnet pod to ignore any stale events. Related to: https://github.com/submariner-io/submariner/pull/2399 Signed-off-by: Sridhar Gaddam --- pkg/globalnet/controllers/gateway_monitor.go | 32 +++++++++++++++++--- pkg/globalnet/controllers/types.go | 21 +++++++------ 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/pkg/globalnet/controllers/gateway_monitor.go b/pkg/globalnet/controllers/gateway_monitor.go index 4fa189d1a..a6512d2f5 100644 --- a/pkg/globalnet/controllers/gateway_monitor.go +++ b/pkg/globalnet/controllers/gateway_monitor.go @@ -37,6 +37,7 @@ import ( "github.com/submariner-io/submariner/pkg/netlink" routeAgent "github.com/submariner-io/submariner/pkg/routeagent_driver/constants" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/dynamic" @@ -47,11 +48,12 @@ import ( func NewGatewayMonitor(spec Specification, localCIDRs []string, config *watcher.Config) (Interface, error) { // We'll panic if config is nil, this is intentional gatewayMonitor := &gatewayMonitor{ - baseController: newBaseController(), - spec: spec, - isGatewayNode: atomic.Bool{}, - localSubnets: sets.New(localCIDRs...).UnsortedList(), - remoteSubnets: sets.New[string](), + baseController: newBaseController(), + spec: spec, + isGatewayNode: atomic.Bool{}, + localSubnets: sets.New(localCIDRs...).UnsortedList(), + remoteSubnets: sets.New[string](), + remoteEndpointTimeStamp: map[string]metav1.Time{}, } var err error @@ -143,6 +145,14 @@ func (g *gatewayMonitor) handleCreatedOrUpdatedEndpoint(obj runtime.Object, _ in logger.V(log.DEBUG).Infof("In processNextEndpoint, endpoint info: %+v", endpoint) if endpoint.Spec.ClusterID != g.spec.ClusterID { + lastProcessedTime, ok := g.remoteEndpointTimeStamp[endpoint.Spec.ClusterID] + + if ok && lastProcessedTime.After(endpoint.CreationTimestamp.Time) { + logger.Infof("Ignoring new remote %#v since a later endpoint was already"+ + "processed", endpoint) + return false + } + logger.V(log.DEBUG).Infof("Endpoint %q, host: %q belongs to a remote cluster", endpoint.Spec.ClusterID, endpoint.Spec.Hostname) @@ -168,6 +178,8 @@ func (g *gatewayMonitor) handleCreatedOrUpdatedEndpoint(obj runtime.Object, _ in } } + g.remoteEndpointTimeStamp[endpoint.Spec.ClusterID] = endpoint.CreationTimestamp + return false } @@ -206,6 +218,16 @@ func (g *gatewayMonitor) handleCreatedOrUpdatedEndpoint(obj runtime.Object, _ in func (g *gatewayMonitor) handleRemovedEndpoint(obj runtime.Object, _ int) bool { endpoint := obj.(*v1.Endpoint) + lastProcessedTime, ok := g.remoteEndpointTimeStamp[endpoint.Spec.ClusterID] + + if ok && lastProcessedTime.After(endpoint.CreationTimestamp.Time) { + logger.Infof("Ignoring deleted remote %#v since a later endpoint was already"+ + "processed", endpoint) + return false + } + + delete(g.remoteEndpointTimeStamp, endpoint.Spec.ClusterID) + logger.V(log.DEBUG).Infof("Informed of removed endpoint for gateway monitor: %v", endpoint) hostname, err := os.Hostname() diff --git a/pkg/globalnet/controllers/types.go b/pkg/globalnet/controllers/types.go index b3a49acd6..5173cb2e2 100644 --- a/pkg/globalnet/controllers/types.go +++ b/pkg/globalnet/controllers/types.go @@ -91,16 +91,17 @@ type baseController struct { type gatewayMonitor struct { *baseController - syncerConfig *syncer.ResourceSyncerConfig - endpointWatcher watcher.Interface - spec Specification - ipt iptables.Interface - isGatewayNode atomic.Bool - nodeName string - localSubnets []string - remoteSubnets sets.Set[string] - controllersMutex sync.Mutex // Protects controllers - controllers []Interface + syncerConfig *syncer.ResourceSyncerConfig + endpointWatcher watcher.Interface + remoteEndpointTimeStamp map[string]metav1.Time + spec Specification + ipt iptables.Interface + isGatewayNode atomic.Bool + nodeName string + localSubnets []string + remoteSubnets sets.Set[string] + controllersMutex sync.Mutex // Protects controllers + controllers []Interface } type baseSyncerController struct {