diff --git a/go.mod b/go.mod index 0cbddd3..4f1da5b 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/deepmap/oapi-codegen v1.11.0 github.com/projectsyn/lieutenant-api v0.7.0 github.com/stretchr/testify v1.8.0 + go.uber.org/multierr v1.6.0 golang.org/x/crypto v0.0.0-20220525230936-793ad666bf5e gopkg.in/alecthomas/kingpin.v2 v2.2.6 k8s.io/api v0.21.2 @@ -46,6 +47,7 @@ require ( github.com/taion809/haikunator v0.0.0-20150324135039-4e414e676fd1 // indirect github.com/valyala/bytebufferpool v1.0.0 // indirect github.com/valyala/fasttemplate v1.2.1 // indirect + go.uber.org/atomic v1.7.0 // indirect golang.org/x/net v0.0.0-20220513224357-95641704303c // indirect golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d // indirect golang.org/x/sys v0.0.0-20220513210249-45d2b4557a2a // indirect diff --git a/go.sum b/go.sum index 0a69684..38bb40d 100644 --- a/go.sum +++ b/go.sum @@ -805,6 +805,7 @@ go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= +go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/multierr v0.0.0-20180122172545-ddea229ff1df/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= @@ -812,6 +813,7 @@ go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/ go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= go.uber.org/multierr v1.4.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU= +go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA= go.uber.org/zap v0.0.0-20180814183419-67bc79d13d15/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= diff --git a/main.go b/main.go index 05f1c67..775382b 100644 --- a/main.go +++ b/main.go @@ -45,6 +45,7 @@ func main() { app.Flag("region", "Cloud region this cluster is running in").StringVar(&agent.CloudRegion) app.Flag("distribution", "Kubernetes distribution this cluster is running").StringVar(&agent.Distribution) app.Flag("namespace", "Namespace in which steward is running").Default("syn").StringVar(&agent.Namespace) + app.Flag("operator-namespace", "Namespace in which the ArgoCD operator will be running").Default("syn-argocd-operator").StringVar(&agent.OperatorNamespace) app.Flag("argo-image", "Image to be used for the Argo CD deployments").Default(images.DefaultArgoCDImage).StringVar(&agent.ArgoCDImage) app.Flag("redis-image", "Image to be used for the Argo CD Redis deployment").Default(images.DefaultRedisImage).StringVar(&agent.RedisImage) diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index 52a0411..8ee45d6 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -21,15 +21,16 @@ import ( // Agent configures the cluster agent type Agent struct { - APIURL *url.URL - Token string - ClusterID string - CloudType string - CloudRegion string - Distribution string - Namespace string - ArgoCDImage string - RedisImage string + APIURL *url.URL + Token string + ClusterID string + CloudType string + CloudRegion string + Distribution string + Namespace string + OperatorNamespace string + ArgoCDImage string + RedisImage string facts factCollector } @@ -125,7 +126,7 @@ func (a *Agent) registerCluster(ctx context.Context, config *rest.Config, apiCli return } - if err := argocd.Apply(ctx, config, a.Namespace, a.ArgoCDImage, a.RedisImage, apiClient, cluster); err != nil { + if err := argocd.Apply(ctx, config, a.Namespace, a.OperatorNamespace, a.ArgoCDImage, a.RedisImage, apiClient, cluster); err != nil { klog.Error(err) } } diff --git a/pkg/argocd/argocd.go b/pkg/argocd/argocd.go index 6285840..0f9a74c 100644 --- a/pkg/argocd/argocd.go +++ b/pkg/argocd/argocd.go @@ -2,15 +2,18 @@ package argocd import ( "context" + "fmt" + "time" "github.com/projectsyn/lieutenant-api/pkg/api" + "go.uber.org/multierr" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/klog" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" ) @@ -34,7 +37,7 @@ var ( ) // Apply reconciles the Argo CD deployments -func Apply(ctx context.Context, config *rest.Config, namespace, argoImage, redisArgoImage string, apiClient *api.Client, cluster *api.Cluster) error { +func Apply(ctx context.Context, config *rest.Config, namespace, operatorNamespace, argoImage, redisArgoImage string, apiClient *api.Client, cluster *api.Cluster) error { clientset, err := kubernetes.NewForConfig(config) if err != nil { return err @@ -58,14 +61,16 @@ func Apply(ctx context.Context, config *rest.Config, namespace, argoImage, redis } if err == nil && len(argos.Items) > 0 { - return nil + // An ArgoCD custom resource exists in our namespace + err = fixArgoOperatorDeadlock(ctx, clientset, config, namespace, operatorNamespace) + return fmt.Errorf("could not fix argocd operator deadlock: %w", err) } deployments, err := clientset.AppsV1().Deployments(namespace).List(ctx, metav1.ListOptions{ LabelSelector: "app.kubernetes.io/part-of=argocd", }) if err != nil { - return err + return fmt.Errorf("Could not list ArgoCD deployments: %w", err) } expectedDeploymentCount := 3 foundDeploymentCount := len(deployments.Items) @@ -74,7 +79,7 @@ func Apply(ctx context.Context, config *rest.Config, namespace, argoImage, redis LabelSelector: "app.kubernetes.io/part-of=argocd", }) if err != nil { - return err + return fmt.Errorf("Could not list ArgoCD statefulsets: %w", err) } expectedStatefulSetCount := 1 foundStatefulSetCount := len(statefulsets.Items) @@ -123,3 +128,56 @@ func bootstrapArgo(ctx context.Context, clientset *kubernetes.Clientset, config return nil } + +func fixArgoOperatorDeadlock(ctx context.Context, clientset *kubernetes.Clientset, config *rest.Config, namespace, operatorNamespace string) error { + configmaps, err := clientset.CoreV1().ConfigMaps(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: "app.kubernetes.io/part-of=argocd", + }) + + if err != nil { + return fmt.Errorf("Could not list ArgoCD config maps: %w", err) + } + + if len(configmaps.Items) > 2 { + // no restart required + return nil + } + + pods, err := clientset.CoreV1().Pods(operatorNamespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("Could not list ArgoCD operator pods: %w", err) + } + + for _, pod := range(pods.Items) { + if pod.CreationTimestamp.Time.After(time.Now().Add(-10 * time.Minute)) { + klog.Info("ArgoCD Operator pod was recently created, waiting to reboot...") + return nil + } + } + + // if there still exists an argocd-secret not managed by the operator, clean it up: + secret, err := clientset.CoreV1().Secrets(namespace).Get(ctx, argoSecretName, metav1.GetOptions{}) + if err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("Could not get ArgoCD secret: %w", err) + } + + if err == nil { + if len(secret.ObjectMeta.OwnerReferences) == 0 { + klog.Info("Deleting steward-managed ArgoCD secret") + err := clientset.CoreV1().Secrets(namespace).Delete(ctx, argoSecretName, metav1.DeleteOptions{}) + if err != nil { + return fmt.Errorf("Could not delete steward-managed ArgoCD secret: %w", err) + } + } + } + + klog.Info("Rebooting ArgoCD operator to resolve deadlock...") + errors := []error{} + for _, pod := range(pods.Items) { + klog.Infof("Removing pod %s", pod.Name) + err := clientset.CoreV1().Pods(operatorNamespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}) + errors = append(errors, err) + } + + return multierr.Combine(errors ...) +} \ No newline at end of file