Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reboot ArgoCD operator when a possible deadlock situation occurs #128

Merged
merged 2 commits into from
Jul 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ require (
github.com/deepmap/oapi-codegen v1.11.0
github.com/projectsyn/lieutenant-api v0.7.0
github.com/stretchr/testify v1.8.0
go.uber.org/multierr v1.6.0
golang.org/x/crypto v0.0.0-20220525230936-793ad666bf5e
gopkg.in/alecthomas/kingpin.v2 v2.2.6
k8s.io/api v0.21.2
Expand Down Expand Up @@ -46,6 +47,7 @@ require (
github.com/taion809/haikunator v0.0.0-20150324135039-4e414e676fd1 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasttemplate v1.2.1 // indirect
go.uber.org/atomic v1.7.0 // indirect
golang.org/x/net v0.0.0-20220513224357-95641704303c // indirect
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d // indirect
golang.org/x/sys v0.0.0-20220513210249-45d2b4557a2a // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -805,13 +805,15 @@ go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
go.uber.org/multierr v0.0.0-20180122172545-ddea229ff1df/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4=
go.uber.org/multierr v1.4.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4=
go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU=
go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA=
go.uber.org/zap v0.0.0-20180814183419-67bc79d13d15/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
Expand Down
1 change: 1 addition & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ func main() {
app.Flag("region", "Cloud region this cluster is running in").StringVar(&agent.CloudRegion)
app.Flag("distribution", "Kubernetes distribution this cluster is running").StringVar(&agent.Distribution)
app.Flag("namespace", "Namespace in which steward is running").Default("syn").StringVar(&agent.Namespace)
app.Flag("operator-namespace", "Namespace in which the ArgoCD operator will be running").Default("syn-argocd-operator").StringVar(&agent.OperatorNamespace)
app.Flag("argo-image", "Image to be used for the Argo CD deployments").Default(images.DefaultArgoCDImage).StringVar(&agent.ArgoCDImage)
app.Flag("redis-image", "Image to be used for the Argo CD Redis deployment").Default(images.DefaultRedisImage).StringVar(&agent.RedisImage)

Expand Down
21 changes: 11 additions & 10 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,16 @@ import (

// Agent configures the cluster agent
type Agent struct {
APIURL *url.URL
Token string
ClusterID string
CloudType string
CloudRegion string
Distribution string
Namespace string
ArgoCDImage string
RedisImage string
APIURL *url.URL
Token string
ClusterID string
CloudType string
CloudRegion string
Distribution string
Namespace string
OperatorNamespace string
ArgoCDImage string
RedisImage string

facts factCollector
}
Expand Down Expand Up @@ -125,7 +126,7 @@ func (a *Agent) registerCluster(ctx context.Context, config *rest.Config, apiCli
return
}

if err := argocd.Apply(ctx, config, a.Namespace, a.ArgoCDImage, a.RedisImage, apiClient, cluster); err != nil {
if err := argocd.Apply(ctx, config, a.Namespace, a.OperatorNamespace, a.ArgoCDImage, a.RedisImage, apiClient, cluster); err != nil {
klog.Error(err)
}
}
Expand Down
68 changes: 63 additions & 5 deletions pkg/argocd/argocd.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@ package argocd

import (
"context"
"fmt"
"time"

"github.com/projectsyn/lieutenant-api/pkg/api"
"go.uber.org/multierr"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/klog"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
)

Expand All @@ -34,7 +37,7 @@ var (
)

// Apply reconciles the Argo CD deployments
func Apply(ctx context.Context, config *rest.Config, namespace, argoImage, redisArgoImage string, apiClient *api.Client, cluster *api.Cluster) error {
func Apply(ctx context.Context, config *rest.Config, namespace, operatorNamespace, argoImage, redisArgoImage string, apiClient *api.Client, cluster *api.Cluster) error {
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return err
Expand All @@ -58,14 +61,16 @@ func Apply(ctx context.Context, config *rest.Config, namespace, argoImage, redis
}

if err == nil && len(argos.Items) > 0 {
return nil
// An ArgoCD custom resource exists in our namespace
err = fixArgoOperatorDeadlock(ctx, clientset, config, namespace, operatorNamespace)
return fmt.Errorf("could not fix argocd operator deadlock: %w", err)
}

deployments, err := clientset.AppsV1().Deployments(namespace).List(ctx, metav1.ListOptions{
LabelSelector: "app.kubernetes.io/part-of=argocd",
})
if err != nil {
return err
return fmt.Errorf("Could not list ArgoCD deployments: %w", err)
}
expectedDeploymentCount := 3
foundDeploymentCount := len(deployments.Items)
Expand All @@ -74,7 +79,7 @@ func Apply(ctx context.Context, config *rest.Config, namespace, argoImage, redis
LabelSelector: "app.kubernetes.io/part-of=argocd",
})
if err != nil {
return err
return fmt.Errorf("Could not list ArgoCD statefulsets: %w", err)
}
expectedStatefulSetCount := 1
foundStatefulSetCount := len(statefulsets.Items)
Expand Down Expand Up @@ -123,3 +128,56 @@ func bootstrapArgo(ctx context.Context, clientset *kubernetes.Clientset, config

return nil
}

func fixArgoOperatorDeadlock(ctx context.Context, clientset *kubernetes.Clientset, config *rest.Config, namespace, operatorNamespace string) error {
configmaps, err := clientset.CoreV1().ConfigMaps(namespace).List(ctx, metav1.ListOptions{
LabelSelector: "app.kubernetes.io/part-of=argocd",
})

if err != nil {
return fmt.Errorf("Could not list ArgoCD config maps: %w", err)
}

if len(configmaps.Items) > 2 {
// no restart required
return nil
}

pods, err := clientset.CoreV1().Pods(operatorNamespace).List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("Could not list ArgoCD operator pods: %w", err)
}

for _, pod := range(pods.Items) {
if pod.CreationTimestamp.Time.After(time.Now().Add(-10 * time.Minute)) {
klog.Info("ArgoCD Operator pod was recently created, waiting to reboot...")
return nil
}
}

// if there still exists an argocd-secret not managed by the operator, clean it up:
secret, err := clientset.CoreV1().Secrets(namespace).Get(ctx, argoSecretName, metav1.GetOptions{})
if err != nil && !errors.IsNotFound(err) {
return fmt.Errorf("Could not get ArgoCD secret: %w", err)
}

if err == nil {
if len(secret.ObjectMeta.OwnerReferences) == 0 {
klog.Info("Deleting steward-managed ArgoCD secret")
err := clientset.CoreV1().Secrets(namespace).Delete(ctx, argoSecretName, metav1.DeleteOptions{})
if err != nil {
return fmt.Errorf("Could not delete steward-managed ArgoCD secret: %w", err)
}
}
}

klog.Info("Rebooting ArgoCD operator to resolve deadlock...")
errors := []error{}
for _, pod := range(pods.Items) {
klog.Infof("Removing pod %s", pod.Name)
err := clientset.CoreV1().Pods(operatorNamespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
errors = append(errors, err)
}

return multierr.Combine(errors ...)
}