Skip to content

Commit

Permalink
feat: support using exponential backoff between self heal attempts (#…
Browse files Browse the repository at this point in the history
…20275) (#20479)

Signed-off-by: Alexander Matyushentsev <AMatyushentsev@gmail.com>
  • Loading branch information
alexmt authored Oct 22, 2024
1 parent 4dab5bd commit db5876f
Show file tree
Hide file tree
Showing 17 changed files with 1,015 additions and 705 deletions.
7 changes: 6 additions & 1 deletion assets/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -6960,7 +6960,7 @@
},
"serverVersion": {
"type": "string",
"title": "DEPRECATED: use Info.ServerVersion field instead.\nThe server version"
"title": "Deprecated: use Info.ServerVersion field instead.\nThe server version"
},
"shard": {
"description": "Shard contains optional shard number. Calculated on the fly by the application controller if not specified.",
Expand Down Expand Up @@ -9020,6 +9020,11 @@
"description": "SyncOperation contains details about a sync operation.",
"type": "object",
"properties": {
"autoHealAttemptsCount": {
"type": "integer",
"format": "int64",
"title": "SelfHealAttemptsCount contains the number of auto-heal attempts"
},
"dryRun": {
"type": "boolean",
"title": "DryRun specifies to perform a `kubectl apply --dry-run` without actually performing the sync"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/redis/go-redis/v9"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"

Expand Down Expand Up @@ -53,6 +54,9 @@ func NewCommand() *cobra.Command {
repoServerAddress string
repoServerTimeoutSeconds int
selfHealTimeoutSeconds int
selfHealBackoffTimeoutSeconds int
selfHealBackoffFactor int
selfHealBackoffCapSeconds int
statusProcessors int
operationProcessors int
glogLevel int
Expand Down Expand Up @@ -148,6 +152,14 @@ func NewCommand() *cobra.Command {
kubectl := kubeutil.NewKubectl()
clusterSharding, err := sharding.GetClusterSharding(kubeClient, settingsMgr, shardingAlgorithm, enableDynamicClusterDistribution)
errors.CheckError(err)
var selfHealBackoff *wait.Backoff
if selfHealBackoffTimeoutSeconds != 0 {
selfHealBackoff = &wait.Backoff{
Duration: time.Duration(selfHealBackoffTimeoutSeconds) * time.Second,
Factor: float64(selfHealBackoffFactor),
Cap: time.Duration(selfHealBackoffCapSeconds) * time.Second,
}
}
appController, err = controller.NewApplicationController(
namespace,
settingsMgr,
Expand All @@ -160,6 +172,7 @@ func NewCommand() *cobra.Command {
hardResyncDuration,
time.Duration(appResyncJitter)*time.Second,
time.Duration(selfHealTimeoutSeconds)*time.Second,
selfHealBackoff,
time.Duration(repoErrorGracePeriod)*time.Second,
metricsPort,
metricsCacheExpiration,
Expand Down Expand Up @@ -209,7 +222,10 @@ func NewCommand() *cobra.Command {
command.Flags().IntVar(&glogLevel, "gloglevel", 0, "Set the glog logging level")
command.Flags().IntVar(&metricsPort, "metrics-port", common.DefaultPortArgoCDMetrics, "Start metrics server on given port")
command.Flags().DurationVar(&metricsCacheExpiration, "metrics-cache-expiration", env.ParseDurationFromEnv("ARGOCD_APPLICATION_CONTROLLER_METRICS_CACHE_EXPIRATION", 0*time.Second, 0, math.MaxInt64), "Prometheus metrics cache expiration (disabled by default. e.g. 24h0m0s)")
command.Flags().IntVar(&selfHealTimeoutSeconds, "self-heal-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_TIMEOUT_SECONDS", 5, 0, math.MaxInt32), "Specifies timeout between application self heal attempts")
command.Flags().IntVar(&selfHealTimeoutSeconds, "self-heal-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_TIMEOUT_SECONDS", 0, 0, math.MaxInt32), "Specifies timeout between application self heal attempts")
command.Flags().IntVar(&selfHealBackoffTimeoutSeconds, "self-heal-backoff-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS", 2, 0, math.MaxInt32), "Specifies initial timeout of exponential backoff between self heal attempts")
command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts")
command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts")
command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.")
command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server")
command.Flags().BoolVar(&repoServerStrictTLS, "repo-server-strict-tls", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_STRICT_TLS", false), "Whether to use strict validation of the TLS cert presented by the repo server")
Expand Down
27 changes: 24 additions & 3 deletions controller/appcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ type ApplicationController struct {
statusHardRefreshTimeout time.Duration
statusRefreshJitter time.Duration
selfHealTimeout time.Duration
selfHealBackOff *wait.Backoff
repoClientset apiclient.Clientset
db db.ArgoDB
settingsMgr *settings_util.SettingsManager
Expand Down Expand Up @@ -159,6 +160,7 @@ func NewApplicationController(
appHardResyncPeriod time.Duration,
appResyncJitter time.Duration,
selfHealTimeout time.Duration,
selfHealBackoff *wait.Backoff,
repoErrorGracePeriod time.Duration,
metricsPort int,
metricsCacheExpiration time.Duration,
Expand Down Expand Up @@ -198,6 +200,7 @@ func NewApplicationController(
auditLogger: argo.NewAuditLogger(namespace, kubeClientset, common.ApplicationController),
settingsMgr: settingsMgr,
selfHealTimeout: selfHealTimeout,
selfHealBackOff: selfHealBackoff,
clusterSharding: clusterSharding,
projByNameCache: sync.Map{},
applicationNamespaces: applicationNamespaces,
Expand Down Expand Up @@ -1878,6 +1881,9 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus *
InitiatedBy: appv1.OperationInitiator{Automated: true},
Retry: appv1.RetryStrategy{Limit: 5},
}
if app.Status.OperationState != nil && app.Status.OperationState.Operation.Sync != nil {
op.Sync.SelfHealAttemptsCount = app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount
}
if app.Spec.SyncPolicy.Retry != nil {
op.Retry = *app.Spec.SyncPolicy.Retry
}
Expand All @@ -1895,6 +1901,7 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus *
return nil, 0
} else if alreadyAttempted && selfHeal {
if shouldSelfHeal, retryAfter := ctrl.shouldSelfHeal(app); shouldSelfHeal {
op.Sync.SelfHealAttemptsCount++
for _, resource := range resources {
if resource.Status != appv1.SyncStatusCodeSynced {
op.Sync.Resources = append(op.Sync.Resources, appv1.SyncOperationResource{
Expand Down Expand Up @@ -2000,10 +2007,24 @@ func (ctrl *ApplicationController) shouldSelfHeal(app *appv1.Application) (bool,
}

var retryAfter time.Duration
if app.Status.OperationState.FinishedAt == nil {
retryAfter = ctrl.selfHealTimeout
if ctrl.selfHealBackOff == nil {
if app.Status.OperationState.FinishedAt == nil {
retryAfter = ctrl.selfHealTimeout
} else {
retryAfter = ctrl.selfHealTimeout - time.Since(app.Status.OperationState.FinishedAt.Time)
}
} else {
retryAfter = ctrl.selfHealTimeout - time.Since(app.Status.OperationState.FinishedAt.Time)
backOff := *ctrl.selfHealBackOff
backOff.Steps = int(app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount)
var delay time.Duration
for backOff.Steps > 0 {
delay = backOff.Step()
}
if app.Status.OperationState.FinishedAt == nil {
retryAfter = delay
} else {
retryAfter = delay - time.Since(app.Status.OperationState.FinishedAt.Time)
}
}
return retryAfter <= 0, retryAfter
}
Expand Down
70 changes: 68 additions & 2 deletions controller/appcontroller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"testing"
"time"

clustercache "github.com/argoproj/gitops-engine/pkg/cache"
"github.com/argoproj/gitops-engine/pkg/utils/kube/kubetest"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/require"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/rest"

clustercache "github.com/argoproj/gitops-engine/pkg/cache"
"k8s.io/utils/ptr"

"github.com/argoproj/argo-cd/v2/common"
statecache "github.com/argoproj/argo-cd/v2/controller/cache"
Expand Down Expand Up @@ -151,6 +153,7 @@ func newFakeController(data *fakeData, repoErr error) *ApplicationController {
time.Hour,
time.Second,
time.Minute,
nil,
time.Second*10,
common.DefaultPortArgoCDMetrics,
data.metricsCacheExpiration,
Expand Down Expand Up @@ -2132,3 +2135,66 @@ func TestAppStatusIsReplaced(t *testing.T) {
require.True(t, has)
require.Nil(t, val)
}

func assertDurationAround(t *testing.T, expected time.Duration, actual time.Duration) {
delta := time.Second / 2
assert.GreaterOrEqual(t, expected, actual-delta)
assert.LessOrEqual(t, expected, actual+delta)
}

func TestSelfHealExponentialBackoff(t *testing.T) {
ctrl := newFakeController(&fakeData{}, nil)
ctrl.selfHealBackOff = &wait.Backoff{
Factor: 3,
Duration: 2 * time.Second,
Cap: 5 * time.Minute,
}

app := &v1alpha1.Application{
Status: v1alpha1.ApplicationStatus{
OperationState: &v1alpha1.OperationState{
Operation: v1alpha1.Operation{
Sync: &v1alpha1.SyncOperation{},
},
},
},
}

testCases := []struct {
attempts int64
finishedAt *metav1.Time
expectedDuration time.Duration
shouldSelfHeal bool
}{{
attempts: 0,
finishedAt: ptr.To(metav1.Now()),
expectedDuration: 0,
shouldSelfHeal: true,
}, {
attempts: 1,
finishedAt: ptr.To(metav1.Now()),
expectedDuration: 2 * time.Second,
shouldSelfHeal: false,
}, {
attempts: 2,
finishedAt: ptr.To(metav1.Now()),
expectedDuration: 6 * time.Second,
shouldSelfHeal: false,
}, {
attempts: 3,
finishedAt: nil,
expectedDuration: 18 * time.Second,
shouldSelfHeal: false,
}}

for i := range testCases {
tc := testCases[i]
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount = tc.attempts
app.Status.OperationState.FinishedAt = tc.finishedAt
ok, duration := ctrl.shouldSelfHeal(app)
require.Equal(t, ok, tc.shouldSelfHeal)
assertDurationAround(t, tc.expectedDuration, duration)
})
}
}
7 changes: 5 additions & 2 deletions docs/operator-manual/argocd-cmd-params-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,11 @@ data:
controller.log.level: "info"
# Prometheus metrics cache expiration (disabled by default. e.g. 24h0m0s)
controller.metrics.cache.expiration: "24h0m0s"
# Specifies timeout between application self heal attempts (default 5)
controller.self.heal.timeout.seconds: "5"
# Specifies exponential backoff timeout parameters between application self heal attempts
controller.self.heal.timeout.seconds: "2"
controller.self.heal.backoff.factor: "3"
controller.self.heal.backoff.cap.seconds: "300"

# Cache expiration for app state (default 1h0m0s)
controller.app.state.cache.expiration: "1h0m0s"
# Specifies if resource health should be persisted in app CRD (default true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ argocd-application-controller [flags]
--repo-server-strict-tls Whether to use strict validation of the TLS cert presented by the repo server
--repo-server-timeout-seconds int Repo server RPC call timeout seconds. (default 60)
--request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0")
--self-heal-timeout-seconds int Specifies timeout between application self heal attempts (default 5)
--self-heal-backoff-cap-seconds int Specifies max timeout of exponential backoff between application self heal attempts (default 300)
--self-heal-backoff-factor int Specifies factor of exponential timeout between application self heal attempts (default 3)
--self-heal-backoff-timeout-seconds int Specifies initial timeout of exponential backoff between self heal attempts (default 2)
--self-heal-timeout-seconds int Specifies timeout between application self heal attempts
--sentinel stringArray Redis sentinel hostname and port (e.g. argocd-redis-ha-announce-0:6379).
--sentinelmaster string Redis sentinel master group name. (default "master")
--server string The address and port of the Kubernetes API server
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,24 @@ spec:
name: argocd-cmd-params-cm
key: controller.self.heal.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.factor
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.cap.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,24 @@ spec:
name: argocd-cmd-params-cm
key: controller.self.heal.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.factor
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.cap.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
Expand Down
28 changes: 28 additions & 0 deletions manifests/core-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ spec:
sync:
description: Sync contains parameters for the operation
properties:
autoHealAttemptsCount:
description: SelfHealAttemptsCount contains the number of auto-heal
attempts
format: int64
type: integer
dryRun:
description: DryRun specifies to perform a `kubectl apply --dry-run`
without actually performing the sync
Expand Down Expand Up @@ -2537,6 +2542,11 @@ spec:
sync:
description: Sync contains parameters for the operation
properties:
autoHealAttemptsCount:
description: SelfHealAttemptsCount contains the number
of auto-heal attempts
format: int64
type: integer
dryRun:
description: DryRun specifies to perform a `kubectl apply
--dry-run` without actually performing the sync
Expand Down Expand Up @@ -21857,6 +21867,24 @@ spec:
key: controller.self.heal.timeout.seconds
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS
valueFrom:
configMapKeyRef:
key: controller.self.heal.backoff.timeout.seconds
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR
valueFrom:
configMapKeyRef:
key: controller.self.heal.backoff.factor
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS
valueFrom:
configMapKeyRef:
key: controller.self.heal.backoff.cap.seconds
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
Expand Down
10 changes: 10 additions & 0 deletions manifests/crds/application-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ spec:
sync:
description: Sync contains parameters for the operation
properties:
autoHealAttemptsCount:
description: SelfHealAttemptsCount contains the number of auto-heal
attempts
format: int64
type: integer
dryRun:
description: DryRun specifies to perform a `kubectl apply --dry-run`
without actually performing the sync
Expand Down Expand Up @@ -2536,6 +2541,11 @@ spec:
sync:
description: Sync contains parameters for the operation
properties:
autoHealAttemptsCount:
description: SelfHealAttemptsCount contains the number
of auto-heal attempts
format: int64
type: integer
dryRun:
description: DryRun specifies to perform a `kubectl apply
--dry-run` without actually performing the sync
Expand Down
Loading

0 comments on commit db5876f

Please sign in to comment.