Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support using exponential backoff between self heal attempts (#20275) #20478

Merged
merged 1 commit into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions assets/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -8884,6 +8884,11 @@
"description": "SyncOperation contains details about a sync operation.",
"type": "object",
"properties": {
"autoHealAttemptsCount": {
"type": "integer",
"format": "int64",
"title": "SelfHealAttemptsCount contains the number of auto-heal attempts"
},
"dryRun": {
"type": "boolean",
"title": "DryRun specifies to perform a `kubectl apply --dry-run` without actually performing the sync"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/redis/go-redis/v9"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"

Expand Down Expand Up @@ -53,6 +54,9 @@ func NewCommand() *cobra.Command {
repoServerAddress string
repoServerTimeoutSeconds int
selfHealTimeoutSeconds int
selfHealBackoffTimeoutSeconds int
selfHealBackoffFactor int
selfHealBackoffCapSeconds int
statusProcessors int
operationProcessors int
glogLevel int
Expand Down Expand Up @@ -148,6 +152,14 @@ func NewCommand() *cobra.Command {
kubectl := kubeutil.NewKubectl()
clusterSharding, err := sharding.GetClusterSharding(kubeClient, settingsMgr, shardingAlgorithm, enableDynamicClusterDistribution)
errors.CheckError(err)
var selfHealBackoff *wait.Backoff
if selfHealBackoffTimeoutSeconds != 0 {
selfHealBackoff = &wait.Backoff{
Duration: time.Duration(selfHealBackoffTimeoutSeconds) * time.Second,
Factor: float64(selfHealBackoffFactor),
Cap: time.Duration(selfHealBackoffCapSeconds) * time.Second,
}
}
appController, err = controller.NewApplicationController(
namespace,
settingsMgr,
Expand All @@ -160,6 +172,7 @@ func NewCommand() *cobra.Command {
hardResyncDuration,
time.Duration(appResyncJitter)*time.Second,
time.Duration(selfHealTimeoutSeconds)*time.Second,
selfHealBackoff,
time.Duration(repoErrorGracePeriod)*time.Second,
metricsPort,
metricsCacheExpiration,
Expand Down Expand Up @@ -209,7 +222,10 @@ func NewCommand() *cobra.Command {
command.Flags().IntVar(&glogLevel, "gloglevel", 0, "Set the glog logging level")
command.Flags().IntVar(&metricsPort, "metrics-port", common.DefaultPortArgoCDMetrics, "Start metrics server on given port")
command.Flags().DurationVar(&metricsCacheExpiration, "metrics-cache-expiration", env.ParseDurationFromEnv("ARGOCD_APPLICATION_CONTROLLER_METRICS_CACHE_EXPIRATION", 0*time.Second, 0, math.MaxInt64), "Prometheus metrics cache expiration (disabled by default. e.g. 24h0m0s)")
command.Flags().IntVar(&selfHealTimeoutSeconds, "self-heal-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_TIMEOUT_SECONDS", 5, 0, math.MaxInt32), "Specifies timeout between application self heal attempts")
command.Flags().IntVar(&selfHealTimeoutSeconds, "self-heal-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_TIMEOUT_SECONDS", 0, 0, math.MaxInt32), "Specifies timeout between application self heal attempts")
command.Flags().IntVar(&selfHealBackoffTimeoutSeconds, "self-heal-backoff-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS", 2, 0, math.MaxInt32), "Specifies initial timeout of exponential backoff between self heal attempts")
command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts")
command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts")
command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.")
command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server")
command.Flags().BoolVar(&repoServerStrictTLS, "repo-server-strict-tls", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_STRICT_TLS", false), "Whether to use strict validation of the TLS cert presented by the repo server")
Expand Down
27 changes: 24 additions & 3 deletions controller/appcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ type ApplicationController struct {
statusHardRefreshTimeout time.Duration
statusRefreshJitter time.Duration
selfHealTimeout time.Duration
selfHealBackOff *wait.Backoff
repoClientset apiclient.Clientset
db db.ArgoDB
settingsMgr *settings_util.SettingsManager
Expand Down Expand Up @@ -150,6 +151,7 @@ func NewApplicationController(
appHardResyncPeriod time.Duration,
appResyncJitter time.Duration,
selfHealTimeout time.Duration,
selfHealBackoff *wait.Backoff,
repoErrorGracePeriod time.Duration,
metricsPort int,
metricsCacheExpiration time.Duration,
Expand Down Expand Up @@ -189,6 +191,7 @@ func NewApplicationController(
auditLogger: argo.NewAuditLogger(namespace, kubeClientset, common.ApplicationController),
settingsMgr: settingsMgr,
selfHealTimeout: selfHealTimeout,
selfHealBackOff: selfHealBackoff,
clusterSharding: clusterSharding,
projByNameCache: sync.Map{},
applicationNamespaces: applicationNamespaces,
Expand Down Expand Up @@ -1874,6 +1877,9 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus *
InitiatedBy: appv1.OperationInitiator{Automated: true},
Retry: appv1.RetryStrategy{Limit: 5},
}
if app.Status.OperationState != nil && app.Status.OperationState.Operation.Sync != nil {
op.Sync.SelfHealAttemptsCount = app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount
}
if app.Spec.SyncPolicy.Retry != nil {
op.Retry = *app.Spec.SyncPolicy.Retry
}
Expand All @@ -1891,6 +1897,7 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus *
return nil, 0
} else if alreadyAttempted && selfHeal {
if shouldSelfHeal, retryAfter := ctrl.shouldSelfHeal(app); shouldSelfHeal {
op.Sync.SelfHealAttemptsCount++
for _, resource := range resources {
if resource.Status != appv1.SyncStatusCodeSynced {
op.Sync.Resources = append(op.Sync.Resources, appv1.SyncOperationResource{
Expand Down Expand Up @@ -1998,10 +2005,24 @@ func (ctrl *ApplicationController) shouldSelfHeal(app *appv1.Application) (bool,
}

var retryAfter time.Duration
if app.Status.OperationState.FinishedAt == nil {
retryAfter = ctrl.selfHealTimeout
if ctrl.selfHealBackOff == nil {
if app.Status.OperationState.FinishedAt == nil {
retryAfter = ctrl.selfHealTimeout
} else {
retryAfter = ctrl.selfHealTimeout - time.Since(app.Status.OperationState.FinishedAt.Time)
}
} else {
retryAfter = ctrl.selfHealTimeout - time.Since(app.Status.OperationState.FinishedAt.Time)
backOff := *ctrl.selfHealBackOff
backOff.Steps = int(app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount)
var delay time.Duration
for backOff.Steps > 0 {
delay = backOff.Step()
}
if app.Status.OperationState.FinishedAt == nil {
retryAfter = delay
} else {
retryAfter = delay - time.Since(app.Status.OperationState.FinishedAt.Time)
}
}
return retryAfter <= 0, retryAfter
}
Expand Down
70 changes: 68 additions & 2 deletions controller/appcontroller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,18 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"testing"
"time"

clustercache "github.com/argoproj/gitops-engine/pkg/cache"
"github.com/argoproj/gitops-engine/pkg/utils/kube/kubetest"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/require"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/rest"

clustercache "github.com/argoproj/gitops-engine/pkg/cache"

"github.com/argoproj/argo-cd/v2/common"
statecache "github.com/argoproj/argo-cd/v2/controller/cache"
"github.com/argoproj/argo-cd/v2/controller/sharding"
Expand Down Expand Up @@ -150,6 +151,7 @@ func newFakeController(data *fakeData, repoErr error) *ApplicationController {
time.Hour,
time.Second,
time.Minute,
nil,
time.Second*10,
common.DefaultPortArgoCDMetrics,
data.metricsCacheExpiration,
Expand Down Expand Up @@ -1976,3 +1978,67 @@ func TestAppStatusIsReplaced(t *testing.T) {
require.True(t, has)
require.Nil(t, val)
}

func assertDurationAround(t *testing.T, expected time.Duration, actual time.Duration) {
delta := time.Second / 2
assert.GreaterOrEqual(t, expected, actual-delta)
assert.LessOrEqual(t, expected, actual+delta)
}

func TestSelfHealExponentialBackoff(t *testing.T) {
ctrl := newFakeController(&fakeData{}, nil)
ctrl.selfHealBackOff = &wait.Backoff{
Factor: 3,
Duration: 2 * time.Second,
Cap: 5 * time.Minute,
}

app := &v1alpha1.Application{
Status: v1alpha1.ApplicationStatus{
OperationState: &v1alpha1.OperationState{
Operation: v1alpha1.Operation{
Sync: &v1alpha1.SyncOperation{},
},
},
},
}
now := metav1.Now()

testCases := []struct {
attempts int64
finishedAt *metav1.Time
expectedDuration time.Duration
shouldSelfHeal bool
}{{
attempts: 0,
finishedAt: &now,
expectedDuration: 0,
shouldSelfHeal: true,
}, {
attempts: 1,
finishedAt: &now,
expectedDuration: 2 * time.Second,
shouldSelfHeal: false,
}, {
attempts: 2,
finishedAt: &now,
expectedDuration: 6 * time.Second,
shouldSelfHeal: false,
}, {
attempts: 3,
finishedAt: nil,
expectedDuration: 18 * time.Second,
shouldSelfHeal: false,
}}

for i := range testCases {
tc := testCases[i]
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount = tc.attempts
app.Status.OperationState.FinishedAt = tc.finishedAt
ok, duration := ctrl.shouldSelfHeal(app)
require.Equal(t, ok, tc.shouldSelfHeal)
assertDurationAround(t, tc.expectedDuration, duration)
})
}
}
7 changes: 5 additions & 2 deletions docs/operator-manual/argocd-cmd-params-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,11 @@ data:
controller.log.level: "info"
# Prometheus metrics cache expiration (disabled by default. e.g. 24h0m0s)
controller.metrics.cache.expiration: "24h0m0s"
# Specifies timeout between application self heal attempts (default 5)
controller.self.heal.timeout.seconds: "5"
# Specifies exponential backoff timeout parameters between application self heal attempts
controller.self.heal.timeout.seconds: "2"
controller.self.heal.backoff.factor: "3"
controller.self.heal.backoff.cap.seconds: "300"

# Cache expiration for app state (default 1h0m0s)
controller.app.state.cache.expiration: "1h0m0s"
# Specifies if resource health should be persisted in app CRD (default true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ argocd-application-controller [flags]
--repo-server-strict-tls Whether to use strict validation of the TLS cert presented by the repo server
--repo-server-timeout-seconds int Repo server RPC call timeout seconds. (default 60)
--request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0")
--self-heal-timeout-seconds int Specifies timeout between application self heal attempts (default 5)
--self-heal-backoff-cap-seconds int Specifies max timeout of exponential backoff between application self heal attempts (default 300)
--self-heal-backoff-factor int Specifies factor of exponential timeout between application self heal attempts (default 3)
--self-heal-backoff-timeout-seconds int Specifies initial timeout of exponential backoff between self heal attempts (default 2)
--self-heal-timeout-seconds int Specifies timeout between application self heal attempts
--sentinel stringArray Redis sentinel hostname and port (e.g. argocd-redis-ha-announce-0:6379).
--sentinelmaster string Redis sentinel master group name. (default "master")
--server string The address and port of the Kubernetes API server
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,24 @@ spec:
name: argocd-cmd-params-cm
key: controller.self.heal.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.factor
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.cap.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,24 @@ spec:
name: argocd-cmd-params-cm
key: controller.self.heal.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.factor
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.cap.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
Expand Down
28 changes: 28 additions & 0 deletions manifests/core-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ spec:
sync:
description: Sync contains parameters for the operation
properties:
autoHealAttemptsCount:
description: SelfHealAttemptsCount contains the number of auto-heal
attempts
format: int64
type: integer
dryRun:
description: DryRun specifies to perform a `kubectl apply --dry-run`
without actually performing the sync
Expand Down Expand Up @@ -2534,6 +2539,11 @@ spec:
sync:
description: Sync contains parameters for the operation
properties:
autoHealAttemptsCount:
description: SelfHealAttemptsCount contains the number
of auto-heal attempts
format: int64
type: integer
dryRun:
description: DryRun specifies to perform a `kubectl apply
--dry-run` without actually performing the sync
Expand Down Expand Up @@ -21799,6 +21809,24 @@ spec:
key: controller.self.heal.timeout.seconds
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS
valueFrom:
configMapKeyRef:
key: controller.self.heal.backoff.timeout.seconds
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR
valueFrom:
configMapKeyRef:
key: controller.self.heal.backoff.factor
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS
valueFrom:
configMapKeyRef:
key: controller.self.heal.backoff.cap.seconds
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
Expand Down
10 changes: 10 additions & 0 deletions manifests/crds/application-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,11 @@ spec:
sync:
description: Sync contains parameters for the operation
properties:
autoHealAttemptsCount:
description: SelfHealAttemptsCount contains the number of auto-heal
attempts
format: int64
type: integer
dryRun:
description: DryRun specifies to perform a `kubectl apply --dry-run`
without actually performing the sync
Expand Down Expand Up @@ -2533,6 +2538,11 @@ spec:
sync:
description: Sync contains parameters for the operation
properties:
autoHealAttemptsCount:
description: SelfHealAttemptsCount contains the number
of auto-heal attempts
format: int64
type: integer
dryRun:
description: DryRun specifies to perform a `kubectl apply
--dry-run` without actually performing the sync
Expand Down
Loading
Loading