diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index f878633..e6d8c5a 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -16,5 +16,5 @@ jobs: - name: golangci-lint uses: golangci/golangci-lint-action@v6 with: - version: v1.59 + version: v1.63 args: --timeout=5m diff --git a/cmd/observer/main.go b/cmd/observer/main.go index c495f88..04e832d 100644 --- a/cmd/observer/main.go +++ b/cmd/observer/main.go @@ -37,7 +37,6 @@ type app struct { cloudProviderName *string namespace *string addr *string - prometheusAddress *string dryMode *bool runImmediately *bool runOnce *bool @@ -50,18 +49,16 @@ type app struct { // newApp creates a new app and sets up the cobra flags func newApp(rootCmd *cobra.Command) *app { return &app{ - addr: rootCmd.PersistentFlags().String("addr", ":8080", "Address to listen on for /metrics"), - cloudProviderName: rootCmd.PersistentFlags().String("cloud-provider", "aws", "Which cloud provider to use, options: [aws]"), - namespaces: rootCmd.PersistentFlags().StringSlice("namespaces", []string{"kube-system"}, "Namespaces to watch for cycle request objects"), - namespace: rootCmd.PersistentFlags().String("namespace", "kube-system", "Namespaces to watch and create cnrs"), - dryMode: rootCmd.PersistentFlags().Bool("dry", false, "api-server drymode for applying CNRs"), - waitInterval: rootCmd.PersistentFlags().Duration("wait-interval", 2*time.Minute, "duration to wait after detecting changes before creating CNR objects. The window for letting changes on nodegroups settle before starting rotation"), - checkInterval: rootCmd.PersistentFlags().Duration("check-interval", 5*time.Minute, `duration interval to check for changes. e.g. run the loop every 5 minutes"`), - nodeStartupTime: rootCmd.PersistentFlags().Duration("node-startup-time", 2*time.Minute, "duration to wait after a cluster-autoscaler scaleUp event is detected"), - runImmediately: rootCmd.PersistentFlags().Bool("now", false, "makes the check loop run straight away on program start rather than wait for the check interval to elapse"), - runOnce: rootCmd.PersistentFlags().Bool("once", false, "run the check loop once then exit. also works with --now"), - prometheusAddress: rootCmd.PersistentFlags().String("prometheus-address", "prometheus", "Prometheus service address used to query cluster-autoscaler metrics"), - prometheusScrapeInterval: rootCmd.PersistentFlags().Duration("prometheus-scrape-interval", 40*time.Second, "Prometheus scrape interval used to detect change of value from prometheus query, needed to detect scaleUp event"), + addr: rootCmd.PersistentFlags().String("addr", ":8080", "Address to listen on for /metrics"), + cloudProviderName: rootCmd.PersistentFlags().String("cloud-provider", "aws", "Which cloud provider to use, options: [aws]"), + namespaces: rootCmd.PersistentFlags().StringSlice("namespaces", []string{"kube-system"}, "Namespaces to watch for cycle request objects"), + namespace: rootCmd.PersistentFlags().String("namespace", "kube-system", "Namespaces to watch and create cnrs"), + dryMode: rootCmd.PersistentFlags().Bool("dry", false, "api-server drymode for applying CNRs"), + waitInterval: rootCmd.PersistentFlags().Duration("wait-interval", 2*time.Minute, "duration to wait after detecting changes before creating CNR objects. The window for letting changes on nodegroups settle before starting rotation"), + checkInterval: rootCmd.PersistentFlags().Duration("check-interval", 5*time.Minute, `duration interval to check for changes. e.g. run the loop every 5 minutes"`), + nodeStartupTime: rootCmd.PersistentFlags().Duration("node-startup-time", 2*time.Minute, "duration to wait after a cluster-autoscaler scaleUp event is detected"), + runImmediately: rootCmd.PersistentFlags().Bool("now", false, "makes the check loop run straight away on program start rather than wait for the check interval to elapse"), + runOnce: rootCmd.PersistentFlags().Bool("once", false, "run the check loop once then exit. also works with --now"), } } @@ -126,7 +123,6 @@ func (a *app) run() { RunOnce: *a.runOnce, WaitInterval: *a.waitInterval, NodeStartupTime: *a.nodeStartupTime, - PrometheusAddress: *a.prometheusAddress, PrometheusScrapeInterval: *a.prometheusScrapeInterval, } diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go index 420b572..4c39de1 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions.go @@ -48,7 +48,7 @@ func (t *CycleNodeRequestTransitioner) transitionUndefined() (reconcile.Result, validationErrors := validation.IsDNS1035Label(t.cycleNodeRequest.Name) if len(validationErrors) > 0 { - return t.transitionToFailed(fmt.Errorf(strings.Join(validationErrors, ","))) + return t.transitionToFailed(fmt.Errorf("%s", strings.Join(validationErrors, ","))) } // Transition the object to pending diff --git a/pkg/controller/cyclenodestatus/transitioner/transitions.go b/pkg/controller/cyclenodestatus/transitioner/transitions.go index 2f821a7..16db5f3 100644 --- a/pkg/controller/cyclenodestatus/transitioner/transitions.go +++ b/pkg/controller/cyclenodestatus/transitioner/transitions.go @@ -153,7 +153,7 @@ func (t *CycleNodeStatusTransitioner) transitionDraining() (reconcile.Result, er // Fail with all of the combined encountered errors if we got any. If we failed inside the loop we would // potentially miss some important information in the logs. if len(unexpectedErrors) > 0 { - return t.transitionToFailed(fmt.Errorf(strings.Join(unexpectedErrors, "\n"))) + return t.transitionToFailed(fmt.Errorf("%s", strings.Join(unexpectedErrors, "\n"))) } // No serious errors were encountered. If we're done, move on. if finished { diff --git a/pkg/observer/controller.go b/pkg/observer/controller.go index cf7f30a..c622da6 100644 --- a/pkg/observer/controller.go +++ b/pkg/observer/controller.go @@ -2,16 +2,11 @@ package observer import ( "context" - "errors" "net/http" "sort" - "strconv" "time" - "github.com/cenkalti/backoff/v4" - "github.com/prometheus/client_golang/api" "github.com/prometheus/client_golang/prometheus/promhttp" - "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" @@ -19,7 +14,6 @@ import ( v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1" "github.com/atlassian-labs/cyclops/pkg/generation" "github.com/atlassian-labs/cyclops/pkg/k8s" - promv1 "github.com/prometheus/client_golang/api/prometheus/v1" ) var apiVersion = "undefined" //nolint:golint,varcheck,deadcode,unused @@ -277,66 +271,6 @@ func (c *controller) dropInProgressNodeGroups(nodeGroups v1.NodeGroupList, cnrs return restingNodeGroups } -// get the cluster-autoscaler last scaleUp activity time -func stringToTime(s string) (time.Time, error) { - sec, err := strconv.ParseInt(s, 10, 64) - if err != nil { - return time.Time{}, err - } - return time.Unix(sec, 0), nil -} - -// query cluster-autoscaler metrics to figure out if it's safe to start a new CNR -func (c *controller) safeToStartCycle() bool { - client, err := api.NewClient(api.Config{ - Address: c.PrometheusAddress, - }) - if err != nil { - // Prometheus might not be installed in the cluster. return true if it can't connect - klog.Errorln("Error creating client:", err) - return true - } - - v1api := promv1.NewAPI(client) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - // scaleDown metric is updated every cycle cluster-autoscaler is checking if the cluster should scaleDown - // scaleDown does not get checked and therefore not updated when the cluster is scaling up since no check for scaleDown is needed - result, warnings, err := v1api.Query(ctx, "cluster_autoscaler_last_activity{activity='scaleDown'}", time.Now()) - if err != nil { - // cluster-autoscaler might not be installed in the cluster. return true if it can't find the metrics of run the query - klog.Errorln("Error querying Prometheus:", err) - return true - } - if len(warnings) > 0 { - klog.Errorln("Warnings:", warnings) - } - - v := result.(model.Vector) - // cluster-autoscaler should always gives a response if it's active - if v.Len() == 0 { - klog.Errorln("Empty response from prometheus") - return true - } - - scaleUpTime := v[v.Len()-1].Value.String() - t, err := stringToTime(scaleUpTime) - if err != nil { - klog.Errorln("Error converting the time:", err) - return false - } - - // cluster_autoscaler_last_activity values will update every PrometheusScrapeInterval in non-scaling scenario - lastScaleEvent := time.Since(t) - if lastScaleEvent > c.PrometheusScrapeInterval { - klog.Infoln("Scale up event recently happened") - return false - } - klog.V(3).Infoln("No scale up event") - - return true -} - // createCNRs generates and applies CNRs from the changedNodeGroups func (c *controller) createCNRs(changedNodeGroups []*ListedNodeGroups) { klog.V(3).Infoln("applying") @@ -371,26 +305,6 @@ func (c *controller) nextRunTime() time.Time { return time.Now().UTC().Add(c.CheckInterval) } -func (c *controller) checkIfSafeToStartCycle() bool { - b := backoff.NewExponentialBackOff() - b.MaxElapsedTime = 120 * time.Second - - err := backoff.Retry(func() error { - if !c.safeToStartCycle() { - klog.Error("Cluster autoscaler scaleUp event in progress. Retry...") - return errors.New("cluster-autoscaler event in progress") - } - return nil - }, b) - - if err != nil { - klog.Errorln("there are still cluster-autoscaler scaleUp events") - return false - } - - return true -} - // Run runs the controller loops once. detecting lock, changes, and applying CNRs // implements cron.Job interface func (c *controller) Run() { @@ -420,11 +334,6 @@ func (c *controller) Run() { } } - // query cluster-autoscaler to check if it's safe to start a new CNR - if !c.checkIfSafeToStartCycle() { - return - } - // wait for the desired amount to allow any in progress changes to batch up klog.V(3).Infof("waiting for %v to allow changes to settle", c.WaitInterval) select { diff --git a/pkg/observer/types.go b/pkg/observer/types.go index 9eb39d1..7354729 100644 --- a/pkg/observer/types.go +++ b/pkg/observer/types.go @@ -21,10 +21,9 @@ type Controller interface { // Options contains the options config for a controller type Options struct { - CNRPrefix string - Namespace string - CheckSchedule string - PrometheusAddress string + CNRPrefix string + Namespace string + CheckSchedule string DryMode bool RunImmediately bool