Merge pull request #91 from atlassian-labs/mzhong/KUBE-9199-remove-pr…

…om-metrics-check-for-safetocycle remove safeToStartCycle check
atlassian-labs · Jan 20, 2025 · 913fc99 · 913fc99
2 parents bba30c9 + d11fd2b
commit 913fc99
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 112 deletions.
diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml
@@ -16,5 +16,5 @@ jobs:
       - name: golangci-lint
         uses: golangci/golangci-lint-action@v6
         with:
-          version: v1.59
+          version: v1.63
           args: --timeout=5m
diff --git a/cmd/observer/main.go b/cmd/observer/main.go
@@ -37,7 +37,6 @@ type app struct {
 	cloudProviderName        *string
 	namespace                *string
 	addr                     *string
-	prometheusAddress        *string
 	dryMode                  *bool
 	runImmediately           *bool
 	runOnce                  *bool
@@ -50,18 +49,16 @@ type app struct {
 // newApp creates a new app and sets up the cobra flags
 func newApp(rootCmd *cobra.Command) *app {
 	return &app{
-		addr:                     rootCmd.PersistentFlags().String("addr", ":8080", "Address to listen on for /metrics"),
-		cloudProviderName:        rootCmd.PersistentFlags().String("cloud-provider", "aws", "Which cloud provider to use, options: [aws]"),
-		namespaces:               rootCmd.PersistentFlags().StringSlice("namespaces", []string{"kube-system"}, "Namespaces to watch for cycle request objects"),
-		namespace:                rootCmd.PersistentFlags().String("namespace", "kube-system", "Namespaces to watch and create cnrs"),
-		dryMode:                  rootCmd.PersistentFlags().Bool("dry", false, "api-server drymode for applying CNRs"),
-		waitInterval:             rootCmd.PersistentFlags().Duration("wait-interval", 2*time.Minute, "duration to wait after detecting changes before creating CNR objects. The window for letting changes on nodegroups settle before starting rotation"),
-		checkInterval:            rootCmd.PersistentFlags().Duration("check-interval", 5*time.Minute, `duration interval to check for changes. e.g. run the loop every 5 minutes"`),
-		nodeStartupTime:          rootCmd.PersistentFlags().Duration("node-startup-time", 2*time.Minute, "duration to wait after a cluster-autoscaler scaleUp event is detected"),
-		runImmediately:           rootCmd.PersistentFlags().Bool("now", false, "makes the check loop run straight away on program start rather than wait for the check interval to elapse"),
-		runOnce:                  rootCmd.PersistentFlags().Bool("once", false, "run the check loop once then exit. also works with --now"),
-		prometheusAddress:        rootCmd.PersistentFlags().String("prometheus-address", "prometheus", "Prometheus service address used to query cluster-autoscaler metrics"),
-		prometheusScrapeInterval: rootCmd.PersistentFlags().Duration("prometheus-scrape-interval", 40*time.Second, "Prometheus scrape interval used to detect change of value from prometheus query, needed to detect scaleUp event"),
+		addr:              rootCmd.PersistentFlags().String("addr", ":8080", "Address to listen on for /metrics"),
+		cloudProviderName: rootCmd.PersistentFlags().String("cloud-provider", "aws", "Which cloud provider to use, options: [aws]"),
+		namespaces:        rootCmd.PersistentFlags().StringSlice("namespaces", []string{"kube-system"}, "Namespaces to watch for cycle request objects"),
+		namespace:         rootCmd.PersistentFlags().String("namespace", "kube-system", "Namespaces to watch and create cnrs"),
+		dryMode:           rootCmd.PersistentFlags().Bool("dry", false, "api-server drymode for applying CNRs"),
+		waitInterval:      rootCmd.PersistentFlags().Duration("wait-interval", 2*time.Minute, "duration to wait after detecting changes before creating CNR objects. The window for letting changes on nodegroups settle before starting rotation"),
+		checkInterval:     rootCmd.PersistentFlags().Duration("check-interval", 5*time.Minute, `duration interval to check for changes. e.g. run the loop every 5 minutes"`),
+		nodeStartupTime:   rootCmd.PersistentFlags().Duration("node-startup-time", 2*time.Minute, "duration to wait after a cluster-autoscaler scaleUp event is detected"),
+		runImmediately:    rootCmd.PersistentFlags().Bool("now", false, "makes the check loop run straight away on program start rather than wait for the check interval to elapse"),
+		runOnce:           rootCmd.PersistentFlags().Bool("once", false, "run the check loop once then exit. also works with --now"),
 	}
 }
 
@@ -126,7 +123,6 @@ func (a *app) run() {
 		RunOnce:                  *a.runOnce,
 		WaitInterval:             *a.waitInterval,
 		NodeStartupTime:          *a.nodeStartupTime,
-		PrometheusAddress:        *a.prometheusAddress,
 		PrometheusScrapeInterval: *a.prometheusScrapeInterval,
 	}
 

diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go
@@ -48,7 +48,7 @@ func (t *CycleNodeRequestTransitioner) transitionUndefined() (reconcile.Result,
 	validationErrors := validation.IsDNS1035Label(t.cycleNodeRequest.Name)
 
 	if len(validationErrors) > 0 {
-		return t.transitionToFailed(fmt.Errorf(strings.Join(validationErrors, ",")))
+		return t.transitionToFailed(fmt.Errorf("%s", strings.Join(validationErrors, ",")))
 	}
 
 	// Transition the object to pending

diff --git a/pkg/controller/cyclenodestatus/transitioner/transitions.go b/pkg/controller/cyclenodestatus/transitioner/transitions.go
@@ -153,7 +153,7 @@ func (t *CycleNodeStatusTransitioner) transitionDraining() (reconcile.Result, er
 	// Fail with all of the combined encountered errors if we got any. If we failed inside the loop we would
 	// potentially miss some important information in the logs.
 	if len(unexpectedErrors) > 0 {
-		return t.transitionToFailed(fmt.Errorf(strings.Join(unexpectedErrors, "\n")))
+		return t.transitionToFailed(fmt.Errorf("%s", strings.Join(unexpectedErrors, "\n")))
 	}
 	// No serious errors were encountered. If we're done, move on.
 	if finished {

diff --git a/pkg/observer/controller.go b/pkg/observer/controller.go
@@ -2,24 +2,18 @@ package observer
 
 import (
 	"context"
-	"errors"
 	"net/http"
 	"sort"
-	"strconv"
 	"time"
 
-	"github.com/cenkalti/backoff/v4"
-	"github.com/prometheus/client_golang/api"
 	"github.com/prometheus/client_golang/prometheus/promhttp"
-	"github.com/prometheus/common/model"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/klog/v2"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1"
 	"github.com/atlassian-labs/cyclops/pkg/generation"
 	"github.com/atlassian-labs/cyclops/pkg/k8s"
-	promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
 )
 
 var apiVersion = "undefined" //nolint:golint,varcheck,deadcode,unused
@@ -277,66 +271,6 @@ func (c *controller) dropInProgressNodeGroups(nodeGroups v1.NodeGroupList, cnrs
 	return restingNodeGroups
 }
 
-// get the cluster-autoscaler last scaleUp activity time
-func stringToTime(s string) (time.Time, error) {
-	sec, err := strconv.ParseInt(s, 10, 64)
-	if err != nil {
-		return time.Time{}, err
-	}
-	return time.Unix(sec, 0), nil
-}
-
-// query cluster-autoscaler metrics to figure out if it's safe to start a new CNR
-func (c *controller) safeToStartCycle() bool {
-	client, err := api.NewClient(api.Config{
-		Address: c.PrometheusAddress,
-	})
-	if err != nil {
-		// Prometheus might not be installed in the cluster. return true if it can't connect
-		klog.Errorln("Error creating client:", err)
-		return true
-	}
-
-	v1api := promv1.NewAPI(client)
-	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
-	defer cancel()
-	// scaleDown metric is updated every cycle cluster-autoscaler is checking if the cluster should scaleDown
-	// scaleDown does not get checked and therefore not updated when the cluster is scaling up since no check for scaleDown is needed
-	result, warnings, err := v1api.Query(ctx, "cluster_autoscaler_last_activity{activity='scaleDown'}", time.Now())
-	if err != nil {
-		// cluster-autoscaler might not be installed in the cluster. return true if it can't find the metrics of run the query
-		klog.Errorln("Error querying Prometheus:", err)
-		return true
-	}
-	if len(warnings) > 0 {
-		klog.Errorln("Warnings:", warnings)
-	}
-
-	v := result.(model.Vector)
-	// cluster-autoscaler should always gives a response if it's active
-	if v.Len() == 0 {
-		klog.Errorln("Empty response from prometheus")
-		return true
-	}
-
-	scaleUpTime := v[v.Len()-1].Value.String()
-	t, err := stringToTime(scaleUpTime)
-	if err != nil {
-		klog.Errorln("Error converting the time:", err)
-		return false
-	}
-
-	// cluster_autoscaler_last_activity values will update every PrometheusScrapeInterval in non-scaling scenario
-	lastScaleEvent := time.Since(t)
-	if lastScaleEvent > c.PrometheusScrapeInterval {
-		klog.Infoln("Scale up event recently happened")
-		return false
-	}
-	klog.V(3).Infoln("No scale up event")
-
-	return true
-}
-
 // createCNRs generates and applies CNRs from the changedNodeGroups
 func (c *controller) createCNRs(changedNodeGroups []*ListedNodeGroups) {
 	klog.V(3).Infoln("applying")
@@ -371,26 +305,6 @@ func (c *controller) nextRunTime() time.Time {
 	return time.Now().UTC().Add(c.CheckInterval)
 }
 
-func (c *controller) checkIfSafeToStartCycle() bool {
-	b := backoff.NewExponentialBackOff()
-	b.MaxElapsedTime = 120 * time.Second
-
-	err := backoff.Retry(func() error {
-		if !c.safeToStartCycle() {
-			klog.Error("Cluster autoscaler scaleUp event in progress. Retry...")
-			return errors.New("cluster-autoscaler event in progress")
-		}
-		return nil
-	}, b)
-
-	if err != nil {
-		klog.Errorln("there are still cluster-autoscaler scaleUp events")
-		return false
-	}
-
-	return true
-}
-
 // Run runs the controller loops once. detecting lock, changes, and applying CNRs
 // implements cron.Job interface
 func (c *controller) Run() {
@@ -420,11 +334,6 @@ func (c *controller) Run() {
 		}
 	}
 
-	// query cluster-autoscaler to check if it's safe to start a new CNR
-	if !c.checkIfSafeToStartCycle() {
-		return
-	}
-
 	// wait for the desired amount to allow any in progress changes to batch up
 	klog.V(3).Infof("waiting for %v to allow changes to settle", c.WaitInterval)
 	select {

diff --git a/pkg/observer/types.go b/pkg/observer/types.go
@@ -21,10 +21,9 @@ type Controller interface {
 
 // Options contains the options config for a controller
 type Options struct {
-	CNRPrefix         string
-	Namespace         string
-	CheckSchedule     string
-	PrometheusAddress string
+	CNRPrefix     string
+	Namespace     string
+	CheckSchedule string
 
 	DryMode        bool
 	RunImmediately bool