Skip to content

Commit

Permalink
Merge pull request #91 from atlassian-labs/mzhong/KUBE-9199-remove-pr…
Browse files Browse the repository at this point in the history
…om-metrics-check-for-safetocycle

remove safeToStartCycle check
  • Loading branch information
MinyiZ authored Jan 20, 2025
2 parents bba30c9 + d11fd2b commit 913fc99
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 112 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/golangci-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ jobs:
- name: golangci-lint
uses: golangci/golangci-lint-action@v6
with:
version: v1.59
version: v1.63
args: --timeout=5m
24 changes: 10 additions & 14 deletions cmd/observer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ type app struct {
cloudProviderName *string
namespace *string
addr *string
prometheusAddress *string
dryMode *bool
runImmediately *bool
runOnce *bool
Expand All @@ -50,18 +49,16 @@ type app struct {
// newApp creates a new app and sets up the cobra flags
func newApp(rootCmd *cobra.Command) *app {
return &app{
addr: rootCmd.PersistentFlags().String("addr", ":8080", "Address to listen on for /metrics"),
cloudProviderName: rootCmd.PersistentFlags().String("cloud-provider", "aws", "Which cloud provider to use, options: [aws]"),
namespaces: rootCmd.PersistentFlags().StringSlice("namespaces", []string{"kube-system"}, "Namespaces to watch for cycle request objects"),
namespace: rootCmd.PersistentFlags().String("namespace", "kube-system", "Namespaces to watch and create cnrs"),
dryMode: rootCmd.PersistentFlags().Bool("dry", false, "api-server drymode for applying CNRs"),
waitInterval: rootCmd.PersistentFlags().Duration("wait-interval", 2*time.Minute, "duration to wait after detecting changes before creating CNR objects. The window for letting changes on nodegroups settle before starting rotation"),
checkInterval: rootCmd.PersistentFlags().Duration("check-interval", 5*time.Minute, `duration interval to check for changes. e.g. run the loop every 5 minutes"`),
nodeStartupTime: rootCmd.PersistentFlags().Duration("node-startup-time", 2*time.Minute, "duration to wait after a cluster-autoscaler scaleUp event is detected"),
runImmediately: rootCmd.PersistentFlags().Bool("now", false, "makes the check loop run straight away on program start rather than wait for the check interval to elapse"),
runOnce: rootCmd.PersistentFlags().Bool("once", false, "run the check loop once then exit. also works with --now"),
prometheusAddress: rootCmd.PersistentFlags().String("prometheus-address", "prometheus", "Prometheus service address used to query cluster-autoscaler metrics"),
prometheusScrapeInterval: rootCmd.PersistentFlags().Duration("prometheus-scrape-interval", 40*time.Second, "Prometheus scrape interval used to detect change of value from prometheus query, needed to detect scaleUp event"),
addr: rootCmd.PersistentFlags().String("addr", ":8080", "Address to listen on for /metrics"),
cloudProviderName: rootCmd.PersistentFlags().String("cloud-provider", "aws", "Which cloud provider to use, options: [aws]"),
namespaces: rootCmd.PersistentFlags().StringSlice("namespaces", []string{"kube-system"}, "Namespaces to watch for cycle request objects"),
namespace: rootCmd.PersistentFlags().String("namespace", "kube-system", "Namespaces to watch and create cnrs"),
dryMode: rootCmd.PersistentFlags().Bool("dry", false, "api-server drymode for applying CNRs"),
waitInterval: rootCmd.PersistentFlags().Duration("wait-interval", 2*time.Minute, "duration to wait after detecting changes before creating CNR objects. The window for letting changes on nodegroups settle before starting rotation"),
checkInterval: rootCmd.PersistentFlags().Duration("check-interval", 5*time.Minute, `duration interval to check for changes. e.g. run the loop every 5 minutes"`),
nodeStartupTime: rootCmd.PersistentFlags().Duration("node-startup-time", 2*time.Minute, "duration to wait after a cluster-autoscaler scaleUp event is detected"),
runImmediately: rootCmd.PersistentFlags().Bool("now", false, "makes the check loop run straight away on program start rather than wait for the check interval to elapse"),
runOnce: rootCmd.PersistentFlags().Bool("once", false, "run the check loop once then exit. also works with --now"),
}
}

Expand Down Expand Up @@ -126,7 +123,6 @@ func (a *app) run() {
RunOnce: *a.runOnce,
WaitInterval: *a.waitInterval,
NodeStartupTime: *a.nodeStartupTime,
PrometheusAddress: *a.prometheusAddress,
PrometheusScrapeInterval: *a.prometheusScrapeInterval,
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func (t *CycleNodeRequestTransitioner) transitionUndefined() (reconcile.Result,
validationErrors := validation.IsDNS1035Label(t.cycleNodeRequest.Name)

if len(validationErrors) > 0 {
return t.transitionToFailed(fmt.Errorf(strings.Join(validationErrors, ",")))
return t.transitionToFailed(fmt.Errorf("%s", strings.Join(validationErrors, ",")))
}

// Transition the object to pending
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/cyclenodestatus/transitioner/transitions.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func (t *CycleNodeStatusTransitioner) transitionDraining() (reconcile.Result, er
// Fail with all of the combined encountered errors if we got any. If we failed inside the loop we would
// potentially miss some important information in the logs.
if len(unexpectedErrors) > 0 {
return t.transitionToFailed(fmt.Errorf(strings.Join(unexpectedErrors, "\n")))
return t.transitionToFailed(fmt.Errorf("%s", strings.Join(unexpectedErrors, "\n")))
}
// No serious errors were encountered. If we're done, move on.
if finished {
Expand Down
91 changes: 0 additions & 91 deletions pkg/observer/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,18 @@ package observer

import (
"context"
"errors"
"net/http"
"sort"
"strconv"
"time"

"github.com/cenkalti/backoff/v4"
"github.com/prometheus/client_golang/api"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/prometheus/common/model"
corev1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/client"

v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1"
"github.com/atlassian-labs/cyclops/pkg/generation"
"github.com/atlassian-labs/cyclops/pkg/k8s"
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
)

var apiVersion = "undefined" //nolint:golint,varcheck,deadcode,unused
Expand Down Expand Up @@ -277,66 +271,6 @@ func (c *controller) dropInProgressNodeGroups(nodeGroups v1.NodeGroupList, cnrs
return restingNodeGroups
}

// get the cluster-autoscaler last scaleUp activity time
func stringToTime(s string) (time.Time, error) {
sec, err := strconv.ParseInt(s, 10, 64)
if err != nil {
return time.Time{}, err
}
return time.Unix(sec, 0), nil
}

// query cluster-autoscaler metrics to figure out if it's safe to start a new CNR
func (c *controller) safeToStartCycle() bool {
client, err := api.NewClient(api.Config{
Address: c.PrometheusAddress,
})
if err != nil {
// Prometheus might not be installed in the cluster. return true if it can't connect
klog.Errorln("Error creating client:", err)
return true
}

v1api := promv1.NewAPI(client)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// scaleDown metric is updated every cycle cluster-autoscaler is checking if the cluster should scaleDown
// scaleDown does not get checked and therefore not updated when the cluster is scaling up since no check for scaleDown is needed
result, warnings, err := v1api.Query(ctx, "cluster_autoscaler_last_activity{activity='scaleDown'}", time.Now())
if err != nil {
// cluster-autoscaler might not be installed in the cluster. return true if it can't find the metrics of run the query
klog.Errorln("Error querying Prometheus:", err)
return true
}
if len(warnings) > 0 {
klog.Errorln("Warnings:", warnings)
}

v := result.(model.Vector)
// cluster-autoscaler should always gives a response if it's active
if v.Len() == 0 {
klog.Errorln("Empty response from prometheus")
return true
}

scaleUpTime := v[v.Len()-1].Value.String()
t, err := stringToTime(scaleUpTime)
if err != nil {
klog.Errorln("Error converting the time:", err)
return false
}

// cluster_autoscaler_last_activity values will update every PrometheusScrapeInterval in non-scaling scenario
lastScaleEvent := time.Since(t)
if lastScaleEvent > c.PrometheusScrapeInterval {
klog.Infoln("Scale up event recently happened")
return false
}
klog.V(3).Infoln("No scale up event")

return true
}

// createCNRs generates and applies CNRs from the changedNodeGroups
func (c *controller) createCNRs(changedNodeGroups []*ListedNodeGroups) {
klog.V(3).Infoln("applying")
Expand Down Expand Up @@ -371,26 +305,6 @@ func (c *controller) nextRunTime() time.Time {
return time.Now().UTC().Add(c.CheckInterval)
}

func (c *controller) checkIfSafeToStartCycle() bool {
b := backoff.NewExponentialBackOff()
b.MaxElapsedTime = 120 * time.Second

err := backoff.Retry(func() error {
if !c.safeToStartCycle() {
klog.Error("Cluster autoscaler scaleUp event in progress. Retry...")
return errors.New("cluster-autoscaler event in progress")
}
return nil
}, b)

if err != nil {
klog.Errorln("there are still cluster-autoscaler scaleUp events")
return false
}

return true
}

// Run runs the controller loops once. detecting lock, changes, and applying CNRs
// implements cron.Job interface
func (c *controller) Run() {
Expand Down Expand Up @@ -420,11 +334,6 @@ func (c *controller) Run() {
}
}

// query cluster-autoscaler to check if it's safe to start a new CNR
if !c.checkIfSafeToStartCycle() {
return
}

// wait for the desired amount to allow any in progress changes to batch up
klog.V(3).Infof("waiting for %v to allow changes to settle", c.WaitInterval)
select {
Expand Down
7 changes: 3 additions & 4 deletions pkg/observer/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@ type Controller interface {

// Options contains the options config for a controller
type Options struct {
CNRPrefix string
Namespace string
CheckSchedule string
PrometheusAddress string
CNRPrefix string
Namespace string
CheckSchedule string

DryMode bool
RunImmediately bool
Expand Down

0 comments on commit 913fc99

Please sign in to comment.