Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implement customizable ticker interval to mitigate buildkite api rate limit Issues #279

Merged
merged 10 commits into from
May 30, 2024
28 changes: 15 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,19 +95,21 @@ Available Commands:
version Prints the version

Flags:
--agent-token-secret string name of the Buildkite agent token secret (default "buildkite-agent-token")
--buildkite-token string Buildkite API token with GraphQL scopes
--cluster-uuid string UUID of the Buildkite Cluster. The agent token must be for the Buildkite Cluster.
-f, --config string config file path
--debug debug logs
-h, --help help for agent-stack-k8s
--image string The image to use for the Buildkite agent (default "ghcr.io/buildkite/agent-stack-k8s/agent:latest")
--job-ttl duration time to retain kubernetes jobs after completion (default 10m0s)
--max-in-flight int max jobs in flight, 0 means no max (default 25)
--namespace string kubernetes namespace to create resources in (default "default")
--org string Buildkite organization name to watch
--profiler-address string Bind address to expose the pprof profiler (e.g. localhost:6060)
--tags strings A comma-separated list of agent tags. The "queue" tag must be unique (e.g. "queue=kubernetes,os=linux") (default [queue=kubernetes])
--agent-token-secret string name of the Buildkite agent token secret (default "buildkite-agent-token")
--buildkite-token string Buildkite API token with GraphQL scopes
--cluster-uuid string UUID of the Buildkite Cluster. The agent token must be for the Buildkite Cluster.
-f, --config string config file path
--debug debug logs
-h, --help help for agent-stack-k8s
--image string The image to use for the Buildkite agent (default "ghcr.io/buildkite/agent:3.73.1")
--image-pull-backoff-grace-period duration Duration after starting a pod that the controller will wait before considering cancelling a job due to ImagePullBackOff (e.g. when the podSpec specifies container images that cannot be pulled) (default 30s)
--job-ttl duration time to retain kubernetes jobs after completion (default 10m0s)
--max-in-flight int max jobs in flight, 0 means no max (default 25)
--namespace string kubernetes namespace to create resources in (default "default")
--org string Buildkite organization name to watch
--poll-interval duration time to wait between polling for new jobs (minimum 1s); note that increasing this causes jobs to be slower to start (default 1s)
--profiler-address string Bind address to expose the pprof profiler (e.g. localhost:6060)
--tags strings A comma-separated list of agent tags. The "queue" tag must be unique (e.g. "queue=kubernetes,os=linux") (default [queue=kubernetes])

Use "agent-stack-k8s [command] --help" for more information about a command.
```
Expand Down
8 changes: 8 additions & 0 deletions charts/agent-stack-k8s/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,12 @@
"title": "The max-in-flight Schema",
"examples": [100]
},
"poll-interval": {
"type": "string",
"default": "",
"title": "The poll-interval Schema",
"examples": ["1s", "1m"]
},
"org": {
"type": "string",
"default": "",
Expand Down Expand Up @@ -223,6 +229,7 @@
"image": "",
"debug": false,
"job-ttl": "5m",
"poll-interval": "5s",
"max-in-flight": 100,
"org": "",
"tags": []
Expand All @@ -241,6 +248,7 @@
"debug": false,
"jobTTL": "",
"maxInFlight": 100,
"pollInterval": "5s",
"org": "",
"tags": []
}
Expand Down
5 changes: 5 additions & 0 deletions cmd/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ func AddConfigFlags(cmd *cobra.Command) {
10*time.Minute,
"time to retain kubernetes jobs after completion",
)
cmd.Flags().Duration(
"poll-interval",
time.Second,
"time to wait between polling for new jobs (minimum 1s); note that increasing this causes jobs to be slower to start",
)
cmd.Flags().String(
"cluster-uuid",
"",
Expand Down
1 change: 1 addition & 0 deletions cmd/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ func TestReadAndParseConfig(t *testing.T) {
Image: "my.registry.dev/buildkite-agent:latest",
JobTTL: 300 * time.Second,
ImagePullBackOffGradePeriod: 60 * time.Second,
PollInterval: 5 * time.Second,
MaxInFlight: 100,
Namespace: "my-buildkite-ns",
Org: "my-buildkite-org",
Expand Down
1 change: 1 addition & 0 deletions examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ debug: true
image: my.registry.dev/buildkite-agent:latest
job-ttl: 5m
image-pull-backoff-grace-period: 60s
poll-interval: 5s
max-in-flight: 100
namespace: my-buildkite-ns
org: my-buildkite-org
Expand Down
2 changes: 2 additions & 0 deletions internal/controller/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ var DefaultAgentImage = "ghcr.io/buildkite/agent:" + version.Version()
type Config struct {
Debug bool `json:"debug"`
JobTTL time.Duration `json:"job-ttl"`
PollInterval time.Duration `json:"poll-interval"`
AgentTokenSecret string `json:"agent-token-secret" validate:"required"`
BuildkiteToken string `json:"buildkite-token" validate:"required"`
Image string `json:"image" validate:"required"`
Expand Down Expand Up @@ -52,6 +53,7 @@ func (c Config) MarshalLogObject(enc zapcore.ObjectEncoder) error {
enc.AddBool("debug", c.Debug)
enc.AddString("image", c.Image)
enc.AddDuration("job-ttl", c.JobTTL)
enc.AddDuration("poll-interval", c.PollInterval)
enc.AddInt("max-in-flight", c.MaxInFlight)
enc.AddString("namespace", c.Namespace)
enc.AddString("org", c.Org)
Expand Down
13 changes: 7 additions & 6 deletions internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,13 @@ func Run(
}

m, err := monitor.New(logger.Named("monitor"), k8sClient, monitor.Config{
Namespace: cfg.Namespace,
Org: cfg.Org,
ClusterUUID: cfg.ClusterUUID,
MaxInFlight: cfg.MaxInFlight,
Tags: cfg.Tags,
Token: cfg.BuildkiteToken,
Namespace: cfg.Namespace,
Org: cfg.Org,
ClusterUUID: cfg.ClusterUUID,
MaxInFlight: cfg.MaxInFlight,
PollInterval: cfg.PollInterval,
Tags: cfg.Tags,
Token: cfg.BuildkiteToken,
})
if err != nil {
logger.Fatal("failed to create monitor", zap.Error(err))
Expand Down
19 changes: 12 additions & 7 deletions internal/controller/monitor/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ type Monitor struct {
}

type Config struct {
Namespace string
Token string
ClusterUUID string
MaxInFlight int
Org string
Tags []string
Namespace string
Token string
ClusterUUID string
MaxInFlight int
PollInterval time.Duration
Org string
Tags []string
}

type JobHandler interface {
Expand All @@ -37,6 +38,10 @@ type JobHandler interface {
func New(logger *zap.Logger, k8s kubernetes.Interface, cfg Config) (*Monitor, error) {
graphqlClient := api.NewClient(cfg.Token)

if cfg.PollInterval < time.Second {
cfg.PollInterval = time.Second
}

return &Monitor{
gql: graphqlClient,
logger: logger,
Expand Down Expand Up @@ -119,7 +124,7 @@ func (m *Monitor) Start(ctx context.Context, handler JobHandler) <-chan error {

go func() {
logger.Info("started")
ticker := time.NewTicker(time.Second)
ticker := time.NewTicker(m.cfg.PollInterval)
defer ticker.Stop()

first := make(chan struct{}, 1)
Expand Down
10 changes: 6 additions & 4 deletions internal/integration/monitor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"os"
"testing"
"time"

"github.com/buildkite/agent-stack-k8s/v2/internal/controller/monitor"
"github.com/stretchr/testify/require"
Expand All @@ -13,10 +14,11 @@ import (

func TestInvalidOrg(t *testing.T) {
m, err := monitor.New(zap.Must(zap.NewDevelopment()), fake.NewSimpleClientset(), monitor.Config{
Token: os.Getenv("BUILDKITE_TOKEN"),
MaxInFlight: 1,
Org: "foo",
Tags: []string{"queue=default", "foo=bar"},
Token: os.Getenv("BUILDKITE_TOKEN"),
MaxInFlight: 1,
PollInterval: time.Second,
Org: "foo",
Tags: []string{"queue=default", "foo=bar"},
})
require.NoError(t, err)

Expand Down