Skip to content

Commit

Permalink
Derive image check pull policies from post-patch podSpec
Browse files Browse the repository at this point in the history
  • Loading branch information
DrJosh9000 committed Dec 18, 2024
1 parent 2138e80 commit 5a7e9bb
Show file tree
Hide file tree
Showing 15 changed files with 565 additions and 305 deletions.
12 changes: 12 additions & 0 deletions charts/agent-stack-k8s/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,18 @@
}
}
},
"default-image-pull-policy": {
"type": "string",
"description": "Configures a default image pull policy for containers that do not specify a pull policy, or containers created by the stack itself",
"default": "IfNotPresent",
"examples": ["Always", "IfNotPresent", "Never", ""]
},
"default-image-check-pull-policy": {
"type": "string",
"description": "Configures a default image pull policy for image-check init containers, used if an image pull policy is not set for the corresponding container in a podSpec or podSpecPatch",
"default": "",
"examples": ["Always", "IfNotPresent", "Never", ""]
},
"pod-spec-patch": {
"$ref": "https://kubernetesjsonschema.dev/master/_definitions.json#/definitions/io.k8s.api.core.v1.PodSpec"
}
Expand Down
10 changes: 10 additions & 0 deletions cmd/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,16 @@ func AddConfigFlags(cmd *cobra.Command) {
config.DefaultEmptyJobGracePeriod,
"Duration after starting a Kubernetes job that the controller will wait before considering failing the job due to a missing pod (e.g. when the podSpec specifies a missing service account)",
)
cmd.Flags().String(
"default-image-pull-policy",
"IfNotPresent",
"Configures a default image pull policy for containers that do not specify a pull policy and non-init containers created by the stack itself",
)
cmd.Flags().String(
"default-image-check-pull-policy",
"",
"Sets a default PullPolicy for image-check init containers, used if an image pull policy is not set for the corresponding container in a podSpec or podSpecPatch",
)
cmd.Flags().Bool(
"prohibit-kubernetes-plugin",
false,
Expand Down
2 changes: 2 additions & 0 deletions cmd/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ func TestReadAndParseConfig(t *testing.T) {
ClusterUUID: "beefcafe-abbe-baba-abba-deedcedecade",
ProhibitKubernetesPlugin: true,
GraphQLEndpoint: "http://graphql.buildkite.localhost/v1",
DefaultImagePullPolicy: "Never",
DefaultImageCheckPullPolicy: "IfNotPresent",

WorkspaceVolume: &corev1.Volume{
Name: "workspace-2-the-reckoning",
Expand Down
2 changes: 2 additions & 0 deletions examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ job-creation-concurrency: 5
max-in-flight: 100
namespace: my-buildkite-ns
org: my-buildkite-org
default-image-pull-policy: Never
default-image-check-pull-policy: IfNotPresent

# Setting a custom GraphQL endpoint is usually only useful if you have a
# different instance of Buildkite itself available to run.
Expand Down
12 changes: 8 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ require (
github.com/Khan/genqlient v0.7.0
github.com/buildkite/go-buildkite/v3 v3.13.0
github.com/buildkite/roko v1.2.0
github.com/distribution/reference v0.6.0
github.com/go-playground/locales v0.14.1
github.com/go-playground/universal-translator v0.18.1
github.com/go-playground/validator/v10 v10.23.0
Expand Down Expand Up @@ -69,7 +70,7 @@ require (
github.com/dustinkirkland/golang-petname v0.0.0-20240428194347-eebcea082ee0 // indirect
github.com/eapache/queue/v2 v2.0.0-20230407133247-75960ed334e4 // indirect
github.com/ebitengine/purego v0.7.1 // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/fatih/color v1.18.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
Expand All @@ -95,6 +96,9 @@ require (
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/onsi/ginkgo/v2 v2.22.0 // indirect
github.com/onsi/gomega v1.36.0 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/outcaste-io/ristretto v0.2.3 // indirect
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
github.com/power-devops/perfstat v0.0.0-20220216144756-c35f1ee13d7c // indirect
Expand Down Expand Up @@ -149,7 +153,7 @@ require (
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/denisbrodbeck/machineid v1.0.1 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/emicklei/go-restful/v3 v3.12.1 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/go-chi/chi/v5 v5.1.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
Expand Down Expand Up @@ -194,7 +198,7 @@ require (
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/secure-systems-lab/go-securesystemslib v0.8.0 // indirect
github.com/spf13/afero v1.11.0 // indirect
github.com/spf13/cast v1.6.0 // indirect
github.com/spf13/cast v1.7.0 // indirect
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.10.0
github.com/subosito/gotenv v1.6.0 // indirect
Expand All @@ -216,7 +220,7 @@ require (
go.opentelemetry.io/proto/otlp v1.3.1 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/crypto v0.29.0 // indirect
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8 // indirect
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
golang.org/x/net v0.31.0 // indirect
golang.org/x/oauth2 v0.24.0 // indirect
golang.org/x/sys v0.27.0 // indirect
Expand Down
27 changes: 16 additions & 11 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/r
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g=
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
github.com/dnephin/pflag v1.0.7 h1:oxONGlWxhmUct0YzKTgrpQv9AUA1wtPBn7zuSjJqptk=
github.com/dnephin/pflag v1.0.7/go.mod h1:uxE91IoWURlOiTUIA8Mq5ZZkAv3dPUfZNaT80Zm7OQE=
github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
Expand All @@ -170,16 +172,17 @@ github.com/eapache/queue/v2 v2.0.0-20230407133247-75960ed334e4 h1:8EXxF+tCLqaVk8
github.com/eapache/queue/v2 v2.0.0-20230407133247-75960ed334e4/go.mod h1:I5sHm0Y0T1u5YjlyqC5GVArM7aNZRUYtTjmJ8mPJFds=
github.com/ebitengine/purego v0.7.1 h1:6/55d26lG3o9VCZX8lping+bZcmShseiqlh2bnUDiPA=
github.com/ebitengine/purego v0.7.1/go.mod h1:ah1In8AOtksoNK6yk5z1HTJeUkC1Ez4Wk2idgGslMwQ=
github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU=
github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw=
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM=
github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
Expand Down Expand Up @@ -367,10 +370,12 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/oleiade/reflections v1.1.0 h1:D+I/UsXQB4esMathlt0kkZRJZdUDmhv5zGi/HOwYTWo=
github.com/oleiade/reflections v1.1.0/go.mod h1:mCxx0QseeVCHs5Um5HhJeCKVC7AwS8kO67tky4rdisA=
github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM=
github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo=
github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4=
github.com/onsi/gomega v1.35.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog=
github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg=
github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo=
github.com/onsi/gomega v1.36.0 h1:Pb12RlruUtj4XUuPUqeEWc6j5DkVVVA49Uf6YLfC95Y=
github.com/onsi/gomega v1.36.0/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs=
github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc=
github.com/outcaste-io/ristretto v0.2.3 h1:AK4zt/fJ76kjlYObOeNwh4T3asEuaCmp26pOvUOL9w0=
Expand Down Expand Up @@ -448,8 +453,8 @@ github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0b
github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8=
github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY=
github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0=
github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
github.com/spf13/cast v1.7.0 h1:ntdiHjuueXFgm5nzDRdOS4yfT43P5Fnud6DH50rz/7w=
github.com/spf13/cast v1.7.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
Expand Down Expand Up @@ -563,8 +568,8 @@ golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOM
golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ=
golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8 h1:yixxcjnhBmY0nkL253HFVIm0JsFHwrHdT3Yh6szTnfY=
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8/go.mod h1:jj3sYF3dwk5D+ghuXyeI3r5MFf+NT2An6/9dOA95KSI=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
Expand Down
5 changes: 5 additions & 0 deletions internal/controller/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ type Config struct {
DefaultSidecarParams *SidecarParams `json:"default-sidecar-params" validate:"omitempty"`
DefaultMetadata Metadata `json:"default-metadata" validate:"omitempty"`

DefaultImagePullPolicy corev1.PullPolicy `json:"default-image-pull-policy" validate:"omitempty"`
DefaultImageCheckPullPolicy corev1.PullPolicy `json:"default-image-check-pull-policy" validate:"omitempty"`

// ProhibitKubernetesPlugin can be used to prevent alterations to the pod
// from the job (the kubernetes "plugin" in pipeline.yml). If enabled,
// jobs with a "kubernetes" plugin will fail.
Expand Down Expand Up @@ -120,6 +123,8 @@ func (c Config) MarshalLogObject(enc zapcore.ObjectEncoder) error {
if err := enc.AddReflected("default-metadata", c.DefaultMetadata); err != nil {
return err
}
enc.AddString("default-image-pull-policy", string(c.DefaultImagePullPolicy))
enc.AddString("default-image-check-pull-policy", string(c.DefaultImageCheckPullPolicy))
return nil
}

Expand Down
28 changes: 15 additions & 13 deletions internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,19 +87,21 @@ func Run(
// Scheduler does the complicated work of converting a Buildkite job into
// a pod to run that job. It talks to the k8s API to create pods.
sched := scheduler.New(logger.Named("scheduler"), k8sClient, scheduler.Config{
Namespace: cfg.Namespace,
Image: cfg.Image,
AgentTokenSecretName: cfg.AgentTokenSecret,
JobTTL: cfg.JobTTL,
AdditionalRedactedVars: cfg.AdditionalRedactedVars,
WorkspaceVolume: cfg.WorkspaceVolume,
AgentConfig: cfg.AgentConfig,
DefaultCheckoutParams: cfg.DefaultCheckoutParams,
DefaultCommandParams: cfg.DefaultCommandParams,
DefaultSidecarParams: cfg.DefaultSidecarParams,
DefaultMetadata: cfg.DefaultMetadata,
PodSpecPatch: cfg.PodSpecPatch,
ProhibitK8sPlugin: cfg.ProhibitKubernetesPlugin,
Namespace: cfg.Namespace,
Image: cfg.Image,
AgentTokenSecretName: cfg.AgentTokenSecret,
JobTTL: cfg.JobTTL,
AdditionalRedactedVars: cfg.AdditionalRedactedVars,
WorkspaceVolume: cfg.WorkspaceVolume,
AgentConfig: cfg.AgentConfig,
DefaultCheckoutParams: cfg.DefaultCheckoutParams,
DefaultCommandParams: cfg.DefaultCommandParams,
DefaultSidecarParams: cfg.DefaultSidecarParams,
DefaultMetadata: cfg.DefaultMetadata,
DefaultImagePullPolicy: cfg.DefaultImagePullPolicy,
DefaultImageCheckPullPolicy: cfg.DefaultImageCheckPullPolicy,
PodSpecPatch: cfg.PodSpecPatch,
ProhibitK8sPlugin: cfg.ProhibitKubernetesPlugin,
})

informerFactory, err := NewInformerFactory(k8sClient, cfg.Namespace, cfg.Tags)
Expand Down
33 changes: 15 additions & 18 deletions internal/controller/scheduler/job_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ type jobWatcher struct {

// Tracks stalling jobs (jobs that have yet to create pods).
stallingJobsMu sync.Mutex
stallingJobs map[*batchv1.Job]struct{}
stallingJobs map[uuid.UUID]*batchv1.Job

// Tracks jobs that are being cleaned up (to avoid repeats).
ignoredJobsMu sync.RWMutex
Expand All @@ -58,7 +58,7 @@ func NewJobWatcher(logger *zap.Logger, k8sClient kubernetes.Interface, cfg *conf
logger: logger,
k8s: k8sClient,
cfg: cfg,
stallingJobs: make(map[*batchv1.Job]struct{}),
stallingJobs: make(map[uuid.UUID]*batchv1.Job),
ignoredJobs: make(map[uuid.UUID]struct{}),
}
jobsStallingGaugeFunc = func() int {
Expand Down Expand Up @@ -118,13 +118,14 @@ func (w *jobWatcher) OnDelete(prev any) {
if kjob == nil {
return
}
w.removeFromStalling(kjob)

jobUUID, err := jobUUIDForObject(kjob)
if err != nil {
return
}

w.removeFromStalling(jobUUID)

// The job is gone, so we can stop ignoring it (if it comes back).
w.unignoreJob(jobUUID)

Expand All @@ -145,10 +146,10 @@ func (w *jobWatcher) runChecks(ctx context.Context, kjob *batchv1.Job) {
}

if model.JobFinished(kjob) {
w.removeFromStalling(kjob)
w.removeFromStalling(jobUUID)
w.checkFinishedWithoutPod(ctx, log, kjob)
} else {
w.checkStalledWithoutPod(log, kjob)
w.checkStalledWithoutPod(log, jobUUID, kjob)
}
}

Expand All @@ -171,7 +172,7 @@ func (w *jobWatcher) checkFinishedWithoutPod(ctx context.Context, log *zap.Logge
w.failJob(ctx, log, kjob, message)
}

func (w *jobWatcher) checkStalledWithoutPod(log *zap.Logger, kjob *batchv1.Job) {
func (w *jobWatcher) checkStalledWithoutPod(log *zap.Logger, jobUUID uuid.UUID, kjob *batchv1.Job) {
log.Debug("Checking job for stalling without a pod")

// If the job is not finished and there is no pod, it should start one
Expand All @@ -184,7 +185,7 @@ func (w *jobWatcher) checkStalledWithoutPod(log *zap.Logger, kjob *batchv1.Job)
}
if pods > 0 {
// All's well with the world.
w.removeFromStalling(kjob)
w.removeFromStalling(jobUUID)
return
}

Expand All @@ -193,7 +194,7 @@ func (w *jobWatcher) checkStalledWithoutPod(log *zap.Logger, kjob *batchv1.Job)
return
}

w.addToStalling(kjob)
w.addToStalling(jobUUID, kjob)
}

func (w *jobWatcher) fetchEvents(ctx context.Context, log *zap.Logger, kjob *batchv1.Job) string {
Expand Down Expand Up @@ -248,16 +249,16 @@ func (w *jobWatcher) formatEvents(evlist *corev1.EventList) string {
return tw.Render()
}

func (w *jobWatcher) addToStalling(kjob *batchv1.Job) {
func (w *jobWatcher) addToStalling(jobUUID uuid.UUID, kjob *batchv1.Job) {
w.stallingJobsMu.Lock()
defer w.stallingJobsMu.Unlock()
w.stallingJobs[kjob] = struct{}{}
w.stallingJobs[jobUUID] = kjob
}

func (w *jobWatcher) removeFromStalling(kjob *batchv1.Job) {
func (w *jobWatcher) removeFromStalling(jobUUID uuid.UUID) {
w.stallingJobsMu.Lock()
defer w.stallingJobsMu.Unlock()
delete(w.stallingJobs, kjob)
delete(w.stallingJobs, jobUUID)
}

func (w *jobWatcher) stalledJobChecker(ctx context.Context) {
Expand All @@ -274,21 +275,17 @@ func (w *jobWatcher) stalledJobChecker(ctx context.Context) {
// Gather stalled jobs
var stalled []*batchv1.Job
w.stallingJobsMu.Lock()
for kjob := range w.stallingJobs {
for jobUUID, kjob := range w.stallingJobs {
if time.Since(kjob.Status.StartTime.Time) < w.cfg.EmptyJobGracePeriod {
continue
}

// ignore it from now until it is deleted
jobUUID, err := jobUUIDForObject(kjob)
if err != nil {
continue
}
w.ignoreJob(jobUUID)

// Move it from w.stalling into stalled
stalled = append(stalled, kjob)
delete(w.stallingJobs, kjob)
delete(w.stallingJobs, jobUUID)
}
w.stallingJobsMu.Unlock()

Expand Down
13 changes: 10 additions & 3 deletions internal/controller/scheduler/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,10 @@ var (
// Pod watcher metrics

var (
// Overridden to return len(jobCancelCheckers) by podWatcher.
jobCancelCheckerGaugeFunc = func() int { return 0 }
podWatcherIgnoredJobsGaugeFunc = func() int { return 0 }
// Overridden by podWatcher.
jobCancelCheckerGaugeFunc = func() int { return 0 }
podWatcherIgnoredJobsGaugeFunc = func() int { return 0 }
watchingForImageFailureGaugeFunc = func() int { return 0 }

_ = promauto.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: promNamespace,
Expand All @@ -149,6 +150,12 @@ var (
Name: "num_ignored_jobs",
Help: "Current count of jobs ignored for podWatcher checks",
}, func() float64 { return float64(podWatcherIgnoredJobsGaugeFunc()) })
_ = promauto.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: promNamespace,
Subsystem: "pod_watcher",
Name: "num_watching_for_image_failure",
Help: "Current count of pods being watched for potential image-related failures",
}, func() float64 { return float64(watchingForImageFailureGaugeFunc()) })

podWatcherOnAddEventCounter = promauto.NewCounter(prometheus.CounterOpts{
Namespace: promNamespace,
Expand Down
Loading

0 comments on commit 5a7e9bb

Please sign in to comment.