Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Image check improvements #459

Merged
merged 1 commit into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions charts/agent-stack-k8s/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,18 @@
}
}
},
"default-image-pull-policy": {
"type": "string",
"description": "Configures a default image pull policy for containers that do not specify a pull policy, or containers created by the stack itself",
"default": "IfNotPresent",
"examples": ["Always", "IfNotPresent", "Never", ""]
},
"default-image-check-pull-policy": {
"type": "string",
"description": "Configures a default image pull policy for image-check init containers, used if an image pull policy is not set for the corresponding container in a podSpec or podSpecPatch",
"default": "",
"examples": ["Always", "IfNotPresent", "Never", ""]
},
"pod-spec-patch": {
"$ref": "https://kubernetesjsonschema.dev/master/_definitions.json#/definitions/io.k8s.api.core.v1.PodSpec"
}
Expand Down
10 changes: 10 additions & 0 deletions cmd/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,16 @@ func AddConfigFlags(cmd *cobra.Command) {
config.DefaultEmptyJobGracePeriod,
"Duration after starting a Kubernetes job that the controller will wait before considering failing the job due to a missing pod (e.g. when the podSpec specifies a missing service account)",
)
cmd.Flags().String(
"default-image-pull-policy",
"IfNotPresent",
"Configures a default image pull policy for containers that do not specify a pull policy and non-init containers created by the stack itself",
)
cmd.Flags().String(
"default-image-check-pull-policy",
"",
"Sets a default PullPolicy for image-check init containers, used if an image pull policy is not set for the corresponding container in a podSpec or podSpecPatch",
)
cmd.Flags().Bool(
"prohibit-kubernetes-plugin",
false,
Expand Down
2 changes: 2 additions & 0 deletions cmd/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ func TestReadAndParseConfig(t *testing.T) {
ClusterUUID: "beefcafe-abbe-baba-abba-deedcedecade",
ProhibitKubernetesPlugin: true,
GraphQLEndpoint: "http://graphql.buildkite.localhost/v1",
DefaultImagePullPolicy: "Never",
DefaultImageCheckPullPolicy: "IfNotPresent",

WorkspaceVolume: &corev1.Volume{
Name: "workspace-2-the-reckoning",
Expand Down
2 changes: 2 additions & 0 deletions examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ job-creation-concurrency: 5
max-in-flight: 100
namespace: my-buildkite-ns
org: my-buildkite-org
default-image-pull-policy: Never
default-image-check-pull-policy: IfNotPresent

# Setting a custom GraphQL endpoint is usually only useful if you have a
# different instance of Buildkite itself available to run.
Expand Down
12 changes: 8 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ require (
github.com/Khan/genqlient v0.7.0
github.com/buildkite/go-buildkite/v3 v3.13.0
github.com/buildkite/roko v1.2.0
github.com/distribution/reference v0.6.0
github.com/go-playground/locales v0.14.1
github.com/go-playground/universal-translator v0.18.1
github.com/go-playground/validator/v10 v10.23.0
Expand Down Expand Up @@ -69,7 +70,7 @@ require (
github.com/dustinkirkland/golang-petname v0.0.0-20240428194347-eebcea082ee0 // indirect
github.com/eapache/queue/v2 v2.0.0-20230407133247-75960ed334e4 // indirect
github.com/ebitengine/purego v0.7.1 // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/fatih/color v1.18.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
Expand All @@ -95,6 +96,9 @@ require (
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/onsi/ginkgo/v2 v2.22.0 // indirect
github.com/onsi/gomega v1.36.0 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/outcaste-io/ristretto v0.2.3 // indirect
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
github.com/power-devops/perfstat v0.0.0-20220216144756-c35f1ee13d7c // indirect
Expand Down Expand Up @@ -149,7 +153,7 @@ require (
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/denisbrodbeck/machineid v1.0.1 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/emicklei/go-restful/v3 v3.12.1 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/go-chi/chi/v5 v5.1.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
Expand Down Expand Up @@ -194,7 +198,7 @@ require (
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/secure-systems-lab/go-securesystemslib v0.8.0 // indirect
github.com/spf13/afero v1.11.0 // indirect
github.com/spf13/cast v1.6.0 // indirect
github.com/spf13/cast v1.7.0 // indirect
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.10.0
github.com/subosito/gotenv v1.6.0 // indirect
Expand All @@ -216,7 +220,7 @@ require (
go.opentelemetry.io/proto/otlp v1.3.1 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/crypto v0.29.0 // indirect
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8 // indirect
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
golang.org/x/net v0.31.0 // indirect
golang.org/x/oauth2 v0.24.0 // indirect
golang.org/x/sys v0.27.0 // indirect
Expand Down
27 changes: 16 additions & 11 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/r
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g=
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
github.com/dnephin/pflag v1.0.7 h1:oxONGlWxhmUct0YzKTgrpQv9AUA1wtPBn7zuSjJqptk=
github.com/dnephin/pflag v1.0.7/go.mod h1:uxE91IoWURlOiTUIA8Mq5ZZkAv3dPUfZNaT80Zm7OQE=
github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
Expand All @@ -170,16 +172,17 @@ github.com/eapache/queue/v2 v2.0.0-20230407133247-75960ed334e4 h1:8EXxF+tCLqaVk8
github.com/eapache/queue/v2 v2.0.0-20230407133247-75960ed334e4/go.mod h1:I5sHm0Y0T1u5YjlyqC5GVArM7aNZRUYtTjmJ8mPJFds=
github.com/ebitengine/purego v0.7.1 h1:6/55d26lG3o9VCZX8lping+bZcmShseiqlh2bnUDiPA=
github.com/ebitengine/purego v0.7.1/go.mod h1:ah1In8AOtksoNK6yk5z1HTJeUkC1Ez4Wk2idgGslMwQ=
github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU=
github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw=
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM=
github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
Expand Down Expand Up @@ -367,10 +370,12 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/oleiade/reflections v1.1.0 h1:D+I/UsXQB4esMathlt0kkZRJZdUDmhv5zGi/HOwYTWo=
github.com/oleiade/reflections v1.1.0/go.mod h1:mCxx0QseeVCHs5Um5HhJeCKVC7AwS8kO67tky4rdisA=
github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM=
github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo=
github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4=
github.com/onsi/gomega v1.35.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog=
github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg=
github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo=
github.com/onsi/gomega v1.36.0 h1:Pb12RlruUtj4XUuPUqeEWc6j5DkVVVA49Uf6YLfC95Y=
github.com/onsi/gomega v1.36.0/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs=
github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc=
github.com/outcaste-io/ristretto v0.2.3 h1:AK4zt/fJ76kjlYObOeNwh4T3asEuaCmp26pOvUOL9w0=
Expand Down Expand Up @@ -448,8 +453,8 @@ github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0b
github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8=
github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY=
github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0=
github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
github.com/spf13/cast v1.7.0 h1:ntdiHjuueXFgm5nzDRdOS4yfT43P5Fnud6DH50rz/7w=
github.com/spf13/cast v1.7.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
Expand Down Expand Up @@ -563,8 +568,8 @@ golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOM
golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ=
golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8 h1:yixxcjnhBmY0nkL253HFVIm0JsFHwrHdT3Yh6szTnfY=
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8/go.mod h1:jj3sYF3dwk5D+ghuXyeI3r5MFf+NT2An6/9dOA95KSI=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
Expand Down
5 changes: 5 additions & 0 deletions internal/controller/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ type Config struct {
DefaultSidecarParams *SidecarParams `json:"default-sidecar-params" validate:"omitempty"`
DefaultMetadata Metadata `json:"default-metadata" validate:"omitempty"`

DefaultImagePullPolicy corev1.PullPolicy `json:"default-image-pull-policy" validate:"omitempty"`
DefaultImageCheckPullPolicy corev1.PullPolicy `json:"default-image-check-pull-policy" validate:"omitempty"`

// ProhibitKubernetesPlugin can be used to prevent alterations to the pod
// from the job (the kubernetes "plugin" in pipeline.yml). If enabled,
// jobs with a "kubernetes" plugin will fail.
Expand Down Expand Up @@ -120,6 +123,8 @@ func (c Config) MarshalLogObject(enc zapcore.ObjectEncoder) error {
if err := enc.AddReflected("default-metadata", c.DefaultMetadata); err != nil {
return err
}
enc.AddString("default-image-pull-policy", string(c.DefaultImagePullPolicy))
enc.AddString("default-image-check-pull-policy", string(c.DefaultImageCheckPullPolicy))
return nil
}

Expand Down
28 changes: 15 additions & 13 deletions internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,19 +87,21 @@ func Run(
// Scheduler does the complicated work of converting a Buildkite job into
// a pod to run that job. It talks to the k8s API to create pods.
sched := scheduler.New(logger.Named("scheduler"), k8sClient, scheduler.Config{
Namespace: cfg.Namespace,
Image: cfg.Image,
AgentTokenSecretName: cfg.AgentTokenSecret,
JobTTL: cfg.JobTTL,
AdditionalRedactedVars: cfg.AdditionalRedactedVars,
WorkspaceVolume: cfg.WorkspaceVolume,
AgentConfig: cfg.AgentConfig,
DefaultCheckoutParams: cfg.DefaultCheckoutParams,
DefaultCommandParams: cfg.DefaultCommandParams,
DefaultSidecarParams: cfg.DefaultSidecarParams,
DefaultMetadata: cfg.DefaultMetadata,
PodSpecPatch: cfg.PodSpecPatch,
ProhibitK8sPlugin: cfg.ProhibitKubernetesPlugin,
Namespace: cfg.Namespace,
Image: cfg.Image,
AgentTokenSecretName: cfg.AgentTokenSecret,
JobTTL: cfg.JobTTL,
AdditionalRedactedVars: cfg.AdditionalRedactedVars,
WorkspaceVolume: cfg.WorkspaceVolume,
AgentConfig: cfg.AgentConfig,
DefaultCheckoutParams: cfg.DefaultCheckoutParams,
DefaultCommandParams: cfg.DefaultCommandParams,
DefaultSidecarParams: cfg.DefaultSidecarParams,
DefaultMetadata: cfg.DefaultMetadata,
DefaultImagePullPolicy: cfg.DefaultImagePullPolicy,
DefaultImageCheckPullPolicy: cfg.DefaultImageCheckPullPolicy,
PodSpecPatch: cfg.PodSpecPatch,
ProhibitK8sPlugin: cfg.ProhibitKubernetesPlugin,
})

informerFactory, err := NewInformerFactory(k8sClient, cfg.Namespace, cfg.Tags)
Expand Down
33 changes: 15 additions & 18 deletions internal/controller/scheduler/job_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ type jobWatcher struct {

// Tracks stalling jobs (jobs that have yet to create pods).
stallingJobsMu sync.Mutex
stallingJobs map[*batchv1.Job]struct{}
stallingJobs map[uuid.UUID]*batchv1.Job

// Tracks jobs that are being cleaned up (to avoid repeats).
ignoredJobsMu sync.RWMutex
Expand All @@ -58,7 +58,7 @@ func NewJobWatcher(logger *zap.Logger, k8sClient kubernetes.Interface, cfg *conf
logger: logger,
k8s: k8sClient,
cfg: cfg,
stallingJobs: make(map[*batchv1.Job]struct{}),
stallingJobs: make(map[uuid.UUID]*batchv1.Job),
wolfeidau marked this conversation as resolved.
Show resolved Hide resolved
ignoredJobs: make(map[uuid.UUID]struct{}),
}
jobsStallingGaugeFunc = func() int {
Expand Down Expand Up @@ -118,13 +118,14 @@ func (w *jobWatcher) OnDelete(prev any) {
if kjob == nil {
return
}
w.removeFromStalling(kjob)

jobUUID, err := jobUUIDForObject(kjob)
if err != nil {
return
}

w.removeFromStalling(jobUUID)

// The job is gone, so we can stop ignoring it (if it comes back).
w.unignoreJob(jobUUID)

Expand All @@ -145,10 +146,10 @@ func (w *jobWatcher) runChecks(ctx context.Context, kjob *batchv1.Job) {
}

if model.JobFinished(kjob) {
w.removeFromStalling(kjob)
w.removeFromStalling(jobUUID)
w.checkFinishedWithoutPod(ctx, log, kjob)
} else {
w.checkStalledWithoutPod(log, kjob)
w.checkStalledWithoutPod(log, jobUUID, kjob)
}
}

Expand All @@ -171,7 +172,7 @@ func (w *jobWatcher) checkFinishedWithoutPod(ctx context.Context, log *zap.Logge
w.failJob(ctx, log, kjob, message)
}

func (w *jobWatcher) checkStalledWithoutPod(log *zap.Logger, kjob *batchv1.Job) {
func (w *jobWatcher) checkStalledWithoutPod(log *zap.Logger, jobUUID uuid.UUID, kjob *batchv1.Job) {
log.Debug("Checking job for stalling without a pod")

// If the job is not finished and there is no pod, it should start one
Expand All @@ -184,7 +185,7 @@ func (w *jobWatcher) checkStalledWithoutPod(log *zap.Logger, kjob *batchv1.Job)
}
if pods > 0 {
// All's well with the world.
w.removeFromStalling(kjob)
w.removeFromStalling(jobUUID)
return
}

Expand All @@ -193,7 +194,7 @@ func (w *jobWatcher) checkStalledWithoutPod(log *zap.Logger, kjob *batchv1.Job)
return
}

w.addToStalling(kjob)
w.addToStalling(jobUUID, kjob)
}

func (w *jobWatcher) fetchEvents(ctx context.Context, log *zap.Logger, kjob *batchv1.Job) string {
Expand Down Expand Up @@ -248,16 +249,16 @@ func (w *jobWatcher) formatEvents(evlist *corev1.EventList) string {
return tw.Render()
}

func (w *jobWatcher) addToStalling(kjob *batchv1.Job) {
func (w *jobWatcher) addToStalling(jobUUID uuid.UUID, kjob *batchv1.Job) {
w.stallingJobsMu.Lock()
defer w.stallingJobsMu.Unlock()
w.stallingJobs[kjob] = struct{}{}
w.stallingJobs[jobUUID] = kjob
}

func (w *jobWatcher) removeFromStalling(kjob *batchv1.Job) {
func (w *jobWatcher) removeFromStalling(jobUUID uuid.UUID) {
w.stallingJobsMu.Lock()
defer w.stallingJobsMu.Unlock()
delete(w.stallingJobs, kjob)
delete(w.stallingJobs, jobUUID)
}

func (w *jobWatcher) stalledJobChecker(ctx context.Context) {
Expand All @@ -274,21 +275,17 @@ func (w *jobWatcher) stalledJobChecker(ctx context.Context) {
// Gather stalled jobs
var stalled []*batchv1.Job
w.stallingJobsMu.Lock()
for kjob := range w.stallingJobs {
for jobUUID, kjob := range w.stallingJobs {
if time.Since(kjob.Status.StartTime.Time) < w.cfg.EmptyJobGracePeriod {
continue
}

// ignore it from now until it is deleted
jobUUID, err := jobUUIDForObject(kjob)
if err != nil {
continue
}
w.ignoreJob(jobUUID)

// Move it from w.stalling into stalled
stalled = append(stalled, kjob)
delete(w.stallingJobs, kjob)
delete(w.stallingJobs, jobUUID)
}
w.stallingJobsMu.Unlock()

Expand Down
13 changes: 10 additions & 3 deletions internal/controller/scheduler/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,10 @@ var (
// Pod watcher metrics

var (
// Overridden to return len(jobCancelCheckers) by podWatcher.
jobCancelCheckerGaugeFunc = func() int { return 0 }
podWatcherIgnoredJobsGaugeFunc = func() int { return 0 }
// Overridden by podWatcher.
jobCancelCheckerGaugeFunc = func() int { return 0 }
podWatcherIgnoredJobsGaugeFunc = func() int { return 0 }
watchingForImageFailureGaugeFunc = func() int { return 0 }

_ = promauto.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: promNamespace,
Expand All @@ -149,6 +150,12 @@ var (
Name: "num_ignored_jobs",
Help: "Current count of jobs ignored for podWatcher checks",
}, func() float64 { return float64(podWatcherIgnoredJobsGaugeFunc()) })
_ = promauto.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: promNamespace,
Subsystem: "pod_watcher",
Name: "num_watching_for_image_failure",
Help: "Current count of pods being watched for potential image-related failures",
}, func() float64 { return float64(watchingForImageFailureGaugeFunc()) })

podWatcherOnAddEventCounter = promauto.NewCounter(prometheus.CounterOpts{
Namespace: promNamespace,
Expand Down
Loading