From 434f4d2e5cf86e4ae35a25748afe0119a26edfc2 Mon Sep 17 00:00:00 2001 From: googs1025 Date: Thu, 4 Jul 2024 09:32:54 +0800 Subject: [PATCH] feat: add metrics for jobset --- main.go | 3 ++ pkg/constants/constants.go | 3 ++ pkg/controllers/failure_policy.go | 3 ++ pkg/controllers/jobset_controller.go | 3 ++ pkg/metrics/metrics.go | 61 ++++++++++++++++++++++++++ pkg/metrics/metrics_test.go | 65 ++++++++++++++++++++++++++++ 6 files changed, 138 insertions(+) create mode 100644 pkg/metrics/metrics.go create mode 100644 pkg/metrics/metrics_test.go diff --git a/main.go b/main.go index 9fd97006..1a45de09 100644 --- a/main.go +++ b/main.go @@ -38,6 +38,7 @@ import ( jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" "sigs.k8s.io/jobset/pkg/controllers" + "sigs.k8s.io/jobset/pkg/metrics" "sigs.k8s.io/jobset/pkg/util/cert" "sigs.k8s.io/jobset/pkg/webhooks" //+kubebuilder:scaffold:imports @@ -88,6 +89,8 @@ func main() { os.Exit(1) } + metrics.Register() + mgr, err := ctrl.NewManager(kubeConfig, ctrl.Options{ Scheme: scheme, Metrics: server.Options{ diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go index 4d641ab7..c1f8a038 100644 --- a/pkg/constants/constants.go +++ b/pkg/constants/constants.go @@ -17,6 +17,9 @@ limitations under the License. package constants const ( + // JobSetName is the name of the JobSet + JobSetName = "jobset" + // JobOwnerKey is the field used to build the JobSet index, which enables looking up Jobs // by the owner JobSet quickly. JobOwnerKey = ".metadata.controller" diff --git a/pkg/controllers/failure_policy.go b/pkg/controllers/failure_policy.go index 0c09c291..1860f5fa 100644 --- a/pkg/controllers/failure_policy.go +++ b/pkg/controllers/failure_policy.go @@ -26,6 +26,7 @@ import ( jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" "sigs.k8s.io/jobset/pkg/constants" + "sigs.k8s.io/jobset/pkg/metrics" ) // actionFunctionMap relates jobset failure policy action names to the appropriate behavior during jobset reconciliation. @@ -258,6 +259,8 @@ func makeFailedConditionOpts(reason, msg string) *conditionOpts { func setJobSetFailedCondition(js *jobset.JobSet, reason, msg string, updateStatusOpts *statusUpdateOpts) { setCondition(js, makeFailedConditionOpts(reason, msg), updateStatusOpts) js.Status.TerminalState = string(jobset.JobSetFailed) + // Update the metrics + metrics.FailedCase(fmt.Sprintf("%s/%s", js.Namespace, js.Name)) } // findJobFailureTimeAndReason is a helper function which extracts the Job failure condition from a Job, diff --git a/pkg/controllers/jobset_controller.go b/pkg/controllers/jobset_controller.go index 86ce029a..7073cce7 100644 --- a/pkg/controllers/jobset_controller.go +++ b/pkg/controllers/jobset_controller.go @@ -41,6 +41,7 @@ import ( jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" "sigs.k8s.io/jobset/pkg/constants" + "sigs.k8s.io/jobset/pkg/metrics" "sigs.k8s.io/jobset/pkg/util/collections" "sigs.k8s.io/jobset/pkg/util/placement" ) @@ -948,6 +949,8 @@ func updateCondition(js *jobset.JobSet, opts *conditionOpts) bool { func setJobSetCompletedCondition(js *jobset.JobSet, updateStatusOpts *statusUpdateOpts) { setCondition(js, makeCompletedConditionsOpts(), updateStatusOpts) js.Status.TerminalState = string(jobset.JobSetCompleted) + // Update the metrics + metrics.CompletedCase(fmt.Sprintf("%s/%s", js.Namespace, js.Name)) } // setJobSetSuspendedCondition sets a condition on the JobSet status indicating it is currently suspended. diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 00000000..4a954d35 --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,61 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" + + "sigs.k8s.io/jobset/pkg/constants" +) + +var ( + FailedTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.JobSetName, + Name: "jobset_failed_total", + Help: `The total number of jobset failed case`, + }, []string{"jobsetName"}, + ) + + CompletedTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.JobSetName, + Name: "jobset_completed_total", + Help: `The total number of jobset completed case`, + }, []string{"jobsetName"}, + ) +) + +// FailedCase records the failed case +// label values: namespace/name +func FailedCase(namespaceName string) { + FailedTotal.WithLabelValues(namespaceName).Inc() +} + +// CompletedCase records the completed case +// label values: namespace/name +func CompletedCase(namespaceName string) { + CompletedTotal.WithLabelValues(namespaceName).Inc() +} + +func Register() { + metrics.Registry.MustRegister( + FailedTotal, + CompletedTotal, + ) +} diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go new file mode 100644 index 00000000..03dffe27 --- /dev/null +++ b/pkg/metrics/metrics_test.go @@ -0,0 +1,65 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "fmt" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestFailedCase(t *testing.T) { + prometheus.MustRegister(FailedTotal) + + FailedCase(fmt.Sprintf("%s/%s", "default", "jobset-test1")) + FailedCase(fmt.Sprintf("%s/%s", "default", "jobset-test2")) + FailedCase(fmt.Sprintf("%s/%s", "default", "jobset-test1")) + + if count := testutil.CollectAndCount(FailedTotal); count != 2 { + t.Errorf("Expecting %d metrics, got: %d", 2, count) + } + + if count := testutil.ToFloat64(FailedTotal.WithLabelValues(fmt.Sprintf("%s/%s", "default", "jobset-test1"))); count != float64(2) { + t.Errorf("Expecting %s to have value %d, but got %f", fmt.Sprintf("%s/%s", "default", "jobset-test1"), 2, count) + } + + if count := testutil.ToFloat64(FailedTotal.WithLabelValues(fmt.Sprintf("%s/%s", "default", "jobset-test2"))); count != float64(1) { + t.Errorf("Expecting %s to have value %d, but got %f", fmt.Sprintf("%s/%s", "default", "jobset-test2"), 1, count) + } +} + +func TestCompletedCase(t *testing.T) { + prometheus.MustRegister(CompletedTotal) + + CompletedCase(fmt.Sprintf("%s/%s", "default", "jobset-test1")) + CompletedCase(fmt.Sprintf("%s/%s", "default", "jobset-test2")) + CompletedCase(fmt.Sprintf("%s/%s", "default", "jobset-test1")) + + if count := testutil.CollectAndCount(CompletedTotal); count != 2 { + t.Errorf("Expecting %d metrics, got: %d", 2, count) + } + + if count := testutil.ToFloat64(CompletedTotal.WithLabelValues(fmt.Sprintf("%s/%s", "default", "jobset-test1"))); count != float64(2) { + t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-0", 2, count) + } + + if count := testutil.ToFloat64(CompletedTotal.WithLabelValues(fmt.Sprintf("%s/%s", "default", "jobset-test2"))); count != float64(1) { + t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-1", 1, count) + } +}