Skip to content

Commit

Permalink
feat: add metrics for jobset
Browse files Browse the repository at this point in the history
  • Loading branch information
googs1025 committed Jul 4, 2024
1 parent 0105ce5 commit 434f4d2
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 0 deletions.
3 changes: 3 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (

jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2"
"sigs.k8s.io/jobset/pkg/controllers"
"sigs.k8s.io/jobset/pkg/metrics"
"sigs.k8s.io/jobset/pkg/util/cert"
"sigs.k8s.io/jobset/pkg/webhooks"
//+kubebuilder:scaffold:imports
Expand Down Expand Up @@ -88,6 +89,8 @@ func main() {
os.Exit(1)
}

metrics.Register()

mgr, err := ctrl.NewManager(kubeConfig, ctrl.Options{
Scheme: scheme,
Metrics: server.Options{
Expand Down
3 changes: 3 additions & 0 deletions pkg/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ limitations under the License.
package constants

const (
// JobSetName is the name of the JobSet
JobSetName = "jobset"

// JobOwnerKey is the field used to build the JobSet index, which enables looking up Jobs
// by the owner JobSet quickly.
JobOwnerKey = ".metadata.controller"
Expand Down
3 changes: 3 additions & 0 deletions pkg/controllers/failure_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2"
"sigs.k8s.io/jobset/pkg/constants"
"sigs.k8s.io/jobset/pkg/metrics"
)

// actionFunctionMap relates jobset failure policy action names to the appropriate behavior during jobset reconciliation.
Expand Down Expand Up @@ -258,6 +259,8 @@ func makeFailedConditionOpts(reason, msg string) *conditionOpts {
func setJobSetFailedCondition(js *jobset.JobSet, reason, msg string, updateStatusOpts *statusUpdateOpts) {
setCondition(js, makeFailedConditionOpts(reason, msg), updateStatusOpts)
js.Status.TerminalState = string(jobset.JobSetFailed)
// Update the metrics
metrics.FailedCase(fmt.Sprintf("%s/%s", js.Namespace, js.Name))
}

// findJobFailureTimeAndReason is a helper function which extracts the Job failure condition from a Job,
Expand Down
3 changes: 3 additions & 0 deletions pkg/controllers/jobset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (

jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2"
"sigs.k8s.io/jobset/pkg/constants"
"sigs.k8s.io/jobset/pkg/metrics"
"sigs.k8s.io/jobset/pkg/util/collections"
"sigs.k8s.io/jobset/pkg/util/placement"
)
Expand Down Expand Up @@ -948,6 +949,8 @@ func updateCondition(js *jobset.JobSet, opts *conditionOpts) bool {
func setJobSetCompletedCondition(js *jobset.JobSet, updateStatusOpts *statusUpdateOpts) {
setCondition(js, makeCompletedConditionsOpts(), updateStatusOpts)
js.Status.TerminalState = string(jobset.JobSetCompleted)
// Update the metrics
metrics.CompletedCase(fmt.Sprintf("%s/%s", js.Namespace, js.Name))
}

// setJobSetSuspendedCondition sets a condition on the JobSet status indicating it is currently suspended.
Expand Down
61 changes: 61 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"

"sigs.k8s.io/jobset/pkg/constants"
)

var (
FailedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: constants.JobSetName,
Name: "jobset_failed_total",
Help: `The total number of jobset failed case`,
}, []string{"jobsetName"},
)

CompletedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: constants.JobSetName,
Name: "jobset_completed_total",
Help: `The total number of jobset completed case`,
}, []string{"jobsetName"},
)
)

// FailedCase records the failed case
// label values: namespace/name
func FailedCase(namespaceName string) {
FailedTotal.WithLabelValues(namespaceName).Inc()
}

// CompletedCase records the completed case
// label values: namespace/name
func CompletedCase(namespaceName string) {
CompletedTotal.WithLabelValues(namespaceName).Inc()
}

func Register() {
metrics.Registry.MustRegister(
FailedTotal,
CompletedTotal,
)
}
65 changes: 65 additions & 0 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"fmt"
"testing"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
)

func TestFailedCase(t *testing.T) {
prometheus.MustRegister(FailedTotal)

FailedCase(fmt.Sprintf("%s/%s", "default", "jobset-test1"))
FailedCase(fmt.Sprintf("%s/%s", "default", "jobset-test2"))
FailedCase(fmt.Sprintf("%s/%s", "default", "jobset-test1"))

if count := testutil.CollectAndCount(FailedTotal); count != 2 {
t.Errorf("Expecting %d metrics, got: %d", 2, count)
}

if count := testutil.ToFloat64(FailedTotal.WithLabelValues(fmt.Sprintf("%s/%s", "default", "jobset-test1"))); count != float64(2) {
t.Errorf("Expecting %s to have value %d, but got %f", fmt.Sprintf("%s/%s", "default", "jobset-test1"), 2, count)
}

if count := testutil.ToFloat64(FailedTotal.WithLabelValues(fmt.Sprintf("%s/%s", "default", "jobset-test2"))); count != float64(1) {
t.Errorf("Expecting %s to have value %d, but got %f", fmt.Sprintf("%s/%s", "default", "jobset-test2"), 1, count)
}
}

func TestCompletedCase(t *testing.T) {
prometheus.MustRegister(CompletedTotal)

CompletedCase(fmt.Sprintf("%s/%s", "default", "jobset-test1"))
CompletedCase(fmt.Sprintf("%s/%s", "default", "jobset-test2"))
CompletedCase(fmt.Sprintf("%s/%s", "default", "jobset-test1"))

if count := testutil.CollectAndCount(CompletedTotal); count != 2 {
t.Errorf("Expecting %d metrics, got: %d", 2, count)
}

if count := testutil.ToFloat64(CompletedTotal.WithLabelValues(fmt.Sprintf("%s/%s", "default", "jobset-test1"))); count != float64(2) {
t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-0", 2, count)
}

if count := testutil.ToFloat64(CompletedTotal.WithLabelValues(fmt.Sprintf("%s/%s", "default", "jobset-test2"))); count != float64(1) {
t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-1", 1, count)
}
}

0 comments on commit 434f4d2

Please sign in to comment.