Skip to content

Commit

Permalink
Add prometheus metrics for experiment and trial (#870)
Browse files Browse the repository at this point in the history
  • Loading branch information
hougangliu authored and k8s-ci-robot committed Oct 11, 2019
1 parent dd1fb5c commit 030b691
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 0 deletions.
4 changes: 4 additions & 0 deletions pkg/controller.v1alpha3/experiment/experiment_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ func (r *ReconcileExperiment) Reconcile(request reconcile.Request) (reconcile.Re
}
instance := original.DeepCopy()

if needUpdate, finalizers := needUpdateFinalizers(instance); needUpdate {
return r.updateFinalizers(instance, finalizers)
}

if instance.IsCompleted() && !instance.HasRunningTrials() {

return reconcile.Result{}, nil
Expand Down
48 changes: 48 additions & 0 deletions pkg/controller.v1alpha3/experiment/experiment_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,19 @@ import (
"k8s.io/apimachinery/pkg/types"
k8syaml "k8s.io/apimachinery/pkg/util/yaml"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

experimentsv1alpha3 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1alpha3"
suggestionsv1alpha3 "github.com/kubeflow/katib/pkg/apis/controller/suggestions/v1alpha3"
trialsv1alpha3 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1alpha3"
utilv1alpha3 "github.com/kubeflow/katib/pkg/controller.v1alpha3/experiment/util"
"github.com/kubeflow/katib/pkg/controller.v1alpha3/util"
)

const (
updatePrometheusMetrics = "update-prometheus-metrics"
)

func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1alpha3.Experiment, trialAssignment *suggestionsv1alpha3.TrialAssignment) error {
BUFSIZE := 1024
logger := log.WithValues("Experiment", types.NamespacedName{Name: expInstance.GetName(), Namespace: expInstance.GetNamespace()})
Expand Down Expand Up @@ -62,3 +68,45 @@ func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1alph
return nil

}

func needUpdateFinalizers(exp *experimentsv1alpha3.Experiment) (bool, []string) {
deleted := !exp.ObjectMeta.DeletionTimestamp.IsZero()
pendingFinalizers := exp.GetFinalizers()
contained := false
for _, elem := range pendingFinalizers {
if elem == updatePrometheusMetrics {
contained = true
break
}
}

if !deleted && !contained {
finalizers := append(pendingFinalizers, updatePrometheusMetrics)
return true, finalizers
}
if deleted && contained {
finalizers := []string{}
for _, pendingFinalizer := range pendingFinalizers {
if pendingFinalizer != updatePrometheusMetrics {
finalizers = append(finalizers, pendingFinalizer)
}
}
return true, finalizers
}
return false, []string{}
}

func (r *ReconcileExperiment) updateFinalizers(instance *experimentsv1alpha3.Experiment, finalizers []string) (reconcile.Result, error) {
instance.SetFinalizers(finalizers)
if err := r.Update(context.TODO(), instance); err != nil {
return reconcile.Result{}, err
} else {
if !instance.ObjectMeta.DeletionTimestamp.IsZero() {
utilv1alpha3.IncreaseExperimentsDeletedCount()
} else {
utilv1alpha3.IncreaseExperimentsCreatedCount()
}
// Need to requeue because finalizer update does not change metadata.generation
return reconcile.Result{Requeue: true}, err
}
}
66 changes: 66 additions & 0 deletions pkg/controller.v1alpha3/experiment/util/prometheus_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package util

import (
"github.com/prometheus/client_golang/prometheus"

"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
experimentsDeletedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_experiment_deleted",
Help: "Counts number of Experiment deleted",
})
experimentsCreatedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_experiment_created",
Help: "Counts number of Experiment created",
})
experimentsSucceededCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_experiment_succeeded",
Help: "Counts number of Experiment succeeded",
})
experimentsFailedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_experiment_failed",
Help: "Counts number of Experiment failed",
})
)

func init() {
metrics.Registry.MustRegister(
experimentsDeletedCount,
experimentsCreatedCount,
experimentsSucceededCount,
experimentsFailedCount)
}

func IncreaseExperimentsDeletedCount() {
experimentsDeletedCount.Inc()
}

func IncreaseExperimentsCreatedCount() {
experimentsCreatedCount.Inc()
}

func IncreaseExperimentsSucceededCount() {
experimentsSucceededCount.Inc()
}

func IncreaseExperimentsFailedCount() {
experimentsFailedCount.Inc()
}
4 changes: 4 additions & 0 deletions pkg/controller.v1alpha3/experiment/util/status_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,27 +145,31 @@ func UpdateExperimentStatusCondition(instance *experimentsv1alpha3.Experiment, i
msg := "Experiment has succeeded because Objective goal has reached"
instance.MarkExperimentStatusSucceeded(ExperimentSucceededReason, msg)
instance.Status.CompletionTime = &now
IncreaseExperimentsSucceededCount()
return
}

if (instance.Spec.MaxTrialCount != nil) && (completedTrialsCount >= *instance.Spec.MaxTrialCount) {
msg := "Experiment has succeeded because max trial count has reached"
instance.MarkExperimentStatusSucceeded(ExperimentSucceededReason, msg)
instance.Status.CompletionTime = &now
IncreaseExperimentsSucceededCount()
return
}

if getSuggestionDone && (instance.Status.TrialsPending+instance.Status.TrialsRunning) == 0 {
msg := "Experiment has succeeded because suggestion service has reached the end"
instance.MarkExperimentStatusSucceeded(ExperimentSucceededReason, msg)
instance.Status.CompletionTime = &now
IncreaseExperimentsSucceededCount()
return
}

if (instance.Spec.MaxFailedTrialCount != nil) && (failedTrialsCount >= *instance.Spec.MaxFailedTrialCount) {
msg := "Experiment has failed because max failed count has reached"
instance.MarkExperimentStatusFailed(ExperimentFailedReason, msg)
instance.Status.CompletionTime = &now
IncreaseExperimentsFailedCount()
return
}

Expand Down
66 changes: 66 additions & 0 deletions pkg/controller.v1alpha3/trial/prometheus_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package trial

import (
"github.com/prometheus/client_golang/prometheus"

"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
trialsDeletedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_trial_deleted",
Help: "Counts number of Trial deleted",
})
trialsCreatedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_trial_created",
Help: "Counts number of Trial created",
})
trialsSucceededCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_trial_succeeded",
Help: "Counts number of Trial succeeded",
})
trialsFailedCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "katib_trial_failed",
Help: "Counts number of Trial failed",
})
)

func init() {
metrics.Registry.MustRegister(
trialsDeletedCount,
trialsCreatedCount,
trialsSucceededCount,
trialsFailedCount)
}

func IncreaseTrialsDeletedCount() {
trialsDeletedCount.Inc()
}

func IncreaseTrialsCreatedCount() {
trialsCreatedCount.Inc()
}

func IncreaseTrialsSucceededCount() {
trialsSucceededCount.Inc()
}

func IncreaseTrialsFailedCount() {
trialsFailedCount.Inc()
}
10 changes: 10 additions & 0 deletions pkg/controller.v1alpha3/trial/trial_controller_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1alpha3.Tri

eventMsg := fmt.Sprintf("Job %s has succeeded", deployedJob.GetName())
r.recorder.Eventf(instance, corev1.EventTypeNormal, JobSucceededReason, eventMsg)
IncreaseTrialsSucceededCount()
} else {
msg := "Metrics are not available"
instance.MarkTrialStatusSucceeded(corev1.ConditionFalse, TrialMetricsUnavailableReason, msg)
Expand All @@ -113,6 +114,7 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1alpha3.Tri
jobConditionMessage := (*jobCondition).Message
eventMsg := fmt.Sprintf("Job %s has failed: %s", deployedJob.GetName(), jobConditionMessage)
r.recorder.Eventf(instance, corev1.EventTypeNormal, JobFailedReason, eventMsg)
IncreaseTrialsFailedCount()
}
//else nothing to do
return
Expand Down Expand Up @@ -145,15 +147,23 @@ func (r *ReconcileTrial) UpdateTrialStatusObservation(instance *trialsv1alpha3.T
}

func (r *ReconcileTrial) updateFinalizers(instance *trialsv1alpha3.Trial, finalizers []string) (reconcile.Result, error) {
isDelete := true
if !instance.ObjectMeta.DeletionTimestamp.IsZero() {
if _, err := r.DeleteTrialObservationLog(instance); err != nil {
return reconcile.Result{}, err
}
} else {
isDelete = false
}
instance.SetFinalizers(finalizers)
if err := r.Update(context.TODO(), instance); err != nil {
return reconcile.Result{}, err
} else {
if isDelete {
IncreaseTrialsDeletedCount()
} else {
IncreaseTrialsCreatedCount()
}
// Need to requeue because finalizer update does not change metadata.generation
return reconcile.Result{Requeue: true}, err
}
Expand Down

0 comments on commit 030b691

Please sign in to comment.