From 4f8386aab95c22d9b0f23fbf6b717d95f97b30d0 Mon Sep 17 00:00:00 2001 From: Beraldo Leal Date: Tue, 24 Sep 2024 17:03:29 -0400 Subject: [PATCH] metrics: add initial support for SLI/SLO metrics Introduces high-level SLI metrics that can help derive SLOs for kata-remote. Currently, the osc-monitor image only includes the kata-monitor binary. The idea is to reuse the same image to also include the metrics-server. For now, the idea is to collect metrics for kata-remote, but we can expand later to get more metrics. Signed-off-by: Beraldo Leal --- cmd/metrics/metrics.go | 158 +++++++++++++++++++++ config/default/kustomization.yaml | 1 + config/metrics/kustomization.yaml | 4 + config/metrics/metrics-deployment.yaml | 27 ++++ config/metrics/metrics-service.yaml | 14 ++ config/metrics/metrics-servicemonitor.yaml | 16 +++ 6 files changed, 220 insertions(+) create mode 100644 cmd/metrics/metrics.go create mode 100644 config/metrics/kustomization.yaml create mode 100644 config/metrics/metrics-deployment.yaml create mode 100644 config/metrics/metrics-service.yaml create mode 100644 config/metrics/metrics-servicemonitor.yaml diff --git a/cmd/metrics/metrics.go b/cmd/metrics/metrics.go new file mode 100644 index 00000000..f8465fc0 --- /dev/null +++ b/cmd/metrics/metrics.go @@ -0,0 +1,158 @@ +package main + +import ( + "context" + "log" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +const runtimeClassName = "kata-remote" + +var ( + runtimeClassAvailable = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "kata_remote_runtimeclass_available", + Help: "Indicates if the " + runtimeClassName + " RuntimeClass is available (1) or not (0).", + }) + + kataConfigInstallationSuccess = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "kata_config_installation_success", + Help: "Indicates if KataConfig installation is successful (1) or not (0).", + }) + + kataRemoteWorkloadFailureRatio = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "kata_remote_workload_failure_ratio", + Help: "Percentage of " + runtimeClassName + " workloads that have failed.", + }) + + totalKataRemotePods = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "kata_total_remote_pods", + Help: "Total number of " + runtimeClassName + " pods across all namespaces, regardless of their status.", + }) + + failedKataRemotePods = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "kata_failed_remote_pods", + Help: "Total number of " + runtimeClassName + " pods across all namespaces, that status is != 'Running|Succeed'", + }) +) + +func collectMetricsData(clientset *kubernetes.Clientset, dynamicClient dynamic.Interface) { + // Defaults + runtimeClassAvailable.Set(0) + kataRemoteWorkloadFailureRatio.Set(0) + totalKataRemotePods.Set(0) + failedKataRemotePods.Set(0) + + // Check if kata-remote runtime class is available + _, err := clientset.NodeV1().RuntimeClasses().Get(context.TODO(), runtimeClassName, metav1.GetOptions{}) + if err == nil { + runtimeClassAvailable.Set(1) + + // Fetch Pods for kata-remote workload metrics + pods, err := clientset.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{}) + if err != nil { + log.Printf("Error listing pods: %v", err) + } else { + totalPods := 0 + successfulPods := 0 + failedPods := 0 + for _, pod := range pods.Items { + if pod.Spec.RuntimeClassName != nil && *pod.Spec.RuntimeClassName == runtimeClassName { + totalPods++ + if pod.Status.Phase == "Running" || pod.Status.Phase == "Succeeded" { + successfulPods++ + } else { + failedPods++ + } + } + } + + if totalPods > 0 { + kataRemoteWorkloadFailureRatio.Set(float64(failedPods) / float64(totalPods) * 100) + } + + totalKataRemotePods.Set(float64(totalPods)) + failedKataRemotePods.Set(float64(failedPods)) + } + } + + // Fetch KataConfig status + kataConfigGVR := schema.GroupVersionResource{ + Group: "kataconfiguration.openshift.io", + Version: "v1", + Resource: "kataconfigs", + } + kataConfigs, err := dynamicClient.Resource(kataConfigGVR).List(context.TODO(), metav1.ListOptions{}) + if err != nil || len(kataConfigs.Items) == 0 { + kataConfigInstallationSuccess.Set(0) + } else { + kataConfig := &kataConfigs.Items[0] + status, found, err := unstructured.NestedMap(kataConfig.Object, "status") + if err != nil || !found { + kataConfigInstallationSuccess.Set(0) + } else { + inProgress, _, _ := unstructured.NestedBool(status, "inProgress") + readyNodeCount, _, _ := unstructured.NestedInt64(status, "readyNodeCount") + totalNodeCount, _, _ := unstructured.NestedInt64(status, "totalNodeCount") + if !inProgress && readyNodeCount == totalNodeCount { + kataConfigInstallationSuccess.Set(1) + } else { + kataConfigInstallationSuccess.Set(0) + } + } + } +} + +func getKubernetesClients() (*kubernetes.Clientset, dynamic.Interface, error) { + config, err := rest.InClusterConfig() + if err != nil { + return nil, nil, err + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return nil, nil, err + } + + dynamicClient, err := dynamic.NewForConfig(config) + if err != nil { + return nil, nil, err + } + + return clientset, dynamicClient, nil +} + +func metricsHandler(clientset *kubernetes.Clientset, dynamicClient dynamic.Interface) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + collectMetricsData(clientset, dynamicClient) + promhttp.Handler().ServeHTTP(w, r) + }) +} + +func main() { + prometheus.MustRegister( + runtimeClassAvailable, + kataConfigInstallationSuccess, + kataRemoteWorkloadFailureRatio, + totalKataRemotePods, + failedKataRemotePods, + ) + + clientset, dynamicClient, err := getKubernetesClients() + if err != nil { + log.Fatalf("Error setting up Kubernetes clients: %v", err) + } + + http.Handle("/metrics", metricsHandler(clientset, dynamicClient)) + + log.Println("Starting OSC metrics server on port :8091") + log.Fatal(http.ListenAndServe(":8091", nil)) +} diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index fc05f5b0..6396d2fa 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -18,6 +18,7 @@ resources: - ../rbac - ../manager - ../kata-monitor +- ../metrics # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml - ../webhook diff --git a/config/metrics/kustomization.yaml b/config/metrics/kustomization.yaml new file mode 100644 index 00000000..186bad51 --- /dev/null +++ b/config/metrics/kustomization.yaml @@ -0,0 +1,4 @@ +resources: + - metrics-deployment.yaml + - metrics-service.yaml + - metrics-servicemonitor.yaml diff --git a/config/metrics/metrics-deployment.yaml b/config/metrics/metrics-deployment.yaml new file mode 100644 index 00000000..6fb8ec4e --- /dev/null +++ b/config/metrics/metrics-deployment.yaml @@ -0,0 +1,27 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-metrics-server + namespace: openshift-sandboxed-containers-operator + labels: + app: operator-metrics-server +spec: + replicas: 1 + selector: + matchLabels: + app: operator-metrics-server + template: + metadata: + labels: + app: operator-metrics-server + spec: + containers: + - name: metrics-server + image: registry.redhat.io/openshift-sandboxed-containers/osc-monitor-rhel9:1.7.0 + command: ["/metrics-server"] + ports: + - containerPort: 8091 + resources: + requests: + memory: "64Mi" + cpu: "50m" diff --git a/config/metrics/metrics-service.yaml b/config/metrics/metrics-service.yaml new file mode 100644 index 00000000..ad0321c2 --- /dev/null +++ b/config/metrics/metrics-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: operator-metrics-service + namespace: openshift-sandboxed-containers-operator + labels: + app: operator-metrics-server +spec: + selector: + app: operator-metrics-server + ports: + - protocol: TCP + port: 8091 + targetPort: 8091 diff --git a/config/metrics/metrics-servicemonitor.yaml b/config/metrics/metrics-servicemonitor.yaml new file mode 100644 index 00000000..01b8ac5d --- /dev/null +++ b/config/metrics/metrics-servicemonitor.yaml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: operator-metrics-monitor + namespace: openshift-sandboxed-containers-operator + labels: + release: prometheus-operator +spec: + selector: + matchLabels: + app: operator-metrics-server + endpoints: + - port: "8091" + path: /metrics + interval: 30s + scrapeTimeout: 10s