Skip to content

Commit

Permalink
Merge pull request #461 from beraldoleal/sli-metrics
Browse files Browse the repository at this point in the history
metrics: add initial support for SLI/SLO metrics
  • Loading branch information
gkurz authored Oct 7, 2024
2 parents c315250 + 4f8386a commit 8877ce1
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 0 deletions.
158 changes: 158 additions & 0 deletions cmd/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package main

import (
"context"
"log"
"net/http"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)

const runtimeClassName = "kata-remote"

var (
runtimeClassAvailable = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "kata_remote_runtimeclass_available",
Help: "Indicates if the " + runtimeClassName + " RuntimeClass is available (1) or not (0).",
})

kataConfigInstallationSuccess = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "kata_config_installation_success",
Help: "Indicates if KataConfig installation is successful (1) or not (0).",
})

kataRemoteWorkloadFailureRatio = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "kata_remote_workload_failure_ratio",
Help: "Percentage of " + runtimeClassName + " workloads that have failed.",
})

totalKataRemotePods = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "kata_total_remote_pods",
Help: "Total number of " + runtimeClassName + " pods across all namespaces, regardless of their status.",
})

failedKataRemotePods = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "kata_failed_remote_pods",
Help: "Total number of " + runtimeClassName + " pods across all namespaces, that status is != 'Running|Succeed'",
})
)

func collectMetricsData(clientset *kubernetes.Clientset, dynamicClient dynamic.Interface) {
// Defaults
runtimeClassAvailable.Set(0)
kataRemoteWorkloadFailureRatio.Set(0)
totalKataRemotePods.Set(0)
failedKataRemotePods.Set(0)

// Check if kata-remote runtime class is available
_, err := clientset.NodeV1().RuntimeClasses().Get(context.TODO(), runtimeClassName, metav1.GetOptions{})
if err == nil {
runtimeClassAvailable.Set(1)

// Fetch Pods for kata-remote workload metrics
pods, err := clientset.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{})
if err != nil {
log.Printf("Error listing pods: %v", err)
} else {
totalPods := 0
successfulPods := 0
failedPods := 0
for _, pod := range pods.Items {
if pod.Spec.RuntimeClassName != nil && *pod.Spec.RuntimeClassName == runtimeClassName {
totalPods++
if pod.Status.Phase == "Running" || pod.Status.Phase == "Succeeded" {
successfulPods++
} else {
failedPods++
}
}
}

if totalPods > 0 {
kataRemoteWorkloadFailureRatio.Set(float64(failedPods) / float64(totalPods) * 100)
}

totalKataRemotePods.Set(float64(totalPods))
failedKataRemotePods.Set(float64(failedPods))
}
}

// Fetch KataConfig status
kataConfigGVR := schema.GroupVersionResource{
Group: "kataconfiguration.openshift.io",
Version: "v1",
Resource: "kataconfigs",
}
kataConfigs, err := dynamicClient.Resource(kataConfigGVR).List(context.TODO(), metav1.ListOptions{})
if err != nil || len(kataConfigs.Items) == 0 {
kataConfigInstallationSuccess.Set(0)
} else {
kataConfig := &kataConfigs.Items[0]
status, found, err := unstructured.NestedMap(kataConfig.Object, "status")
if err != nil || !found {
kataConfigInstallationSuccess.Set(0)
} else {
inProgress, _, _ := unstructured.NestedBool(status, "inProgress")
readyNodeCount, _, _ := unstructured.NestedInt64(status, "readyNodeCount")
totalNodeCount, _, _ := unstructured.NestedInt64(status, "totalNodeCount")
if !inProgress && readyNodeCount == totalNodeCount {
kataConfigInstallationSuccess.Set(1)
} else {
kataConfigInstallationSuccess.Set(0)
}
}
}
}

func getKubernetesClients() (*kubernetes.Clientset, dynamic.Interface, error) {
config, err := rest.InClusterConfig()
if err != nil {
return nil, nil, err
}

clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return nil, nil, err
}

dynamicClient, err := dynamic.NewForConfig(config)
if err != nil {
return nil, nil, err
}

return clientset, dynamicClient, nil
}

func metricsHandler(clientset *kubernetes.Clientset, dynamicClient dynamic.Interface) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
collectMetricsData(clientset, dynamicClient)
promhttp.Handler().ServeHTTP(w, r)
})
}

func main() {
prometheus.MustRegister(
runtimeClassAvailable,
kataConfigInstallationSuccess,
kataRemoteWorkloadFailureRatio,
totalKataRemotePods,
failedKataRemotePods,
)

clientset, dynamicClient, err := getKubernetesClients()
if err != nil {
log.Fatalf("Error setting up Kubernetes clients: %v", err)
}

http.Handle("/metrics", metricsHandler(clientset, dynamicClient))

log.Println("Starting OSC metrics server on port :8091")
log.Fatal(http.ListenAndServe(":8091", nil))
}
1 change: 1 addition & 0 deletions config/default/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ resources:
- ../rbac
- ../manager
- ../kata-monitor
- ../metrics
# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
# crd/kustomization.yaml
- ../webhook
Expand Down
4 changes: 4 additions & 0 deletions config/metrics/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
resources:
- metrics-deployment.yaml
- metrics-service.yaml
- metrics-servicemonitor.yaml
27 changes: 27 additions & 0 deletions config/metrics/metrics-deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: operator-metrics-server
namespace: openshift-sandboxed-containers-operator
labels:
app: operator-metrics-server
spec:
replicas: 1
selector:
matchLabels:
app: operator-metrics-server
template:
metadata:
labels:
app: operator-metrics-server
spec:
containers:
- name: metrics-server
image: registry.redhat.io/openshift-sandboxed-containers/osc-monitor-rhel9:1.7.0
command: ["/metrics-server"]
ports:
- containerPort: 8091
resources:
requests:
memory: "64Mi"
cpu: "50m"
14 changes: 14 additions & 0 deletions config/metrics/metrics-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: v1
kind: Service
metadata:
name: operator-metrics-service
namespace: openshift-sandboxed-containers-operator
labels:
app: operator-metrics-server
spec:
selector:
app: operator-metrics-server
ports:
- protocol: TCP
port: 8091
targetPort: 8091
16 changes: 16 additions & 0 deletions config/metrics/metrics-servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: operator-metrics-monitor
namespace: openshift-sandboxed-containers-operator
labels:
release: prometheus-operator
spec:
selector:
matchLabels:
app: operator-metrics-server
endpoints:
- port: "8091"
path: /metrics
interval: 30s
scrapeTimeout: 10s

0 comments on commit 8877ce1

Please sign in to comment.