From 01bda0a8b148ce2f42750bcbad7e31ea67af18b8 Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Wed, 3 Apr 2024 17:40:30 +0200 Subject: [PATCH 01/13] NETOBSERV-1343: generate dashboards from metrics API - Add dashboard config to metrics API - Use this API internally for predefined dashboards - Allow using SingleStats - New dedicated buckets for latency histograms --- apis/flowmetrics/v1alpha1/flowmetric_types.go | 37 +++ controllers/flp/flp_pipeline_builder.go | 35 +-- .../monitoring/monitoring_controller.go | 30 +- controllers/monitoring/monitoring_objects.go | 8 +- pkg/dashboards/dashboard.go | 294 +++++------------- pkg/dashboards/dashboard_test.go | 118 +++---- pkg/dashboards/health.go | 93 +++--- pkg/dashboards/model.go | 110 +++---- pkg/dashboards/scopes.go | 68 ---- pkg/metrics/helper.go | 30 ++ pkg/metrics/predefined_charts.go | 285 +++++++++++++++++ pkg/metrics/predefined_metrics.go | 136 ++++---- pkg/metrics/predefined_metrics_test.go | 18 +- 13 files changed, 717 insertions(+), 545 deletions(-) delete mode 100644 pkg/dashboards/scopes.go create mode 100644 pkg/metrics/helper.go create mode 100644 pkg/metrics/predefined_charts.go diff --git a/apis/flowmetrics/v1alpha1/flowmetric_types.go b/apis/flowmetrics/v1alpha1/flowmetric_types.go index 9206e89ad..f50f05153 100644 --- a/apis/flowmetrics/v1alpha1/flowmetric_types.go +++ b/apis/flowmetrics/v1alpha1/flowmetric_types.go @@ -109,6 +109,43 @@ type FlowMetricSpec struct { // A list of buckets to use when `type` is "Histogram". The list must be parseable as floats. Prometheus default buckets will be used if unset. // +optional Buckets []string `json:"buckets,omitempty"` + + // When non-zero, scale factor (divider) of the value. Metric value = Flow value / Divider. + // +optional + Divider float64 `json:"divider"` + + // Charts configuration + // +optional + Charts []Chart `json:"charts,omitempty"` +} + +type Unit string +type ChartType string + +const ( + UnitBytes Unit = "bytes" + UnitSeconds Unit = "seconds" + UnitBPS Unit = "Bps" + UnitPPS Unit = "pps" + ChartTypeSingleStat ChartType = "SingleStat" + ChartTypeLine ChartType = "Line" + ChartTypeStackArea ChartType = "StackArea" +) + +// Configures charts / dashboard generation associated to a metric +type Chart struct { + DashboardName string `json:"dashboardName"` + SectionName string `json:"sectionName"` + Title string `json:"title"` + Unit Unit `json:"unit"` + Type ChartType `json:"type"` + Queries []Query `json:"queries"` +} + +// Configures PromQL queries +type Query struct { + PromQL string `json:"promQL"` + Legend string `json:"legend"` } // FlowMetricStatus defines the observed state of FlowMetric diff --git a/controllers/flp/flp_pipeline_builder.go b/controllers/flp/flp_pipeline_builder.go index 193ba0d0d..2e0941384 100644 --- a/controllers/flp/flp_pipeline_builder.go +++ b/controllers/flp/flp_pipeline_builder.go @@ -199,23 +199,23 @@ func (b *PipelineBuilder) AddProcessorStages() error { } // obtain encode_prometheus stage from metrics_definitions - names := metrics.GetIncludeList(b.desired) - promMetrics := metrics.GetDefinitions(names) + allMetrics := metrics.MergePredefined(b.flowMetrics.Items, b.desired) - for i := range b.flowMetrics.Items { - fm := &b.flowMetrics.Items[i] + var flpMetrics []api.MetricsItem + for i := range allMetrics { + fm := &allMetrics[i] m, err := flowMetricToFLP(&fm.Spec) if err != nil { return fmt.Errorf("error reading FlowMetric definition '%s': %w", fm.Name, err) } - promMetrics = append(promMetrics, *m) + flpMetrics = append(flpMetrics, *m) } - if len(promMetrics) > 0 { + if len(flpMetrics) > 0 { // prometheus stage (encode) configuration promEncode := api.PromEncode{ Prefix: "netobserv_", - Metrics: promMetrics, + Metrics: flpMetrics, } enrichedStage.EncodePrometheus("prometheus", promEncode) } @@ -226,23 +226,16 @@ func (b *PipelineBuilder) AddProcessorStages() error { func flowMetricToFLP(flowMetric *metricslatest.FlowMetricSpec) (*api.MetricsItem, error) { m := &api.MetricsItem{ - Name: flowMetric.MetricName, - Type: api.MetricEncodeOperationEnum(strings.ToLower(string(flowMetric.Type))), - Filters: []api.MetricsFilter{}, - Labels: flowMetric.Labels, - ValueKey: flowMetric.ValueField, + Name: flowMetric.MetricName, + Type: api.MetricEncodeOperationEnum(strings.ToLower(string(flowMetric.Type))), + Filters: []api.MetricsFilter{}, + Labels: flowMetric.Labels, + ValueKey: flowMetric.ValueField, + ValueScale: flowMetric.Divider, } - for _, f := range flowMetric.Filters { + for _, f := range metrics.GetFilters(flowMetric) { m.Filters = append(m.Filters, api.MetricsFilter{Key: f.Field, Value: f.Value, Type: api.MetricFilterEnum(conversion.PascalToLower(string(f.MatchType), '_'))}) } - if !flowMetric.IncludeDuplicates { - m.Filters = append(m.Filters, api.MetricsFilter{Key: "Duplicate", Value: "true", Type: api.MetricFilterNotEqual}) - } - if flowMetric.Direction == metricslatest.Egress { - m.Filters = append(m.Filters, api.MetricsFilter{Key: "FlowDirection", Value: "1|2", Type: api.MetricFilterRegex}) - } else if flowMetric.Direction == metricslatest.Ingress { - m.Filters = append(m.Filters, api.MetricsFilter{Key: "FlowDirection", Value: "0|2", Type: api.MetricFilterRegex}) - } for _, b := range flowMetric.Buckets { f, err := strconv.ParseFloat(b, 64) if err != nil { diff --git a/controllers/monitoring/monitoring_controller.go b/controllers/monitoring/monitoring_controller.go index 1f3b6cf19..d7e80c51f 100644 --- a/controllers/monitoring/monitoring_controller.go +++ b/controllers/monitoring/monitoring_controller.go @@ -9,9 +9,13 @@ import ( "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" flowslatest "github.com/netobserv/network-observability-operator/apis/flowcollector/v1beta2" + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" + "github.com/netobserv/network-observability-operator/controllers/constants" "github.com/netobserv/network-observability-operator/controllers/reconcilers" "github.com/netobserv/network-observability-operator/pkg/helper" "github.com/netobserv/network-observability-operator/pkg/manager" @@ -21,8 +25,9 @@ import ( type Reconciler struct { client.Client - mgr *manager.Manager - status status.Instance + mgr *manager.Manager + status status.Instance + currentNamespace string } func Start(ctx context.Context, mgr *manager.Manager) error { @@ -37,6 +42,15 @@ func Start(ctx context.Context, mgr *manager.Manager) error { For(&flowslatest.FlowCollector{}, reconcilers.IgnoreStatusChange). Named("monitoring"). Owns(&corev1.Namespace{}). + Watches( + &metricslatest.FlowMetric{}, + handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []reconcile.Request { + if o.GetNamespace() == r.currentNamespace { + return []reconcile.Request{{NamespacedName: constants.FlowCollectorName}} + } + return []reconcile.Request{} + }), + ). Complete(&r) } @@ -74,6 +88,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result func (r *Reconciler) reconcile(ctx context.Context, clh *helper.Client, desired *flowslatest.FlowCollector) error { ns := helper.GetNamespace(&desired.Spec) + r.currentNamespace = ns // If namespace does not exist, we create it nsExist, err := r.namespaceExist(ctx, ns) @@ -104,8 +119,14 @@ func (r *Reconciler) reconcile(ctx context.Context, clh *helper.Client, desired } if r.mgr.HasSvcMonitor() { - names := metrics.GetIncludeList(&desired.Spec) - desiredFlowDashboardCM, del, err := buildFlowMetricsDashboard(ns, names) + // List custom metrics + fm := metricslatest.FlowMetricList{} + if err := r.Client.List(ctx, &fm, &client.ListOptions{Namespace: ns}); err != nil { + return r.status.Error("CantListFlowMetrics", err) + } + + allMetrics := metrics.MergePredefined(fm.Items, &desired.Spec) + desiredFlowDashboardCM, del, err := buildFlowMetricsDashboard(allMetrics) if err != nil { return err } else if err = reconcilers.ReconcileConfigMap(ctx, clh, desiredFlowDashboardCM, del); err != nil { @@ -119,6 +140,7 @@ func (r *Reconciler) reconcile(ctx context.Context, clh *helper.Client, desired return err } } + return nil } diff --git a/controllers/monitoring/monitoring_objects.go b/controllers/monitoring/monitoring_objects.go index 08b35284c..3b77b191e 100644 --- a/controllers/monitoring/monitoring_objects.go +++ b/controllers/monitoring/monitoring_objects.go @@ -1,6 +1,7 @@ package monitoring import ( + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" "github.com/netobserv/network-observability-operator/controllers/constants" "github.com/netobserv/network-observability-operator/pkg/dashboards" corev1 "k8s.io/api/core/v1" @@ -74,11 +75,8 @@ func buildRoleBindingMonitoringReader(ns string) *rbacv1.ClusterRoleBinding { } } -func buildFlowMetricsDashboard(namespace string, metrics []string) (*corev1.ConfigMap, bool, error) { - dashboard, err := dashboards.CreateFlowMetricsDashboard(namespace, metrics) - if err != nil { - return nil, false, err - } +func buildFlowMetricsDashboard(metrics []metricslatest.FlowMetric) (*corev1.ConfigMap, bool, error) { + dashboard := dashboards.CreateFlowMetricsDashboards(metrics) configMap := corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ diff --git a/pkg/dashboards/dashboard.go b/pkg/dashboards/dashboard.go index d938c8a36..59c2fd528 100644 --- a/pkg/dashboards/dashboard.go +++ b/pkg/dashboards/dashboard.go @@ -4,246 +4,102 @@ import ( "fmt" "strings" - "k8s.io/utils/strings/slices" + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" ) -const ( - layerApps = "Applications" - layerInfra = "Infrastructure" - appsFilters1 = `SrcK8S_Namespace!~"|$NETOBSERV_NS|openshift.*"` - appsFilters2 = `SrcK8S_Namespace=~"$NETOBSERV_NS|openshift.*",DstK8S_Namespace!~"|$NETOBSERV_NS|openshift.*"` - infraFilters1 = `SrcK8S_Namespace=~"$NETOBSERV_NS|openshift.*"` - infraFilters2 = `SrcK8S_Namespace!~"$NETOBSERV_NS|openshift.*",DstK8S_Namespace=~"$NETOBSERV_NS|openshift.*"` - metricTagIngress = "ingress" - metricTagEgress = "egress" - metricTagBytes = "bytes" - metricTagPackets = "packets" -) - -var allRows []*Row +type chart struct { + metricslatest.Chart + mptr *metricslatest.FlowMetric +} -func init() { - for _, scope := range []metricScope{srcDstNodeScope, srcDstNamespaceScope, srcDstWorkloadScope} { - // byte/pkt rates - for _, valueType := range []string{metricTagBytes, metricTagPackets} { - valueTypeText := valueTypeToText(valueType) - for _, dir := range []string{metricTagEgress, metricTagIngress} { - title := fmt.Sprintf( - "%s rate %s %s", - valueTypeText, - dirToVerb(dir), - scope.titlePart, - ) - metric := fmt.Sprintf("%s_%s_%s_total", scope.metricPart, dir, valueType) - allRows = append(allRows, row( - metric, - title, - topRatePanels(&scope, metric, scope.joinLabels(), scope.legendPart), - )) - } - // drops - title := fmt.Sprintf( - "%s drop rate %s", - valueTypeText, - scope.titlePart, - ) - metric := fmt.Sprintf("%s_drop_%s_total", scope.metricPart, valueType) - allRows = append(allRows, row( - metric, - title, - topRatePanels(&scope, metric, scope.joinLabels(), scope.legendPart), - )) +func createSingleStatPanels(c *chart) []Panel { + var panels []Panel + for _, q := range c.Queries { + title := c.Title + if q.Legend != "" { + title += ", " + q.Legend } - // RTT - title := fmt.Sprintf("Round-trip time %s (milliseconds - p99 and p50)", scope.titlePart) - metric := fmt.Sprintf("%s_rtt_seconds", scope.metricPart) - allRows = append(allRows, row( - metric, - title, - histogramPanels(&scope, metric, scope.joinLabels(), scope.legendPart, "*1000"), - )) - // DNS latency - title = fmt.Sprintf("DNS latency %s (milliseconds - p99 and p50)", scope.titlePart) - metric = fmt.Sprintf("%s_dns_latency_seconds", scope.metricPart) - allRows = append(allRows, row( - metric, - title, - histogramPanels(&scope, metric, scope.joinLabels(), scope.legendPart, "*1000"), - )) - // DNS errors - title = fmt.Sprintf("DNS request rate per code and %s", scope.titlePart) - metric = fmt.Sprintf("%s_dns_latency_seconds", scope.metricPart) - labels := scope.joinLabels() + ",DnsFlagsResponseCode" - legend := scope.legendPart + ", {{DnsFlagsResponseCode}}" - allRows = append(allRows, row( - metric, - title, - topRatePanels(&scope, metric+"_count", labels, legend), - )) + query := strings.ReplaceAll(q.PromQL, "$METRIC", "netobserv_"+c.mptr.Spec.MetricName) + newPanel := NewPanel(title, metricslatest.ChartTypeSingleStat, c.Unit, 3, NewTarget(query, "")) + panels = append(panels, newPanel) } + return panels } -func row(metrics string, title string, panels []Panel) *Row { - r := NewRow(title, false, "250px", panels) - r.Metric = metrics - return r +func createGraphPanel(c *chart) Panel { + var targets []Target + for _, q := range c.Queries { + query := strings.ReplaceAll(q.PromQL, "$METRIC", "netobserv_"+c.mptr.Spec.MetricName) + query = fmt.Sprintf("topk(7, %s)", query) + targets = append(targets, NewTarget(query, q.Legend)) + } + return NewPanel(c.Title, c.Type, c.Unit, 4, targets...) } -func topRatePanels(scope *metricScope, metric, labels, legend string) []Panel { - if scope.splitAppInfra { - return []Panel{ - // App - NewPanel( - layerApps, PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - NewTarget( - scope.labelReplace( - fmt.Sprintf( - "topk(10,sum(rate(netobserv_%s{%s}[2m]) or rate(netobserv_%s{%s}[2m])) by (%s))", - metric, - appsFilters1, - metric, - appsFilters2, - labels, - ), - ), - legend, - ), - }, - ), - // Infra - NewPanel( - layerInfra, PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - NewTarget( - scope.labelReplace( - fmt.Sprintf( - "topk(10,sum(rate(netobserv_%s{%s}[2m]) or rate(netobserv_%s{%s}[2m])) by (%s))", - metric, - infraFilters1, - metric, - infraFilters2, - labels, - ), - ), - legend, - ), - }, - ), +func rearrangeRows(rows []*Row, mapTopPanels, mapBodyPanels map[string][]Panel) { + for i, row := range rows { + topPanels := mapTopPanels[row.Title] + bodyPanels := mapBodyPanels[row.Title] + // Most of the time, panels are correctly arranged within a section. + // Excepted when there are 4 panels (or 3*rows+1), it shows 3 on first row then 1 on the second row + // We'll change that to 2 + 2 + count := len(bodyPanels) + if count > 3 && count%3 == 1 { + // Set Span=6 (half page) for the two last panels + bodyPanels[count-1].Span = 6 + bodyPanels[count-2].Span = 6 } + rows[i].Panels = topPanels + rows[i].Panels = append(rows[i].Panels, bodyPanels...) } - // No split - return []Panel{NewPanel( - "", PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - NewTarget( - scope.labelReplace( - fmt.Sprintf("topk(10,sum(rate(netobserv_%s[2m])) by (%s))", metric, labels), - ), - legend, - ), - }, - )} } -func histogramPanels(scope *metricScope, metric, labels, legend, scaler string) []Panel { - if scope.splitAppInfra { - appRateExpr := fmt.Sprintf( - "rate(netobserv_%s_bucket{%s}[2m]) or rate(netobserv_%s_bucket{%s}[2m])", - metric, - appsFilters1, - metric, - appsFilters2, - ) - infraRateExpr := fmt.Sprintf( - "rate(netobserv_%s_bucket{%s}[2m]) or rate(netobserv_%s_bucket{%s}[2m])", - metric, - infraFilters1, - metric, - infraFilters2, - ) - return []Panel{ - // App - NewPanel( - layerApps, PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - histogramTarget(scope, "0.99", appRateExpr, labels, legend, scaler), - histogramTarget(scope, "0.50", appRateExpr, labels, legend, scaler), - }, - ), - // Infra - NewPanel( - layerInfra, PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - histogramTarget(scope, "0.99", infraRateExpr, labels, legend, scaler), - histogramTarget(scope, "0.50", infraRateExpr, labels, legend, scaler), - }, - ), +func createFlowMetricsDashboard(dashboardName string, charts []chart) string { + mapRows := make(map[string]*Row) + mapTopPanels := make(map[string][]Panel) + mapBodyPanels := make(map[string][]Panel) + var orderedRows []*Row + chartsDedupMap := make(map[string]any) + for i := range charts { + chart := charts[i] + // A chart might be provided by several metrics, e.g. Total ingress bps can be provided by node_ingress_bytes_total and namespace_ingress_bytes_total + // Dedup them, assuming they have the same title+unit + dedupKey := chart.Title + "/" + string(chart.Unit) + if _, exists := chartsDedupMap[dedupKey]; exists { + continue } - } - // No split - rateExpr := fmt.Sprintf("rate(netobserv_%s_bucket[2m])", metric) - return []Panel{ - NewPanel( - "", PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - histogramTarget(scope, "0.99", rateExpr, labels, legend, scaler), - histogramTarget(scope, "0.50", rateExpr, labels, legend, scaler), - }, - ), - } -} + chartsDedupMap[dedupKey] = true -func histogramTarget(scope *metricScope, quantile, rateExpr, labels, legend, scaler string) Target { - return NewTarget( - scope.labelReplace( - fmt.Sprintf( - "topk(10,histogram_quantile(%s, sum(%s) by (le,%s))%s > 0)", - quantile, - rateExpr, - labels, - scaler, - ), - ), - legend+", q="+quantile, - ) -} + if chart.Type == metricslatest.ChartTypeSingleStat { + mapTopPanels[chart.SectionName] = append(mapTopPanels[chart.SectionName], createSingleStatPanels(&chart)...) + } else { + mapBodyPanels[chart.SectionName] = append(mapBodyPanels[chart.SectionName], createGraphPanel(&chart)) + } -func dirToVerb(dir string) string { - switch dir { - case metricTagEgress: - return "sent" - case metricTagIngress: - return "received" + if _, exists := mapRows[chart.SectionName]; !exists { + row := NewRow(chart.SectionName, false, "250px", nil) + mapRows[chart.SectionName] = row + orderedRows = append(orderedRows, row) + } } - return "" -} -func valueTypeToText(t string) string { - switch t { - case metricTagBytes: - return "Byte" - case metricTagPackets: - return "Packet" - } - return "" + rearrangeRows(orderedRows, mapTopPanels, mapBodyPanels) + d := Dashboard{Rows: orderedRows, Title: dashboardName} + return d.ToGrafanaJSON() } -func CreateFlowMetricsDashboard(netobsNs string, metrics []string) (string, error) { - var rows []*Row - for _, ri := range allRows { - if slices.Contains(metrics, ri.Metric) { - rows = append(rows, ri) - } else if strings.Contains(ri.Metric, "namespace_") { - // namespace-based panels can also be displayed using workload-based metrics - // Try again, replacing *_namespace_* with *_workload_* - equivalentMetric := strings.Replace(ri.Metric, "namespace_", "workload_", 1) - if slices.Contains(metrics, equivalentMetric) { - clone := ri.replaceMetric(equivalentMetric) - rows = append(rows, clone) +func CreateFlowMetricsDashboards(metrics []metricslatest.FlowMetric) string { + chartsPerDashboard := make(map[string][]chart) + for i := range metrics { + metric := &metrics[i] + for j := range metric.Spec.Charts { + c := chart{ + Chart: metric.Spec.Charts[j], + mptr: metric, } + chartsPerDashboard[c.DashboardName] = append(chartsPerDashboard[c.DashboardName], c) } } - d := Dashboard{Rows: rows, Title: "NetObserv"} - return d.ToGrafanaJSON(netobsNs), nil + // TODO: handle more dashboards + return createFlowMetricsDashboard("NetObserv", chartsPerDashboard["NetObserv"]) } diff --git a/pkg/dashboards/dashboard_test.go b/pkg/dashboards/dashboard_test.go index d0bbcf876..a260e3375 100644 --- a/pkg/dashboards/dashboard_test.go +++ b/pkg/dashboards/dashboard_test.go @@ -10,74 +10,80 @@ import ( func TestCreateFlowMetricsDashboard_All(t *testing.T) { assert := assert.New(t) - js, err := CreateFlowMetricsDashboard("netobserv", metrics.GetAllNames()) - assert.NoError(err) + defs := metrics.GetDefinitions(metrics.GetAllNames()) + js := CreateFlowMetricsDashboards(defs) d, err := FromBytes([]byte(js)) assert.NoError(err) assert.Equal("NetObserv", d.Title) - assert.Len(d.Rows, 27) - row := d.FindRow("Byte rate sent per node") - assert.NotNil(row) - assert.Len(row.Panels, 1) - assert.Equal("", row.Panels[0].Title) - assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(10,sum(rate(netobserv_node_egress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") + assert.Equal([]string{"Traffic rates", "TCP latencies", "Byte and packet drops", "DNS"}, d.Titles()) - row = d.FindRow("DNS latency per node") - assert.NotNil(row) - assert.Len(row.Panels, 1) - assert.Equal("", row.Panels[0].Title) - assert.Len(row.Panels[0].Targets, 2) - assert.Contains(row.Panels[0].Targets[0].Expr, "histogram_quantile(0.99, sum(rate(netobserv_node_dns_latency_seconds_bucket[2m])) by (le,SrcK8S_HostName,DstK8S_HostName))") + assert.Len(d.Rows[0].Panels, 32) - row = d.FindRow("Byte rate received per namespace") - assert.NotNil(row) - assert.Len(row.Panels, 2) - assert.Equal("Applications", row.Panels[0].Title) - assert.Equal("Infrastructure", row.Panels[1].Title) - assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_namespace_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_namespace_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, + p := d.FindPanel("Top egress traffic per node") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal("topk(7, sum(rate(netobserv_node_egress_bytes_total{}[2m])) by (SrcK8S_HostName,DstK8S_HostName))", p.Targets[0].Expr) + + p = d.FindPanel("Top P50 DNS latency per node (ms)") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal("topk(7, histogram_quantile(0.5, sum(rate(netobserv_node_dns_latency_seconds_bucket{}[2m])) by (le,SrcK8S_HostName,DstK8S_HostName))*1000 > 0)", p.Targets[0].Expr) + + p = d.FindPanel("Top P99 DNS latency per node (ms)") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal("topk(7, histogram_quantile(0.99, sum(rate(netobserv_node_dns_latency_seconds_bucket{}[2m])) by (le,SrcK8S_HostName,DstK8S_HostName))*1000 > 0)", p.Targets[0].Expr) + + p = d.FindPanel("Top ingress traffic per app namespace") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal( + `topk(7, (sum(rate(netobserv_namespace_ingress_bytes_total{K8S_FlowLayer="app",SrcK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`+ + ` or (sum(rate(netobserv_namespace_ingress_bytes_total{K8S_FlowLayer="app",DstK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace)))`, + p.Targets[0].Expr, ) - assert.Contains(row.Panels[1].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_namespace_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_namespace_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, + p = d.FindPanel("Top ingress traffic per infra namespace") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal( + `topk(7, (sum(rate(netobserv_namespace_ingress_bytes_total{K8S_FlowLayer="infra",SrcK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`+ + ` or (sum(rate(netobserv_namespace_ingress_bytes_total{K8S_FlowLayer="infra",DstK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace)))`, + p.Targets[0].Expr, ) - row = d.FindRow("Round-trip time per namespace") - assert.NotNil(row) - assert.Len(row.Panels, 2) - assert.Equal("Applications", row.Panels[0].Title) - assert.Equal("Infrastructure", row.Panels[1].Title) - assert.Len(row.Panels[0].Targets, 2) - assert.Contains(row.Panels[0].Targets[0].Expr, - `histogram_quantile(0.99, sum(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (le,SrcK8S_Namespace,DstK8S_Namespace))`, - ) - assert.Contains(row.Panels[1].Targets[1].Expr, - `histogram_quantile(0.50, sum(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (le,SrcK8S_Namespace,DstK8S_Namespace))`, + p = d.FindPanel("Top P50 sRTT per infra namespace (ms)") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal( + `topk(7, (histogram_quantile(0.5, sum(rate(netobserv_namespace_rtt_seconds_bucket{K8S_FlowLayer="infra",SrcK8S_Namespace!=""}[2m])) by (le,SrcK8S_Namespace,DstK8S_Namespace))*1000 > 0)`+ + ` or (histogram_quantile(0.5, sum(rate(netobserv_namespace_rtt_seconds_bucket{K8S_FlowLayer="infra",DstK8S_Namespace!=""}[2m])) by (le,SrcK8S_Namespace,DstK8S_Namespace))*1000 > 0))`, + p.Targets[0].Expr, ) - row = d.FindRow("Packet rate received per workload") - assert.NotNil(row) - assert.Len(row.Panels, 2) - assert.Equal("Applications", row.Panels[0].Title) - assert.Equal("Infrastructure", row.Panels[1].Title) - assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_packets_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_packets_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + p = d.FindPanel("Top ingress traffic per app workload") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal( + `topk(7, sum(rate(netobserv_workload_ingress_packets_total{K8S_FlowLayer="app"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + p.Targets[0].Expr, ) - assert.Contains(row.Panels[1].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_packets_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_packets_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + p = d.FindPanel("Top ingress traffic per infra workload") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal( + `topk(7, sum(rate(netobserv_workload_ingress_packets_total{K8S_FlowLayer="infra"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + p.Targets[0].Expr, ) } func TestCreateFlowMetricsDashboard_OnlyNodeIngressBytes(t *testing.T) { assert := assert.New(t) - js, err := CreateFlowMetricsDashboard("netobserv", []string{"node_ingress_bytes_total"}) - assert.NoError(err) + defs := metrics.GetDefinitions([]string{"node_ingress_bytes_total"}) + js := CreateFlowMetricsDashboards(defs) d, err := FromBytes([]byte(js)) assert.NoError(err) @@ -90,14 +96,14 @@ func TestCreateFlowMetricsDashboard_OnlyNodeIngressBytes(t *testing.T) { assert.Len(row.Panels, 1) assert.Equal("", row.Panels[0].Title) assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(10,sum(rate(netobserv_node_ingress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") + assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(7,sum(rate(netobserv_node_ingress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") } func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { assert := assert.New(t) - js, err := CreateFlowMetricsDashboard("netobserv", metrics.DefaultIncludeList) - assert.NoError(err) + defs := metrics.GetDefinitions(metrics.DefaultIncludeList) + js := CreateFlowMetricsDashboards(defs) d, err := FromBytes([]byte(js)) assert.NoError(err) @@ -110,7 +116,7 @@ func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { assert.Len(row.Panels, 1) assert.Equal("", row.Panels[0].Title) assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(10,sum(rate(netobserv_node_ingress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") + assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(7,sum(rate(netobserv_node_ingress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") row = d.FindRow("Byte rate received per namespace") assert.NotNil(row) @@ -120,10 +126,10 @@ func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { assert.Len(row.Panels[0].Targets, 1) // Make sure netobserv_namespace_ingress_bytes_total was replaced with netobserv_workload_ingress_bytes_total assert.Contains(row.Panels[0].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, + `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, ) assert.Contains(row.Panels[1].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, + `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, ) row = d.FindRow("Byte rate received per workload") @@ -133,10 +139,10 @@ func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { assert.Equal("Infrastructure", row.Panels[1].Title) assert.Len(row.Panels[0].Targets, 1) assert.Contains(row.Panels[0].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, ) assert.Contains(row.Panels[1].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, ) } diff --git a/pkg/dashboards/health.go b/pkg/dashboards/health.go index ad5e16c4c..f395a9fad 100644 --- a/pkg/dashboards/health.go +++ b/pkg/dashboards/health.go @@ -2,6 +2,8 @@ package dashboards import ( "fmt" + + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" ) func CreateHealthDashboard(netobsNs string) (string, error) { @@ -10,18 +12,18 @@ func CreateHealthDashboard(netobsNs string) (string, error) { // Global stats // TODO after direct-FLP: if Direct mode, get flow rate from loki if enabled, else from agent d.Rows = append(d.Rows, NewRow("", false, "100px", []Panel{ - NewSingleStatPanel("Flows per second", PanelUnitShort, 3, NewTarget( + NewPanel("Flows per second", metricslatest.ChartTypeSingleStat, "", 3, NewTarget( `sum(rate(netobserv_ingest_flows_processed[1m]))`, "")), - NewSingleStatPanel("Sampling", PanelUnitShort, 3, NewTarget( + NewPanel("Sampling", metricslatest.ChartTypeSingleStat, "", 3, NewTarget( "avg(netobserv_agent_sampling_rate)", "")), - NewSingleStatPanel("Errors last minute", PanelUnitShort, 3, NewTarget( + NewPanel("Errors last minute", metricslatest.ChartTypeSingleStat, "", 3, NewTarget( `(sum(increase(netobserv_agent_errors_total[1m])) OR on() vector(0)) + (sum(increase(netobserv_ingest_errors[1m])) OR on() vector(0)) + (sum(increase(netobserv_encode_prom_errors[1m])) OR on() vector(0)) + (sum(increase(netobserv_loki_batch_retries_total[1m])) OR on() vector(0)) + (sum(increase(controller_runtime_reconcile_errors_total{job="netobserv-metrics-service"}[1m])) OR on() vector(0)) `, "")), - NewSingleStatPanel("Dropped flows per second", PanelUnitShort, 3, NewTarget( + NewPanel("Dropped flows per second", metricslatest.ChartTypeSingleStat, "", 3, NewTarget( `(sum(rate(netobserv_loki_dropped_entries_total[1m])) OR on() vector(0)) + (sum(rate(netobserv_agent_dropped_flows_total[1m])) OR on() vector(0)) `, "")), @@ -29,94 +31,93 @@ func CreateHealthDashboard(netobsNs string) (string, error) { // FLP stats overheadQuery := fmt.Sprintf("100 * sum(rate(netobserv_namespace_flows_total{SrcK8S_Namespace='%s'}[1m]) or rate(netobserv_namespace_flows_total{SrcK8S_Namespace!='%s',DstK8S_Namespace='%s'}[1m])) / sum(rate(netobserv_namespace_flows_total[1m]))", netobsNs, netobsNs, netobsNs) - // TODO: add FLP error d.Rows = append(d.Rows, NewRow("Flowlogs-pipeline statistics", false, "250px", []Panel{ - NewGraphPanel("Flows per second", PanelUnitShort, 4, false, []Target{ + NewPanel("Flows per second", metricslatest.ChartTypeLine, "", 4, NewTarget("sum(rate(netobserv_ingest_flows_processed[1m]))", "Flows ingested"), NewTarget("sum(rate(netobserv_loki_sent_entries_total[1m]))", "Flows sent to Loki"), NewTarget("sum(rate(netobserv_loki_dropped_entries_total[1m]))", "Flows dropped due to Loki error"), - }), - NewGraphPanel("Flows overhead (% generated by NetObserv own traffic)", PanelUnitShort, 4, false, []Target{ + ), + NewPanel("Flows overhead (% generated by NetObserv own traffic)", metricslatest.ChartTypeLine, "", 4, NewTarget(overheadQuery, "% overhead"), - }), - NewGraphPanel("Errors per minute", PanelUnitShort, 4, true, []Target{ + ), + NewPanel("Errors per minute", metricslatest.ChartTypeStackArea, "", 4, NewTarget(`sum(increase(netobserv_ingest_errors[1m])) by (stage,code)`, "{{stage}} {{code}}"), NewTarget(`sum(increase(netobserv_encode_prom_errors[1m])) by (error)`, "metrics {{error}}"), NewTarget(`sum(increase(netobserv_loki_batch_retries_total[1m]))`, "loki retries"), - }), - NewGraphPanel("By namespace", PanelUnitShort, 6, false, []Target{ + ), + NewPanel("By namespace", metricslatest.ChartTypeLine, "", 6, NewTarget(`topk(10,sum(rate(netobserv_namespace_flows_total{SrcK8S_Namespace!=""}[1m])) by (SrcK8S_Namespace))`, "From {{SrcK8S_Namespace}}"), NewTarget(`topk(10,sum(rate(netobserv_namespace_flows_total{DstK8S_Namespace!=""}[1m])) by (DstK8S_Namespace))`, "To {{DstK8S_Namespace}}"), - }), - NewGraphPanel("By node", PanelUnitShort, 6, false, []Target{ + ), + NewPanel("By node", metricslatest.ChartTypeLine, "", 6, NewTarget(`topk(10,sum(rate(netobserv_node_flows_total{SrcK8S_HostName!=""}[1m])) by (SrcK8S_HostName))`, "From {{SrcK8S_HostName}}"), NewTarget(`topk(10,sum(rate(netobserv_node_flows_total{DstK8S_HostName!=""}[1m])) by (DstK8S_HostName))`, "To {{DstK8S_HostName}}"), - }), + ), }), ) // Agent stats d.Rows = append(d.Rows, NewRow("eBPF agent statistics", true, "250px", []Panel{ - NewGraphPanel("Eviction rate", PanelUnitShort, 4, false, []Target{ + NewPanel("Eviction rate", metricslatest.ChartTypeLine, "", 4, NewTarget("sum(rate(netobserv_agent_evictions_total[1m])) by (source, reason)", "{{source}} {{reason}}"), - }), - NewGraphPanel("Evicted flows rate", PanelUnitShort, 4, false, []Target{ + ), + NewPanel("Evicted flows rate", metricslatest.ChartTypeLine, "", 4, NewTarget("sum(rate(netobserv_agent_evicted_flows_total[1m])) by (source, reason)", "{{source}} {{reason}}"), - }), - NewGraphPanel("Dropped flows rate", PanelUnitShort, 4, true, []Target{ + ), + NewPanel("Dropped flows rate", metricslatest.ChartTypeStackArea, "", 4, NewTarget(`sum(rate(netobserv_agent_dropped_flows_total[1m])) by (source, reason)`, "{{source}} {{reason}}"), - }), - NewGraphPanel("Ringbuffer / HashMap ratio", PanelUnitShort, 4, false, []Target{ + ), + NewPanel("Ringbuffer / HashMap ratio", metricslatest.ChartTypeLine, "", 4, NewTarget(`(sum(rate(netobserv_agent_evicted_flows_total{source="accounter"}[1m])) OR on() vector(0)) / sum(rate(netobserv_agent_evicted_flows_total{source="hashmap"}[1m]))`, "ratio"), - }), - NewGraphPanel("Buffer size", PanelUnitShort, 4, false, []Target{ + ), + NewPanel("Buffer size", metricslatest.ChartTypeLine, "", 4, NewTarget(`sum(netobserv_agent_buffer_size) by (name)`, "{{name}}"), - }), - NewGraphPanel("Errors per minute", PanelUnitShort, 4, true, []Target{ + ), + NewPanel("Errors per minute", metricslatest.ChartTypeStackArea, "", 4, NewTarget(`sum(increase(netobserv_agent_errors_total[1m])) by (component, error)`, "{{component}} {{error}}"), - }), - NewGraphPanel("Filtered flows rate", PanelUnitShort, 4, false, []Target{ + ), + NewPanel("Filtered flows rate", metricslatest.ChartTypeStackArea, "", 4, NewTarget("sum(rate(netobserv_agent_filtered_flows_total[1m])) by (source, reason)", "{{source}} {{reason}}"), - }), + ), })) // Operator stats d.Rows = append(d.Rows, NewRow("Operator statistics", true, "250px", []Panel{ - NewGraphPanel("Reconcile events per minute", PanelUnitShort, 6, true, []Target{ + NewPanel("Reconcile events per minute", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`sum(increase(controller_runtime_reconcile_total{job="netobserv-metrics-service"}[1m])) by (controller,result)`, "{{controller}}: {{result}}"), - }), - NewGraphPanel("Average and P99 reconcile time", PanelUnitSeconds, 6, false, []Target{ + ), + NewPanel("Average and P99 reconcile time", metricslatest.ChartTypeLine, metricslatest.UnitSeconds, 6, NewTarget(`sum(rate(controller_runtime_reconcile_time_seconds_sum{job="netobserv-metrics-service"}[1m])) / sum(rate(controller_runtime_reconcile_time_seconds_count{job="netobserv-metrics-service"}[1m]))`, "average"), NewTarget(`histogram_quantile(0.99, sum by(le) (rate(controller_runtime_reconcile_time_seconds_bucket{job="netobserv-metrics-service"}[1m])))`, "p99"), - }), + ), })) // CPU and memory d.Rows = append(d.Rows, NewRow("Resource usage", true, "250px", []Panel{ - NewGraphPanel("Overall CPU", PanelUnitShort, 6, true, []Target{ + NewPanel("Overall CPU", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container="netobserv-ebpf-agent"})`, "eBPF agent"), NewTarget(`sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container="flowlogs-pipeline"})`, "flowlogs-pipeline"), NewTarget(`sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!="",pod=~"netobserv-controller-manager.*"})`, "operator"), - }), - NewGraphPanel("Overall memory", PanelUnitShort, 6, true, []Target{ + ), + NewPanel("Overall memory", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`sum(container_memory_rss{container="netobserv-ebpf-agent"})`, "eBPF agent"), NewTarget(`sum(container_memory_rss{container="flowlogs-pipeline"})`, "flowlogs-pipeline"), NewTarget(`sum(container_memory_rss{container!="",pod=~"netobserv-controller-manager.*"})`, "operator"), - }), - NewGraphPanel("eBPF agent CPU - top 10 pods", PanelUnitShort, 6, true, []Target{ + ), + NewPanel("eBPF agent CPU - top 10 pods", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`topk(10, node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container="netobserv-ebpf-agent"})`, "{{pod}}"), - }), - NewGraphPanel("eBPF agent memory - top 10 pods", PanelUnitShort, 6, true, []Target{ + ), + NewPanel("eBPF agent memory - top 10 pods", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`topk(10, container_memory_rss{container="netobserv-ebpf-agent"})`, "{{pod}}"), - }), - NewGraphPanel("Flowlogs-pipeline CPU - top 10 pods", PanelUnitShort, 6, true, []Target{ + ), + NewPanel("Flowlogs-pipeline CPU - top 10 pods", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`topk(10, node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container="flowlogs-pipeline"})`, "{{pod}}"), - }), - NewGraphPanel("Flowlogs-pipeline memory - top 10 pods", PanelUnitShort, 6, true, []Target{ + ), + NewPanel("Flowlogs-pipeline memory - top 10 pods", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`topk(10, container_memory_rss{container="flowlogs-pipeline"})`, "{{pod}}"), - }), + ), })) - return d.ToGrafanaJSON(netobsNs), nil + return d.ToGrafanaJSON(), nil } diff --git a/pkg/dashboards/model.go b/pkg/dashboards/model.go index 032a123c3..004bbcce4 100644 --- a/pkg/dashboards/model.go +++ b/pkg/dashboards/model.go @@ -4,6 +4,8 @@ import ( "encoding/json" "fmt" "strings" + + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" ) type Dashboard struct { @@ -28,47 +30,24 @@ func NewRow(title string, collapse bool, height string, panels []Panel) *Row { } } -type PanelType string -type PanelUnit string - -const ( - PanelTypeSingleStat PanelType = "singlestat" - PanelTypeGraph PanelType = "graph" - PanelUnitBytes PanelUnit = "bytes" - PanelUnitShort PanelUnit = "short" - PanelUnitSeconds PanelUnit = "seconds" - PanelUnitBPS PanelUnit = "Bps" - PanelUnitPPS PanelUnit = "pps" -) - type Panel struct { Title string - Type PanelType + Type metricslatest.ChartType Targets []Target Span int - Stacked bool - Unit PanelUnit + Unit metricslatest.Unit } -func NewPanel(title string, t PanelType, unit PanelUnit, span int, stacked bool, targets []Target) Panel { +func NewPanel(title string, t metricslatest.ChartType, unit metricslatest.Unit, span int, targets ...Target) Panel { return Panel{ Title: title, Type: t, Unit: unit, Span: span, - Stacked: stacked, Targets: targets, } } -func NewGraphPanel(title string, unit PanelUnit, span int, stacked bool, targets []Target) Panel { - return NewPanel(title, PanelTypeGraph, unit, span, stacked, targets) -} - -func NewSingleStatPanel(title string, unit PanelUnit, span int, target Target) Panel { - return NewPanel(title, PanelTypeSingleStat, unit, span, false, []Target{target}) -} - type Target struct { Expr string Legend string @@ -110,7 +89,18 @@ func (d *Dashboard) FindRow(titleSubstr string) *Row { return nil } -func (d *Dashboard) ToGrafanaJSON(netobsNs string) string { +func (d *Dashboard) FindPanel(titleSubstr string) *Panel { + for _, r := range d.Rows { + for _, p := range r.Panels { + if strings.Contains(p.Title, titleSubstr) { + return &p + } + } + } + return nil +} + +func (d *Dashboard) ToGrafanaJSON() string { // return empty if dashboard doesn't contains rows if len(d.Rows) == 0 { return "" @@ -118,7 +108,7 @@ func (d *Dashboard) ToGrafanaJSON(netobsNs string) string { var rows []string for _, ri := range d.Rows { - rows = append(rows, ri.ToGrafanaJSON(netobsNs)) + rows = append(rows, ri.ToGrafanaJSON()) } rowsStr := strings.Join(rows, ",") @@ -181,10 +171,18 @@ func (d *Dashboard) ToGrafanaJSON(netobsNs string) string { `, rowsStr, d.Title) } -func (r *Row) ToGrafanaJSON(netobsNs string) string { +func (r *Row) Titles() []string { + var titles []string + for _, p := range r.Panels { + titles = append(titles, p.Title) + } + return titles +} + +func (r *Row) ToGrafanaJSON() string { var panels []string for _, panel := range r.Panels { - panels = append(panels, panel.ToGrafanaJSON(netobsNs)) + panels = append(panels, panel.ToGrafanaJSON()) } showTitle := true if r.Title == "" { @@ -202,19 +200,31 @@ func (r *Row) ToGrafanaJSON(netobsNs string) string { `, r.Collapse, r.Height, strings.Join(panels, ","), showTitle, r.Title) } -func (r *Row) replaceMetric(newName string) *Row { - clone := NewRow(r.Title, r.Collapse, r.Height, nil) - clone.Metric = r.Metric - for _, p := range r.Panels { - clone.Panels = append(clone.Panels, p.replaceMetric(r.Metric, newName)) - } - return clone -} - -func (p *Panel) ToGrafanaJSON(netobsNs string) string { +func (p *Panel) ToGrafanaJSON() string { var targets []string for _, target := range p.Targets { - targets = append(targets, target.ToGrafanaJSON(netobsNs)) + targets = append(targets, target.ToGrafanaJSON()) + } + unit := string(p.Unit) + if unit == "" { + unit = "short" + } + var singleStatFormat string + if p.Unit == metricslatest.UnitSeconds { + singleStatFormat = "s" + } else { + singleStatFormat = unit + } + var t string + stacked := false + switch p.Type { + case metricslatest.ChartTypeSingleStat: + t = "singlestat" + case metricslatest.ChartTypeLine: + t = "graph" + case metricslatest.ChartTypeStackArea: + t = "graph" + stacked = true } return fmt.Sprintf(` { @@ -225,6 +235,7 @@ func (p *Panel) ToGrafanaJSON(netobsNs string) string { "datasource": "prometheus", "fill": 1, "fillGradient": 0, + "format": "%s", "gridPos": {}, "id": 1, "legend": { @@ -282,22 +293,11 @@ func (p *Panel) ToGrafanaJSON(netobsNs string) string { } ] } - `, p.Span, p.Stacked, strings.Join(targets, ","), p.Title, string(p.Type), string(p.Unit)) -} - -func (p *Panel) replaceMetric(oldName, newName string) Panel { - clone := NewPanel(p.Title, p.Type, p.Unit, p.Span, p.Stacked, nil) - for _, t := range p.Targets { - clone.Targets = append( - clone.Targets, - NewTarget(strings.ReplaceAll(t.Expr, oldName, newName), t.Legend), - ) - } - return clone + `, singleStatFormat, p.Span, stacked, strings.Join(targets, ","), p.Title, t, unit) } -func (t *Target) ToGrafanaJSON(netobsNs string) string { - expr := formatCleaner.Replace(strings.ReplaceAll(t.Expr, "$NETOBSERV_NS", netobsNs)) +func (t *Target) ToGrafanaJSON() string { + expr := formatCleaner.Replace(t.Expr) return fmt.Sprintf(` { "expr": "%s", diff --git a/pkg/dashboards/scopes.go b/pkg/dashboards/scopes.go deleted file mode 100644 index 6cddc0f0b..000000000 --- a/pkg/dashboards/scopes.go +++ /dev/null @@ -1,68 +0,0 @@ -package dashboards - -import ( - "fmt" - "strings" -) - -type metricScope struct { - metricPart string - titlePart string - labels []string - legendPart string - labelsReplacementTemplate string - splitAppInfra bool -} - -var ( - srcDstNodeScope = metricScope{ - metricPart: "node", - titlePart: "per node", - labels: []string{"SrcK8S_HostName", "DstK8S_HostName"}, - legendPart: "{{SrcK8S_HostName}} -> {{DstK8S_HostName}}", - labelsReplacementTemplate: `label_replace( - label_replace( - %s, - "SrcK8S_HostName", "(not namespaced)", "SrcK8S_HostName", "()" - ), - "DstK8S_HostName", "(not namespaced)", "DstK8S_HostName", "()" - )`, - splitAppInfra: false, - } - srcDstNamespaceScope = metricScope{ - metricPart: "namespace", - titlePart: "per namespace", - labels: []string{"SrcK8S_Namespace", "DstK8S_Namespace"}, - legendPart: "{{SrcK8S_Namespace}} -> {{DstK8S_Namespace}}", - labelsReplacementTemplate: `label_replace( - label_replace( - %s, - "SrcK8S_Namespace", "(not namespaced)", "SrcK8S_Namespace", "()" - ), - "DstK8S_Namespace", "(not namespaced)", "DstK8S_Namespace", "()" - )`, - splitAppInfra: true, - } - srcDstWorkloadScope = metricScope{ - metricPart: "workload", - titlePart: "per workload", - labels: []string{"SrcK8S_Namespace", "SrcK8S_OwnerName", "DstK8S_Namespace", "DstK8S_OwnerName"}, - legendPart: "{{SrcK8S_OwnerName}} ({{SrcK8S_Namespace}}) -> {{DstK8S_OwnerName}} ({{DstK8S_Namespace}})", - labelsReplacementTemplate: `label_replace( - label_replace( - %s, - "SrcK8S_Namespace", "non pods", "SrcK8S_Namespace", "()" - ), - "DstK8S_Namespace", "non pods", "DstK8S_Namespace", "()" - )`, - splitAppInfra: true, - } -) - -func (s *metricScope) joinLabels() string { - return strings.Join(s.labels, ",") -} - -func (s *metricScope) labelReplace(q string) string { - return fmt.Sprintf(s.labelsReplacementTemplate, q) -} diff --git a/pkg/metrics/helper.go b/pkg/metrics/helper.go new file mode 100644 index 000000000..2a548f135 --- /dev/null +++ b/pkg/metrics/helper.go @@ -0,0 +1,30 @@ +package metrics + +import ( + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" +) + +func GetFilters(fm *metricslatest.FlowMetricSpec) []metricslatest.MetricFilter { + var filters []metricslatest.MetricFilter + if !fm.IncludeDuplicates { + filters = append(filters, metricslatest.MetricFilter{ + Field: "Duplicate", + Value: "true", + MatchType: metricslatest.MatchNotEqual, + }) + } + if fm.Direction == metricslatest.Egress { + filters = append(filters, metricslatest.MetricFilter{ + Field: "FlowDirection", + Value: "1|2", + MatchType: metricslatest.MatchRegex, + }) + } else if fm.Direction == metricslatest.Ingress { + filters = append(filters, metricslatest.MetricFilter{ + Field: "FlowDirection", + Value: "0|2", + MatchType: metricslatest.MatchRegex, + }) + } + return append(fm.Filters, filters...) +} diff --git a/pkg/metrics/predefined_charts.go b/pkg/metrics/predefined_charts.go new file mode 100644 index 000000000..8a21778f2 --- /dev/null +++ b/pkg/metrics/predefined_charts.go @@ -0,0 +1,285 @@ +package metrics + +import ( + "fmt" + "strings" + + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" +) + +const ( + netobservDashboard = "NetObserv" +) + +func trafficCharts(group, vt, dir string) []metricslatest.Chart { + sectionName := "Traffic rates" + var unit metricslatest.Unit + switch vt { + case tagBytes: + unit = metricslatest.UnitBPS + case tagPackets: + unit = metricslatest.UnitPPS + } + + totalSingleStat := metricslatest.Chart{ + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: fmt.Sprintf("Total %s traffic", dir), + Unit: unit, + Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC[2m]))"}}, + } + + appSingleStat := metricslatest.Chart{ + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: fmt.Sprintf("Apps %s traffic", dir), + Unit: unit, + Queries: []metricslatest.Query{{PromQL: `sum(rate($METRIC{K8S_FlowLayer="app"}[2m]))`}}, + } + + infraSingleStat := metricslatest.Chart{ + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: fmt.Sprintf("Infra %s traffic", dir), + Unit: unit, + Queries: []metricslatest.Query{{PromQL: `sum(rate($METRIC{K8S_FlowLayer="infra"}[2m]))`}}, + } + + var charts []metricslatest.Chart + switch group { + case tagNodes: + charts = []metricslatest.Chart{ + totalSingleStat, + } + case tagNamespaces, tagWorkloads: + charts = []metricslatest.Chart{ + totalSingleStat, + infraSingleStat, + appSingleStat, + } + } + + return append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeStackArea, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: fmt.Sprintf("Top %s traffic", dir), + Unit: unit, + Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC{$FILTERS}[2m])) by ($LABELS)", Legend: "$LEGEND"}}, + }, group, "")...) +} + +func rttCharts(group string) []metricslatest.Chart { + sectionName := "TCP latencies" + charts := []metricslatest.Chart{{ + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: "TCP latency", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket[2m])) by (le)) > 0", + Legend: "p99", + }, + }, + }} + charts = append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeLine, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "Top P50 sRTT", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.5, sum(rate($METRIC_bucket{$FILTERS}[2m])) by (le,$LABELS))*1000 > 0", + Legend: "$LEGEND", + }, + }, + }, group, "ms")...) + charts = append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeLine, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "Top P99 sRTT", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket{$FILTERS}[2m])) by (le,$LABELS))*1000 > 0", + Legend: "$LEGEND", + }, + }, + }, group, "ms")...) + + return charts +} + +func dropCharts(group string, unit metricslatest.Unit) []metricslatest.Chart { + sectionName := "Byte and packet drops" + var charts []metricslatest.Chart + if unit == "pps" { + charts = append(charts, metricslatest.Chart{ + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: "Drops", + Unit: unit, + Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC[2m]))"}}, + }) + } + return append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeStackArea, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "Top drops", + Unit: unit, + Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC{$FILTERS}[2m])) by ($LABELS)", Legend: "$LEGEND"}}, + }, group, string(unit))...) +} + +func dnsCharts(group string) []metricslatest.Chart { + sectionName := "DNS" + charts := []metricslatest.Chart{ + { + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: "DNS latency", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket[2m])) by (le)) > 0", + Legend: "p99", + }, + }, + }, + { + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: "DNS error rate", + Queries: []metricslatest.Query{{PromQL: `sum(rate($METRIC_count{DnsFlagsResponseCode!="NoError"}[2m]))`}}, + }, + } + charts = append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeLine, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "Top P50 DNS latency", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.5, sum(rate($METRIC_bucket{$FILTERS}[2m])) by (le,$LABELS))*1000 > 0", + Legend: "$LEGEND", + }, + }, + }, group, "ms")...) + charts = append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeLine, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "Top P99 DNS latency", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket{$FILTERS}[2m])) by (le,$LABELS))*1000 > 0", + Legend: "$LEGEND", + }, + }, + }, group, "ms")...) + + return append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeStackArea, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "DNS error rate", + Queries: []metricslatest.Query{{ + PromQL: `sum(rate($METRIC_count{DnsFlagsResponseCode!="NoError",$FILTERS}[2m])) by (DnsFlagsResponseCode,$LABELS)`, + Legend: "$LEGEND, {{ DnsFlagsResponseCode }}", + }}, + }, group, "")...) +} + +func chartVariantsFor(chart *metricslatest.Chart, group, unit string) []metricslatest.Chart { + switch group { + case tagNodes: + return []metricslatest.Chart{ + chartVariantFor(chart, group, "", unit), + } + case tagNamespaces: + return []metricslatest.Chart{ + chartVariantFor(chart, group, "infra", unit), + chartVariantFor(chart, group, "app", unit), + } + case tagWorkloads: + return []metricslatest.Chart{ + chartVariantFor(chart, tagNamespaces, "infra", unit), + chartVariantFor(chart, tagNamespaces, "app", unit), + chartVariantFor(chart, group, "infra", unit), + chartVariantFor(chart, group, "app", unit), + } + } + return nil +} + +func chartVariantFor(c *metricslatest.Chart, group, layer, unit string) metricslatest.Chart { + chart := *c + var flowLayerFilter, labels, legend string + chart.Title += " per " + if layer != "" { + chart.Title += layer + " " + flowLayerFilter = `K8S_FlowLayer="` + layer + `",` + } + var orFilters []string + switch group { + case tagNodes: + chart.Title += "node" + labels = "SrcK8S_HostName,DstK8S_HostName" + legend = "source:{{SrcK8S_HostName}}, dest:{{DstK8S_HostName}}" + case tagNamespaces: + chart.Title += "namespace" + labels = "SrcK8S_Namespace,DstK8S_Namespace" + legend = "source:{{SrcK8S_Namespace}}, dest:{{DstK8S_Namespace}}" + // orFilters aim to eliminate node-to-node traffic when looking at namespace-based metrics + orFilters = []string{ + flowLayerFilter + `SrcK8S_Namespace!=""`, + flowLayerFilter + `DstK8S_Namespace!=""`, + } + case tagWorkloads: + chart.Title += "workload" + labels = "SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName" + legend = "source:{{SrcK8S_OwnerName}}/{{SrcK8S_Namespace}}, dest:{{DstK8S_OwnerName}}/{{DstK8S_Namespace}}" + // orFilters aim to eliminate node-to-node traffic when looking at workload-based metrics + orFilters = []string{ + flowLayerFilter + `SrcK8S_Namespace!=""`, + flowLayerFilter + `DstK8S_Namespace!=""`, + } + } + if unit != "" { + chart.Title += " (" + unit + ")" + } + queriesReplaceAll(&chart, labels, legend, orFilters) + return chart +} + +func queriesReplaceAll(c *metricslatest.Chart, labels, legend string, orFilters []string) { + var queries []metricslatest.Query + for _, q := range c.Queries { + q.PromQL = strings.ReplaceAll(q.PromQL, "$LABELS", labels) + q.Legend = strings.ReplaceAll(q.Legend, "$LEGEND", legend) + if len(orFilters) == 0 { + q.PromQL = strings.ReplaceAll(q.PromQL, "$FILTERS", "") + } else { + var parts []string + for _, filter := range orFilters { + parts = append(parts, "("+strings.ReplaceAll(q.PromQL, "$FILTERS", filter)+")") + } + q.PromQL = strings.Join(parts, " or ") + } + queries = append(queries, q) + } + c.Queries = queries +} diff --git a/pkg/metrics/predefined_metrics.go b/pkg/metrics/predefined_metrics.go index 0d9710f08..85b33f4d9 100644 --- a/pkg/metrics/predefined_metrics.go +++ b/pkg/metrics/predefined_metrics.go @@ -5,8 +5,8 @@ import ( "reflect" "strings" - flpapi "github.com/netobserv/flowlogs-pipeline/pkg/api" flowslatest "github.com/netobserv/network-observability-operator/apis/flowcollector/v1beta2" + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" "github.com/netobserv/network-observability-operator/pkg/helper" ) @@ -21,19 +21,16 @@ const ( ) var ( - mapLabels = map[string][]string{ + latencyBuckets = []string{".005", ".01", ".02", ".03", ".04", ".05", ".075", ".1", ".25", "1"} + mapLabels = map[string][]string{ tagNodes: {"SrcK8S_HostName", "DstK8S_HostName"}, - tagNamespaces: {"SrcK8S_Namespace", "DstK8S_Namespace"}, - tagWorkloads: {"SrcK8S_Namespace", "DstK8S_Namespace", "SrcK8S_OwnerName", "DstK8S_OwnerName", "SrcK8S_OwnerType", "DstK8S_OwnerType"}, + tagNamespaces: {"SrcK8S_Namespace", "DstK8S_Namespace", "K8S_FlowLayer"}, + tagWorkloads: {"SrcK8S_Namespace", "DstK8S_Namespace", "K8S_FlowLayer", "SrcK8S_OwnerName", "DstK8S_OwnerName", "SrcK8S_OwnerType", "DstK8S_OwnerType"}, } mapValueFields = map[string]string{ tagBytes: "Bytes", tagPackets: "Packets", } - mapDirection = map[string]string{ - tagIngress: "0|2", - tagEgress: "1|2", - } predefinedMetrics []taggedMetricDefinition // Note that we set default in-code rather than in CRD, in order to keep track of value being unset or set intentionnally in FlowCollector DefaultIncludeList = []string{ @@ -51,7 +48,7 @@ var ( ) type taggedMetricDefinition struct { - flpapi.MetricsItem + metricslatest.FlowMetricSpec tags []string } @@ -62,84 +59,93 @@ func init() { // Bytes / packets metrics for _, vt := range []string{tagBytes, tagPackets} { valueField := mapValueFields[vt] - for _, dir := range []string{tagEgress, tagIngress} { + for _, dir := range []metricslatest.FlowDirection{metricslatest.Egress, metricslatest.Ingress} { + lowDir := strings.ToLower(string(dir)) predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_%s_%s_total", groupTrimmed, dir, vt), - Type: "counter", - ValueKey: valueField, - Filters: []flpapi.MetricsFilter{ - {Key: "Duplicate", Value: "true", Type: flpapi.MetricFilterNotEqual}, - {Key: "FlowDirection", Value: mapDirection[dir], Type: flpapi.MetricFilterRegex}, - }, - Labels: labels, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_%s_%s_total", groupTrimmed, lowDir, vt), + Type: metricslatest.CounterMetric, + ValueField: valueField, + IncludeDuplicates: false, + Direction: dir, + Labels: labels, + Charts: trafficCharts(group, vt, lowDir), }, - tags: []string{group, vt, dir}, + tags: []string{group, vt, lowDir}, }) } } // Flows metrics predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_flows_total", groupTrimmed), - Type: "counter", - Labels: labels, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_flows_total", groupTrimmed), + Type: "counter", + Labels: labels, + IncludeDuplicates: true, }, tags: []string{group, group + "-flows", "flows"}, }) // RTT metrics predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_rtt_seconds", groupTrimmed), - Type: "histogram", - ValueKey: "TimeFlowRttNs", - Filters: []flpapi.MetricsFilter{ - {Key: "TimeFlowRttNs", Type: flpapi.MetricFilterPresence}, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_rtt_seconds", groupTrimmed), + Type: metricslatest.HistogramMetric, + ValueField: "TimeFlowRttNs", + IncludeDuplicates: true, + Filters: []metricslatest.MetricFilter{ + {Field: "TimeFlowRttNs", MatchType: metricslatest.MatchPresence}, }, - Labels: labels, - ValueScale: 1_000_000_000, // ns => s + Labels: labels, + Divider: 1_000_000_000, // ns => s + Buckets: latencyBuckets, + Charts: rttCharts(group), }, tags: []string{group, "rtt"}, }) // Drops metrics predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_drop_packets_total", groupTrimmed), - Type: "counter", - ValueKey: "PktDropPackets", - Filters: []flpapi.MetricsFilter{ - {Key: "Duplicate", Value: "true", Type: flpapi.MetricFilterNotEqual}, - {Key: "PktDropPackets", Type: flpapi.MetricFilterPresence}, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_drop_packets_total", groupTrimmed), + Type: metricslatest.CounterMetric, + ValueField: "PktDropPackets", + IncludeDuplicates: false, + Filters: []metricslatest.MetricFilter{ + {Field: "PktDropPackets", MatchType: metricslatest.MatchPresence}, }, Labels: labels, + Charts: dropCharts(group, "pps"), }, tags: []string{group, tagPackets, "drops"}, }) predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_drop_bytes_total", groupTrimmed), - Type: "counter", - ValueKey: "PktDropBytes", - Filters: []flpapi.MetricsFilter{ - {Key: "Duplicate", Value: "true", Type: flpapi.MetricFilterNotEqual}, - {Key: "PktDropBytes", Type: flpapi.MetricFilterPresence}, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_drop_bytes_total", groupTrimmed), + Type: metricslatest.CounterMetric, + ValueField: "PktDropBytes", + IncludeDuplicates: false, + Filters: []metricslatest.MetricFilter{ + {Field: "PktDropBytes", MatchType: metricslatest.MatchPresence}, }, Labels: labels, + Charts: dropCharts(group, "Bps"), }, tags: []string{group, tagBytes, "drop"}, }) // DNS metrics - dnsLabels := append(labels, "DnsFlagsResponseCode") + dnsLabels := labels + dnsLabels = append(dnsLabels, "DnsFlagsResponseCode") predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_dns_latency_seconds", groupTrimmed), - Type: "histogram", - ValueKey: "DnsLatencyMs", - Filters: []flpapi.MetricsFilter{ - {Key: "DnsId", Type: flpapi.MetricFilterPresence}, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_dns_latency_seconds", groupTrimmed), + Type: metricslatest.HistogramMetric, + ValueField: "DnsLatencyMs", + Filters: []metricslatest.MetricFilter{ + {Field: "DnsId", MatchType: metricslatest.MatchPresence}, }, - Labels: dnsLabels, - ValueScale: 1000, // ms => s + Labels: dnsLabels, + Divider: 1000, // ms => s + Buckets: latencyBuckets, + Charts: dnsCharts(group), }, tags: []string{group, "dns"}, }) @@ -161,7 +167,7 @@ func convertIgnoreTagsToIncludeList(ignoreTags []string) []flowslatest.FLPMetric ret := []flowslatest.FLPMetric{} for i := range predefinedMetrics { if !isIgnored(&predefinedMetrics[i], ignoreTags) { - ret = append(ret, flowslatest.FLPMetric(predefinedMetrics[i].Name)) + ret = append(ret, flowslatest.FLPMetric(predefinedMetrics[i].MetricName)) } } return ret @@ -181,24 +187,24 @@ func GetAsIncludeList(ignoreTags []string, includeList *[]flowslatest.FLPMetric) func GetAllNames() []string { names := []string{} for i := range predefinedMetrics { - names = append(names, predefinedMetrics[i].Name) + names = append(names, predefinedMetrics[i].MetricName) } return names } -func GetDefinitions(names []string) []flpapi.MetricsItem { - ret := []flpapi.MetricsItem{} +func GetDefinitions(names []string) []metricslatest.FlowMetric { + ret := []metricslatest.FlowMetric{} for i := range predefinedMetrics { for _, name := range names { - if predefinedMetrics[i].Name == name { - ret = append(ret, predefinedMetrics[i].MetricsItem) + if predefinedMetrics[i].MetricName == name { + ret = append(ret, metricslatest.FlowMetric{Spec: predefinedMetrics[i].FlowMetricSpec}) } } } return ret } -func GetIncludeList(spec *flowslatest.FlowCollectorSpec) []string { +func getIncludeList(spec *flowslatest.FlowCollectorSpec) []string { var list []string if spec.Processor.Metrics.IncludeList == nil { list = DefaultIncludeList @@ -228,3 +234,9 @@ func removeMetricsByPattern(list []string, search string) []string { } return filtered } + +func MergePredefined(fm []metricslatest.FlowMetric, fc *flowslatest.FlowCollectorSpec) []metricslatest.FlowMetric { + names := getIncludeList(fc) + predefined := GetDefinitions(names) + return append(fm, predefined...) +} diff --git a/pkg/metrics/predefined_metrics_test.go b/pkg/metrics/predefined_metrics_test.go index c0f531f1f..d43a50970 100644 --- a/pkg/metrics/predefined_metrics_test.go +++ b/pkg/metrics/predefined_metrics_test.go @@ -45,13 +45,13 @@ func TestGetDefinitions(t *testing.T) { res := GetDefinitions([]string{"namespace_flows_total", "node_ingress_bytes_total", "workload_egress_packets_total"}) assert.Len(res, 3) - assert.Equal("node_ingress_bytes_total", res[0].Name) - assert.Equal("Bytes", res[0].ValueKey) - assert.Equal([]string{"SrcK8S_HostName", "DstK8S_HostName"}, res[0].Labels) - assert.Equal("namespace_flows_total", res[1].Name) - assert.Empty(res[1].ValueKey) - assert.Equal([]string{"SrcK8S_Namespace", "DstK8S_Namespace"}, res[1].Labels) - assert.Equal("workload_egress_packets_total", res[2].Name) - assert.Equal("Packets", res[2].ValueKey) - assert.Equal([]string{"SrcK8S_Namespace", "DstK8S_Namespace", "SrcK8S_OwnerName", "DstK8S_OwnerName", "SrcK8S_OwnerType", "DstK8S_OwnerType"}, res[2].Labels) + assert.Equal("node_ingress_bytes_total", res[0].Spec.MetricName) + assert.Equal("Bytes", res[0].Spec.ValueField) + assert.Equal([]string{"SrcK8S_HostName", "DstK8S_HostName"}, res[0].Spec.Labels) + assert.Equal("namespace_flows_total", res[1].Spec.MetricName) + assert.Empty(res[1].Spec.ValueField) + assert.Equal([]string{"SrcK8S_Namespace", "DstK8S_Namespace", "K8S_FlowLayer"}, res[1].Spec.Labels) + assert.Equal("workload_egress_packets_total", res[2].Spec.MetricName) + assert.Equal("Packets", res[2].Spec.ValueField) + assert.Equal([]string{"SrcK8S_Namespace", "DstK8S_Namespace", "K8S_FlowLayer", "SrcK8S_OwnerName", "DstK8S_OwnerName", "SrcK8S_OwnerType", "DstK8S_OwnerType"}, res[2].Spec.Labels) } From 2325b29695563a6614e34315fd996fef7c435b8e Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Thu, 4 Apr 2024 14:11:00 +0200 Subject: [PATCH 02/13] Document API, gen bundle, fix tests, update examples --- apis/flowmetrics/v1alpha1/flowmetric_types.go | 48 ++++- .../v1alpha1/zz_generated.deepcopy.go | 42 ++++ .../flows.netobserv.io_flowmetrics.yaml | 85 +++++++- ...observ-operator.clusterserviceversion.yaml | 49 +++-- .../bases/flows.netobserv.io_flowmetrics.yaml | 85 +++++++- .../az_aware_worloads_traffic.yaml | 1 + .../flowmetrics/cluster_egress_traffic.yaml | 30 +++ .../flowmetrics/cluster_ingress_traffic.yaml | 30 +++ .../flowmetrics/pods_incoming_bytes.yaml | 1 + .../flowmetrics/pods_openshift_ingress.yaml | 1 + .../flowmetrics/pods_outgoing_bytes.yaml | 1 + config/samples/flowmetrics/service_ports.yaml | 20 ++ .../flowmetrics/services_incoming_bytes.yaml | 1 + .../flowmetrics/traffic_across_az.yaml | 38 ++++ config/samples/flows_v1alpha1_flowmetric.yaml | 36 ++-- controllers/flp/flp_controller.go | 2 +- controllers/flp/flp_pipeline_builder.go | 22 ++- controllers/flp/flp_test.go | 8 +- .../monitoring/monitoring_controller.go | 6 +- pkg/dashboards/dashboard_test.go | 185 +++++++++++++----- pkg/dashboards/model.go | 29 ++- pkg/metrics/predefined_charts.go | 2 +- pkg/metrics/predefined_metrics.go | 6 +- 23 files changed, 617 insertions(+), 111 deletions(-) create mode 100644 config/samples/flowmetrics/cluster_egress_traffic.yaml create mode 100644 config/samples/flowmetrics/cluster_ingress_traffic.yaml create mode 100644 config/samples/flowmetrics/service_ports.yaml create mode 100644 config/samples/flowmetrics/traffic_across_az.yaml diff --git a/apis/flowmetrics/v1alpha1/flowmetric_types.go b/apis/flowmetrics/v1alpha1/flowmetric_types.go index f50f05153..9d04728a2 100644 --- a/apis/flowmetrics/v1alpha1/flowmetric_types.go +++ b/apis/flowmetrics/v1alpha1/flowmetric_types.go @@ -45,7 +45,7 @@ type MetricFilter struct { // +required Field string `json:"field"` - // Value to filter on + // Value to filter on. When `matchType` is `Equal` or `NotEqual`, you can use field injection with `$(SomeField)` to refer to any other field of the flow. // +optional Value string `json:"value"` @@ -112,7 +112,7 @@ type FlowMetricSpec struct { // When non-zero, scale factor (divider) of the value. Metric value = Flow value / Divider. // +optional - Divider float64 `json:"divider"` + Divider string `json:"divider"` // Charts configuration // +optional @@ -127,6 +127,7 @@ const ( UnitSeconds Unit = "seconds" UnitBPS Unit = "Bps" UnitPPS Unit = "pps" + UnitPercent Unit = "percent" ChartTypeSingleStat ChartType = "SingleStat" ChartTypeLine ChartType = "Line" ChartTypeStackArea ChartType = "StackArea" @@ -134,17 +135,48 @@ const ( // Configures charts / dashboard generation associated to a metric type Chart struct { - DashboardName string `json:"dashboardName"` - SectionName string `json:"sectionName"` - Title string `json:"title"` - Unit Unit `json:"unit"` - Type ChartType `json:"type"` - Queries []Query `json:"queries"` + // Name of the containing dashboard. If this name does not refer to an existing dashboard, a new dashboard is created. + // +kubebuilder:default:="NetObserv" + DashboardName string `json:"dashboardName"` + + // Name of the containing dashboard section. If this name does not refer to an existing section, a new section is created. + // If `sectionName` is omitted or empty, the chart is placed in the global top section. + // +optional + SectionName string `json:"sectionName"` + + // Title of the chart. + // +required + Title string `json:"title"` + + // Unit of this chart. Only a few units are currently supported. Leave empty to use generic number. + // +kubebuilder:validation:Enum:="bytes";"seconds";"Bps";"pps";"percent" + // +optional + Unit Unit `json:"unit"` + + // Type of the chart. + // +kubebuilder:validation:Enum:="SingleStat";"Line";"StackArea" + // +required + Type ChartType `json:"type"` + + // List of queries to be displayed on this chart. If `type` is `SingleStat` and multiple queries are provided, + // this chart will be automatically expanded in several panels (one per query). + // +required + Queries []Query `json:"queries"` } // Configures PromQL queries type Query struct { + // The `promQL` query to be run against Prometheus. If the chart `type` is `SingleStat`, this query should only return + // a single timeseries. For other types, a top 7 is displayed. + // You can use `$METRIC` to refer to the metric defined in this resource. For example: `sum(rate($METRIC[2m]))`. + // To learn more about `promQL`, refer to the Prometheus documentation: https://prometheus.io/docs/prometheus/latest/querying/basics/ + // +required PromQL string `json:"promQL"` + + // The query legend that applies to each timeseries represented in this chart. When multiple timeseries are displayed, you should set a legend + // that distinguishes each of them. It can be done with the following format: `{{ Label }}`. For example, if the `promQL` groups timeseries per + // label such as: `sum(rate($METRIC[2m])) by (Label1, Label2)`, you may write as the legend: `Label1={{ Label1 }}, Label2={{ Label2 }}`. + // +required Legend string `json:"legend"` } diff --git a/apis/flowmetrics/v1alpha1/zz_generated.deepcopy.go b/apis/flowmetrics/v1alpha1/zz_generated.deepcopy.go index 34b5e6c21..b0a2a88ae 100644 --- a/apis/flowmetrics/v1alpha1/zz_generated.deepcopy.go +++ b/apis/flowmetrics/v1alpha1/zz_generated.deepcopy.go @@ -24,6 +24,26 @@ import ( runtime "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Chart) DeepCopyInto(out *Chart) { + *out = *in + if in.Queries != nil { + in, out := &in.Queries, &out.Queries + *out = make([]Query, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Chart. +func (in *Chart) DeepCopy() *Chart { + if in == nil { + return nil + } + out := new(Chart) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FlowMetric) DeepCopyInto(out *FlowMetric) { *out = *in @@ -101,6 +121,13 @@ func (in *FlowMetricSpec) DeepCopyInto(out *FlowMetricSpec) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.Charts != nil { + in, out := &in.Charts, &out.Charts + *out = make([]Chart, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FlowMetricSpec. @@ -142,3 +169,18 @@ func (in *MetricFilter) DeepCopy() *MetricFilter { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Query) DeepCopyInto(out *Query) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Query. +func (in *Query) DeepCopy() *Query { + if in == nil { + return nil + } + out := new(Query) + in.DeepCopyInto(out) + return out +} diff --git a/bundle/manifests/flows.netobserv.io_flowmetrics.yaml b/bundle/manifests/flows.netobserv.io_flowmetrics.yaml index a36936c50..b9a995eb3 100644 --- a/bundle/manifests/flows.netobserv.io_flowmetrics.yaml +++ b/bundle/manifests/flows.netobserv.io_flowmetrics.yaml @@ -51,6 +51,83 @@ spec: items: type: string type: array + charts: + description: Charts configuration + items: + description: Configures charts / dashboard generation associated + to a metric + properties: + dashboardName: + default: NetObserv + description: Name of the containing dashboard. If this name + does not refer to an existing dashboard, a new dashboard is + created. + type: string + queries: + description: List of queries to be displayed on this chart. + If `type` is `SingleStat` and multiple queries are provided, + this chart will be automatically expanded in several panels + (one per query). + items: + description: Configures PromQL queries + properties: + legend: + description: 'The query legend that applies to each timeseries + represented in this chart. When multiple timeseries + are displayed, you should set a legend that distinguishes + each of them. It can be done with the following format: + `{{ Label }}`. For example, if the `promQL` groups timeseries + per label such as: `sum(rate($METRIC[2m])) by (Label1, + Label2)`, you may write as the legend: `Label1={{ Label1 + }}, Label2={{ Label2 }}`.' + type: string + promQL: + description: 'The `promQL` query to be run against Prometheus. + If the chart `type` is `SingleStat`, this query should + only return a single timeseries. For other types, a + top 7 is displayed. You can use `$METRIC` to refer to + the metric defined in this resource. For example: `sum(rate($METRIC[2m]))`. + To learn more about `promQL`, refer to the Prometheus + documentation: https://prometheus.io/docs/prometheus/latest/querying/basics/' + type: string + required: + - legend + - promQL + type: object + type: array + sectionName: + description: Name of the containing dashboard section. If this + name does not refer to an existing section, a new section + is created. If `sectionName` is omitted or empty, the chart + is placed in the global top section. + type: string + title: + description: Title of the chart. + type: string + type: + description: Type of the chart. + enum: + - SingleStat + - Line + - StackArea + type: string + unit: + description: Unit of this chart. Only a few units are currently + supported. Leave empty to use generic number. + enum: + - bytes + - seconds + - Bps + - pps + - percent + type: string + required: + - dashboardName + - queries + - title + - type + type: object + type: array direction: default: Any description: |- @@ -62,6 +139,10 @@ spec: - Egress - Ingress type: string + divider: + description: When non-zero, scale factor (divider) of the value. Metric + value = Flow value / Divider. + type: string filters: description: |- `filters` is a list of fields and values used to restrict which flows are taken into account. Oftentimes, these filters must @@ -84,7 +165,9 @@ spec: - NotMatchRegex type: string value: - description: Value to filter on + description: Value to filter on. When `matchType` is `Equal` + or `NotEqual`, you can use field injection with `$(SomeField)` + to refer to any other field of the flow. type: string required: - field diff --git a/bundle/manifests/netobserv-operator.clusterserviceversion.yaml b/bundle/manifests/netobserv-operator.clusterserviceversion.yaml index 75ad03082..2f6db2724 100644 --- a/bundle/manifests/netobserv-operator.clusterserviceversion.yaml +++ b/bundle/manifests/netobserv-operator.clusterserviceversion.yaml @@ -18,28 +18,49 @@ metadata: "name": "flowmetric-sample" }, "spec": { - "filters": [ + "charts": [ { - "field": "DstPort", - "matchType": "Regex", - "value": "^\\d\\d?\\d?\\d?$" + "dashboardName": "NetObserv", + "queries": [ + { + "legend": "", + "promQL": "sum(rate($METRIC[2m]))" + } + ], + "title": "Cluster ingress traffic", + "type": "SingleStat", + "unit": "Bps" }, { - "field": "Duplicate", - "matchType": "NotEqual", - "value": "true" - }, + "dashboardName": "NetObserv", + "queries": [ + { + "legend": "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}", + "promQL": "sum(rate($METRIC{DstK8S_Namespace!=\"\"}[2m])) by (DstK8S_Namespace, DstK8S_OwnerName)" + } + ], + "sectionName": "Cluster", + "title": "Top cluster ingress traffic per workload", + "type": "StackArea", + "unit": "Bps" + } + ], + "direction": "Ingress", + "filters": [ { - "field": "FlowDirection", - "matchType": "Regex", - "value": "1|2" + "field": "SrcSubnetLabel", + "matchType": "Absence" } ], "labels": [ - "DstPort" + "DstK8S_HostName", + "DstK8S_Namespace", + "DstK8S_OwnerName", + "DstK8S_OwnerType" ], - "metricName": "service_ports_total", - "type": "Counter" + "metricName": "cluster_ingress_bytes_total", + "type": "Counter", + "valueField": "Bytes" } }, { diff --git a/config/crd/bases/flows.netobserv.io_flowmetrics.yaml b/config/crd/bases/flows.netobserv.io_flowmetrics.yaml index a869ae7ba..86eeb7745 100644 --- a/config/crd/bases/flows.netobserv.io_flowmetrics.yaml +++ b/config/crd/bases/flows.netobserv.io_flowmetrics.yaml @@ -51,6 +51,83 @@ spec: items: type: string type: array + charts: + description: Charts configuration + items: + description: Configures charts / dashboard generation associated + to a metric + properties: + dashboardName: + default: NetObserv + description: Name of the containing dashboard. If this name + does not refer to an existing dashboard, a new dashboard is + created. + type: string + queries: + description: List of queries to be displayed on this chart. + If `type` is `SingleStat` and multiple queries are provided, + this chart will be automatically expanded in several panels + (one per query). + items: + description: Configures PromQL queries + properties: + legend: + description: 'The query legend that applies to each timeseries + represented in this chart. When multiple timeseries + are displayed, you should set a legend that distinguishes + each of them. It can be done with the following format: + `{{ Label }}`. For example, if the `promQL` groups timeseries + per label such as: `sum(rate($METRIC[2m])) by (Label1, + Label2)`, you may write as the legend: `Label1={{ Label1 + }}, Label2={{ Label2 }}`.' + type: string + promQL: + description: 'The `promQL` query to be run against Prometheus. + If the chart `type` is `SingleStat`, this query should + only return a single timeseries. For other types, a + top 7 is displayed. You can use `$METRIC` to refer to + the metric defined in this resource. For example: `sum(rate($METRIC[2m]))`. + To learn more about `promQL`, refer to the Prometheus + documentation: https://prometheus.io/docs/prometheus/latest/querying/basics/' + type: string + required: + - legend + - promQL + type: object + type: array + sectionName: + description: Name of the containing dashboard section. If this + name does not refer to an existing section, a new section + is created. If `sectionName` is omitted or empty, the chart + is placed in the global top section. + type: string + title: + description: Title of the chart. + type: string + type: + description: Type of the chart. + enum: + - SingleStat + - Line + - StackArea + type: string + unit: + description: Unit of this chart. Only a few units are currently + supported. Leave empty to use generic number. + enum: + - bytes + - seconds + - Bps + - pps + - percent + type: string + required: + - dashboardName + - queries + - title + - type + type: object + type: array direction: default: Any description: |- @@ -62,6 +139,10 @@ spec: - Egress - Ingress type: string + divider: + description: When non-zero, scale factor (divider) of the value. Metric + value = Flow value / Divider. + type: string filters: description: |- `filters` is a list of fields and values used to restrict which flows are taken into account. Oftentimes, these filters must @@ -84,7 +165,9 @@ spec: - NotMatchRegex type: string value: - description: Value to filter on + description: Value to filter on. When `matchType` is `Equal` + or `NotEqual`, you can use field injection with `$(SomeField)` + to refer to any other field of the flow. type: string required: - field diff --git a/config/samples/flowmetrics/az_aware_worloads_traffic.yaml b/config/samples/flowmetrics/az_aware_worloads_traffic.yaml index 4bfac10ab..34397799e 100644 --- a/config/samples/flowmetrics/az_aware_worloads_traffic.yaml +++ b/config/samples/flowmetrics/az_aware_worloads_traffic.yaml @@ -1,3 +1,4 @@ +# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics apiVersion: flows.netobserv.io/v1alpha1 kind: FlowMetric metadata: diff --git a/config/samples/flowmetrics/cluster_egress_traffic.yaml b/config/samples/flowmetrics/cluster_egress_traffic.yaml new file mode 100644 index 000000000..a265e6065 --- /dev/null +++ b/config/samples/flowmetrics/cluster_egress_traffic.yaml @@ -0,0 +1,30 @@ +# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics +apiVersion: flows.netobserv.io/v1alpha1 +kind: FlowMetric +metadata: + name: flowmetric-cluster-egress-traffic +spec: + metricName: cluster_egress_bytes_total + type: Counter + valueField: Bytes + direction: Egress + labels: [SrcK8S_HostName,SrcK8S_Namespace,SrcK8S_OwnerName,SrcK8S_OwnerType] + filters: + - field: DstSubnetLabel + matchType: Absence + charts: + - dashboardName: NetObserv + title: Cluster egress traffic + unit: Bps + type: SingleStat + queries: + - promQL: "sum(rate($METRIC[2m]))" + legend: "" + - dashboardName: NetObserv + sectionName: Cluster + title: Top cluster egress traffic per workload + unit: Bps + type: StackArea + queries: + - promQL: "sum(rate($METRIC{SrcK8S_Namespace!=\"\"}[2m])) by (SrcK8S_Namespace, SrcK8S_OwnerName)" + legend: "{{SrcK8S_Namespace}} / {{SrcK8S_OwnerName}}" diff --git a/config/samples/flowmetrics/cluster_ingress_traffic.yaml b/config/samples/flowmetrics/cluster_ingress_traffic.yaml new file mode 100644 index 000000000..c1983bb31 --- /dev/null +++ b/config/samples/flowmetrics/cluster_ingress_traffic.yaml @@ -0,0 +1,30 @@ +# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics +apiVersion: flows.netobserv.io/v1alpha1 +kind: FlowMetric +metadata: + name: flowmetric-cluster-ingress-traffic +spec: + metricName: cluster_ingress_bytes_total + type: Counter + valueField: Bytes + direction: Ingress + labels: [DstK8S_HostName,DstK8S_Namespace,DstK8S_OwnerName,DstK8S_OwnerType] + filters: + - field: SrcSubnetLabel + matchType: Absence + charts: + - dashboardName: NetObserv + title: Cluster ingress traffic + unit: Bps + type: SingleStat + queries: + - promQL: "sum(rate($METRIC[2m]))" + legend: "" + - dashboardName: NetObserv + sectionName: Cluster + title: Top cluster ingress traffic per workload + unit: Bps + type: StackArea + queries: + - promQL: "sum(rate($METRIC{DstK8S_Namespace!=\"\"}[2m])) by (DstK8S_Namespace, DstK8S_OwnerName)" + legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" diff --git a/config/samples/flowmetrics/pods_incoming_bytes.yaml b/config/samples/flowmetrics/pods_incoming_bytes.yaml index e7bbfee39..fa5c3425e 100644 --- a/config/samples/flowmetrics/pods_incoming_bytes.yaml +++ b/config/samples/flowmetrics/pods_incoming_bytes.yaml @@ -1,3 +1,4 @@ +# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics apiVersion: flows.netobserv.io/v1alpha1 kind: FlowMetric metadata: diff --git a/config/samples/flowmetrics/pods_openshift_ingress.yaml b/config/samples/flowmetrics/pods_openshift_ingress.yaml index 1d5b04a71..fc3d15d61 100644 --- a/config/samples/flowmetrics/pods_openshift_ingress.yaml +++ b/config/samples/flowmetrics/pods_openshift_ingress.yaml @@ -1,3 +1,4 @@ +# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics apiVersion: flows.netobserv.io/v1alpha1 kind: FlowMetric metadata: diff --git a/config/samples/flowmetrics/pods_outgoing_bytes.yaml b/config/samples/flowmetrics/pods_outgoing_bytes.yaml index 16f57c568..cf94a35a9 100644 --- a/config/samples/flowmetrics/pods_outgoing_bytes.yaml +++ b/config/samples/flowmetrics/pods_outgoing_bytes.yaml @@ -1,3 +1,4 @@ +# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics apiVersion: flows.netobserv.io/v1alpha1 kind: FlowMetric metadata: diff --git a/config/samples/flowmetrics/service_ports.yaml b/config/samples/flowmetrics/service_ports.yaml new file mode 100644 index 000000000..85608df74 --- /dev/null +++ b/config/samples/flowmetrics/service_ports.yaml @@ -0,0 +1,20 @@ +# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics +apiVersion: flows.netobserv.io/v1alpha1 +kind: FlowMetric +metadata: + name: flowmetric-service-ports +spec: + # Example, counting flows per port <= 9999 + metricName: service_ports_total + type: Counter + labels: [DstPort] + filters: + - field: DstPort + value: "^\\d\\d?\\d?\\d?$" + matchType: Regex + - field: Duplicate + value: "true" + matchType: NotEqual + - field: FlowDirection + value: "1|2" + matchType: Regex diff --git a/config/samples/flowmetrics/services_incoming_bytes.yaml b/config/samples/flowmetrics/services_incoming_bytes.yaml index 1328721fd..d14991864 100644 --- a/config/samples/flowmetrics/services_incoming_bytes.yaml +++ b/config/samples/flowmetrics/services_incoming_bytes.yaml @@ -1,3 +1,4 @@ +# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics apiVersion: flows.netobserv.io/v1alpha1 kind: FlowMetric metadata: diff --git a/config/samples/flowmetrics/traffic_across_az.yaml b/config/samples/flowmetrics/traffic_across_az.yaml new file mode 100644 index 000000000..69b373a86 --- /dev/null +++ b/config/samples/flowmetrics/traffic_across_az.yaml @@ -0,0 +1,38 @@ +# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics +apiVersion: flows.netobserv.io/v1alpha1 +kind: FlowMetric +metadata: + name: flowmetric-traffic-across-az +spec: + metricName: cross_az_ingress_bytes_total + type: Counter + valueField: Bytes + direction: Ingress + labels: [SrcK8S_Namespace,SrcK8S_OwnerName,SrcK8S_OwnerType,SrcK8S_Zone,DstK8S_Namespace,DstK8S_OwnerName,DstK8S_OwnerType,DstK8S_Zone] + filters: + - field: SrcK8S_Zone + value: $(DstK8S_Zone) + matchType: NotEqual + charts: + - dashboardName: NetObserv + title: Traffic across availability zones + unit: Bps + type: SingleStat + queries: + - promQL: "sum(rate($METRIC[2m]))" + legend: "" + - dashboardName: NetObserv + title: Ratio across AZ + unit: percent + type: SingleStat + queries: + - promQL: "sum(rate($METRIC[2m])) / sum(rate(netobserv_node_ingress_bytes_total[2m]))" + legend: "" + - dashboardName: NetObserv + sectionName: Traffic rates + title: Top traffic across availability zones per namespace + unit: Bps + type: StackArea + queries: + - promQL: "sum(rate($METRIC[2m])) by (SrcK8S_Namespace, SrcK8S_Zone, DstK8S_Namespace, DstK8S_Zone)" + legend: "{{SrcK8S_Namespace}}, {{ SrcK8S_Zone }} -> {{DstK8S_Namespace}}, {{ DstK8S_Zone }}" diff --git a/config/samples/flows_v1alpha1_flowmetric.yaml b/config/samples/flows_v1alpha1_flowmetric.yaml index a17861053..7af084dbf 100644 --- a/config/samples/flows_v1alpha1_flowmetric.yaml +++ b/config/samples/flows_v1alpha1_flowmetric.yaml @@ -9,18 +9,28 @@ metadata: app.kubernetes.io/created-by: netobserv-operator name: flowmetric-sample spec: - # Example, counting flows per port <= 9999 - # More examples in config/samples/flowmetrics - metricName: service_ports_total + # More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics + metricName: cluster_ingress_bytes_total type: Counter - labels: [DstPort] + valueField: Bytes + direction: Ingress + labels: [DstK8S_HostName,DstK8S_Namespace,DstK8S_OwnerName,DstK8S_OwnerType] filters: - - field: DstPort - value: "^\\d\\d?\\d?\\d?$" - matchType: Regex - - field: Duplicate - value: "true" - matchType: NotEqual - - field: FlowDirection - value: "1|2" - matchType: Regex + - field: SrcSubnetLabel + matchType: Absence + charts: + - dashboardName: NetObserv + title: Cluster ingress traffic + unit: Bps + type: SingleStat + queries: + - promQL: "sum(rate($METRIC[2m]))" + legend: "" + - dashboardName: NetObserv + sectionName: Cluster + title: Top cluster ingress traffic per workload + unit: Bps + type: StackArea + queries: + - promQL: "sum(rate($METRIC{DstK8S_Namespace!=\"\"}[2m])) by (DstK8S_Namespace, DstK8S_OwnerName)" + legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" diff --git a/controllers/flp/flp_controller.go b/controllers/flp/flp_controller.go index bc1e1ed71..4208a2020 100644 --- a/controllers/flp/flp_controller.go +++ b/controllers/flp/flp_controller.go @@ -56,7 +56,7 @@ func Start(ctx context.Context, mgr *manager.Manager) error { Owns(&corev1.ServiceAccount{}). Watches( &metricslatest.FlowMetric{}, - handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []reconcile.Request { + handler.EnqueueRequestsFromMapFunc(func(_ context.Context, o client.Object) []reconcile.Request { if o.GetNamespace() == r.currentNamespace { return []reconcile.Request{{NamespacedName: constants.FlowCollectorName}} } diff --git a/controllers/flp/flp_pipeline_builder.go b/controllers/flp/flp_pipeline_builder.go index 2e0941384..45e7ae38c 100644 --- a/controllers/flp/flp_pipeline_builder.go +++ b/controllers/flp/flp_pipeline_builder.go @@ -226,12 +226,11 @@ func (b *PipelineBuilder) AddProcessorStages() error { func flowMetricToFLP(flowMetric *metricslatest.FlowMetricSpec) (*api.MetricsItem, error) { m := &api.MetricsItem{ - Name: flowMetric.MetricName, - Type: api.MetricEncodeOperationEnum(strings.ToLower(string(flowMetric.Type))), - Filters: []api.MetricsFilter{}, - Labels: flowMetric.Labels, - ValueKey: flowMetric.ValueField, - ValueScale: flowMetric.Divider, + Name: flowMetric.MetricName, + Type: api.MetricEncodeOperationEnum(strings.ToLower(string(flowMetric.Type))), + Filters: []api.MetricsFilter{}, + Labels: flowMetric.Labels, + ValueKey: flowMetric.ValueField, } for _, f := range metrics.GetFilters(flowMetric) { m.Filters = append(m.Filters, api.MetricsFilter{Key: f.Field, Value: f.Value, Type: api.MetricFilterEnum(conversion.PascalToLower(string(f.MatchType), '_'))}) @@ -243,6 +242,13 @@ func flowMetricToFLP(flowMetric *metricslatest.FlowMetricSpec) (*api.MetricsItem } m.Buckets = append(m.Buckets, f) } + if flowMetric.Divider != "" { + f, err := strconv.ParseFloat(flowMetric.Divider, 64) + if err != nil { + return nil, fmt.Errorf("could not parse metric divider as float: '%s'; error was: %w", flowMetric.Divider, err) + } + m.ValueScale = f + } return m, nil } @@ -398,7 +404,7 @@ func (b *PipelineBuilder) addTransformFilter(lastStage config.PipelineBuilderSta if b.desired.Processor.ClusterName != "" { clusterName = b.desired.Processor.ClusterName } else { - //take clustername from openshift + // Take clustername from openshift clusterName = string(b.clusterID) } if clusterName != "" { @@ -460,7 +466,7 @@ func getIPFIXTransport(transport string) string { case "UDP": return "udp" default: - return "tcp" //always fallback on tcp + return "tcp" // Always fallback on tcp } } diff --git a/controllers/flp/flp_test.go b/controllers/flp/flp_test.go index 0fdebd218..455b36dbd 100644 --- a/controllers/flp/flp_test.go +++ b/controllers/flp/flp_test.go @@ -722,27 +722,27 @@ func TestConfigMapShouldDeserializeAsJSONWithLokiStack(t *testing.T) { func TestAutoScalerUpdateCheck(t *testing.T) { assert := assert.New(t) - //equals specs + // Equals specs autoScalerSpec, hpa := getAutoScalerSpecs() report := helper.NewChangeReport("") assert.False(helper.AutoScalerChanged(&autoScalerSpec, hpa, &report)) assert.Contains(report.String(), "no change") - //wrong max replicas + // Wrong max replicas autoScalerSpec, hpa = getAutoScalerSpecs() autoScalerSpec.Spec.MaxReplicas = 10 report = helper.NewChangeReport("") assert.True(helper.AutoScalerChanged(&autoScalerSpec, hpa, &report)) assert.Contains(report.String(), "Max replicas changed") - //missing min replicas + // Missing min replicas autoScalerSpec, hpa = getAutoScalerSpecs() autoScalerSpec.Spec.MinReplicas = nil report = helper.NewChangeReport("") assert.True(helper.AutoScalerChanged(&autoScalerSpec, hpa, &report)) assert.Contains(report.String(), "Min replicas changed") - //missing metrics + // Missing metrics autoScalerSpec, hpa = getAutoScalerSpecs() autoScalerSpec.Spec.Metrics = []ascv2.MetricSpec{} report = helper.NewChangeReport("") diff --git a/controllers/monitoring/monitoring_controller.go b/controllers/monitoring/monitoring_controller.go index d7e80c51f..e0567b924 100644 --- a/controllers/monitoring/monitoring_controller.go +++ b/controllers/monitoring/monitoring_controller.go @@ -44,7 +44,7 @@ func Start(ctx context.Context, mgr *manager.Manager) error { Owns(&corev1.Namespace{}). Watches( &metricslatest.FlowMetric{}, - handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []reconcile.Request { + handler.EnqueueRequestsFromMapFunc(func(_ context.Context, o client.Object) []reconcile.Request { if o.GetNamespace() == r.currentNamespace { return []reconcile.Request{{NamespacedName: constants.FlowCollectorName}} } @@ -87,6 +87,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result } func (r *Reconciler) reconcile(ctx context.Context, clh *helper.Client, desired *flowslatest.FlowCollector) error { + log := log.FromContext(ctx) ns := helper.GetNamespace(&desired.Spec) r.currentNamespace = ns @@ -124,8 +125,11 @@ func (r *Reconciler) reconcile(ctx context.Context, clh *helper.Client, desired if err := r.Client.List(ctx, &fm, &client.ListOptions{Namespace: ns}); err != nil { return r.status.Error("CantListFlowMetrics", err) } + log.WithValues("items count", len(fm.Items)).Info("FlowMetrics loaded") allMetrics := metrics.MergePredefined(fm.Items, &desired.Spec) + log.WithValues("metrics count", len(allMetrics)).Info("Merged metrics") + desiredFlowDashboardCM, del, err := buildFlowMetricsDashboard(allMetrics) if err != nil { return err diff --git a/pkg/dashboards/dashboard_test.go b/pkg/dashboards/dashboard_test.go index a260e3375..5d45a6d46 100644 --- a/pkg/dashboards/dashboard_test.go +++ b/pkg/dashboards/dashboard_test.go @@ -3,6 +3,7 @@ package dashboards import ( "testing" + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" "github.com/netobserv/network-observability-operator/pkg/metrics" "github.com/stretchr/testify/assert" ) @@ -18,9 +19,10 @@ func TestCreateFlowMetricsDashboard_All(t *testing.T) { assert.Equal("NetObserv", d.Title) - assert.Equal([]string{"Traffic rates", "TCP latencies", "Byte and packet drops", "DNS"}, d.Titles()) + assert.Equal([]string{"", "Traffic rates", "TCP latencies", "Byte and packet drops", "DNS"}, d.Titles()) - assert.Len(d.Rows[0].Panels, 32) + assert.Len(d.Rows[0].Panels, 16) + assert.Len(d.Rows[1].Panels, 20) p := d.FindPanel("Top egress traffic per node") assert.NotNil(p) @@ -63,18 +65,20 @@ func TestCreateFlowMetricsDashboard_All(t *testing.T) { p.Targets[0].Expr, ) - p = d.FindPanel("Top ingress traffic per app workload") + p = d.FindNthPanel("Top ingress traffic per app workload", 2) // pps variant assert.NotNil(p) assert.Len(p.Targets, 1) assert.Equal( - `topk(7, sum(rate(netobserv_workload_ingress_packets_total{K8S_FlowLayer="app"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + `topk(7, (sum(rate(netobserv_workload_ingress_packets_total{K8S_FlowLayer="app",SrcK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`+ + ` or (sum(rate(netobserv_workload_ingress_packets_total{K8S_FlowLayer="app",DstK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName)))`, p.Targets[0].Expr, ) - p = d.FindPanel("Top ingress traffic per infra workload") + p = d.FindNthPanel("Top ingress traffic per infra workload", 2) // pps variant assert.NotNil(p) assert.Len(p.Targets, 1) assert.Equal( - `topk(7, sum(rate(netobserv_workload_ingress_packets_total{K8S_FlowLayer="infra"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + `topk(7, (sum(rate(netobserv_workload_ingress_packets_total{K8S_FlowLayer="infra",SrcK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`+ + ` or (sum(rate(netobserv_workload_ingress_packets_total{K8S_FlowLayer="infra",DstK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName)))`, p.Targets[0].Expr, ) } @@ -89,14 +93,13 @@ func TestCreateFlowMetricsDashboard_OnlyNodeIngressBytes(t *testing.T) { assert.NoError(err) assert.Equal("NetObserv", d.Title) - assert.Len(d.Rows, 1) - - row := d.FindRow("Byte rate received per node") - assert.NotNil(row) - assert.Len(row.Panels, 1) - assert.Equal("", row.Panels[0].Title) - assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(7,sum(rate(netobserv_node_ingress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") + assert.Equal([]string{"", "Traffic rates"}, d.Titles()) + + topRow := d.FindRow("") + assert.Equal([]string{"Total ingress traffic"}, topRow.Titles()) + + trafficRow := d.FindRow("Traffic rates") + assert.Equal([]string{"Top ingress traffic per node"}, trafficRow.Titles()) } func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { @@ -109,41 +112,51 @@ func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { assert.NoError(err) assert.Equal("NetObserv", d.Title) - assert.Len(d.Rows, 7) - - row := d.FindRow("Byte rate received per node") - assert.NotNil(row) - assert.Len(row.Panels, 1) - assert.Equal("", row.Panels[0].Title) - assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(7,sum(rate(netobserv_node_ingress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") - - row = d.FindRow("Byte rate received per namespace") - assert.NotNil(row) - assert.Len(row.Panels, 2) - assert.Equal("Applications", row.Panels[0].Title) - assert.Equal("Infrastructure", row.Panels[1].Title) - assert.Len(row.Panels[0].Targets, 1) - // Make sure netobserv_namespace_ingress_bytes_total was replaced with netobserv_workload_ingress_bytes_total - assert.Contains(row.Panels[0].Targets[0].Expr, - `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, - ) - assert.Contains(row.Panels[1].Targets[0].Expr, - `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, - ) + assert.Equal([]string{"", "Traffic rates", "TCP latencies", "Byte and packet drops", "DNS"}, d.Titles()) - row = d.FindRow("Byte rate received per workload") - assert.NotNil(row) - assert.Len(row.Panels, 2) - assert.Equal("Applications", row.Panels[0].Title) - assert.Equal("Infrastructure", row.Panels[1].Title) - assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, - `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, - ) - assert.Contains(row.Panels[1].Targets[0].Expr, - `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, - ) + topRow := d.FindRow("") + assert.Equal([]string{ + "Total ingress traffic", + "TCP latency, p99", + "Drops", + "DNS latency, p99", + "DNS error rate", + "Infra ingress traffic", + "Apps ingress traffic", + }, topRow.Titles()) + + trafficRow := d.FindRow("Traffic rates") + assert.Equal([]string{ + "Top ingress traffic per node", + "Top ingress traffic per infra namespace", + "Top ingress traffic per app namespace", + "Top ingress traffic per infra workload", + "Top ingress traffic per app workload", + }, trafficRow.Titles()) + + rttRow := d.FindRow("TCP latencies") + assert.Equal([]string{ + "Top P50 sRTT per infra namespace (ms)", + "Top P50 sRTT per app namespace (ms)", + "Top P99 sRTT per infra namespace (ms)", + "Top P99 sRTT per app namespace (ms)", + }, rttRow.Titles()) + + dropsRow := d.FindRow("Byte and packet drops") + assert.Equal([]string{ + "Top drops per infra namespace", + "Top drops per app namespace", + }, dropsRow.Titles()) + + dnsRow := d.FindRow("DNS") + assert.Equal([]string{ + "Top P50 DNS latency per infra namespace (ms)", + "Top P50 DNS latency per app namespace (ms)", + "Top P99 DNS latency per infra namespace (ms)", + "Top P99 DNS latency per app namespace (ms)", + "DNS error rate per infra namespace", + "DNS error rate per app namespace", + }, dnsRow.Titles()) } func TestCreateHealthDashboard_Default(t *testing.T) { @@ -165,3 +178,79 @@ func TestCreateHealthDashboard_Default(t *testing.T) { assert.Len(d.Rows[row].Panels[0].Targets, 1) assert.Contains(d.Rows[row].Panels[0].Targets[0].Expr, "netobserv_ingest_flows_processed") } + +func TestCreateCustomDashboard(t *testing.T) { + assert := assert.New(t) + + js := CreateFlowMetricsDashboards([]metricslatest.FlowMetric{ + { + Spec: metricslatest.FlowMetricSpec{ + MetricName: "my_metric", + Charts: []metricslatest.Chart{ + { + DashboardName: "NetObserv", + SectionName: "My section", + Title: "My chart", + Unit: metricslatest.UnitBPS, + Type: metricslatest.ChartTypeSingleStat, + Queries: []metricslatest.Query{ + { + PromQL: `sum(rate($METRIC{label="foo"}[5m]))`, + Legend: "", + }, + }, + }, + { + DashboardName: "NetObserv", + SectionName: "My next section", + Title: "My next chart", + Unit: metricslatest.UnitBPS, + Type: metricslatest.ChartTypeLine, + Queries: []metricslatest.Query{ + { + PromQL: `sum(rate($METRIC{label="foo"}[5m])) by (lbl1,lbl2)`, + Legend: "{{lbl1}}: {{lbl2}}", + }, + }, + }, + }, + }, + }, + }) + + d, err := FromBytes([]byte(js)) + assert.NoError(err) + + assert.Equal("NetObserv", d.Title) + assert.Equal([]string{"My section", "My next section"}, d.Titles()) + + r1 := d.FindRow("My section") + assert.Equal([]string{"My chart"}, r1.Titles()) + assert.Equal(Panel{ + Title: "My chart", + Type: "singlestat", + Span: 3, + Format: "Bps", + Targets: []Target{ + { + Expr: "sum(rate(netobserv_my_metric{label=\"foo\"}[5m]))", + LegendFormat: "", + }, + }, + }, r1.Panels[0]) + + r2 := d.FindRow("My next section") + assert.Equal([]string{"My next chart"}, r2.Titles()) + assert.Equal(Panel{ + Title: "My next chart", + Type: "graph", + Span: 4, + Format: "Bps", + Targets: []Target{ + { + Expr: "topk(7, sum(rate(netobserv_my_metric{label=\"foo\"}[5m])) by (lbl1,lbl2))", + LegendFormat: "{{lbl1}}: {{lbl2}}", + }, + }, + }, r2.Panels[0]) +} diff --git a/pkg/dashboards/model.go b/pkg/dashboards/model.go index 004bbcce4..ee8ad09da 100644 --- a/pkg/dashboards/model.go +++ b/pkg/dashboards/model.go @@ -36,6 +36,7 @@ type Panel struct { Targets []Target Span int Unit metricslatest.Unit + Format string // only used for unmarshalling grafana json in tests } func NewPanel(title string, t metricslatest.ChartType, unit metricslatest.Unit, span int, targets ...Target) Panel { @@ -49,14 +50,14 @@ func NewPanel(title string, t metricslatest.ChartType, unit metricslatest.Unit, } type Target struct { - Expr string - Legend string + Expr string + LegendFormat string } func NewTarget(expr, legend string) Target { return Target{ - Expr: expr, - Legend: legend, + Expr: expr, + LegendFormat: legend, } } @@ -90,10 +91,17 @@ func (d *Dashboard) FindRow(titleSubstr string) *Row { } func (d *Dashboard) FindPanel(titleSubstr string) *Panel { + return d.FindNthPanel(titleSubstr, 1) +} + +func (d *Dashboard) FindNthPanel(titleSubstr string, n int) *Panel { for _, r := range d.Rows { for _, p := range r.Panels { if strings.Contains(p.Title, titleSubstr) { - return &p + if n <= 1 { + return &p + } + n-- } } } @@ -210,9 +218,14 @@ func (p *Panel) ToGrafanaJSON() string { unit = "short" } var singleStatFormat string - if p.Unit == metricslatest.UnitSeconds { + //nolint:exhaustive + switch p.Unit { + case metricslatest.UnitSeconds: singleStatFormat = "s" - } else { + case metricslatest.UnitPercent: + singleStatFormat = "percentunit" + unit = "short" + default: singleStatFormat = unit } var t string @@ -306,5 +319,5 @@ func (t *Target) ToGrafanaJSON() string { "legendFormat": "%s", "refId": "A" } - `, expr, t.Legend) + `, expr, t.LegendFormat) } diff --git a/pkg/metrics/predefined_charts.go b/pkg/metrics/predefined_charts.go index 8a21778f2..416eb7047 100644 --- a/pkg/metrics/predefined_charts.go +++ b/pkg/metrics/predefined_charts.go @@ -137,7 +137,7 @@ func dropCharts(group string, unit metricslatest.Unit) []metricslatest.Chart { Title: "Top drops", Unit: unit, Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC{$FILTERS}[2m])) by ($LABELS)", Legend: "$LEGEND"}}, - }, group, string(unit))...) + }, group, "")...) } func dnsCharts(group string) []metricslatest.Chart { diff --git a/pkg/metrics/predefined_metrics.go b/pkg/metrics/predefined_metrics.go index 85b33f4d9..a5e49a652 100644 --- a/pkg/metrics/predefined_metrics.go +++ b/pkg/metrics/predefined_metrics.go @@ -96,7 +96,7 @@ func init() { {Field: "TimeFlowRttNs", MatchType: metricslatest.MatchPresence}, }, Labels: labels, - Divider: 1_000_000_000, // ns => s + Divider: "1000000000", // ns => s Buckets: latencyBuckets, Charts: rttCharts(group), }, @@ -143,7 +143,7 @@ func init() { {Field: "DnsId", MatchType: metricslatest.MatchPresence}, }, Labels: dnsLabels, - Divider: 1000, // ms => s + Divider: "1000", // ms => s Buckets: latencyBuckets, Charts: dnsCharts(group), }, @@ -238,5 +238,5 @@ func removeMetricsByPattern(list []string, search string) []string { func MergePredefined(fm []metricslatest.FlowMetric, fc *flowslatest.FlowCollectorSpec) []metricslatest.FlowMetric { names := getIncludeList(fc) predefined := GetDefinitions(names) - return append(fm, predefined...) + return append(predefined, fm...) } From 5df217ada7c45bbcb4eb2e05efdba5b21f242b9f Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Mon, 8 Apr 2024 11:52:35 +0200 Subject: [PATCH 03/13] Manage multiple dashboards - Allow users to reference/create new dashboards (this requires the monitoring controller to fetch all dashboards from the dashboards namespace) All dashboard names are prefixed "NetObserv / " ; so, rename the main dashboard as "NetObserv / Main" - Add/update tests --- apis/flowmetrics/v1alpha1/flowmetric_types.go | 6 +- .../flows.netobserv.io_flowmetrics.yaml | 2 +- ...observ-operator.clusterserviceversion.yaml | 4 +- .../bases/flows.netobserv.io_flowmetrics.yaml | 2 +- .../flowmetrics/cluster_egress_traffic.yaml | 4 +- .../flowmetrics/cluster_ingress_traffic.yaml | 4 +- .../flowmetrics/traffic_across_az.yaml | 6 +- config/samples/flows_v1alpha1_flowmetric.yaml | 4 +- .../monitoring/monitoring_controller.go | 59 ++++++-- .../monitoring/monitoring_controller_test.go | 126 +++++++++++++++++- controllers/monitoring/monitoring_objects.go | 37 +++-- controllers/reconcilers/common.go | 4 +- controllers/reconcilers/reconcilers.go | 39 +++--- pkg/cleanup/cleanup.go | 5 + pkg/cleanup/cleanup_test.go | 20 ++- pkg/dashboards/dashboard.go | 11 +- pkg/dashboards/dashboard_test.go | 61 +++++++-- pkg/metrics/predefined_charts.go | 30 ++--- 18 files changed, 328 insertions(+), 96 deletions(-) diff --git a/apis/flowmetrics/v1alpha1/flowmetric_types.go b/apis/flowmetrics/v1alpha1/flowmetric_types.go index 9d04728a2..b28aceed1 100644 --- a/apis/flowmetrics/v1alpha1/flowmetric_types.go +++ b/apis/flowmetrics/v1alpha1/flowmetric_types.go @@ -136,13 +136,13 @@ const ( // Configures charts / dashboard generation associated to a metric type Chart struct { // Name of the containing dashboard. If this name does not refer to an existing dashboard, a new dashboard is created. - // +kubebuilder:default:="NetObserv" + // +kubebuilder:default:="Main" DashboardName string `json:"dashboardName"` // Name of the containing dashboard section. If this name does not refer to an existing section, a new section is created. // If `sectionName` is omitted or empty, the chart is placed in the global top section. // +optional - SectionName string `json:"sectionName"` + SectionName string `json:"sectionName,omitempty"` // Title of the chart. // +required @@ -151,7 +151,7 @@ type Chart struct { // Unit of this chart. Only a few units are currently supported. Leave empty to use generic number. // +kubebuilder:validation:Enum:="bytes";"seconds";"Bps";"pps";"percent" // +optional - Unit Unit `json:"unit"` + Unit Unit `json:"unit,omitempty"` // Type of the chart. // +kubebuilder:validation:Enum:="SingleStat";"Line";"StackArea" diff --git a/bundle/manifests/flows.netobserv.io_flowmetrics.yaml b/bundle/manifests/flows.netobserv.io_flowmetrics.yaml index b9a995eb3..0773a0cba 100644 --- a/bundle/manifests/flows.netobserv.io_flowmetrics.yaml +++ b/bundle/manifests/flows.netobserv.io_flowmetrics.yaml @@ -58,7 +58,7 @@ spec: to a metric properties: dashboardName: - default: NetObserv + default: Main description: Name of the containing dashboard. If this name does not refer to an existing dashboard, a new dashboard is created. diff --git a/bundle/manifests/netobserv-operator.clusterserviceversion.yaml b/bundle/manifests/netobserv-operator.clusterserviceversion.yaml index 2f6db2724..0800a71a9 100644 --- a/bundle/manifests/netobserv-operator.clusterserviceversion.yaml +++ b/bundle/manifests/netobserv-operator.clusterserviceversion.yaml @@ -20,7 +20,7 @@ metadata: "spec": { "charts": [ { - "dashboardName": "NetObserv", + "dashboardName": "Main", "queries": [ { "legend": "", @@ -32,7 +32,7 @@ metadata: "unit": "Bps" }, { - "dashboardName": "NetObserv", + "dashboardName": "Main", "queries": [ { "legend": "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}", diff --git a/config/crd/bases/flows.netobserv.io_flowmetrics.yaml b/config/crd/bases/flows.netobserv.io_flowmetrics.yaml index 86eeb7745..a367e0441 100644 --- a/config/crd/bases/flows.netobserv.io_flowmetrics.yaml +++ b/config/crd/bases/flows.netobserv.io_flowmetrics.yaml @@ -58,7 +58,7 @@ spec: to a metric properties: dashboardName: - default: NetObserv + default: Main description: Name of the containing dashboard. If this name does not refer to an existing dashboard, a new dashboard is created. diff --git a/config/samples/flowmetrics/cluster_egress_traffic.yaml b/config/samples/flowmetrics/cluster_egress_traffic.yaml index a265e6065..cc4ced4ad 100644 --- a/config/samples/flowmetrics/cluster_egress_traffic.yaml +++ b/config/samples/flowmetrics/cluster_egress_traffic.yaml @@ -13,14 +13,14 @@ spec: - field: DstSubnetLabel matchType: Absence charts: - - dashboardName: NetObserv + - dashboardName: Main title: Cluster egress traffic unit: Bps type: SingleStat queries: - promQL: "sum(rate($METRIC[2m]))" legend: "" - - dashboardName: NetObserv + - dashboardName: Main sectionName: Cluster title: Top cluster egress traffic per workload unit: Bps diff --git a/config/samples/flowmetrics/cluster_ingress_traffic.yaml b/config/samples/flowmetrics/cluster_ingress_traffic.yaml index c1983bb31..ae8287b0f 100644 --- a/config/samples/flowmetrics/cluster_ingress_traffic.yaml +++ b/config/samples/flowmetrics/cluster_ingress_traffic.yaml @@ -13,14 +13,14 @@ spec: - field: SrcSubnetLabel matchType: Absence charts: - - dashboardName: NetObserv + - dashboardName: Main title: Cluster ingress traffic unit: Bps type: SingleStat queries: - promQL: "sum(rate($METRIC[2m]))" legend: "" - - dashboardName: NetObserv + - dashboardName: Main sectionName: Cluster title: Top cluster ingress traffic per workload unit: Bps diff --git a/config/samples/flowmetrics/traffic_across_az.yaml b/config/samples/flowmetrics/traffic_across_az.yaml index 69b373a86..fa89d1fcd 100644 --- a/config/samples/flowmetrics/traffic_across_az.yaml +++ b/config/samples/flowmetrics/traffic_across_az.yaml @@ -14,21 +14,21 @@ spec: value: $(DstK8S_Zone) matchType: NotEqual charts: - - dashboardName: NetObserv + - dashboardName: Main title: Traffic across availability zones unit: Bps type: SingleStat queries: - promQL: "sum(rate($METRIC[2m]))" legend: "" - - dashboardName: NetObserv + - dashboardName: Main title: Ratio across AZ unit: percent type: SingleStat queries: - promQL: "sum(rate($METRIC[2m])) / sum(rate(netobserv_node_ingress_bytes_total[2m]))" legend: "" - - dashboardName: NetObserv + - dashboardName: Main sectionName: Traffic rates title: Top traffic across availability zones per namespace unit: Bps diff --git a/config/samples/flows_v1alpha1_flowmetric.yaml b/config/samples/flows_v1alpha1_flowmetric.yaml index 7af084dbf..957b8bfb0 100644 --- a/config/samples/flows_v1alpha1_flowmetric.yaml +++ b/config/samples/flows_v1alpha1_flowmetric.yaml @@ -19,14 +19,14 @@ spec: - field: SrcSubnetLabel matchType: Absence charts: - - dashboardName: NetObserv + - dashboardName: Main title: Cluster ingress traffic unit: Bps type: SingleStat queries: - promQL: "sum(rate($METRIC[2m]))" legend: "" - - dashboardName: NetObserv + - dashboardName: Main sectionName: Cluster title: Top cluster ingress traffic per workload unit: Bps diff --git a/controllers/monitoring/monitoring_controller.go b/controllers/monitoring/monitoring_controller.go index e0567b924..a1e26ccd5 100644 --- a/controllers/monitoring/monitoring_controller.go +++ b/controllers/monitoring/monitoring_controller.go @@ -130,24 +130,65 @@ func (r *Reconciler) reconcile(ctx context.Context, clh *helper.Client, desired allMetrics := metrics.MergePredefined(fm.Items, &desired.Spec) log.WithValues("metrics count", len(allMetrics)).Info("Merged metrics") - desiredFlowDashboardCM, del, err := buildFlowMetricsDashboard(allMetrics) - if err != nil { - return err - } else if err = reconcilers.ReconcileConfigMap(ctx, clh, desiredFlowDashboardCM, del); err != nil { - return err + // List existing dashboards + currentDashboards := corev1.ConfigMapList{} + if err := r.Client.List(ctx, ¤tDashboards, &client.ListOptions{Namespace: dashboardCMNamespace}); err != nil { + return r.status.Error("CantListDashboards", err) } + filterOwned(¤tDashboards) - desiredHealthDashboardCM, del, err := buildHealthDashboard(ns) - if err != nil { - return err - } else if err = reconcilers.ReconcileConfigMap(ctx, clh, desiredHealthDashboardCM, del); err != nil { + // Build desired dashboards + cms := buildFlowMetricsDashboards(allMetrics) + if desiredHealthDashboardCM, del, err := buildHealthDashboard(ns); err != nil { return err + } else if !del { + cms = append(cms, desiredHealthDashboardCM) + } + + for _, cm := range cms { + current := findAndRemoveConfigMapFromList(¤tDashboards, cm.Name) + if err := reconcilers.ReconcileConfigMap(ctx, clh, current, cm); err != nil { + return err + } + } + + // Delete any CM that remained in currentDashboards list + for i := range currentDashboards.Items { + if err := reconcilers.ReconcileConfigMap(ctx, clh, ¤tDashboards.Items[i], nil); err != nil { + return err + } + } + } + + return nil +} + +func filterOwned(list *corev1.ConfigMapList) { + for i := len(list.Items) - 1; i >= 0; i-- { + if !helper.IsOwned(&list.Items[i]) { + removeFromList(list, i) } } +} +func findAndRemoveConfigMapFromList(list *corev1.ConfigMapList, name string) *corev1.ConfigMap { + for i := len(list.Items) - 1; i >= 0; i-- { + if list.Items[i].Name == name { + cm := list.Items[i] + // Remove that element from the list, so the list ends up with elements to delete + removeFromList(list, i) + return &cm + } + } return nil } +func removeFromList(list *corev1.ConfigMapList, i int) { + // (quickest removal as order doesn't matter) + list.Items[i] = list.Items[len(list.Items)-1] + list.Items = list.Items[:len(list.Items)-1] +} + func (r *Reconciler) namespaceExist(ctx context.Context, nsName string) (*corev1.Namespace, error) { ns := &corev1.Namespace{} err := r.Get(ctx, types.NamespacedName{Name: nsName}, ns) diff --git a/controllers/monitoring/monitoring_controller_test.go b/controllers/monitoring/monitoring_controller_test.go index 78ccf9b70..cc1deaa03 100644 --- a/controllers/monitoring/monitoring_controller_test.go +++ b/controllers/monitoring/monitoring_controller_test.go @@ -10,6 +10,7 @@ import ( "k8s.io/apimachinery/pkg/types" flowslatest "github.com/netobserv/network-observability-operator/apis/flowcollector/v1beta2" + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" . "github.com/netobserv/network-observability-operator/controllers/controllerstest" "github.com/netobserv/network-observability-operator/pkg/dashboards" "github.com/netobserv/network-observability-operator/pkg/test" @@ -52,6 +53,14 @@ func ControllerSpecs() { }) Context("Installing CR", func() { + It("Create control-cm", func() { + // control-cm is a control object installed in openshift-config-managed, aiming to make sure we never delete configmaps that we don't own + Expect(k8sClient.Create(ctx, &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: "control-cm", Namespace: "openshift-config-managed"}, + Data: map[string]string{}, + })).Should(Succeed()) + }) + It("Should create successfully", func() { created := &flowslatest.FlowCollector{ ObjectMeta: metav1.ObjectMeta{ @@ -70,7 +79,7 @@ func ControllerSpecs() { Eventually(func() interface{} { cm := v1.ConfigMap{} return k8sClient.Get(ctx, types.NamespacedName{ - Name: "grafana-dashboard-netobserv-flow-metrics", + Name: "netobserv-main", Namespace: "openshift-config-managed", }, &cm) }, timeout, interval).Should(Succeed()) @@ -90,6 +99,11 @@ func ControllerSpecs() { } return d.Titles() }, timeout, interval).Should(Equal([]string{"", "Flowlogs-pipeline statistics", "eBPF agent statistics", "Operator statistics", "Resource usage"})) + + By("Expecting control-cm to remain") + Eventually(func() interface{} { + return k8sClient.Get(ctx, types.NamespacedName{Name: "control-cm", Namespace: "openshift-config-managed"}, &v1.ConfigMap{}) + }, timeout, interval).Should(Succeed()) }) It("Should update successfully", func() { @@ -105,10 +119,10 @@ func ControllerSpecs() { By("Expecting the flow dashboards configmap to be deleted") Eventually(func() interface{} { return k8sClient.Get(ctx, types.NamespacedName{ - Name: "grafana-dashboard-netobserv-flow-metrics", + Name: "netobserv-main", Namespace: "openshift-config-managed", }, &v1.ConfigMap{}) - }, timeout, interval).Should(MatchError(`configmaps "grafana-dashboard-netobserv-flow-metrics" not found`)) + }, timeout, interval).Should(MatchError(`configmaps "netobserv-main" not found`)) By("Expecting the health dashboard to remain") Eventually(func() interface{} { @@ -125,6 +139,107 @@ func ControllerSpecs() { } return d.Titles() }, timeout, interval).Should(Equal([]string{"", "Flowlogs-pipeline statistics", "eBPF agent statistics", "Operator statistics", "Resource usage"})) + + By("Expecting control-cm to remain") + Eventually(func() interface{} { + return k8sClient.Get(ctx, types.NamespacedName{Name: "control-cm", Namespace: "openshift-config-managed"}, &v1.ConfigMap{}) + }, timeout, interval).Should(Succeed()) + }) + }) + + Context("Installing custom dashboards", func() { + It("Should create FlowMetric 1 successfully", func() { + Expect(k8sClient.Create(ctx, &metricslatest.FlowMetric{ + ObjectMeta: metav1.ObjectMeta{Name: "metric1", Namespace: operatorNamespace}, + Spec: metricslatest.FlowMetricSpec{ + MetricName: "my-metric", + Type: metricslatest.CounterMetric, + Charts: []metricslatest.Chart{ + { + DashboardName: "My dashboard 01", + Title: "title", + Type: metricslatest.ChartTypeSingleStat, + Queries: []metricslatest.Query{{PromQL: "(query)", Legend: "-"}}, + }, + }, + }, + })).Should(Succeed()) + }) + + It("Should create FlowMetric 2 successfully", func() { + Expect(k8sClient.Create(ctx, &metricslatest.FlowMetric{ + ObjectMeta: metav1.ObjectMeta{Name: "metric2", Namespace: operatorNamespace}, + Spec: metricslatest.FlowMetricSpec{ + MetricName: "my-metric", + Type: metricslatest.CounterMetric, + Charts: []metricslatest.Chart{ + { + DashboardName: "My dashboard 02", + Title: "title", + Type: metricslatest.ChartTypeSingleStat, + Queries: []metricslatest.Query{{PromQL: "(query)", Legend: "-"}}, + }, + }, + }, + })).Should(Succeed()) + }) + + It("Should create corresponding dashboards", func() { + Eventually(func() interface{} { + cm := v1.ConfigMap{} + return k8sClient.Get(ctx, types.NamespacedName{ + Name: "netobserv-my-dashboard-01", + Namespace: "openshift-config-managed", + }, &cm) + }, timeout, interval).Should(Succeed()) + + Eventually(func() interface{} { + cm := v1.ConfigMap{} + return k8sClient.Get(ctx, types.NamespacedName{ + Name: "netobserv-my-dashboard-02", + Namespace: "openshift-config-managed", + }, &cm) + }, timeout, interval).Should(Succeed()) + }) + + It("Should delete dashboard 2", func() { + By("Getting FlowMetric 2") + fm := metricslatest.FlowMetric{} + Eventually(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Name: "metric2", Namespace: operatorNamespace}, &fm) + }).Should(Succeed()) + + By("Deleting FlowMetric 2") + Eventually(func() error { return k8sClient.Delete(ctx, &fm) }, timeout, interval).Should(Succeed()) + + By("Expecting dashboard 2 to be deleted") + Eventually(func() interface{} { + return k8sClient.Get(ctx, types.NamespacedName{ + Name: "netobserv-my-dashboard-02", + Namespace: "openshift-config-managed", + }, &v1.ConfigMap{}) + }, timeout, interval).Should(MatchError(`configmaps "netobserv-my-dashboard-02" not found`)) + + By("Expecting dashboard 1 to remain") + Eventually(func() interface{} { + return k8sClient.Get(ctx, types.NamespacedName{ + Name: "netobserv-my-dashboard-01", + Namespace: "openshift-config-managed", + }, &v1.ConfigMap{}) + }, timeout, interval).Should(Succeed()) + + By("Expecting the health dashboard to remain") + Eventually(func() interface{} { + return k8sClient.Get(ctx, types.NamespacedName{ + Name: "grafana-dashboard-netobserv-health", + Namespace: "openshift-config-managed", + }, &v1.ConfigMap{}) + }, timeout, interval).Should(Succeed()) + + By("Expecting control-cm to remain") + Eventually(func() interface{} { + return k8sClient.Get(ctx, types.NamespacedName{Name: "control-cm", Namespace: "openshift-config-managed"}, &v1.ConfigMap{}) + }, timeout, interval).Should(Succeed()) }) }) @@ -149,6 +264,11 @@ func ControllerSpecs() { Context("Cleanup", func() { It("Should delete CR", func() { cleanupCR(crKey) + + By("Expecting control-cm to remain") + Eventually(func() interface{} { + return k8sClient.Get(ctx, types.NamespacedName{Name: "control-cm", Namespace: "openshift-config-managed"}, &v1.ConfigMap{}) + }, timeout, interval).Should(Succeed()) }) }) } diff --git a/controllers/monitoring/monitoring_objects.go b/controllers/monitoring/monitoring_objects.go index 3b77b191e..f8beaa7fa 100644 --- a/controllers/monitoring/monitoring_objects.go +++ b/controllers/monitoring/monitoring_objects.go @@ -1,6 +1,9 @@ package monitoring import ( + "regexp" + "strings" + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" "github.com/netobserv/network-observability-operator/controllers/constants" "github.com/netobserv/network-observability-operator/pkg/dashboards" @@ -16,13 +19,14 @@ const ( dashboardCMNamespace = "openshift-config-managed" dashboardCMAnnotation = "console.openshift.io/dashboard" - flowDashboardCMName = "grafana-dashboard-netobserv-flow-metrics" flowDashboardCMFile = "netobserv-flow-metrics.json" healthDashboardCMName = "grafana-dashboard-netobserv-health" healthDashboardCMFile = "netobserv-health-metrics.json" ) +var k8sInvalidChar = regexp.MustCompile(`[^a-z0-9\-]`) + func buildNamespace(ns string, isDownstream bool) *corev1.Namespace { labels := map[string]string{} if isDownstream { @@ -75,22 +79,27 @@ func buildRoleBindingMonitoringReader(ns string) *rbacv1.ClusterRoleBinding { } } -func buildFlowMetricsDashboard(metrics []metricslatest.FlowMetric) (*corev1.ConfigMap, bool, error) { - dashboard := dashboards.CreateFlowMetricsDashboards(metrics) +func buildFlowMetricsDashboards(metrics []metricslatest.FlowMetric) []*corev1.ConfigMap { + var cms []*corev1.ConfigMap + dash := dashboards.CreateFlowMetricsDashboards(metrics) - configMap := corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: flowDashboardCMName, - Namespace: dashboardCMNamespace, - Labels: map[string]string{ - dashboardCMAnnotation: "true", + for name, json := range dash { + k8sName := "netobserv-" + k8sInvalidChar.ReplaceAllString(strings.ToLower(name), "-") + configMap := corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: k8sName, + Namespace: dashboardCMNamespace, + Labels: map[string]string{ + dashboardCMAnnotation: "true", + }, }, - }, - Data: map[string]string{ - flowDashboardCMFile: dashboard, - }, + Data: map[string]string{ + flowDashboardCMFile: json, + }, + } + cms = append(cms, &configMap) } - return &configMap, len(dashboard) == 0, nil + return cms } func buildHealthDashboard(namespace string) (*corev1.ConfigMap, bool, error) { diff --git a/controllers/reconcilers/common.go b/controllers/reconcilers/common.go index 944fdbd4a..dabfe9bdc 100644 --- a/controllers/reconcilers/common.go +++ b/controllers/reconcilers/common.go @@ -65,8 +65,8 @@ func (c *Common) ReconcileRole(ctx context.Context, desired *rbacv1.Role) error return ReconcileRole(ctx, &c.Client, desired) } -func (c *Common) ReconcileConfigMap(ctx context.Context, desired *corev1.ConfigMap, delete bool) error { - return ReconcileConfigMap(ctx, &c.Client, desired, delete) +func (c *Common) ReconcileConfigMap(ctx context.Context, old, new *corev1.ConfigMap) error { + return ReconcileConfigMap(ctx, &c.Client, old, new) } func (i *Instance) ReconcileService(ctx context.Context, old, new *corev1.Service, report *helper.ChangeReport) error { diff --git a/controllers/reconcilers/reconcilers.go b/controllers/reconcilers/reconcilers.go index a2305a052..d1900b9b7 100644 --- a/controllers/reconcilers/reconcilers.go +++ b/controllers/reconcilers/reconcilers.go @@ -29,9 +29,9 @@ var ( !equality.Semantic.DeepEqual(e.ObjectNew.GetAnnotations(), e.ObjectOld.GetAnnotations()) || !equality.Semantic.DeepEqual(e.ObjectNew.GetLabels(), e.ObjectOld.GetLabels()) }, - CreateFunc: func(e event.CreateEvent) bool { return true }, - DeleteFunc: func(e event.DeleteEvent) bool { return true }, - GenericFunc: func(e event.GenericEvent) bool { return false }, + CreateFunc: func(_ event.CreateEvent) bool { return true }, + DeleteFunc: func(_ event.DeleteEvent) bool { return true }, + GenericFunc: func(_ event.GenericEvent) bool { return false }, }) ) @@ -47,7 +47,7 @@ func ReconcileClusterRoleBinding(ctx context.Context, cl *helper.Client, desired actual.RoleRef == desired.RoleRef && reflect.DeepEqual(actual.Subjects, desired.Subjects) { if actual.RoleRef != desired.RoleRef { - //Roleref cannot be updated deleting and creating a new rolebinding + // Roleref cannot be updated deleting and creating a new rolebinding log := log.FromContext(ctx) log.Info("Deleting old ClusterRoleBinding", "Namespace", actual.GetNamespace(), "Name", actual.GetName()) err := cl.Delete(ctx, &actual) @@ -74,7 +74,7 @@ func ReconcileRoleBinding(ctx context.Context, cl *helper.Client, desired *rbacv actual.RoleRef == desired.RoleRef && reflect.DeepEqual(actual.Subjects, desired.Subjects) { if actual.RoleRef != desired.RoleRef { - //Roleref cannot be updated deleting and creating a new rolebinding + // Roleref cannot be updated deleting and creating a new rolebinding log := log.FromContext(ctx) log.Info("Deleting old RoleBinding", "Namespace", actual.GetNamespace(), "Name", actual.GetName()) err := cl.Delete(ctx, &actual) @@ -125,29 +125,24 @@ func ReconcileRole(ctx context.Context, cl *helper.Client, desired *rbacv1.Role) return cl.UpdateIfOwned(ctx, &actual, desired) } -func ReconcileConfigMap(ctx context.Context, cl *helper.Client, desired *corev1.ConfigMap, delete bool) error { - actual := corev1.ConfigMap{} - if err := cl.Get(ctx, types.NamespacedName{Name: desired.Name, Namespace: desired.Namespace}, &actual); err != nil { - if errors.IsNotFound(err) { - if delete { - return nil - } - return cl.CreateOwned(ctx, desired) +func ReconcileConfigMap(ctx context.Context, cl *helper.Client, current, desired *corev1.ConfigMap) error { + if current == nil { + if desired == nil { + return nil } - return fmt.Errorf("can't reconcile Configmap %s: %w", desired.Name, err) + return cl.CreateOwned(ctx, desired) } - - if delete { - return cl.Delete(ctx, desired) + if desired == nil { + if helper.IsOwned(current) { + return cl.Delete(ctx, current) + } + return nil } - - if helper.IsSubSet(actual.Labels, desired.Labels) && - reflect.DeepEqual(actual.Data, desired.Data) { + if helper.IsSubSet(current.Labels, desired.Labels) && reflect.DeepEqual(current.Data, desired.Data) { // configmap already reconciled. Exiting return nil } - - return cl.UpdateIfOwned(ctx, &actual, desired) + return cl.UpdateIfOwned(ctx, current, desired) } func ReconcileDaemonSet(ctx context.Context, ci *Instance, old, new *appsv1.DaemonSet, containerName string, report *helper.ChangeReport) error { diff --git a/pkg/cleanup/cleanup.go b/pkg/cleanup/cleanup.go index 3f535ded7..a2ac5941c 100644 --- a/pkg/cleanup/cleanup.go +++ b/pkg/cleanup/cleanup.go @@ -20,6 +20,11 @@ var ( ref: client.ObjectKey{Name: "grafana-dashboard-netobserv", Namespace: "openshift-config-managed"}, placeholder: &corev1.ConfigMap{}, }, + { + // Old name of NetObserv grafana dashboard / configmap (noo 1.5) + ref: client.ObjectKey{Name: "grafana-dashboard-netobserv-flow-metrics", Namespace: "openshift-config-managed"}, + placeholder: &corev1.ConfigMap{}, + }, } // Need to run only once, at operator startup, this is not part of the reconcile loop didRun = false diff --git a/pkg/cleanup/cleanup_test.go b/pkg/cleanup/cleanup_test.go index 68a6f75df..e17ccc222 100644 --- a/pkg/cleanup/cleanup_test.go +++ b/pkg/cleanup/cleanup_test.go @@ -26,11 +26,26 @@ var oldDashboard = corev1.ConfigMap{ Data: map[string]string{}, } +var oldDashboard2 = corev1.ConfigMap{ + ObjectMeta: v1.ObjectMeta{ + Name: "grafana-dashboard-netobserv-flow-metrics", + Namespace: "openshift-config-managed", + OwnerReferences: []v1.OwnerReference{{ + APIVersion: "flows.netobserv.io/v1beta2", + Kind: "FlowCollector", + Name: "cluster", + Controller: ptr.To(true), + }}, + }, + Data: map[string]string{}, +} + func TestCleanPastReferences(t *testing.T) { assert := assert.New(t) clientMock := test.NewClient() clientMock.MockConfigMap(&oldDashboard) - assert.Equal(1, clientMock.Len()) + clientMock.MockConfigMap(&oldDashboard2) + assert.Equal(2, clientMock.Len()) didRun = false err := CleanPastReferences(context.Background(), clientMock, "netobserv") @@ -44,6 +59,7 @@ func TestCleanPastReferences_Empty(t *testing.T) { assert := assert.New(t) clientMock := test.NewClient() clientMock.MockNonExisting(types.NamespacedName{Name: "grafana-dashboard-netobserv", Namespace: "openshift-config-managed"}) + clientMock.MockNonExisting(types.NamespacedName{Name: "grafana-dashboard-netobserv-flow-metrics", Namespace: "openshift-config-managed"}) assert.Equal(0, clientMock.Len()) didRun = false @@ -59,6 +75,7 @@ func TestCleanPastReferences_NotManaged(t *testing.T) { unmanaged := oldDashboard unmanaged.OwnerReferences = nil clientMock.MockConfigMap(&unmanaged) + clientMock.MockNonExisting(types.NamespacedName{Name: "grafana-dashboard-netobserv-flow-metrics", Namespace: "openshift-config-managed"}) assert.Equal(1, clientMock.Len()) didRun = false @@ -79,6 +96,7 @@ func TestCleanPastReferences_DifferentOwner(t *testing.T) { Name: "SomethingElse", }} clientMock.MockConfigMap(&unmanaged) + clientMock.MockNonExisting(types.NamespacedName{Name: "grafana-dashboard-netobserv-flow-metrics", Namespace: "openshift-config-managed"}) assert.Equal(1, clientMock.Len()) didRun = false diff --git a/pkg/dashboards/dashboard.go b/pkg/dashboards/dashboard.go index 59c2fd528..cd5a8ac04 100644 --- a/pkg/dashboards/dashboard.go +++ b/pkg/dashboards/dashboard.go @@ -84,11 +84,12 @@ func createFlowMetricsDashboard(dashboardName string, charts []chart) string { } rearrangeRows(orderedRows, mapTopPanels, mapBodyPanels) - d := Dashboard{Rows: orderedRows, Title: dashboardName} + d := Dashboard{Rows: orderedRows, Title: "NetObserv / " + dashboardName} return d.ToGrafanaJSON() } -func CreateFlowMetricsDashboards(metrics []metricslatest.FlowMetric) string { +func CreateFlowMetricsDashboards(metrics []metricslatest.FlowMetric) map[string]string { + dashboardsJSON := make(map[string]string) chartsPerDashboard := make(map[string][]chart) for i := range metrics { metric := &metrics[i] @@ -100,6 +101,8 @@ func CreateFlowMetricsDashboards(metrics []metricslatest.FlowMetric) string { chartsPerDashboard[c.DashboardName] = append(chartsPerDashboard[c.DashboardName], c) } } - // TODO: handle more dashboards - return createFlowMetricsDashboard("NetObserv", chartsPerDashboard["NetObserv"]) + for name, charts := range chartsPerDashboard { + dashboardsJSON[name] = createFlowMetricsDashboard(name, charts) + } + return dashboardsJSON } diff --git a/pkg/dashboards/dashboard_test.go b/pkg/dashboards/dashboard_test.go index 5d45a6d46..38a8effed 100644 --- a/pkg/dashboards/dashboard_test.go +++ b/pkg/dashboards/dashboard_test.go @@ -14,10 +14,10 @@ func TestCreateFlowMetricsDashboard_All(t *testing.T) { defs := metrics.GetDefinitions(metrics.GetAllNames()) js := CreateFlowMetricsDashboards(defs) - d, err := FromBytes([]byte(js)) + d, err := FromBytes([]byte(js["Main"])) assert.NoError(err) - assert.Equal("NetObserv", d.Title) + assert.Equal("NetObserv / Main", d.Title) assert.Equal([]string{"", "Traffic rates", "TCP latencies", "Byte and packet drops", "DNS"}, d.Titles()) @@ -89,10 +89,10 @@ func TestCreateFlowMetricsDashboard_OnlyNodeIngressBytes(t *testing.T) { defs := metrics.GetDefinitions([]string{"node_ingress_bytes_total"}) js := CreateFlowMetricsDashboards(defs) - d, err := FromBytes([]byte(js)) + d, err := FromBytes([]byte(js["Main"])) assert.NoError(err) - assert.Equal("NetObserv", d.Title) + assert.Equal("NetObserv / Main", d.Title) assert.Equal([]string{"", "Traffic rates"}, d.Titles()) topRow := d.FindRow("") @@ -108,10 +108,10 @@ func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { defs := metrics.GetDefinitions(metrics.DefaultIncludeList) js := CreateFlowMetricsDashboards(defs) - d, err := FromBytes([]byte(js)) + d, err := FromBytes([]byte(js["Main"])) assert.NoError(err) - assert.Equal("NetObserv", d.Title) + assert.Equal("NetObserv / Main", d.Title) assert.Equal([]string{"", "Traffic rates", "TCP latencies", "Byte and packet drops", "DNS"}, d.Titles()) topRow := d.FindRow("") @@ -188,7 +188,7 @@ func TestCreateCustomDashboard(t *testing.T) { MetricName: "my_metric", Charts: []metricslatest.Chart{ { - DashboardName: "NetObserv", + DashboardName: "Main", SectionName: "My section", Title: "My chart", Unit: metricslatest.UnitBPS, @@ -201,7 +201,7 @@ func TestCreateCustomDashboard(t *testing.T) { }, }, { - DashboardName: "NetObserv", + DashboardName: "Main", SectionName: "My next section", Title: "My next chart", Unit: metricslatest.UnitBPS, @@ -216,12 +216,32 @@ func TestCreateCustomDashboard(t *testing.T) { }, }, }, + { + Spec: metricslatest.FlowMetricSpec{ + MetricName: "my_metric", + Charts: []metricslatest.Chart{ + { + DashboardName: "My other dashboard", + SectionName: "Other section", + Title: "Other chart", + Unit: metricslatest.UnitBPS, + Type: metricslatest.ChartTypeLine, + Queries: []metricslatest.Query{ + { + PromQL: `sum(rate($METRIC{label="foo"}[5m])) by (lbl1,lbl2)`, + Legend: "{{lbl1}}: {{lbl2}}", + }, + }, + }, + }, + }, + }, }) - d, err := FromBytes([]byte(js)) + d, err := FromBytes([]byte(js["Main"])) assert.NoError(err) - assert.Equal("NetObserv", d.Title) + assert.Equal("NetObserv / Main", d.Title) assert.Equal([]string{"My section", "My next section"}, d.Titles()) r1 := d.FindRow("My section") @@ -253,4 +273,25 @@ func TestCreateCustomDashboard(t *testing.T) { }, }, }, r2.Panels[0]) + + d, err = FromBytes([]byte(js["My other dashboard"])) + assert.NoError(err) + + assert.Equal("NetObserv / My other dashboard", d.Title) + assert.Equal([]string{"Other section"}, d.Titles()) + + r1 = d.FindRow("Other section") + assert.Equal([]string{"Other chart"}, r1.Titles()) + assert.Equal(Panel{ + Title: "Other chart", + Type: "graph", + Span: 4, + Format: "Bps", + Targets: []Target{ + { + Expr: "topk(7, sum(rate(netobserv_my_metric{label=\"foo\"}[5m])) by (lbl1,lbl2))", + LegendFormat: "{{lbl1}}: {{lbl2}}", + }, + }, + }, r1.Panels[0]) } diff --git a/pkg/metrics/predefined_charts.go b/pkg/metrics/predefined_charts.go index 416eb7047..057313f77 100644 --- a/pkg/metrics/predefined_charts.go +++ b/pkg/metrics/predefined_charts.go @@ -8,7 +8,7 @@ import ( ) const ( - netobservDashboard = "NetObserv" + mainDashboard = "Main" ) func trafficCharts(group, vt, dir string) []metricslatest.Chart { @@ -24,7 +24,7 @@ func trafficCharts(group, vt, dir string) []metricslatest.Chart { totalSingleStat := metricslatest.Chart{ Type: metricslatest.ChartTypeSingleStat, SectionName: "", - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: fmt.Sprintf("Total %s traffic", dir), Unit: unit, Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC[2m]))"}}, @@ -33,7 +33,7 @@ func trafficCharts(group, vt, dir string) []metricslatest.Chart { appSingleStat := metricslatest.Chart{ Type: metricslatest.ChartTypeSingleStat, SectionName: "", - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: fmt.Sprintf("Apps %s traffic", dir), Unit: unit, Queries: []metricslatest.Query{{PromQL: `sum(rate($METRIC{K8S_FlowLayer="app"}[2m]))`}}, @@ -42,7 +42,7 @@ func trafficCharts(group, vt, dir string) []metricslatest.Chart { infraSingleStat := metricslatest.Chart{ Type: metricslatest.ChartTypeSingleStat, SectionName: "", - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: fmt.Sprintf("Infra %s traffic", dir), Unit: unit, Queries: []metricslatest.Query{{PromQL: `sum(rate($METRIC{K8S_FlowLayer="infra"}[2m]))`}}, @@ -65,7 +65,7 @@ func trafficCharts(group, vt, dir string) []metricslatest.Chart { return append(charts, chartVariantsFor(&metricslatest.Chart{ Type: metricslatest.ChartTypeStackArea, SectionName: sectionName, - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: fmt.Sprintf("Top %s traffic", dir), Unit: unit, Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC{$FILTERS}[2m])) by ($LABELS)", Legend: "$LEGEND"}}, @@ -77,7 +77,7 @@ func rttCharts(group string) []metricslatest.Chart { charts := []metricslatest.Chart{{ Type: metricslatest.ChartTypeSingleStat, SectionName: "", - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: "TCP latency", Unit: metricslatest.UnitSeconds, Queries: []metricslatest.Query{ @@ -90,7 +90,7 @@ func rttCharts(group string) []metricslatest.Chart { charts = append(charts, chartVariantsFor(&metricslatest.Chart{ Type: metricslatest.ChartTypeLine, SectionName: sectionName, - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: "Top P50 sRTT", Unit: metricslatest.UnitSeconds, Queries: []metricslatest.Query{ @@ -103,7 +103,7 @@ func rttCharts(group string) []metricslatest.Chart { charts = append(charts, chartVariantsFor(&metricslatest.Chart{ Type: metricslatest.ChartTypeLine, SectionName: sectionName, - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: "Top P99 sRTT", Unit: metricslatest.UnitSeconds, Queries: []metricslatest.Query{ @@ -124,7 +124,7 @@ func dropCharts(group string, unit metricslatest.Unit) []metricslatest.Chart { charts = append(charts, metricslatest.Chart{ Type: metricslatest.ChartTypeSingleStat, SectionName: "", - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: "Drops", Unit: unit, Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC[2m]))"}}, @@ -133,7 +133,7 @@ func dropCharts(group string, unit metricslatest.Unit) []metricslatest.Chart { return append(charts, chartVariantsFor(&metricslatest.Chart{ Type: metricslatest.ChartTypeStackArea, SectionName: sectionName, - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: "Top drops", Unit: unit, Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC{$FILTERS}[2m])) by ($LABELS)", Legend: "$LEGEND"}}, @@ -146,7 +146,7 @@ func dnsCharts(group string) []metricslatest.Chart { { Type: metricslatest.ChartTypeSingleStat, SectionName: "", - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: "DNS latency", Unit: metricslatest.UnitSeconds, Queries: []metricslatest.Query{ @@ -159,7 +159,7 @@ func dnsCharts(group string) []metricslatest.Chart { { Type: metricslatest.ChartTypeSingleStat, SectionName: "", - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: "DNS error rate", Queries: []metricslatest.Query{{PromQL: `sum(rate($METRIC_count{DnsFlagsResponseCode!="NoError"}[2m]))`}}, }, @@ -167,7 +167,7 @@ func dnsCharts(group string) []metricslatest.Chart { charts = append(charts, chartVariantsFor(&metricslatest.Chart{ Type: metricslatest.ChartTypeLine, SectionName: sectionName, - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: "Top P50 DNS latency", Unit: metricslatest.UnitSeconds, Queries: []metricslatest.Query{ @@ -180,7 +180,7 @@ func dnsCharts(group string) []metricslatest.Chart { charts = append(charts, chartVariantsFor(&metricslatest.Chart{ Type: metricslatest.ChartTypeLine, SectionName: sectionName, - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: "Top P99 DNS latency", Unit: metricslatest.UnitSeconds, Queries: []metricslatest.Query{ @@ -194,7 +194,7 @@ func dnsCharts(group string) []metricslatest.Chart { return append(charts, chartVariantsFor(&metricslatest.Chart{ Type: metricslatest.ChartTypeStackArea, SectionName: sectionName, - DashboardName: netobservDashboard, + DashboardName: mainDashboard, Title: "DNS error rate", Queries: []metricslatest.Query{{ PromQL: `sum(rate($METRIC_count{DnsFlagsResponseCode!="NoError",$FILTERS}[2m])) by (DnsFlagsResponseCode,$LABELS)`, From 2ecbff318a9786ee75a7b436c99837493fe4530a Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Mon, 8 Apr 2024 13:20:20 +0200 Subject: [PATCH 04/13] update bundle --- .../flows.netobserv.io_flowmetrics.yaml | 38 ++++++++----------- .../bases/flows.netobserv.io_flowmetrics.yaml | 38 ++++++++----------- 2 files changed, 30 insertions(+), 46 deletions(-) diff --git a/bundle/manifests/flows.netobserv.io_flowmetrics.yaml b/bundle/manifests/flows.netobserv.io_flowmetrics.yaml index 0773a0cba..b3da33f38 100644 --- a/bundle/manifests/flows.netobserv.io_flowmetrics.yaml +++ b/bundle/manifests/flows.netobserv.io_flowmetrics.yaml @@ -64,31 +64,24 @@ spec: created. type: string queries: - description: List of queries to be displayed on this chart. - If `type` is `SingleStat` and multiple queries are provided, - this chart will be automatically expanded in several panels - (one per query). + description: |- + List of queries to be displayed on this chart. If `type` is `SingleStat` and multiple queries are provided, + this chart will be automatically expanded in several panels (one per query). items: description: Configures PromQL queries properties: legend: - description: 'The query legend that applies to each timeseries - represented in this chart. When multiple timeseries - are displayed, you should set a legend that distinguishes - each of them. It can be done with the following format: - `{{ Label }}`. For example, if the `promQL` groups timeseries - per label such as: `sum(rate($METRIC[2m])) by (Label1, - Label2)`, you may write as the legend: `Label1={{ Label1 - }}, Label2={{ Label2 }}`.' + description: |- + The query legend that applies to each timeseries represented in this chart. When multiple timeseries are displayed, you should set a legend + that distinguishes each of them. It can be done with the following format: `{{ Label }}`. For example, if the `promQL` groups timeseries per + label such as: `sum(rate($METRIC[2m])) by (Label1, Label2)`, you may write as the legend: `Label1={{ Label1 }}, Label2={{ Label2 }}`. type: string promQL: - description: 'The `promQL` query to be run against Prometheus. - If the chart `type` is `SingleStat`, this query should - only return a single timeseries. For other types, a - top 7 is displayed. You can use `$METRIC` to refer to - the metric defined in this resource. For example: `sum(rate($METRIC[2m]))`. - To learn more about `promQL`, refer to the Prometheus - documentation: https://prometheus.io/docs/prometheus/latest/querying/basics/' + description: |- + The `promQL` query to be run against Prometheus. If the chart `type` is `SingleStat`, this query should only return + a single timeseries. For other types, a top 7 is displayed. + You can use `$METRIC` to refer to the metric defined in this resource. For example: `sum(rate($METRIC[2m]))`. + To learn more about `promQL`, refer to the Prometheus documentation: https://prometheus.io/docs/prometheus/latest/querying/basics/ type: string required: - legend @@ -96,10 +89,9 @@ spec: type: object type: array sectionName: - description: Name of the containing dashboard section. If this - name does not refer to an existing section, a new section - is created. If `sectionName` is omitted or empty, the chart - is placed in the global top section. + description: |- + Name of the containing dashboard section. If this name does not refer to an existing section, a new section is created. + If `sectionName` is omitted or empty, the chart is placed in the global top section. type: string title: description: Title of the chart. diff --git a/config/crd/bases/flows.netobserv.io_flowmetrics.yaml b/config/crd/bases/flows.netobserv.io_flowmetrics.yaml index a367e0441..b3e6f456f 100644 --- a/config/crd/bases/flows.netobserv.io_flowmetrics.yaml +++ b/config/crd/bases/flows.netobserv.io_flowmetrics.yaml @@ -64,31 +64,24 @@ spec: created. type: string queries: - description: List of queries to be displayed on this chart. - If `type` is `SingleStat` and multiple queries are provided, - this chart will be automatically expanded in several panels - (one per query). + description: |- + List of queries to be displayed on this chart. If `type` is `SingleStat` and multiple queries are provided, + this chart will be automatically expanded in several panels (one per query). items: description: Configures PromQL queries properties: legend: - description: 'The query legend that applies to each timeseries - represented in this chart. When multiple timeseries - are displayed, you should set a legend that distinguishes - each of them. It can be done with the following format: - `{{ Label }}`. For example, if the `promQL` groups timeseries - per label such as: `sum(rate($METRIC[2m])) by (Label1, - Label2)`, you may write as the legend: `Label1={{ Label1 - }}, Label2={{ Label2 }}`.' + description: |- + The query legend that applies to each timeseries represented in this chart. When multiple timeseries are displayed, you should set a legend + that distinguishes each of them. It can be done with the following format: `{{ Label }}`. For example, if the `promQL` groups timeseries per + label such as: `sum(rate($METRIC[2m])) by (Label1, Label2)`, you may write as the legend: `Label1={{ Label1 }}, Label2={{ Label2 }}`. type: string promQL: - description: 'The `promQL` query to be run against Prometheus. - If the chart `type` is `SingleStat`, this query should - only return a single timeseries. For other types, a - top 7 is displayed. You can use `$METRIC` to refer to - the metric defined in this resource. For example: `sum(rate($METRIC[2m]))`. - To learn more about `promQL`, refer to the Prometheus - documentation: https://prometheus.io/docs/prometheus/latest/querying/basics/' + description: |- + The `promQL` query to be run against Prometheus. If the chart `type` is `SingleStat`, this query should only return + a single timeseries. For other types, a top 7 is displayed. + You can use `$METRIC` to refer to the metric defined in this resource. For example: `sum(rate($METRIC[2m]))`. + To learn more about `promQL`, refer to the Prometheus documentation: https://prometheus.io/docs/prometheus/latest/querying/basics/ type: string required: - legend @@ -96,10 +89,9 @@ spec: type: object type: array sectionName: - description: Name of the containing dashboard section. If this - name does not refer to an existing section, a new section - is created. If `sectionName` is omitted or empty, the chart - is placed in the global top section. + description: |- + Name of the containing dashboard section. If this name does not refer to an existing section, a new section is created. + If `sectionName` is omitted or empty, the chart is placed in the global top section. type: string title: description: Title of the chart. From 54352248728bc8e57b1e1ecc4356cc1be15ff822 Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Mon, 8 Apr 2024 14:22:54 +0200 Subject: [PATCH 05/13] enforce consistent charts order --- pkg/dashboards/dashboard.go | 3 +++ pkg/dashboards/dashboard_test.go | 41 ++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/pkg/dashboards/dashboard.go b/pkg/dashboards/dashboard.go index cd5a8ac04..ba5f4fc58 100644 --- a/pkg/dashboards/dashboard.go +++ b/pkg/dashboards/dashboard.go @@ -2,6 +2,7 @@ package dashboards import ( "fmt" + "sort" "strings" metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" @@ -91,6 +92,8 @@ func createFlowMetricsDashboard(dashboardName string, charts []chart) string { func CreateFlowMetricsDashboards(metrics []metricslatest.FlowMetric) map[string]string { dashboardsJSON := make(map[string]string) chartsPerDashboard := make(map[string][]chart) + // Sort alphabetically to enforce consistent ordering + sort.Slice(metrics, func(i, j int) bool { return metrics[i].Name < metrics[j].Name }) for i := range metrics { metric := &metrics[i] for j := range metric.Spec.Charts { diff --git a/pkg/dashboards/dashboard_test.go b/pkg/dashboards/dashboard_test.go index 38a8effed..0d1bb1706 100644 --- a/pkg/dashboards/dashboard_test.go +++ b/pkg/dashboards/dashboard_test.go @@ -6,6 +6,7 @@ import ( metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" "github.com/netobserv/network-observability-operator/pkg/metrics" "github.com/stretchr/testify/assert" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func TestCreateFlowMetricsDashboard_All(t *testing.T) { @@ -295,3 +296,43 @@ func TestCreateCustomDashboard(t *testing.T) { }, }, r1.Panels[0]) } + +func TestSortedCharts(t *testing.T) { + assert := assert.New(t) + + var quickChart = func(title string) metricslatest.Chart { + return metricslatest.Chart{ + DashboardName: "Main", + SectionName: "S0", + Title: title, + Type: metricslatest.ChartTypeSingleStat, + Queries: []metricslatest.Query{{PromQL: `(query)`, Legend: ""}}, + } + } + + js := CreateFlowMetricsDashboards([]metricslatest.FlowMetric{ + { + ObjectMeta: v1.ObjectMeta{Name: "z"}, + Spec: metricslatest.FlowMetricSpec{ + Charts: []metricslatest.Chart{ + quickChart("C0"), + quickChart("C1"), + }, + }, + }, + { + ObjectMeta: v1.ObjectMeta{Name: "a"}, + Spec: metricslatest.FlowMetricSpec{ + Charts: []metricslatest.Chart{ + quickChart("C2"), + }, + }, + }, + }) + + d, err := FromBytes([]byte(js["Main"])) + assert.NoError(err) + + r := d.FindRow("S0") + assert.Equal([]string{"C2", "C0", "C1"}, r.Titles()) +} From f7ee31f29bf4e35b2bf7d592a727fe315c838c70 Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Mon, 8 Apr 2024 14:42:31 +0200 Subject: [PATCH 06/13] When too many panels are in top row, put them in an Overview section --- pkg/dashboards/dashboard.go | 4 ++++ pkg/dashboards/dashboard_test.go | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/dashboards/dashboard.go b/pkg/dashboards/dashboard.go index ba5f4fc58..e45cc6f71 100644 --- a/pkg/dashboards/dashboard.go +++ b/pkg/dashboards/dashboard.go @@ -52,6 +52,10 @@ func rearrangeRows(rows []*Row, mapTopPanels, mapBodyPanels map[string][]Panel) } rows[i].Panels = topPanels rows[i].Panels = append(rows[i].Panels, bodyPanels...) + if rows[i].Title == "" && len(rows[i].Panels) > 8 { + // When top row has many panels, create a collapsable section + rows[i].Title = "Overview" + } } } diff --git a/pkg/dashboards/dashboard_test.go b/pkg/dashboards/dashboard_test.go index 0d1bb1706..e92c05f6c 100644 --- a/pkg/dashboards/dashboard_test.go +++ b/pkg/dashboards/dashboard_test.go @@ -20,7 +20,7 @@ func TestCreateFlowMetricsDashboard_All(t *testing.T) { assert.Equal("NetObserv / Main", d.Title) - assert.Equal([]string{"", "Traffic rates", "TCP latencies", "Byte and packet drops", "DNS"}, d.Titles()) + assert.Equal([]string{"Overview", "Traffic rates", "TCP latencies", "Byte and packet drops", "DNS"}, d.Titles()) assert.Len(d.Rows[0].Panels, 16) assert.Len(d.Rows[1].Panels, 20) From 336b9d1563b411fbdbfb02bebb445ac96f28da92 Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Mon, 15 Apr 2024 13:46:22 +0200 Subject: [PATCH 07/13] Add configurable top --- apis/flowmetrics/v1alpha1/flowmetric_types.go | 6 ++++++ bundle/manifests/flows.netobserv.io_flowmetrics.yaml | 7 +++++++ config/crd/bases/flows.netobserv.io_flowmetrics.yaml | 7 +++++++ pkg/dashboards/dashboard.go | 6 +++++- pkg/dashboards/dashboard_test.go | 3 ++- 5 files changed, 27 insertions(+), 2 deletions(-) diff --git a/apis/flowmetrics/v1alpha1/flowmetric_types.go b/apis/flowmetrics/v1alpha1/flowmetric_types.go index b28aceed1..b206f69a0 100644 --- a/apis/flowmetrics/v1alpha1/flowmetric_types.go +++ b/apis/flowmetrics/v1alpha1/flowmetric_types.go @@ -178,6 +178,12 @@ type Query struct { // label such as: `sum(rate($METRIC[2m])) by (Label1, Label2)`, you may write as the legend: `Label1={{ Label1 }}, Label2={{ Label2 }}`. // +required Legend string `json:"legend"` + + // Top N series to display per timestamp. Does not apply to `SingleStat` chart type. + // +kubebuilder:default:=7 + // +kubebuilder:validation:Minimum=1 + // +required + Top int `json:"top"` } // FlowMetricStatus defines the observed state of FlowMetric diff --git a/bundle/manifests/flows.netobserv.io_flowmetrics.yaml b/bundle/manifests/flows.netobserv.io_flowmetrics.yaml index b3da33f38..59da80cb4 100644 --- a/bundle/manifests/flows.netobserv.io_flowmetrics.yaml +++ b/bundle/manifests/flows.netobserv.io_flowmetrics.yaml @@ -83,9 +83,16 @@ spec: You can use `$METRIC` to refer to the metric defined in this resource. For example: `sum(rate($METRIC[2m]))`. To learn more about `promQL`, refer to the Prometheus documentation: https://prometheus.io/docs/prometheus/latest/querying/basics/ type: string + top: + default: 7 + description: Top N series to display per timestamp. Does + not apply to `SingleStat` chart type. + minimum: 1 + type: integer required: - legend - promQL + - top type: object type: array sectionName: diff --git a/config/crd/bases/flows.netobserv.io_flowmetrics.yaml b/config/crd/bases/flows.netobserv.io_flowmetrics.yaml index b3e6f456f..87dab78f9 100644 --- a/config/crd/bases/flows.netobserv.io_flowmetrics.yaml +++ b/config/crd/bases/flows.netobserv.io_flowmetrics.yaml @@ -83,9 +83,16 @@ spec: You can use `$METRIC` to refer to the metric defined in this resource. For example: `sum(rate($METRIC[2m]))`. To learn more about `promQL`, refer to the Prometheus documentation: https://prometheus.io/docs/prometheus/latest/querying/basics/ type: string + top: + default: 7 + description: Top N series to display per timestamp. Does + not apply to `SingleStat` chart type. + minimum: 1 + type: integer required: - legend - promQL + - top type: object type: array sectionName: diff --git a/pkg/dashboards/dashboard.go b/pkg/dashboards/dashboard.go index e45cc6f71..f40690d32 100644 --- a/pkg/dashboards/dashboard.go +++ b/pkg/dashboards/dashboard.go @@ -30,8 +30,12 @@ func createSingleStatPanels(c *chart) []Panel { func createGraphPanel(c *chart) Panel { var targets []Target for _, q := range c.Queries { + top := 7 + if q.Top > 0 { + top = q.Top + } query := strings.ReplaceAll(q.PromQL, "$METRIC", "netobserv_"+c.mptr.Spec.MetricName) - query = fmt.Sprintf("topk(7, %s)", query) + query = fmt.Sprintf("topk(%d, %s)", top, query) targets = append(targets, NewTarget(query, q.Legend)) } return NewPanel(c.Title, c.Type, c.Unit, 4, targets...) diff --git a/pkg/dashboards/dashboard_test.go b/pkg/dashboards/dashboard_test.go index e92c05f6c..c595d265a 100644 --- a/pkg/dashboards/dashboard_test.go +++ b/pkg/dashboards/dashboard_test.go @@ -211,6 +211,7 @@ func TestCreateCustomDashboard(t *testing.T) { { PromQL: `sum(rate($METRIC{label="foo"}[5m])) by (lbl1,lbl2)`, Legend: "{{lbl1}}: {{lbl2}}", + Top: 10, }, }, }, @@ -269,7 +270,7 @@ func TestCreateCustomDashboard(t *testing.T) { Format: "Bps", Targets: []Target{ { - Expr: "topk(7, sum(rate(netobserv_my_metric{label=\"foo\"}[5m])) by (lbl1,lbl2))", + Expr: "topk(10, sum(rate(netobserv_my_metric{label=\"foo\"}[5m])) by (lbl1,lbl2))", LegendFormat: "{{lbl1}}: {{lbl2}}", }, }, From e0a3ce2601bc8f3a68def0dd93affb465bf38f97 Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Mon, 15 Apr 2024 13:53:43 +0200 Subject: [PATCH 08/13] fix test --- controllers/monitoring/monitoring_controller_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controllers/monitoring/monitoring_controller_test.go b/controllers/monitoring/monitoring_controller_test.go index cc1deaa03..6d088b2ad 100644 --- a/controllers/monitoring/monitoring_controller_test.go +++ b/controllers/monitoring/monitoring_controller_test.go @@ -159,7 +159,7 @@ func ControllerSpecs() { DashboardName: "My dashboard 01", Title: "title", Type: metricslatest.ChartTypeSingleStat, - Queries: []metricslatest.Query{{PromQL: "(query)", Legend: "-"}}, + Queries: []metricslatest.Query{{PromQL: "(query)", Legend: "-", Top: 7}}, }, }, }, @@ -177,7 +177,7 @@ func ControllerSpecs() { DashboardName: "My dashboard 02", Title: "title", Type: metricslatest.ChartTypeSingleStat, - Queries: []metricslatest.Query{{PromQL: "(query)", Legend: "-"}}, + Queries: []metricslatest.Query{{PromQL: "(query)", Legend: "-", Top: 7}}, }, }, }, From ebf5897158aa724a5430dd4b4a4d5bef38c73633 Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Tue, 16 Apr 2024 16:58:30 +0200 Subject: [PATCH 09/13] More examples --- .../flowmetrics/cluster_egress_traffic.yaml | 30 ------------- .../cluster_external_egress_rtt_seconds.yaml | 43 +++++++++++++++++++ .../cluster_external_egress_traffic.yaml | 16 +++++++ .../cluster_external_ingress_rtt_seconds.yaml | 43 +++++++++++++++++++ .../cluster_external_ingress_traffic.yaml | 16 +++++++ .../flowmetrics/cluster_ingress_traffic.yaml | 30 ------------- 6 files changed, 118 insertions(+), 60 deletions(-) delete mode 100644 config/samples/flowmetrics/cluster_egress_traffic.yaml create mode 100644 config/samples/flowmetrics/cluster_external_egress_rtt_seconds.yaml create mode 100644 config/samples/flowmetrics/cluster_external_ingress_rtt_seconds.yaml delete mode 100644 config/samples/flowmetrics/cluster_ingress_traffic.yaml diff --git a/config/samples/flowmetrics/cluster_egress_traffic.yaml b/config/samples/flowmetrics/cluster_egress_traffic.yaml deleted file mode 100644 index cc4ced4ad..000000000 --- a/config/samples/flowmetrics/cluster_egress_traffic.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics -apiVersion: flows.netobserv.io/v1alpha1 -kind: FlowMetric -metadata: - name: flowmetric-cluster-egress-traffic -spec: - metricName: cluster_egress_bytes_total - type: Counter - valueField: Bytes - direction: Egress - labels: [SrcK8S_HostName,SrcK8S_Namespace,SrcK8S_OwnerName,SrcK8S_OwnerType] - filters: - - field: DstSubnetLabel - matchType: Absence - charts: - - dashboardName: Main - title: Cluster egress traffic - unit: Bps - type: SingleStat - queries: - - promQL: "sum(rate($METRIC[2m]))" - legend: "" - - dashboardName: Main - sectionName: Cluster - title: Top cluster egress traffic per workload - unit: Bps - type: StackArea - queries: - - promQL: "sum(rate($METRIC{SrcK8S_Namespace!=\"\"}[2m])) by (SrcK8S_Namespace, SrcK8S_OwnerName)" - legend: "{{SrcK8S_Namespace}} / {{SrcK8S_OwnerName}}" diff --git a/config/samples/flowmetrics/cluster_external_egress_rtt_seconds.yaml b/config/samples/flowmetrics/cluster_external_egress_rtt_seconds.yaml new file mode 100644 index 000000000..c2d929028 --- /dev/null +++ b/config/samples/flowmetrics/cluster_external_egress_rtt_seconds.yaml @@ -0,0 +1,43 @@ +# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics +apiVersion: flows.netobserv.io/v1alpha1 +kind: FlowMetric +metadata: + name: flowmetric-cluster-external-egress-rtt-seconds +spec: + metricName: cluster_external_egress_rtt_seconds + type: Histogram + valueField: TimeFlowRttNs + direction: Egress + includeDuplicates: true + labels: [SrcK8S_HostName,SrcK8S_Namespace,SrcK8S_OwnerName,SrcK8S_OwnerType] + filters: + - field: DstSubnetLabel + matchType: Absence + - field: TimeFlowRttNs + matchType: Presence + divider: "1000000000" + buckets: [".001", ".005", ".01", ".02", ".03", ".04", ".05", ".075", ".1", ".25", "1"] + charts: + - dashboardName: Main + title: External egress TCP latency + unit: seconds + type: SingleStat + queries: + - promQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket[2m])) by (le)) > 0" + legend: "p99" + - dashboardName: Main + sectionName: Cluster + title: "Top external egress sRTT per workload, p50 (ms)" + unit: seconds + type: Line + queries: + - promQL: "histogram_quantile(0.5, sum(rate($METRIC_bucket{SrcK8S_Namespace!=\"\"}[2m])) by (le,SrcK8S_Namespace,SrcK8S_OwnerName))*1000 > 0" + legend: "{{SrcK8S_Namespace}} / {{SrcK8S_OwnerName}}" + - dashboardName: Main + sectionName: Cluster + title: "Top external egress sRTT per workload, p99 (ms)" + unit: seconds + type: Line + queries: + - promQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket{SrcK8S_Namespace!=\"\"}[2m])) by (le,SrcK8S_Namespace,SrcK8S_OwnerName))*1000 > 0" + legend: "{{SrcK8S_Namespace}} / {{SrcK8S_OwnerName}}" diff --git a/config/samples/flowmetrics/cluster_external_egress_traffic.yaml b/config/samples/flowmetrics/cluster_external_egress_traffic.yaml index 3fbc933cf..076d6f007 100644 --- a/config/samples/flowmetrics/cluster_external_egress_traffic.yaml +++ b/config/samples/flowmetrics/cluster_external_egress_traffic.yaml @@ -12,3 +12,19 @@ spec: filters: - field: DstSubnetLabel matchType: Absence + charts: + - dashboardName: Main + title: Cluster egress traffic + unit: Bps + type: SingleStat + queries: + - promQL: "sum(rate($METRIC[2m]))" + legend: "" + - dashboardName: Main + sectionName: Cluster + title: Top cluster egress traffic per workload + unit: Bps + type: StackArea + queries: + - promQL: "sum(rate($METRIC{SrcK8S_Namespace!=\"\"}[2m])) by (SrcK8S_Namespace, SrcK8S_OwnerName)" + legend: "{{SrcK8S_Namespace}} / {{SrcK8S_OwnerName}}" diff --git a/config/samples/flowmetrics/cluster_external_ingress_rtt_seconds.yaml b/config/samples/flowmetrics/cluster_external_ingress_rtt_seconds.yaml new file mode 100644 index 000000000..8d8192bab --- /dev/null +++ b/config/samples/flowmetrics/cluster_external_ingress_rtt_seconds.yaml @@ -0,0 +1,43 @@ +# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics +apiVersion: flows.netobserv.io/v1alpha1 +kind: FlowMetric +metadata: + name: flowmetric-cluster-external-ingress-rtt-seconds +spec: + metricName: cluster_external_ingress_rtt_seconds + type: Histogram + valueField: TimeFlowRttNs + direction: Ingress + includeDuplicates: true + labels: [DstK8S_HostName,DstK8S_Namespace,DstK8S_OwnerName,DstK8S_OwnerType] + filters: + - field: SrcSubnetLabel + matchType: Absence + - field: TimeFlowRttNs + matchType: Presence + divider: "1000000000" + buckets: [".001", ".005", ".01", ".02", ".03", ".04", ".05", ".075", ".1", ".25", "1"] + charts: + - dashboardName: Main + title: External ingress TCP latency + unit: seconds + type: SingleStat + queries: + - promQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket[2m])) by (le)) > 0" + legend: "p99" + - dashboardName: Main + sectionName: Cluster + title: "Top external ingress sRTT per workload, p50 (ms)" + unit: seconds + type: Line + queries: + - promQL: "histogram_quantile(0.5, sum(rate($METRIC_bucket{DstK8S_Namespace!=\"\"}[2m])) by (le,DstK8S_Namespace,DstK8S_OwnerName))*1000 > 0" + legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" + - dashboardName: Main + sectionName: Cluster + title: "Top external ingress sRTT per workload, p99 (ms)" + unit: seconds + type: Line + queries: + - promQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket{DstK8S_Namespace!=\"\"}[2m])) by (le,DstK8S_Namespace,DstK8S_OwnerName))*1000 > 0" + legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" diff --git a/config/samples/flowmetrics/cluster_external_ingress_traffic.yaml b/config/samples/flowmetrics/cluster_external_ingress_traffic.yaml index 7c6dfb322..bb75ff2fc 100644 --- a/config/samples/flowmetrics/cluster_external_ingress_traffic.yaml +++ b/config/samples/flowmetrics/cluster_external_ingress_traffic.yaml @@ -12,3 +12,19 @@ spec: filters: - field: SrcSubnetLabel matchType: Absence + charts: + - dashboardName: Main + title: Cluster ingress traffic + unit: Bps + type: SingleStat + queries: + - promQL: "sum(rate($METRIC[2m]))" + legend: "" + - dashboardName: Main + sectionName: Cluster + title: Top cluster ingress traffic per workload + unit: Bps + type: StackArea + queries: + - promQL: "sum(rate($METRIC{DstK8S_Namespace!=\"\"}[2m])) by (DstK8S_Namespace, DstK8S_OwnerName)" + legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" diff --git a/config/samples/flowmetrics/cluster_ingress_traffic.yaml b/config/samples/flowmetrics/cluster_ingress_traffic.yaml deleted file mode 100644 index ae8287b0f..000000000 --- a/config/samples/flowmetrics/cluster_ingress_traffic.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics -apiVersion: flows.netobserv.io/v1alpha1 -kind: FlowMetric -metadata: - name: flowmetric-cluster-ingress-traffic -spec: - metricName: cluster_ingress_bytes_total - type: Counter - valueField: Bytes - direction: Ingress - labels: [DstK8S_HostName,DstK8S_Namespace,DstK8S_OwnerName,DstK8S_OwnerType] - filters: - - field: SrcSubnetLabel - matchType: Absence - charts: - - dashboardName: Main - title: Cluster ingress traffic - unit: Bps - type: SingleStat - queries: - - promQL: "sum(rate($METRIC[2m]))" - legend: "" - - dashboardName: Main - sectionName: Cluster - title: Top cluster ingress traffic per workload - unit: Bps - type: StackArea - queries: - - promQL: "sum(rate($METRIC{DstK8S_Namespace!=\"\"}[2m])) by (DstK8S_Namespace, DstK8S_OwnerName)" - legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" From 623b58e80f82703ac72eb9b0ab2c322d77187b3e Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Wed, 17 Apr 2024 12:10:31 +0200 Subject: [PATCH 10/13] Remove includeDuplicates, and some minor doc/samples changes --- apis/flowmetrics/v1alpha1/flowmetric_types.go | 13 ++---- .../flows.netobserv.io_flowmetrics.yaml | 16 +++----- ...observ-operator.clusterserviceversion.yaml | 8 ++-- .../bases/flows.netobserv.io_flowmetrics.yaml | 16 +++----- .../cluster_external_egress_rtt_seconds.yaml | 7 ++-- .../cluster_external_egress_traffic.yaml | 6 +-- .../cluster_external_ingress_rtt_seconds.yaml | 16 ++++++-- .../cluster_external_ingress_traffic.yaml | 6 +-- config/samples/flows_v1alpha1_flowmetric.yaml | 8 ++-- controllers/flp/metrics_api_test.go | 10 ++--- pkg/metrics/helper.go | 7 ---- pkg/metrics/predefined_metrics.go | 41 ++++++++----------- 12 files changed, 67 insertions(+), 87 deletions(-) diff --git a/apis/flowmetrics/v1alpha1/flowmetric_types.go b/apis/flowmetrics/v1alpha1/flowmetric_types.go index b206f69a0..e4ee7952e 100644 --- a/apis/flowmetrics/v1alpha1/flowmetric_types.go +++ b/apis/flowmetrics/v1alpha1/flowmetric_types.go @@ -61,7 +61,7 @@ type MetricFilter struct { // usage of Prometheus workloads as this could potentially have a high impact. Cf https://rhobs-handbook.netlify.app/products/openshiftmonitoring/telemetry.md/#what-is-the-cardinality-of-a-metric
// To check the cardinality of all NetObserv metrics, run as `promql`: `count({__name__=~"netobserv.*"}) by (__name__)`. type FlowMetricSpec struct { - // Name of the metric in Prometheus. It will be automatically prefixed with "netobserv_". + // Name of the metric. In Prometheus, it is automatically prefixed with "netobserv_". // +required MetricName string `json:"metricName"` @@ -93,11 +93,6 @@ type FlowMetricSpec struct { // +optional Labels []string `json:"labels"` - // When set to `true`, flows duplicated across several interfaces will add up in the generated metrics. - // When set to `false` (default), it is equivalent to adding the exact filter on `Duplicate` != `true`. - // +optional - IncludeDuplicates bool `json:"includeDuplicates,omitempty"` - // Filter for ingress, egress or any direction flows. // When set to `Ingress`, it is equivalent to adding the regex filter on `FlowDirection`: `0|2`. // When set to `Egress`, it is equivalent to adding the regex filter on `FlowDirection`: `1|2`. @@ -106,7 +101,7 @@ type FlowMetricSpec struct { // +optional Direction FlowDirection `json:"direction,omitempty"` - // A list of buckets to use when `type` is "Histogram". The list must be parseable as floats. Prometheus default buckets will be used if unset. + // A list of buckets to use when `type` is "Histogram". The list must be parseable as floats. When not set, Prometheus default buckets are used. // +optional Buckets []string `json:"buckets,omitempty"` @@ -114,7 +109,7 @@ type FlowMetricSpec struct { // +optional Divider string `json:"divider"` - // Charts configuration + // Charts configuration, for the OpenShift Console in the administrator view, Dashboards menu. // +optional Charts []Chart `json:"charts,omitempty"` } @@ -159,7 +154,7 @@ type Chart struct { Type ChartType `json:"type"` // List of queries to be displayed on this chart. If `type` is `SingleStat` and multiple queries are provided, - // this chart will be automatically expanded in several panels (one per query). + // this chart is automatically expanded in several panels (one per query). // +required Queries []Query `json:"queries"` } diff --git a/bundle/manifests/flows.netobserv.io_flowmetrics.yaml b/bundle/manifests/flows.netobserv.io_flowmetrics.yaml index 59da80cb4..3ff468e8e 100644 --- a/bundle/manifests/flows.netobserv.io_flowmetrics.yaml +++ b/bundle/manifests/flows.netobserv.io_flowmetrics.yaml @@ -46,13 +46,14 @@ spec: properties: buckets: description: A list of buckets to use when `type` is "Histogram". - The list must be parseable as floats. Prometheus default buckets - will be used if unset. + The list must be parseable as floats. When not set, Prometheus default + buckets are used. items: type: string type: array charts: - description: Charts configuration + description: Charts configuration, for the OpenShift Console in the + administrator view, Dashboards menu. items: description: Configures charts / dashboard generation associated to a metric @@ -66,7 +67,7 @@ spec: queries: description: |- List of queries to be displayed on this chart. If `type` is `SingleStat` and multiple queries are provided, - this chart will be automatically expanded in several panels (one per query). + this chart is automatically expanded in several panels (one per query). items: description: Configures PromQL queries properties: @@ -173,11 +174,6 @@ spec: - matchType type: object type: array - includeDuplicates: - description: |- - When set to `true`, flows duplicated across several interfaces will add up in the generated metrics. - When set to `false` (default), it is equivalent to adding the exact filter on `Duplicate` != `true`. - type: boolean labels: description: |- `labels` is a list of fields that should be used as Prometheus labels, also known as dimensions. @@ -190,7 +186,7 @@ spec: type: string type: array metricName: - description: Name of the metric in Prometheus. It will be automatically + description: Name of the metric. In Prometheus, it is automatically prefixed with "netobserv_". type: string type: diff --git a/bundle/manifests/netobserv-operator.clusterserviceversion.yaml b/bundle/manifests/netobserv-operator.clusterserviceversion.yaml index 0800a71a9..2fe977d98 100644 --- a/bundle/manifests/netobserv-operator.clusterserviceversion.yaml +++ b/bundle/manifests/netobserv-operator.clusterserviceversion.yaml @@ -27,7 +27,7 @@ metadata: "promQL": "sum(rate($METRIC[2m]))" } ], - "title": "Cluster ingress traffic", + "title": "External ingress traffic", "type": "SingleStat", "unit": "Bps" }, @@ -39,8 +39,8 @@ metadata: "promQL": "sum(rate($METRIC{DstK8S_Namespace!=\"\"}[2m])) by (DstK8S_Namespace, DstK8S_OwnerName)" } ], - "sectionName": "Cluster", - "title": "Top cluster ingress traffic per workload", + "sectionName": "External", + "title": "Top external ingress traffic per workload", "type": "StackArea", "unit": "Bps" } @@ -58,7 +58,7 @@ metadata: "DstK8S_OwnerName", "DstK8S_OwnerType" ], - "metricName": "cluster_ingress_bytes_total", + "metricName": "cluster_external_ingress_bytes_total", "type": "Counter", "valueField": "Bytes" } diff --git a/config/crd/bases/flows.netobserv.io_flowmetrics.yaml b/config/crd/bases/flows.netobserv.io_flowmetrics.yaml index 87dab78f9..008fe3d8f 100644 --- a/config/crd/bases/flows.netobserv.io_flowmetrics.yaml +++ b/config/crd/bases/flows.netobserv.io_flowmetrics.yaml @@ -46,13 +46,14 @@ spec: properties: buckets: description: A list of buckets to use when `type` is "Histogram". - The list must be parseable as floats. Prometheus default buckets - will be used if unset. + The list must be parseable as floats. When not set, Prometheus default + buckets are used. items: type: string type: array charts: - description: Charts configuration + description: Charts configuration, for the OpenShift Console in the + administrator view, Dashboards menu. items: description: Configures charts / dashboard generation associated to a metric @@ -66,7 +67,7 @@ spec: queries: description: |- List of queries to be displayed on this chart. If `type` is `SingleStat` and multiple queries are provided, - this chart will be automatically expanded in several panels (one per query). + this chart is automatically expanded in several panels (one per query). items: description: Configures PromQL queries properties: @@ -173,11 +174,6 @@ spec: - matchType type: object type: array - includeDuplicates: - description: |- - When set to `true`, flows duplicated across several interfaces will add up in the generated metrics. - When set to `false` (default), it is equivalent to adding the exact filter on `Duplicate` != `true`. - type: boolean labels: description: |- `labels` is a list of fields that should be used as Prometheus labels, also known as dimensions. @@ -190,7 +186,7 @@ spec: type: string type: array metricName: - description: Name of the metric in Prometheus. It will be automatically + description: Name of the metric. In Prometheus, it is automatically prefixed with "netobserv_". type: string type: diff --git a/config/samples/flowmetrics/cluster_external_egress_rtt_seconds.yaml b/config/samples/flowmetrics/cluster_external_egress_rtt_seconds.yaml index c2d929028..2625d1b80 100644 --- a/config/samples/flowmetrics/cluster_external_egress_rtt_seconds.yaml +++ b/config/samples/flowmetrics/cluster_external_egress_rtt_seconds.yaml @@ -2,13 +2,12 @@ apiVersion: flows.netobserv.io/v1alpha1 kind: FlowMetric metadata: - name: flowmetric-cluster-external-egress-rtt-seconds + name: flowmetric-cluster-external-egress-rtt spec: metricName: cluster_external_egress_rtt_seconds type: Histogram valueField: TimeFlowRttNs direction: Egress - includeDuplicates: true labels: [SrcK8S_HostName,SrcK8S_Namespace,SrcK8S_OwnerName,SrcK8S_OwnerType] filters: - field: DstSubnetLabel @@ -26,7 +25,7 @@ spec: - promQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket[2m])) by (le)) > 0" legend: "p99" - dashboardName: Main - sectionName: Cluster + sectionName: External title: "Top external egress sRTT per workload, p50 (ms)" unit: seconds type: Line @@ -34,7 +33,7 @@ spec: - promQL: "histogram_quantile(0.5, sum(rate($METRIC_bucket{SrcK8S_Namespace!=\"\"}[2m])) by (le,SrcK8S_Namespace,SrcK8S_OwnerName))*1000 > 0" legend: "{{SrcK8S_Namespace}} / {{SrcK8S_OwnerName}}" - dashboardName: Main - sectionName: Cluster + sectionName: External title: "Top external egress sRTT per workload, p99 (ms)" unit: seconds type: Line diff --git a/config/samples/flowmetrics/cluster_external_egress_traffic.yaml b/config/samples/flowmetrics/cluster_external_egress_traffic.yaml index 076d6f007..e1785e0d6 100644 --- a/config/samples/flowmetrics/cluster_external_egress_traffic.yaml +++ b/config/samples/flowmetrics/cluster_external_egress_traffic.yaml @@ -14,15 +14,15 @@ spec: matchType: Absence charts: - dashboardName: Main - title: Cluster egress traffic + title: External egress traffic unit: Bps type: SingleStat queries: - promQL: "sum(rate($METRIC[2m]))" legend: "" - dashboardName: Main - sectionName: Cluster - title: Top cluster egress traffic per workload + sectionName: External + title: Top external egress traffic per workload unit: Bps type: StackArea queries: diff --git a/config/samples/flowmetrics/cluster_external_ingress_rtt_seconds.yaml b/config/samples/flowmetrics/cluster_external_ingress_rtt_seconds.yaml index 8d8192bab..6fc0a2d95 100644 --- a/config/samples/flowmetrics/cluster_external_ingress_rtt_seconds.yaml +++ b/config/samples/flowmetrics/cluster_external_ingress_rtt_seconds.yaml @@ -2,13 +2,12 @@ apiVersion: flows.netobserv.io/v1alpha1 kind: FlowMetric metadata: - name: flowmetric-cluster-external-ingress-rtt-seconds + name: flowmetric-cluster-external-ingress-rtt spec: metricName: cluster_external_ingress_rtt_seconds type: Histogram valueField: TimeFlowRttNs direction: Ingress - includeDuplicates: true labels: [DstK8S_HostName,DstK8S_Namespace,DstK8S_OwnerName,DstK8S_OwnerType] filters: - field: SrcSubnetLabel @@ -26,7 +25,7 @@ spec: - promQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket[2m])) by (le)) > 0" legend: "p99" - dashboardName: Main - sectionName: Cluster + sectionName: External title: "Top external ingress sRTT per workload, p50 (ms)" unit: seconds type: Line @@ -34,10 +33,19 @@ spec: - promQL: "histogram_quantile(0.5, sum(rate($METRIC_bucket{DstK8S_Namespace!=\"\"}[2m])) by (le,DstK8S_Namespace,DstK8S_OwnerName))*1000 > 0" legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" - dashboardName: Main - sectionName: Cluster + sectionName: External title: "Top external ingress sRTT per workload, p99 (ms)" unit: seconds type: Line queries: - promQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket{DstK8S_Namespace!=\"\"}[2m])) by (le,DstK8S_Namespace,DstK8S_OwnerName))*1000 > 0" legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" + + - dashboardName: Main + sectionName: External + title: "Top external ingress sRTT per workload, avg (ms)" + unit: seconds + type: Line + queries: + - promQL: "(sum(rate($METRIC_sum{DstK8S_Namespace!=\"\"}[2m])) by (DstK8S_Namespace,DstK8S_OwnerName) / sum(rate($METRIC_count{DstK8S_Namespace!=\"\"}[2m])) by (DstK8S_Namespace,DstK8S_OwnerName))*1000" + legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" diff --git a/config/samples/flowmetrics/cluster_external_ingress_traffic.yaml b/config/samples/flowmetrics/cluster_external_ingress_traffic.yaml index bb75ff2fc..a542aae2c 100644 --- a/config/samples/flowmetrics/cluster_external_ingress_traffic.yaml +++ b/config/samples/flowmetrics/cluster_external_ingress_traffic.yaml @@ -14,15 +14,15 @@ spec: matchType: Absence charts: - dashboardName: Main - title: Cluster ingress traffic + title: External ingress traffic unit: Bps type: SingleStat queries: - promQL: "sum(rate($METRIC[2m]))" legend: "" - dashboardName: Main - sectionName: Cluster - title: Top cluster ingress traffic per workload + sectionName: External + title: Top external ingress traffic per workload unit: Bps type: StackArea queries: diff --git a/config/samples/flows_v1alpha1_flowmetric.yaml b/config/samples/flows_v1alpha1_flowmetric.yaml index 957b8bfb0..d048bd32f 100644 --- a/config/samples/flows_v1alpha1_flowmetric.yaml +++ b/config/samples/flows_v1alpha1_flowmetric.yaml @@ -10,7 +10,7 @@ metadata: name: flowmetric-sample spec: # More examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics - metricName: cluster_ingress_bytes_total + metricName: cluster_external_ingress_bytes_total type: Counter valueField: Bytes direction: Ingress @@ -20,15 +20,15 @@ spec: matchType: Absence charts: - dashboardName: Main - title: Cluster ingress traffic + title: External ingress traffic unit: Bps type: SingleStat queries: - promQL: "sum(rate($METRIC[2m]))" legend: "" - dashboardName: Main - sectionName: Cluster - title: Top cluster ingress traffic per workload + sectionName: External + title: Top external ingress traffic per workload unit: Bps type: StackArea queries: diff --git a/controllers/flp/metrics_api_test.go b/controllers/flp/metrics_api_test.go index 011bc749a..3ad60638e 100644 --- a/controllers/flp/metrics_api_test.go +++ b/controllers/flp/metrics_api_test.go @@ -59,11 +59,10 @@ func TestFlowMetricToFLP(t *testing.T) { Filters: []metricslatest.MetricFilter{{Field: "f", Value: "v", MatchType: metricslatest.MatchEqual}}, }}, {Spec: metricslatest.FlowMetricSpec{ - MetricName: "m_2", - Type: metricslatest.HistogramMetric, - Labels: []string{"by_field"}, - IncludeDuplicates: true, - Direction: metricslatest.Egress, + MetricName: "m_2", + Type: metricslatest.HistogramMetric, + Labels: []string{"by_field"}, + Direction: metricslatest.Egress, Filters: []metricslatest.MetricFilter{ {Field: "f", Value: "v", MatchType: metricslatest.MatchRegex}, {Field: "f2", MatchType: metricslatest.MatchAbsence}, @@ -92,7 +91,6 @@ func TestFlowMetricToFLP(t *testing.T) { Type: "counter", Filters: []api.MetricsFilter{ {Key: "f", Value: "v", Type: api.MetricFilterEqual}, - {Key: "Duplicate", Value: "true", Type: api.MetricFilterNotEqual}, }, ValueKey: "val", Labels: []string{"by_field"}, diff --git a/pkg/metrics/helper.go b/pkg/metrics/helper.go index 2a548f135..4054cb16e 100644 --- a/pkg/metrics/helper.go +++ b/pkg/metrics/helper.go @@ -6,13 +6,6 @@ import ( func GetFilters(fm *metricslatest.FlowMetricSpec) []metricslatest.MetricFilter { var filters []metricslatest.MetricFilter - if !fm.IncludeDuplicates { - filters = append(filters, metricslatest.MetricFilter{ - Field: "Duplicate", - Value: "true", - MatchType: metricslatest.MatchNotEqual, - }) - } if fm.Direction == metricslatest.Egress { filters = append(filters, metricslatest.MetricFilter{ Field: "FlowDirection", diff --git a/pkg/metrics/predefined_metrics.go b/pkg/metrics/predefined_metrics.go index a5e49a652..7f16b8c5d 100644 --- a/pkg/metrics/predefined_metrics.go +++ b/pkg/metrics/predefined_metrics.go @@ -63,13 +63,12 @@ func init() { lowDir := strings.ToLower(string(dir)) predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ FlowMetricSpec: metricslatest.FlowMetricSpec{ - MetricName: fmt.Sprintf("%s_%s_%s_total", groupTrimmed, lowDir, vt), - Type: metricslatest.CounterMetric, - ValueField: valueField, - IncludeDuplicates: false, - Direction: dir, - Labels: labels, - Charts: trafficCharts(group, vt, lowDir), + MetricName: fmt.Sprintf("%s_%s_%s_total", groupTrimmed, lowDir, vt), + Type: metricslatest.CounterMetric, + ValueField: valueField, + Direction: dir, + Labels: labels, + Charts: trafficCharts(group, vt, lowDir), }, tags: []string{group, vt, lowDir}, }) @@ -78,20 +77,18 @@ func init() { // Flows metrics predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ FlowMetricSpec: metricslatest.FlowMetricSpec{ - MetricName: fmt.Sprintf("%s_flows_total", groupTrimmed), - Type: "counter", - Labels: labels, - IncludeDuplicates: true, + MetricName: fmt.Sprintf("%s_flows_total", groupTrimmed), + Type: "counter", + Labels: labels, }, tags: []string{group, group + "-flows", "flows"}, }) // RTT metrics predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ FlowMetricSpec: metricslatest.FlowMetricSpec{ - MetricName: fmt.Sprintf("%s_rtt_seconds", groupTrimmed), - Type: metricslatest.HistogramMetric, - ValueField: "TimeFlowRttNs", - IncludeDuplicates: true, + MetricName: fmt.Sprintf("%s_rtt_seconds", groupTrimmed), + Type: metricslatest.HistogramMetric, + ValueField: "TimeFlowRttNs", Filters: []metricslatest.MetricFilter{ {Field: "TimeFlowRttNs", MatchType: metricslatest.MatchPresence}, }, @@ -105,10 +102,9 @@ func init() { // Drops metrics predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ FlowMetricSpec: metricslatest.FlowMetricSpec{ - MetricName: fmt.Sprintf("%s_drop_packets_total", groupTrimmed), - Type: metricslatest.CounterMetric, - ValueField: "PktDropPackets", - IncludeDuplicates: false, + MetricName: fmt.Sprintf("%s_drop_packets_total", groupTrimmed), + Type: metricslatest.CounterMetric, + ValueField: "PktDropPackets", Filters: []metricslatest.MetricFilter{ {Field: "PktDropPackets", MatchType: metricslatest.MatchPresence}, }, @@ -119,10 +115,9 @@ func init() { }) predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ FlowMetricSpec: metricslatest.FlowMetricSpec{ - MetricName: fmt.Sprintf("%s_drop_bytes_total", groupTrimmed), - Type: metricslatest.CounterMetric, - ValueField: "PktDropBytes", - IncludeDuplicates: false, + MetricName: fmt.Sprintf("%s_drop_bytes_total", groupTrimmed), + Type: metricslatest.CounterMetric, + ValueField: "PktDropBytes", Filters: []metricslatest.MetricFilter{ {Field: "PktDropBytes", MatchType: metricslatest.MatchPresence}, }, From 88a08ae409965ceb5d60b4564136db41e2d18e39 Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Wed, 17 Apr 2024 12:11:24 +0200 Subject: [PATCH 11/13] Add FlowMetrics upstream doc - generate markdown API reference - document in Metrics.md --- Makefile | 1 + docs/FlowMetric.md | 339 +++++++++++++++++++++++++++++++++++++++++++++ docs/Metrics.md | 144 +++++++++++++++++++ 3 files changed, 484 insertions(+) create mode 100644 docs/FlowMetric.md diff --git a/Makefile b/Makefile index ac4abd188..331e90c5c 100644 --- a/Makefile +++ b/Makefile @@ -267,6 +267,7 @@ gencode: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and doc: crdoc ## Generate markdown documentation $(CRDOC) --resources config/crd/bases/flows.netobserv.io_flowcollectors.yaml --output docs/FlowCollector.md + $(CRDOC) --resources config/crd/bases/flows.netobserv.io_flowmetrics.yaml --output docs/FlowMetric.md generate-go-conversions: $(CONVERSION_GEN) ## Run all generate-go-conversions $(MAKE) clean-generated-conversions SRC_DIRS="./apis/flowcollector/v1beta1" diff --git a/docs/FlowMetric.md b/docs/FlowMetric.md new file mode 100644 index 000000000..9f5c2e9d0 --- /dev/null +++ b/docs/FlowMetric.md @@ -0,0 +1,339 @@ +# API Reference + +Packages: + +- [flows.netobserv.io/v1alpha1](#flowsnetobserviov1alpha1) + +# flows.netobserv.io/v1alpha1 + +Resource Types: + +- [FlowMetric](#flowmetric) + + + + +## FlowMetric +[↩ Parent](#flowsnetobserviov1alpha1 ) + + + + + + +FlowMetric is the Schema for the flowmetrics API + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
apiVersionstringflows.netobserv.io/v1alpha1true
kindstringFlowMetrictrue
metadataobjectRefer to the Kubernetes API documentation for the fields of the `metadata` field.true
specobject + FlowMetricSpec defines the desired state of FlowMetric +The provided API allows you to customize these metrics according to your needs.
+When adding new metrics or modifying existing labels, you must carefully monitor the memory +usage of Prometheus workloads as this could potentially have a high impact. Cf https://rhobs-handbook.netlify.app/products/openshiftmonitoring/telemetry.md/#what-is-the-cardinality-of-a-metric
+To check the cardinality of all NetObserv metrics, run as `promql`: `count({__name__=~"netobserv.*"}) by (__name__)`.
+
false
statusobject + FlowMetricStatus defines the observed state of FlowMetric
+
false
+ + +### FlowMetric.spec +[↩ Parent](#flowmetric) + + + +FlowMetricSpec defines the desired state of FlowMetric +The provided API allows you to customize these metrics according to your needs.
+When adding new metrics or modifying existing labels, you must carefully monitor the memory +usage of Prometheus workloads as this could potentially have a high impact. Cf https://rhobs-handbook.netlify.app/products/openshiftmonitoring/telemetry.md/#what-is-the-cardinality-of-a-metric
+To check the cardinality of all NetObserv metrics, run as `promql`: `count({__name__=~"netobserv.*"}) by (__name__)`. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
metricNamestring + Name of the metric. In Prometheus, it is automatically prefixed with "netobserv_".
+
true
typeenum + Metric type: "Counter" or "Histogram". +Use "Counter" for any value that increases over time and on which you can compute a rate, such as Bytes or Packets. +Use "Histogram" for any value that must be sampled independently, such as latencies.
+
+ Enum: Counter, Histogram
+
true
buckets[]string + A list of buckets to use when `type` is "Histogram". The list must be parseable as floats. When not set, Prometheus default buckets are used.
+
false
charts[]object + Charts configuration, for the OpenShift Console in the administrator view, Dashboards menu.
+
false
directionenum + Filter for ingress, egress or any direction flows. +When set to `Ingress`, it is equivalent to adding the regex filter on `FlowDirection`: `0|2`. +When set to `Egress`, it is equivalent to adding the regex filter on `FlowDirection`: `1|2`.
+
+ Enum: Any, Egress, Ingress
+ Default: Any
+
false
dividerstring + When non-zero, scale factor (divider) of the value. Metric value = Flow value / Divider.
+
false
filters[]object + `filters` is a list of fields and values used to restrict which flows are taken into account. Oftentimes, these filters must +be used to eliminate duplicates: `Duplicate != "true"` and `FlowDirection = "0"`. +Refer to the documentation for the list of available fields: https://docs.openshift.com/container-platform/latest/networking/network_observability/json-flows-format-reference.html.
+
false
labels[]string + `labels` is a list of fields that should be used as Prometheus labels, also known as dimensions. +From choosing labels results the level of granularity of this metric, as well as the available aggregations at query time. +It must be done carefully as it impacts the metric cardinality (cf https://rhobs-handbook.netlify.app/products/openshiftmonitoring/telemetry.md/#what-is-the-cardinality-of-a-metric). +In general, avoid setting very high cardinality labels such as IP or MAC addresses. +"SrcK8S_OwnerName" or "DstK8S_OwnerName" should be preferred over "SrcK8S_Name" or "DstK8S_Name" as much as possible. +Refer to the documentation for the list of available fields: https://docs.openshift.com/container-platform/latest/network_observability/json-flows-format-reference.html.
+
false
valueFieldstring + `valueField` is the flow field that must be used as a value for this metric. This field must hold numeric values. +Leave empty to count flows rather than a specific value per flow. +Refer to the documentation for the list of available fields: https://docs.openshift.com/container-platform/latest/networking/network_observability/json-flows-format-reference.html.
+
false
+ + +### FlowMetric.spec.charts[index] +[↩ Parent](#flowmetricspec) + + + +Configures charts / dashboard generation associated to a metric + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
dashboardNamestring + Name of the containing dashboard. If this name does not refer to an existing dashboard, a new dashboard is created.
+
+ Default: Main
+
true
queries[]object + List of queries to be displayed on this chart. If `type` is `SingleStat` and multiple queries are provided, +this chart is automatically expanded in several panels (one per query).
+
true
titlestring + Title of the chart.
+
true
typeenum + Type of the chart.
+
+ Enum: SingleStat, Line, StackArea
+
true
sectionNamestring + Name of the containing dashboard section. If this name does not refer to an existing section, a new section is created. +If `sectionName` is omitted or empty, the chart is placed in the global top section.
+
false
unitenum + Unit of this chart. Only a few units are currently supported. Leave empty to use generic number.
+
+ Enum: bytes, seconds, Bps, pps, percent
+
false
+ + +### FlowMetric.spec.charts[index].queries[index] +[↩ Parent](#flowmetricspecchartsindex) + + + +Configures PromQL queries + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
legendstring + The query legend that applies to each timeseries represented in this chart. When multiple timeseries are displayed, you should set a legend +that distinguishes each of them. It can be done with the following format: `{{ Label }}`. For example, if the `promQL` groups timeseries per +label such as: `sum(rate($METRIC[2m])) by (Label1, Label2)`, you may write as the legend: `Label1={{ Label1 }}, Label2={{ Label2 }}`.
+
true
promQLstring + The `promQL` query to be run against Prometheus. If the chart `type` is `SingleStat`, this query should only return +a single timeseries. For other types, a top 7 is displayed. +You can use `$METRIC` to refer to the metric defined in this resource. For example: `sum(rate($METRIC[2m]))`. +To learn more about `promQL`, refer to the Prometheus documentation: https://prometheus.io/docs/prometheus/latest/querying/basics/
+
true
topinteger + Top N series to display per timestamp. Does not apply to `SingleStat` chart type.
+
+ Default: 7
+ Minimum: 1
+
true
+ + +### FlowMetric.spec.filters[index] +[↩ Parent](#flowmetricspec) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
fieldstring + Name of the field to filter on
+
true
matchTypeenum + Type of matching to apply
+
+ Enum: Equal, NotEqual, Presence, Absence, MatchRegex, NotMatchRegex
+ Default: Equal
+
true
valuestring + Value to filter on. When `matchType` is `Equal` or `NotEqual`, you can use field injection with `$(SomeField)` to refer to any other field of the flow.
+
false
\ No newline at end of file diff --git a/docs/Metrics.md b/docs/Metrics.md index 13e9d1df8..68736f10b 100644 --- a/docs/Metrics.md +++ b/docs/Metrics.md @@ -1,6 +1,14 @@ # Metrics in the NetObserv Operator The NetObserv operator uses [flowlogs-pipeline](https://github.com/netobserv/flowlogs-pipeline/) to generate metrics out of flow logs. +These metrics are meant to be collected by a Prometheus instance (not part of NetObserv deployment). In OpenShift, they are collected either by Cluster Monitoring or User Workload Monitoring. + +There are two ways to configure metrics: + +- By enabling or disabling any of the predefined metrics +- Using the FlowMetrics API to create custom metrics + +## Predefined metrics They can be configured in the `FlowCollector` custom resource, via `spec.processor.metrics.includeList`. It is a list of metric names that tells which ones to generate. @@ -42,3 +50,139 @@ When the `DNSTracking` feature is enabled in `spec.agent.ebpf.features`, additio - `namespace_dns_latency_seconds` `*` - `node_dns_latency_seconds` - `workload_dns_latency_seconds` + +## Custom metrics using the FlowMetrics API + +The FlowMetrics API ([spec reference](./FlowMetric.md)) has been designed to give you full control on the metrics generation out of the NetObserv' enriched NetFlow data. +It allows to create counters or histograms with any set of fields as Prometheus labels, and using any filters from the fields. Just a recommendation: be careful about the [metrics cardinality](https://www.robustperception.io/cardinality-is-key/) when creating new metrics. High cardinality metrics can stress the Prometheus instance. Don't hesitate to [reach out](https://github.com/netobserv/network-observability-operator/discussions/new/choose) if you need some guidance. + +The full list of fields is [available there](./flows-format.adoc). +Some of those fields require special features to be enabled in `FlowCollector`, such as `TimeFlowRttNs` via `spec.agent.ebpf.features` or `Src/DstK8S_Zone` via `spec.processor.addZone`. + +Currently, `FlowMetric` resources need to be created in the namespace defined in `FlowCollector` `spec.namespace`, which is `netobserv` by default. This may change in the future. + +### Counter example + +Here is an example of a FlowMetric resource that generates a metric tracking ingress bytes received from cluster external sources, labelled by destination host and workload: + +```yaml +apiVersion: flows.netobserv.io/v1alpha1 +kind: FlowMetric +metadata: + name: flowmetric-cluster-external-ingress-traffic +spec: + metricName: cluster_external_ingress_bytes_total + type: Counter + valueField: Bytes + direction: Ingress + labels: [DstK8S_HostName,DstK8S_Namespace,DstK8S_OwnerName,DstK8S_OwnerType] + filters: + - field: SrcSubnetLabel + matchType: Absence +``` + +In this example, selecting just the cluster external traffic is done by matching only flows where `SrcSubnetLabel` is absent. This assumes the subnet labels feature is enabled (via `spec.processor.subnetLabels`) and configured to recognize IP ranges used in the cluster. In OpenShift, this is enabled and configured by default. + +Refer to the [spec reference](./FlowMetric.md) for more information about each field. + +### Histogram example + +Here is a similar example for an histogram. Histograms are typically used for latencies. This example shows RTT latency for cluster external ingress traffic. + +```yaml +apiVersion: flows.netobserv.io/v1alpha1 +kind: FlowMetric +metadata: + name: flowmetric-cluster-external-ingress-rtt +spec: + metricName: cluster_external_ingress_rtt_seconds + type: Histogram + valueField: TimeFlowRttNs + direction: Ingress + labels: [DstK8S_HostName,DstK8S_Namespace,DstK8S_OwnerName,DstK8S_OwnerType] + filters: + - field: SrcSubnetLabel + matchType: Absence + - field: TimeFlowRttNs + matchType: Presence + divider: "1000000000" + buckets: [".001", ".005", ".01", ".02", ".03", ".04", ".05", ".075", ".1", ".25", "1"] +``` + +`type` here is `Histogram` since it looks for a latency value (`TimeFlowRttNs`), +and we define custom buckets that should offer a decent precision on RTT ranging roughly between 5ms and 250ms. +Since the RTT is provided as nanos in flows, we use a divider of 1 billion to convert into seconds (which is standard in Prometheus guidelines). + +### More examples + +You can find more examples in https://github.com/netobserv/network-observability-operator/tree/main/config/samples/flowmetrics. + +### Charts (OpenShift only) + +Optionally, you can generate charts for dashboards in the OpenShift Console (administrator view, Dashboard menu), by filling the `charts` section of the `FlowMetric` resources. + +Here is an example for the `flowmetric-cluster-external-ingress-traffic` resource described above: + +```yaml +# ... + charts: + - dashboardName: Main + title: External ingress traffic + unit: Bps + type: SingleStat + queries: + - promQL: "sum(rate($METRIC[2m]))" + legend: "" + - dashboardName: Main + sectionName: External + title: Top external ingress traffic per workload + unit: Bps + type: StackArea + queries: + - promQL: "sum(rate($METRIC{DstK8S_Namespace!=\"\"}[2m])) by (DstK8S_Namespace, DstK8S_OwnerName)" + legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" +``` + +This creates two panels: +- a textual "single stat" that shows global external ingress rate summed across all dimensions +- a timeseries graph showing the same metric per destination workload + +For more information about the query language, refer to the [Prometheus documentation](https://prometheus.io/docs/prometheus/latest/querying/basics/). +And again, refer to the [spec reference](./FlowMetric.md) for more information about each field. + +Another example for histograms: + +```yaml +# ... + charts: + - dashboardName: Main + title: External ingress TCP latency + unit: seconds + type: SingleStat + queries: + - promQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket[2m])) by (le)) > 0" + legend: "p99" + - dashboardName: Main + sectionName: External + title: "Top external ingress sRTT per workload, p50 (ms)" + unit: seconds + type: Line + queries: + - promQL: "histogram_quantile(0.5, sum(rate($METRIC_bucket{DstK8S_Namespace!=\"\"}[2m])) by (le,DstK8S_Namespace,DstK8S_OwnerName))*1000 > 0" + legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" + - dashboardName: Main + sectionName: External + title: "Top external ingress sRTT per workload, p99 (ms)" + unit: seconds + type: Line + queries: + - promQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket{DstK8S_Namespace!=\"\"}[2m])) by (le,DstK8S_Namespace,DstK8S_OwnerName))*1000 > 0" + legend: "{{DstK8S_Namespace}} / {{DstK8S_OwnerName}}" +``` + +This example uses the `histogram_quantile` function, to show p50 and p99. +You may also be interested in showing averages of histograms: this is done by dividing `$METRIC_sum` by `$METRIC_count` metrics, which are automatically generated when you create an histogram. With the above example, it would be: + +```yaml +promQL: "(sum(rate($METRIC_sum{DstK8S_Namespace!=\"\"}[2m])) by (DstK8S_Namespace,DstK8S_OwnerName) / sum(rate($METRIC_count{DstK8S_Namespace!=\"\"}[2m])) by (DstK8S_Namespace,DstK8S_OwnerName))*1000" +``` From 515c7e53c46f05ecf4e2616fdc63f47046e87a9d Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Fri, 19 Apr 2024 10:24:53 +0200 Subject: [PATCH 12/13] Apply suggestions from code review Co-authored-by: Mehul Modi --- config/samples/flowmetrics/service_ports.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/samples/flowmetrics/service_ports.yaml b/config/samples/flowmetrics/service_ports.yaml index 85608df74..70350add8 100644 --- a/config/samples/flowmetrics/service_ports.yaml +++ b/config/samples/flowmetrics/service_ports.yaml @@ -11,10 +11,10 @@ spec: filters: - field: DstPort value: "^\\d\\d?\\d?\\d?$" - matchType: Regex + matchType: MatchRegex - field: Duplicate value: "true" matchType: NotEqual - field: FlowDirection value: "1|2" - matchType: Regex + matchType: MatchRegex From fe3a9e6814a40b033f73eb1e816b3b356ed4965e Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Fri, 19 Apr 2024 10:52:57 +0200 Subject: [PATCH 13/13] Add more info on cardinality --- docs/Metrics.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/Metrics.md b/docs/Metrics.md index 68736f10b..97beb928d 100644 --- a/docs/Metrics.md +++ b/docs/Metrics.md @@ -54,9 +54,14 @@ When the `DNSTracking` feature is enabled in `spec.agent.ebpf.features`, additio ## Custom metrics using the FlowMetrics API The FlowMetrics API ([spec reference](./FlowMetric.md)) has been designed to give you full control on the metrics generation out of the NetObserv' enriched NetFlow data. -It allows to create counters or histograms with any set of fields as Prometheus labels, and using any filters from the fields. Just a recommendation: be careful about the [metrics cardinality](https://www.robustperception.io/cardinality-is-key/) when creating new metrics. High cardinality metrics can stress the Prometheus instance. Don't hesitate to [reach out](https://github.com/netobserv/network-observability-operator/discussions/new/choose) if you need some guidance. +It allows to create counters or histograms with any set of fields as Prometheus labels, and using any filters from the fields. Just a recommendation: be careful about the [metrics cardinality](https://www.robustperception.io/cardinality-is-key/) when creating new metrics. High cardinality metrics can stress the Prometheus instance. + +The full list of fields is [available there](./flows-format.adoc). The "Cardinality" column gives information about the implied metrics cardinality. Fields flagged as `fine` are safe to use as labels. Fields flagged as `careful` need some extra attention: if you want to use them as labels, it is recommended to narrow down the cardinality with filters. For example, you may safely use `DstPort` as a label if you also restrict which `DstPort` are allowed with a `MatchRegex` filter. + +Be also aware that for each field used as a label, the fields cardinality is potentially multiplied - and this is especially true when mixing Source and Destination fields. For instance, using `SrcK8S_Name` or `DstK8S_Name` (ie. Pod/Node/Service names) alone as a label might be reasonable, but using both `SrcK8S_Name` and `DstK8S_Name` in the same metric potentially generates the square of the cardinality of Pods/Nodes/Services. + +Don't hesitate to [reach out](https://github.com/netobserv/network-observability-operator/discussions/new/choose) if you need more guidance. -The full list of fields is [available there](./flows-format.adoc). Some of those fields require special features to be enabled in `FlowCollector`, such as `TimeFlowRttNs` via `spec.agent.ebpf.features` or `Src/DstK8S_Zone` via `spec.processor.addZone`. Currently, `FlowMetric` resources need to be created in the namespace defined in `FlowCollector` `spec.namespace`, which is `netobserv` by default. This may change in the future.