From 7d96d0fd996895d0cba9df2bfd56f9c72578fad5 Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Wed, 3 Apr 2024 17:40:30 +0200 Subject: [PATCH] NETOBSERV-1343: generate dashboards from metrics API - Add dashboard config to metrics API - Use this API internally for predefined dashboards - Allow using SingleStats - New dedicated buckets for latency histograms --- apis/flowmetrics/v1alpha1/flowmetric_types.go | 37 +++ controllers/flp/flp_pipeline_builder.go | 35 +-- .../monitoring/monitoring_controller.go | 30 +- controllers/monitoring/monitoring_objects.go | 8 +- pkg/dashboards/dashboard.go | 294 +++++------------- pkg/dashboards/dashboard_test.go | 118 +++---- pkg/dashboards/health.go | 89 +++--- pkg/dashboards/model.go | 110 +++---- pkg/dashboards/scopes.go | 68 ---- pkg/metrics/helper.go | 30 ++ pkg/metrics/predefined_charts.go | 285 +++++++++++++++++ pkg/metrics/predefined_metrics.go | 136 ++++---- pkg/metrics/predefined_metrics_test.go | 18 +- 13 files changed, 715 insertions(+), 543 deletions(-) delete mode 100644 pkg/dashboards/scopes.go create mode 100644 pkg/metrics/helper.go create mode 100644 pkg/metrics/predefined_charts.go diff --git a/apis/flowmetrics/v1alpha1/flowmetric_types.go b/apis/flowmetrics/v1alpha1/flowmetric_types.go index 9206e89ad..f50f05153 100644 --- a/apis/flowmetrics/v1alpha1/flowmetric_types.go +++ b/apis/flowmetrics/v1alpha1/flowmetric_types.go @@ -109,6 +109,43 @@ type FlowMetricSpec struct { // A list of buckets to use when `type` is "Histogram". The list must be parseable as floats. Prometheus default buckets will be used if unset. // +optional Buckets []string `json:"buckets,omitempty"` + + // When non-zero, scale factor (divider) of the value. Metric value = Flow value / Divider. + // +optional + Divider float64 `json:"divider"` + + // Charts configuration + // +optional + Charts []Chart `json:"charts,omitempty"` +} + +type Unit string +type ChartType string + +const ( + UnitBytes Unit = "bytes" + UnitSeconds Unit = "seconds" + UnitBPS Unit = "Bps" + UnitPPS Unit = "pps" + ChartTypeSingleStat ChartType = "SingleStat" + ChartTypeLine ChartType = "Line" + ChartTypeStackArea ChartType = "StackArea" +) + +// Configures charts / dashboard generation associated to a metric +type Chart struct { + DashboardName string `json:"dashboardName"` + SectionName string `json:"sectionName"` + Title string `json:"title"` + Unit Unit `json:"unit"` + Type ChartType `json:"type"` + Queries []Query `json:"queries"` +} + +// Configures PromQL queries +type Query struct { + PromQL string `json:"promQL"` + Legend string `json:"legend"` } // FlowMetricStatus defines the observed state of FlowMetric diff --git a/controllers/flp/flp_pipeline_builder.go b/controllers/flp/flp_pipeline_builder.go index c2c405e61..7d2b32919 100644 --- a/controllers/flp/flp_pipeline_builder.go +++ b/controllers/flp/flp_pipeline_builder.go @@ -165,23 +165,23 @@ func (b *PipelineBuilder) AddProcessorStages() error { } // obtain encode_prometheus stage from metrics_definitions - names := metrics.GetIncludeList(b.desired) - promMetrics := metrics.GetDefinitions(names) + allMetrics := metrics.MergePredefined(b.flowMetrics.Items, b.desired) - for i := range b.flowMetrics.Items { - fm := &b.flowMetrics.Items[i] + var flpMetrics []api.MetricsItem + for i := range allMetrics { + fm := &allMetrics[i] m, err := flowMetricToFLP(&fm.Spec) if err != nil { return fmt.Errorf("error reading FlowMetric definition '%s': %w", fm.Name, err) } - promMetrics = append(promMetrics, *m) + flpMetrics = append(flpMetrics, *m) } - if len(promMetrics) > 0 { + if len(flpMetrics) > 0 { // prometheus stage (encode) configuration promEncode := api.PromEncode{ Prefix: "netobserv_", - Metrics: promMetrics, + Metrics: flpMetrics, } enrichedStage.EncodePrometheus("prometheus", promEncode) } @@ -192,23 +192,16 @@ func (b *PipelineBuilder) AddProcessorStages() error { func flowMetricToFLP(flowMetric *metricslatest.FlowMetricSpec) (*api.MetricsItem, error) { m := &api.MetricsItem{ - Name: flowMetric.MetricName, - Type: api.MetricEncodeOperationEnum(strings.ToLower(string(flowMetric.Type))), - Filters: []api.MetricsFilter{}, - Labels: flowMetric.Labels, - ValueKey: flowMetric.ValueField, + Name: flowMetric.MetricName, + Type: api.MetricEncodeOperationEnum(strings.ToLower(string(flowMetric.Type))), + Filters: []api.MetricsFilter{}, + Labels: flowMetric.Labels, + ValueKey: flowMetric.ValueField, + ValueScale: flowMetric.Divider, } - for _, f := range flowMetric.Filters { + for _, f := range metrics.GetFilters(flowMetric) { m.Filters = append(m.Filters, api.MetricsFilter{Key: f.Field, Value: f.Value, Type: api.MetricFilterEnum(conversion.PascalToLower(string(f.MatchType), '_'))}) } - if !flowMetric.IncludeDuplicates { - m.Filters = append(m.Filters, api.MetricsFilter{Key: "Duplicate", Value: "true", Type: api.MetricFilterNotEqual}) - } - if flowMetric.Direction == metricslatest.Egress { - m.Filters = append(m.Filters, api.MetricsFilter{Key: "FlowDirection", Value: "1|2", Type: api.MetricFilterRegex}) - } else if flowMetric.Direction == metricslatest.Ingress { - m.Filters = append(m.Filters, api.MetricsFilter{Key: "FlowDirection", Value: "0|2", Type: api.MetricFilterRegex}) - } for _, b := range flowMetric.Buckets { f, err := strconv.ParseFloat(b, 64) if err != nil { diff --git a/controllers/monitoring/monitoring_controller.go b/controllers/monitoring/monitoring_controller.go index 1f3b6cf19..d7e80c51f 100644 --- a/controllers/monitoring/monitoring_controller.go +++ b/controllers/monitoring/monitoring_controller.go @@ -9,9 +9,13 @@ import ( "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" flowslatest "github.com/netobserv/network-observability-operator/apis/flowcollector/v1beta2" + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" + "github.com/netobserv/network-observability-operator/controllers/constants" "github.com/netobserv/network-observability-operator/controllers/reconcilers" "github.com/netobserv/network-observability-operator/pkg/helper" "github.com/netobserv/network-observability-operator/pkg/manager" @@ -21,8 +25,9 @@ import ( type Reconciler struct { client.Client - mgr *manager.Manager - status status.Instance + mgr *manager.Manager + status status.Instance + currentNamespace string } func Start(ctx context.Context, mgr *manager.Manager) error { @@ -37,6 +42,15 @@ func Start(ctx context.Context, mgr *manager.Manager) error { For(&flowslatest.FlowCollector{}, reconcilers.IgnoreStatusChange). Named("monitoring"). Owns(&corev1.Namespace{}). + Watches( + &metricslatest.FlowMetric{}, + handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []reconcile.Request { + if o.GetNamespace() == r.currentNamespace { + return []reconcile.Request{{NamespacedName: constants.FlowCollectorName}} + } + return []reconcile.Request{} + }), + ). Complete(&r) } @@ -74,6 +88,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result func (r *Reconciler) reconcile(ctx context.Context, clh *helper.Client, desired *flowslatest.FlowCollector) error { ns := helper.GetNamespace(&desired.Spec) + r.currentNamespace = ns // If namespace does not exist, we create it nsExist, err := r.namespaceExist(ctx, ns) @@ -104,8 +119,14 @@ func (r *Reconciler) reconcile(ctx context.Context, clh *helper.Client, desired } if r.mgr.HasSvcMonitor() { - names := metrics.GetIncludeList(&desired.Spec) - desiredFlowDashboardCM, del, err := buildFlowMetricsDashboard(ns, names) + // List custom metrics + fm := metricslatest.FlowMetricList{} + if err := r.Client.List(ctx, &fm, &client.ListOptions{Namespace: ns}); err != nil { + return r.status.Error("CantListFlowMetrics", err) + } + + allMetrics := metrics.MergePredefined(fm.Items, &desired.Spec) + desiredFlowDashboardCM, del, err := buildFlowMetricsDashboard(allMetrics) if err != nil { return err } else if err = reconcilers.ReconcileConfigMap(ctx, clh, desiredFlowDashboardCM, del); err != nil { @@ -119,6 +140,7 @@ func (r *Reconciler) reconcile(ctx context.Context, clh *helper.Client, desired return err } } + return nil } diff --git a/controllers/monitoring/monitoring_objects.go b/controllers/monitoring/monitoring_objects.go index 08b35284c..3b77b191e 100644 --- a/controllers/monitoring/monitoring_objects.go +++ b/controllers/monitoring/monitoring_objects.go @@ -1,6 +1,7 @@ package monitoring import ( + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" "github.com/netobserv/network-observability-operator/controllers/constants" "github.com/netobserv/network-observability-operator/pkg/dashboards" corev1 "k8s.io/api/core/v1" @@ -74,11 +75,8 @@ func buildRoleBindingMonitoringReader(ns string) *rbacv1.ClusterRoleBinding { } } -func buildFlowMetricsDashboard(namespace string, metrics []string) (*corev1.ConfigMap, bool, error) { - dashboard, err := dashboards.CreateFlowMetricsDashboard(namespace, metrics) - if err != nil { - return nil, false, err - } +func buildFlowMetricsDashboard(metrics []metricslatest.FlowMetric) (*corev1.ConfigMap, bool, error) { + dashboard := dashboards.CreateFlowMetricsDashboards(metrics) configMap := corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ diff --git a/pkg/dashboards/dashboard.go b/pkg/dashboards/dashboard.go index d938c8a36..59c2fd528 100644 --- a/pkg/dashboards/dashboard.go +++ b/pkg/dashboards/dashboard.go @@ -4,246 +4,102 @@ import ( "fmt" "strings" - "k8s.io/utils/strings/slices" + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" ) -const ( - layerApps = "Applications" - layerInfra = "Infrastructure" - appsFilters1 = `SrcK8S_Namespace!~"|$NETOBSERV_NS|openshift.*"` - appsFilters2 = `SrcK8S_Namespace=~"$NETOBSERV_NS|openshift.*",DstK8S_Namespace!~"|$NETOBSERV_NS|openshift.*"` - infraFilters1 = `SrcK8S_Namespace=~"$NETOBSERV_NS|openshift.*"` - infraFilters2 = `SrcK8S_Namespace!~"$NETOBSERV_NS|openshift.*",DstK8S_Namespace=~"$NETOBSERV_NS|openshift.*"` - metricTagIngress = "ingress" - metricTagEgress = "egress" - metricTagBytes = "bytes" - metricTagPackets = "packets" -) - -var allRows []*Row +type chart struct { + metricslatest.Chart + mptr *metricslatest.FlowMetric +} -func init() { - for _, scope := range []metricScope{srcDstNodeScope, srcDstNamespaceScope, srcDstWorkloadScope} { - // byte/pkt rates - for _, valueType := range []string{metricTagBytes, metricTagPackets} { - valueTypeText := valueTypeToText(valueType) - for _, dir := range []string{metricTagEgress, metricTagIngress} { - title := fmt.Sprintf( - "%s rate %s %s", - valueTypeText, - dirToVerb(dir), - scope.titlePart, - ) - metric := fmt.Sprintf("%s_%s_%s_total", scope.metricPart, dir, valueType) - allRows = append(allRows, row( - metric, - title, - topRatePanels(&scope, metric, scope.joinLabels(), scope.legendPart), - )) - } - // drops - title := fmt.Sprintf( - "%s drop rate %s", - valueTypeText, - scope.titlePart, - ) - metric := fmt.Sprintf("%s_drop_%s_total", scope.metricPart, valueType) - allRows = append(allRows, row( - metric, - title, - topRatePanels(&scope, metric, scope.joinLabels(), scope.legendPart), - )) +func createSingleStatPanels(c *chart) []Panel { + var panels []Panel + for _, q := range c.Queries { + title := c.Title + if q.Legend != "" { + title += ", " + q.Legend } - // RTT - title := fmt.Sprintf("Round-trip time %s (milliseconds - p99 and p50)", scope.titlePart) - metric := fmt.Sprintf("%s_rtt_seconds", scope.metricPart) - allRows = append(allRows, row( - metric, - title, - histogramPanels(&scope, metric, scope.joinLabels(), scope.legendPart, "*1000"), - )) - // DNS latency - title = fmt.Sprintf("DNS latency %s (milliseconds - p99 and p50)", scope.titlePart) - metric = fmt.Sprintf("%s_dns_latency_seconds", scope.metricPart) - allRows = append(allRows, row( - metric, - title, - histogramPanels(&scope, metric, scope.joinLabels(), scope.legendPart, "*1000"), - )) - // DNS errors - title = fmt.Sprintf("DNS request rate per code and %s", scope.titlePart) - metric = fmt.Sprintf("%s_dns_latency_seconds", scope.metricPart) - labels := scope.joinLabels() + ",DnsFlagsResponseCode" - legend := scope.legendPart + ", {{DnsFlagsResponseCode}}" - allRows = append(allRows, row( - metric, - title, - topRatePanels(&scope, metric+"_count", labels, legend), - )) + query := strings.ReplaceAll(q.PromQL, "$METRIC", "netobserv_"+c.mptr.Spec.MetricName) + newPanel := NewPanel(title, metricslatest.ChartTypeSingleStat, c.Unit, 3, NewTarget(query, "")) + panels = append(panels, newPanel) } + return panels } -func row(metrics string, title string, panels []Panel) *Row { - r := NewRow(title, false, "250px", panels) - r.Metric = metrics - return r +func createGraphPanel(c *chart) Panel { + var targets []Target + for _, q := range c.Queries { + query := strings.ReplaceAll(q.PromQL, "$METRIC", "netobserv_"+c.mptr.Spec.MetricName) + query = fmt.Sprintf("topk(7, %s)", query) + targets = append(targets, NewTarget(query, q.Legend)) + } + return NewPanel(c.Title, c.Type, c.Unit, 4, targets...) } -func topRatePanels(scope *metricScope, metric, labels, legend string) []Panel { - if scope.splitAppInfra { - return []Panel{ - // App - NewPanel( - layerApps, PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - NewTarget( - scope.labelReplace( - fmt.Sprintf( - "topk(10,sum(rate(netobserv_%s{%s}[2m]) or rate(netobserv_%s{%s}[2m])) by (%s))", - metric, - appsFilters1, - metric, - appsFilters2, - labels, - ), - ), - legend, - ), - }, - ), - // Infra - NewPanel( - layerInfra, PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - NewTarget( - scope.labelReplace( - fmt.Sprintf( - "topk(10,sum(rate(netobserv_%s{%s}[2m]) or rate(netobserv_%s{%s}[2m])) by (%s))", - metric, - infraFilters1, - metric, - infraFilters2, - labels, - ), - ), - legend, - ), - }, - ), +func rearrangeRows(rows []*Row, mapTopPanels, mapBodyPanels map[string][]Panel) { + for i, row := range rows { + topPanels := mapTopPanels[row.Title] + bodyPanels := mapBodyPanels[row.Title] + // Most of the time, panels are correctly arranged within a section. + // Excepted when there are 4 panels (or 3*rows+1), it shows 3 on first row then 1 on the second row + // We'll change that to 2 + 2 + count := len(bodyPanels) + if count > 3 && count%3 == 1 { + // Set Span=6 (half page) for the two last panels + bodyPanels[count-1].Span = 6 + bodyPanels[count-2].Span = 6 } + rows[i].Panels = topPanels + rows[i].Panels = append(rows[i].Panels, bodyPanels...) } - // No split - return []Panel{NewPanel( - "", PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - NewTarget( - scope.labelReplace( - fmt.Sprintf("topk(10,sum(rate(netobserv_%s[2m])) by (%s))", metric, labels), - ), - legend, - ), - }, - )} } -func histogramPanels(scope *metricScope, metric, labels, legend, scaler string) []Panel { - if scope.splitAppInfra { - appRateExpr := fmt.Sprintf( - "rate(netobserv_%s_bucket{%s}[2m]) or rate(netobserv_%s_bucket{%s}[2m])", - metric, - appsFilters1, - metric, - appsFilters2, - ) - infraRateExpr := fmt.Sprintf( - "rate(netobserv_%s_bucket{%s}[2m]) or rate(netobserv_%s_bucket{%s}[2m])", - metric, - infraFilters1, - metric, - infraFilters2, - ) - return []Panel{ - // App - NewPanel( - layerApps, PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - histogramTarget(scope, "0.99", appRateExpr, labels, legend, scaler), - histogramTarget(scope, "0.50", appRateExpr, labels, legend, scaler), - }, - ), - // Infra - NewPanel( - layerInfra, PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - histogramTarget(scope, "0.99", infraRateExpr, labels, legend, scaler), - histogramTarget(scope, "0.50", infraRateExpr, labels, legend, scaler), - }, - ), +func createFlowMetricsDashboard(dashboardName string, charts []chart) string { + mapRows := make(map[string]*Row) + mapTopPanels := make(map[string][]Panel) + mapBodyPanels := make(map[string][]Panel) + var orderedRows []*Row + chartsDedupMap := make(map[string]any) + for i := range charts { + chart := charts[i] + // A chart might be provided by several metrics, e.g. Total ingress bps can be provided by node_ingress_bytes_total and namespace_ingress_bytes_total + // Dedup them, assuming they have the same title+unit + dedupKey := chart.Title + "/" + string(chart.Unit) + if _, exists := chartsDedupMap[dedupKey]; exists { + continue } - } - // No split - rateExpr := fmt.Sprintf("rate(netobserv_%s_bucket[2m])", metric) - return []Panel{ - NewPanel( - "", PanelTypeGraph, PanelUnitShort, 6, false, - []Target{ - histogramTarget(scope, "0.99", rateExpr, labels, legend, scaler), - histogramTarget(scope, "0.50", rateExpr, labels, legend, scaler), - }, - ), - } -} + chartsDedupMap[dedupKey] = true -func histogramTarget(scope *metricScope, quantile, rateExpr, labels, legend, scaler string) Target { - return NewTarget( - scope.labelReplace( - fmt.Sprintf( - "topk(10,histogram_quantile(%s, sum(%s) by (le,%s))%s > 0)", - quantile, - rateExpr, - labels, - scaler, - ), - ), - legend+", q="+quantile, - ) -} + if chart.Type == metricslatest.ChartTypeSingleStat { + mapTopPanels[chart.SectionName] = append(mapTopPanels[chart.SectionName], createSingleStatPanels(&chart)...) + } else { + mapBodyPanels[chart.SectionName] = append(mapBodyPanels[chart.SectionName], createGraphPanel(&chart)) + } -func dirToVerb(dir string) string { - switch dir { - case metricTagEgress: - return "sent" - case metricTagIngress: - return "received" + if _, exists := mapRows[chart.SectionName]; !exists { + row := NewRow(chart.SectionName, false, "250px", nil) + mapRows[chart.SectionName] = row + orderedRows = append(orderedRows, row) + } } - return "" -} -func valueTypeToText(t string) string { - switch t { - case metricTagBytes: - return "Byte" - case metricTagPackets: - return "Packet" - } - return "" + rearrangeRows(orderedRows, mapTopPanels, mapBodyPanels) + d := Dashboard{Rows: orderedRows, Title: dashboardName} + return d.ToGrafanaJSON() } -func CreateFlowMetricsDashboard(netobsNs string, metrics []string) (string, error) { - var rows []*Row - for _, ri := range allRows { - if slices.Contains(metrics, ri.Metric) { - rows = append(rows, ri) - } else if strings.Contains(ri.Metric, "namespace_") { - // namespace-based panels can also be displayed using workload-based metrics - // Try again, replacing *_namespace_* with *_workload_* - equivalentMetric := strings.Replace(ri.Metric, "namespace_", "workload_", 1) - if slices.Contains(metrics, equivalentMetric) { - clone := ri.replaceMetric(equivalentMetric) - rows = append(rows, clone) +func CreateFlowMetricsDashboards(metrics []metricslatest.FlowMetric) string { + chartsPerDashboard := make(map[string][]chart) + for i := range metrics { + metric := &metrics[i] + for j := range metric.Spec.Charts { + c := chart{ + Chart: metric.Spec.Charts[j], + mptr: metric, } + chartsPerDashboard[c.DashboardName] = append(chartsPerDashboard[c.DashboardName], c) } } - d := Dashboard{Rows: rows, Title: "NetObserv"} - return d.ToGrafanaJSON(netobsNs), nil + // TODO: handle more dashboards + return createFlowMetricsDashboard("NetObserv", chartsPerDashboard["NetObserv"]) } diff --git a/pkg/dashboards/dashboard_test.go b/pkg/dashboards/dashboard_test.go index d0bbcf876..a260e3375 100644 --- a/pkg/dashboards/dashboard_test.go +++ b/pkg/dashboards/dashboard_test.go @@ -10,74 +10,80 @@ import ( func TestCreateFlowMetricsDashboard_All(t *testing.T) { assert := assert.New(t) - js, err := CreateFlowMetricsDashboard("netobserv", metrics.GetAllNames()) - assert.NoError(err) + defs := metrics.GetDefinitions(metrics.GetAllNames()) + js := CreateFlowMetricsDashboards(defs) d, err := FromBytes([]byte(js)) assert.NoError(err) assert.Equal("NetObserv", d.Title) - assert.Len(d.Rows, 27) - row := d.FindRow("Byte rate sent per node") - assert.NotNil(row) - assert.Len(row.Panels, 1) - assert.Equal("", row.Panels[0].Title) - assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(10,sum(rate(netobserv_node_egress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") + assert.Equal([]string{"Traffic rates", "TCP latencies", "Byte and packet drops", "DNS"}, d.Titles()) - row = d.FindRow("DNS latency per node") - assert.NotNil(row) - assert.Len(row.Panels, 1) - assert.Equal("", row.Panels[0].Title) - assert.Len(row.Panels[0].Targets, 2) - assert.Contains(row.Panels[0].Targets[0].Expr, "histogram_quantile(0.99, sum(rate(netobserv_node_dns_latency_seconds_bucket[2m])) by (le,SrcK8S_HostName,DstK8S_HostName))") + assert.Len(d.Rows[0].Panels, 32) - row = d.FindRow("Byte rate received per namespace") - assert.NotNil(row) - assert.Len(row.Panels, 2) - assert.Equal("Applications", row.Panels[0].Title) - assert.Equal("Infrastructure", row.Panels[1].Title) - assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_namespace_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_namespace_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, + p := d.FindPanel("Top egress traffic per node") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal("topk(7, sum(rate(netobserv_node_egress_bytes_total{}[2m])) by (SrcK8S_HostName,DstK8S_HostName))", p.Targets[0].Expr) + + p = d.FindPanel("Top P50 DNS latency per node (ms)") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal("topk(7, histogram_quantile(0.5, sum(rate(netobserv_node_dns_latency_seconds_bucket{}[2m])) by (le,SrcK8S_HostName,DstK8S_HostName))*1000 > 0)", p.Targets[0].Expr) + + p = d.FindPanel("Top P99 DNS latency per node (ms)") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal("topk(7, histogram_quantile(0.99, sum(rate(netobserv_node_dns_latency_seconds_bucket{}[2m])) by (le,SrcK8S_HostName,DstK8S_HostName))*1000 > 0)", p.Targets[0].Expr) + + p = d.FindPanel("Top ingress traffic per app namespace") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal( + `topk(7, (sum(rate(netobserv_namespace_ingress_bytes_total{K8S_FlowLayer="app",SrcK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`+ + ` or (sum(rate(netobserv_namespace_ingress_bytes_total{K8S_FlowLayer="app",DstK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace)))`, + p.Targets[0].Expr, ) - assert.Contains(row.Panels[1].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_namespace_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_namespace_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, + p = d.FindPanel("Top ingress traffic per infra namespace") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal( + `topk(7, (sum(rate(netobserv_namespace_ingress_bytes_total{K8S_FlowLayer="infra",SrcK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`+ + ` or (sum(rate(netobserv_namespace_ingress_bytes_total{K8S_FlowLayer="infra",DstK8S_Namespace!=""}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace)))`, + p.Targets[0].Expr, ) - row = d.FindRow("Round-trip time per namespace") - assert.NotNil(row) - assert.Len(row.Panels, 2) - assert.Equal("Applications", row.Panels[0].Title) - assert.Equal("Infrastructure", row.Panels[1].Title) - assert.Len(row.Panels[0].Targets, 2) - assert.Contains(row.Panels[0].Targets[0].Expr, - `histogram_quantile(0.99, sum(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (le,SrcK8S_Namespace,DstK8S_Namespace))`, - ) - assert.Contains(row.Panels[1].Targets[1].Expr, - `histogram_quantile(0.50, sum(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (le,SrcK8S_Namespace,DstK8S_Namespace))`, + p = d.FindPanel("Top P50 sRTT per infra namespace (ms)") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal( + `topk(7, (histogram_quantile(0.5, sum(rate(netobserv_namespace_rtt_seconds_bucket{K8S_FlowLayer="infra",SrcK8S_Namespace!=""}[2m])) by (le,SrcK8S_Namespace,DstK8S_Namespace))*1000 > 0)`+ + ` or (histogram_quantile(0.5, sum(rate(netobserv_namespace_rtt_seconds_bucket{K8S_FlowLayer="infra",DstK8S_Namespace!=""}[2m])) by (le,SrcK8S_Namespace,DstK8S_Namespace))*1000 > 0))`, + p.Targets[0].Expr, ) - row = d.FindRow("Packet rate received per workload") - assert.NotNil(row) - assert.Len(row.Panels, 2) - assert.Equal("Applications", row.Panels[0].Title) - assert.Equal("Infrastructure", row.Panels[1].Title) - assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_packets_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_packets_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + p = d.FindPanel("Top ingress traffic per app workload") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal( + `topk(7, sum(rate(netobserv_workload_ingress_packets_total{K8S_FlowLayer="app"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + p.Targets[0].Expr, ) - assert.Contains(row.Panels[1].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_packets_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_packets_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + p = d.FindPanel("Top ingress traffic per infra workload") + assert.NotNil(p) + assert.Len(p.Targets, 1) + assert.Equal( + `topk(7, sum(rate(netobserv_workload_ingress_packets_total{K8S_FlowLayer="infra"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + p.Targets[0].Expr, ) } func TestCreateFlowMetricsDashboard_OnlyNodeIngressBytes(t *testing.T) { assert := assert.New(t) - js, err := CreateFlowMetricsDashboard("netobserv", []string{"node_ingress_bytes_total"}) - assert.NoError(err) + defs := metrics.GetDefinitions([]string{"node_ingress_bytes_total"}) + js := CreateFlowMetricsDashboards(defs) d, err := FromBytes([]byte(js)) assert.NoError(err) @@ -90,14 +96,14 @@ func TestCreateFlowMetricsDashboard_OnlyNodeIngressBytes(t *testing.T) { assert.Len(row.Panels, 1) assert.Equal("", row.Panels[0].Title) assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(10,sum(rate(netobserv_node_ingress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") + assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(7,sum(rate(netobserv_node_ingress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") } func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { assert := assert.New(t) - js, err := CreateFlowMetricsDashboard("netobserv", metrics.DefaultIncludeList) - assert.NoError(err) + defs := metrics.GetDefinitions(metrics.DefaultIncludeList) + js := CreateFlowMetricsDashboards(defs) d, err := FromBytes([]byte(js)) assert.NoError(err) @@ -110,7 +116,7 @@ func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { assert.Len(row.Panels, 1) assert.Equal("", row.Panels[0].Title) assert.Len(row.Panels[0].Targets, 1) - assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(10,sum(rate(netobserv_node_ingress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") + assert.Contains(row.Panels[0].Targets[0].Expr, "label_replace(label_replace(topk(7,sum(rate(netobserv_node_ingress_bytes_total[2m])) by (SrcK8S_HostName,DstK8S_HostName))") row = d.FindRow("Byte rate received per namespace") assert.NotNil(row) @@ -120,10 +126,10 @@ func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { assert.Len(row.Panels[0].Targets, 1) // Make sure netobserv_namespace_ingress_bytes_total was replaced with netobserv_workload_ingress_bytes_total assert.Contains(row.Panels[0].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, + `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, ) assert.Contains(row.Panels[1].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, + `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,DstK8S_Namespace))`, ) row = d.FindRow("Byte rate received per workload") @@ -133,10 +139,10 @@ func TestCreateFlowMetricsDashboard_DefaultList(t *testing.T) { assert.Equal("Infrastructure", row.Panels[1].Title) assert.Len(row.Panels[0].Targets, 1) assert.Contains(row.Panels[0].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"|netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*",DstK8S_Namespace!~"|netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, ) assert.Contains(row.Panels[1].Targets[0].Expr, - `label_replace(label_replace(topk(10,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, + `label_replace(label_replace(topk(7,sum(rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace=~"netobserv|openshift.*"}[2m]) or rate(netobserv_workload_ingress_bytes_total{SrcK8S_Namespace!~"netobserv|openshift.*",DstK8S_Namespace=~"netobserv|openshift.*"}[2m])) by (SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName))`, ) } diff --git a/pkg/dashboards/health.go b/pkg/dashboards/health.go index ec127b376..c24758474 100644 --- a/pkg/dashboards/health.go +++ b/pkg/dashboards/health.go @@ -2,6 +2,8 @@ package dashboards import ( "fmt" + + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" ) func CreateHealthDashboard(netobsNs string) (string, error) { @@ -10,18 +12,18 @@ func CreateHealthDashboard(netobsNs string) (string, error) { // Global stats // TODO after direct-FLP: if Direct mode, get flow rate from loki if enabled, else from agent d.Rows = append(d.Rows, NewRow("", false, "100px", []Panel{ - NewSingleStatPanel("Flows per second", PanelUnitShort, 3, NewTarget( + NewPanel("Flows per second", metricslatest.ChartTypeSingleStat, "", 3, NewTarget( `sum(rate(netobserv_ingest_flows_processed[1m]))`, "")), - NewSingleStatPanel("Sampling", PanelUnitShort, 3, NewTarget( + NewPanel("Sampling", metricslatest.ChartTypeSingleStat, "", 3, NewTarget( "avg(netobserv_agent_sampling_rate)", "")), - NewSingleStatPanel("Errors last minute", PanelUnitShort, 3, NewTarget( + NewPanel("Errors last minute", metricslatest.ChartTypeSingleStat, "", 3, NewTarget( `(sum(increase(netobserv_agent_errors_total[1m])) OR on() vector(0)) + (sum(increase(netobserv_ingest_errors[1m])) OR on() vector(0)) + (sum(increase(netobserv_encode_prom_errors[1m])) OR on() vector(0)) + (sum(increase(netobserv_loki_batch_retries_total[1m])) OR on() vector(0)) + (sum(increase(controller_runtime_reconcile_errors_total{job="netobserv-metrics-service"}[1m])) OR on() vector(0)) `, "")), - NewSingleStatPanel("Dropped flows per second", PanelUnitShort, 3, NewTarget( + NewPanel("Dropped flows per second", metricslatest.ChartTypeSingleStat, "", 3, NewTarget( `(sum(rate(netobserv_loki_dropped_entries_total[1m])) OR on() vector(0)) + (sum(rate(netobserv_agent_dropped_flows_total[1m])) OR on() vector(0)) `, "")), @@ -29,91 +31,90 @@ func CreateHealthDashboard(netobsNs string) (string, error) { // FLP stats overheadQuery := fmt.Sprintf("100 * sum(rate(netobserv_namespace_flows_total{SrcK8S_Namespace='%s'}[1m]) or rate(netobserv_namespace_flows_total{SrcK8S_Namespace!='%s',DstK8S_Namespace='%s'}[1m])) / sum(rate(netobserv_namespace_flows_total[1m]))", netobsNs, netobsNs, netobsNs) - // TODO: add FLP error d.Rows = append(d.Rows, NewRow("Flowlogs-pipeline statistics", false, "250px", []Panel{ - NewGraphPanel("Flows per second", PanelUnitShort, 4, false, []Target{ + NewPanel("Flows per second", metricslatest.ChartTypeLine, "", 4, NewTarget("sum(rate(netobserv_ingest_flows_processed[1m]))", "Flows ingested"), NewTarget("sum(rate(netobserv_loki_sent_entries_total[1m]))", "Flows sent to Loki"), NewTarget("sum(rate(netobserv_loki_dropped_entries_total[1m]))", "Flows dropped due to Loki error"), - }), - NewGraphPanel("Flows overhead (% generated by NetObserv own traffic)", PanelUnitShort, 4, false, []Target{ + ), + NewPanel("Flows overhead (% generated by NetObserv own traffic)", metricslatest.ChartTypeLine, "", 4, NewTarget(overheadQuery, "% overhead"), - }), - NewGraphPanel("Errors per minute", PanelUnitShort, 4, true, []Target{ + ), + NewPanel("Errors per minute", metricslatest.ChartTypeStackArea, "", 4, NewTarget(`sum(increase(netobserv_ingest_errors[1m])) by (stage,code)`, "{{stage}} {{code}}"), NewTarget(`sum(increase(netobserv_encode_prom_errors[1m])) by (error)`, "metrics {{error}}"), NewTarget(`sum(increase(netobserv_loki_batch_retries_total[1m]))`, "loki retries"), - }), - NewGraphPanel("By namespace", PanelUnitShort, 6, false, []Target{ + ), + NewPanel("By namespace", metricslatest.ChartTypeLine, "", 6, NewTarget(`topk(10,sum(rate(netobserv_namespace_flows_total{SrcK8S_Namespace!=""}[1m])) by (SrcK8S_Namespace))`, "From {{SrcK8S_Namespace}}"), NewTarget(`topk(10,sum(rate(netobserv_namespace_flows_total{DstK8S_Namespace!=""}[1m])) by (DstK8S_Namespace))`, "To {{DstK8S_Namespace}}"), - }), - NewGraphPanel("By node", PanelUnitShort, 6, false, []Target{ + ), + NewPanel("By node", metricslatest.ChartTypeLine, "", 6, NewTarget(`topk(10,sum(rate(netobserv_node_flows_total{SrcK8S_HostName!=""}[1m])) by (SrcK8S_HostName))`, "From {{SrcK8S_HostName}}"), NewTarget(`topk(10,sum(rate(netobserv_node_flows_total{DstK8S_HostName!=""}[1m])) by (DstK8S_HostName))`, "To {{DstK8S_HostName}}"), - }), + ), }), ) // Agent stats d.Rows = append(d.Rows, NewRow("eBPF agent statistics", true, "250px", []Panel{ - NewGraphPanel("Eviction rate", PanelUnitShort, 4, false, []Target{ + NewPanel("Eviction rate", metricslatest.ChartTypeLine, "", 4, NewTarget("sum(rate(netobserv_agent_evictions_total[1m])) by (source, reason)", "{{source}} {{reason}}"), - }), - NewGraphPanel("Evicted flows rate", PanelUnitShort, 4, false, []Target{ + ), + NewPanel("Evicted flows rate", metricslatest.ChartTypeLine, "", 4, NewTarget("sum(rate(netobserv_agent_evicted_flows_total[1m])) by (source, reason)", "{{source}} {{reason}}"), - }), - NewGraphPanel("Dropped flows rate", PanelUnitShort, 4, true, []Target{ + ), + NewPanel("Dropped flows rate", metricslatest.ChartTypeStackArea, "", 4, NewTarget(`sum(rate(netobserv_agent_dropped_flows_total[1m])) by (source, reason)`, "{{source}} {{reason}}"), - }), - NewGraphPanel("Ringbuffer / HashMap ratio", PanelUnitShort, 4, false, []Target{ + ), + NewPanel("Ringbuffer / HashMap ratio", metricslatest.ChartTypeLine, "", 4, NewTarget(`(sum(rate(netobserv_agent_evicted_flows_total{source="accounter"}[1m])) OR on() vector(0)) / sum(rate(netobserv_agent_evicted_flows_total{source="hashmap"}[1m]))`, "ratio"), - }), - NewGraphPanel("Buffer size", PanelUnitShort, 4, false, []Target{ + ), + NewPanel("Buffer size", metricslatest.ChartTypeLine, "", 4, NewTarget(`sum(netobserv_agent_buffer_size) by (name)`, "{{name}}"), - }), - NewGraphPanel("Errors per minute", PanelUnitShort, 4, true, []Target{ + ), + NewPanel("Errors per minute", metricslatest.ChartTypeStackArea, "", 4, NewTarget(`sum(increase(netobserv_agent_errors_total[1m])) by (component, error)`, "{{component}} {{error}}"), - }), + ), })) // Operator stats d.Rows = append(d.Rows, NewRow("Operator statistics", true, "250px", []Panel{ - NewGraphPanel("Reconcile events per minute", PanelUnitShort, 6, true, []Target{ + NewPanel("Reconcile events per minute", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`sum(increase(controller_runtime_reconcile_total{job="netobserv-metrics-service"}[1m])) by (controller,result)`, "{{controller}}: {{result}}"), - }), - NewGraphPanel("Average and P99 reconcile time", PanelUnitSeconds, 6, false, []Target{ + ), + NewPanel("Average and P99 reconcile time", metricslatest.ChartTypeLine, metricslatest.UnitSeconds, 6, NewTarget(`sum(rate(controller_runtime_reconcile_time_seconds_sum{job="netobserv-metrics-service"}[1m])) / sum(rate(controller_runtime_reconcile_time_seconds_count{job="netobserv-metrics-service"}[1m]))`, "average"), NewTarget(`histogram_quantile(0.99, sum by(le) (rate(controller_runtime_reconcile_time_seconds_bucket{job="netobserv-metrics-service"}[1m])))`, "p99"), - }), + ), })) // CPU and memory d.Rows = append(d.Rows, NewRow("Resource usage", true, "250px", []Panel{ - NewGraphPanel("Overall CPU", PanelUnitShort, 6, true, []Target{ + NewPanel("Overall CPU", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container="netobserv-ebpf-agent"})`, "eBPF agent"), NewTarget(`sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container="flowlogs-pipeline"})`, "flowlogs-pipeline"), NewTarget(`sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!="",pod=~"netobserv-controller-manager.*"})`, "operator"), - }), - NewGraphPanel("Overall memory", PanelUnitShort, 6, true, []Target{ + ), + NewPanel("Overall memory", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`sum(container_memory_rss{container="netobserv-ebpf-agent"})`, "eBPF agent"), NewTarget(`sum(container_memory_rss{container="flowlogs-pipeline"})`, "flowlogs-pipeline"), NewTarget(`sum(container_memory_rss{container!="",pod=~"netobserv-controller-manager.*"})`, "operator"), - }), - NewGraphPanel("eBPF agent CPU - top 10 pods", PanelUnitShort, 6, true, []Target{ + ), + NewPanel("eBPF agent CPU - top 10 pods", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`topk(10, node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container="netobserv-ebpf-agent"})`, "{{pod}}"), - }), - NewGraphPanel("eBPF agent memory - top 10 pods", PanelUnitShort, 6, true, []Target{ + ), + NewPanel("eBPF agent memory - top 10 pods", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`topk(10, container_memory_rss{container="netobserv-ebpf-agent"})`, "{{pod}}"), - }), - NewGraphPanel("Flowlogs-pipeline CPU - top 10 pods", PanelUnitShort, 6, true, []Target{ + ), + NewPanel("Flowlogs-pipeline CPU - top 10 pods", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`topk(10, node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container="flowlogs-pipeline"})`, "{{pod}}"), - }), - NewGraphPanel("Flowlogs-pipeline memory - top 10 pods", PanelUnitShort, 6, true, []Target{ + ), + NewPanel("Flowlogs-pipeline memory - top 10 pods", metricslatest.ChartTypeStackArea, "", 6, NewTarget(`topk(10, container_memory_rss{container="flowlogs-pipeline"})`, "{{pod}}"), - }), + ), })) - return d.ToGrafanaJSON(netobsNs), nil + return d.ToGrafanaJSON(), nil } diff --git a/pkg/dashboards/model.go b/pkg/dashboards/model.go index 032a123c3..004bbcce4 100644 --- a/pkg/dashboards/model.go +++ b/pkg/dashboards/model.go @@ -4,6 +4,8 @@ import ( "encoding/json" "fmt" "strings" + + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" ) type Dashboard struct { @@ -28,47 +30,24 @@ func NewRow(title string, collapse bool, height string, panels []Panel) *Row { } } -type PanelType string -type PanelUnit string - -const ( - PanelTypeSingleStat PanelType = "singlestat" - PanelTypeGraph PanelType = "graph" - PanelUnitBytes PanelUnit = "bytes" - PanelUnitShort PanelUnit = "short" - PanelUnitSeconds PanelUnit = "seconds" - PanelUnitBPS PanelUnit = "Bps" - PanelUnitPPS PanelUnit = "pps" -) - type Panel struct { Title string - Type PanelType + Type metricslatest.ChartType Targets []Target Span int - Stacked bool - Unit PanelUnit + Unit metricslatest.Unit } -func NewPanel(title string, t PanelType, unit PanelUnit, span int, stacked bool, targets []Target) Panel { +func NewPanel(title string, t metricslatest.ChartType, unit metricslatest.Unit, span int, targets ...Target) Panel { return Panel{ Title: title, Type: t, Unit: unit, Span: span, - Stacked: stacked, Targets: targets, } } -func NewGraphPanel(title string, unit PanelUnit, span int, stacked bool, targets []Target) Panel { - return NewPanel(title, PanelTypeGraph, unit, span, stacked, targets) -} - -func NewSingleStatPanel(title string, unit PanelUnit, span int, target Target) Panel { - return NewPanel(title, PanelTypeSingleStat, unit, span, false, []Target{target}) -} - type Target struct { Expr string Legend string @@ -110,7 +89,18 @@ func (d *Dashboard) FindRow(titleSubstr string) *Row { return nil } -func (d *Dashboard) ToGrafanaJSON(netobsNs string) string { +func (d *Dashboard) FindPanel(titleSubstr string) *Panel { + for _, r := range d.Rows { + for _, p := range r.Panels { + if strings.Contains(p.Title, titleSubstr) { + return &p + } + } + } + return nil +} + +func (d *Dashboard) ToGrafanaJSON() string { // return empty if dashboard doesn't contains rows if len(d.Rows) == 0 { return "" @@ -118,7 +108,7 @@ func (d *Dashboard) ToGrafanaJSON(netobsNs string) string { var rows []string for _, ri := range d.Rows { - rows = append(rows, ri.ToGrafanaJSON(netobsNs)) + rows = append(rows, ri.ToGrafanaJSON()) } rowsStr := strings.Join(rows, ",") @@ -181,10 +171,18 @@ func (d *Dashboard) ToGrafanaJSON(netobsNs string) string { `, rowsStr, d.Title) } -func (r *Row) ToGrafanaJSON(netobsNs string) string { +func (r *Row) Titles() []string { + var titles []string + for _, p := range r.Panels { + titles = append(titles, p.Title) + } + return titles +} + +func (r *Row) ToGrafanaJSON() string { var panels []string for _, panel := range r.Panels { - panels = append(panels, panel.ToGrafanaJSON(netobsNs)) + panels = append(panels, panel.ToGrafanaJSON()) } showTitle := true if r.Title == "" { @@ -202,19 +200,31 @@ func (r *Row) ToGrafanaJSON(netobsNs string) string { `, r.Collapse, r.Height, strings.Join(panels, ","), showTitle, r.Title) } -func (r *Row) replaceMetric(newName string) *Row { - clone := NewRow(r.Title, r.Collapse, r.Height, nil) - clone.Metric = r.Metric - for _, p := range r.Panels { - clone.Panels = append(clone.Panels, p.replaceMetric(r.Metric, newName)) - } - return clone -} - -func (p *Panel) ToGrafanaJSON(netobsNs string) string { +func (p *Panel) ToGrafanaJSON() string { var targets []string for _, target := range p.Targets { - targets = append(targets, target.ToGrafanaJSON(netobsNs)) + targets = append(targets, target.ToGrafanaJSON()) + } + unit := string(p.Unit) + if unit == "" { + unit = "short" + } + var singleStatFormat string + if p.Unit == metricslatest.UnitSeconds { + singleStatFormat = "s" + } else { + singleStatFormat = unit + } + var t string + stacked := false + switch p.Type { + case metricslatest.ChartTypeSingleStat: + t = "singlestat" + case metricslatest.ChartTypeLine: + t = "graph" + case metricslatest.ChartTypeStackArea: + t = "graph" + stacked = true } return fmt.Sprintf(` { @@ -225,6 +235,7 @@ func (p *Panel) ToGrafanaJSON(netobsNs string) string { "datasource": "prometheus", "fill": 1, "fillGradient": 0, + "format": "%s", "gridPos": {}, "id": 1, "legend": { @@ -282,22 +293,11 @@ func (p *Panel) ToGrafanaJSON(netobsNs string) string { } ] } - `, p.Span, p.Stacked, strings.Join(targets, ","), p.Title, string(p.Type), string(p.Unit)) -} - -func (p *Panel) replaceMetric(oldName, newName string) Panel { - clone := NewPanel(p.Title, p.Type, p.Unit, p.Span, p.Stacked, nil) - for _, t := range p.Targets { - clone.Targets = append( - clone.Targets, - NewTarget(strings.ReplaceAll(t.Expr, oldName, newName), t.Legend), - ) - } - return clone + `, singleStatFormat, p.Span, stacked, strings.Join(targets, ","), p.Title, t, unit) } -func (t *Target) ToGrafanaJSON(netobsNs string) string { - expr := formatCleaner.Replace(strings.ReplaceAll(t.Expr, "$NETOBSERV_NS", netobsNs)) +func (t *Target) ToGrafanaJSON() string { + expr := formatCleaner.Replace(t.Expr) return fmt.Sprintf(` { "expr": "%s", diff --git a/pkg/dashboards/scopes.go b/pkg/dashboards/scopes.go deleted file mode 100644 index 6cddc0f0b..000000000 --- a/pkg/dashboards/scopes.go +++ /dev/null @@ -1,68 +0,0 @@ -package dashboards - -import ( - "fmt" - "strings" -) - -type metricScope struct { - metricPart string - titlePart string - labels []string - legendPart string - labelsReplacementTemplate string - splitAppInfra bool -} - -var ( - srcDstNodeScope = metricScope{ - metricPart: "node", - titlePart: "per node", - labels: []string{"SrcK8S_HostName", "DstK8S_HostName"}, - legendPart: "{{SrcK8S_HostName}} -> {{DstK8S_HostName}}", - labelsReplacementTemplate: `label_replace( - label_replace( - %s, - "SrcK8S_HostName", "(not namespaced)", "SrcK8S_HostName", "()" - ), - "DstK8S_HostName", "(not namespaced)", "DstK8S_HostName", "()" - )`, - splitAppInfra: false, - } - srcDstNamespaceScope = metricScope{ - metricPart: "namespace", - titlePart: "per namespace", - labels: []string{"SrcK8S_Namespace", "DstK8S_Namespace"}, - legendPart: "{{SrcK8S_Namespace}} -> {{DstK8S_Namespace}}", - labelsReplacementTemplate: `label_replace( - label_replace( - %s, - "SrcK8S_Namespace", "(not namespaced)", "SrcK8S_Namespace", "()" - ), - "DstK8S_Namespace", "(not namespaced)", "DstK8S_Namespace", "()" - )`, - splitAppInfra: true, - } - srcDstWorkloadScope = metricScope{ - metricPart: "workload", - titlePart: "per workload", - labels: []string{"SrcK8S_Namespace", "SrcK8S_OwnerName", "DstK8S_Namespace", "DstK8S_OwnerName"}, - legendPart: "{{SrcK8S_OwnerName}} ({{SrcK8S_Namespace}}) -> {{DstK8S_OwnerName}} ({{DstK8S_Namespace}})", - labelsReplacementTemplate: `label_replace( - label_replace( - %s, - "SrcK8S_Namespace", "non pods", "SrcK8S_Namespace", "()" - ), - "DstK8S_Namespace", "non pods", "DstK8S_Namespace", "()" - )`, - splitAppInfra: true, - } -) - -func (s *metricScope) joinLabels() string { - return strings.Join(s.labels, ",") -} - -func (s *metricScope) labelReplace(q string) string { - return fmt.Sprintf(s.labelsReplacementTemplate, q) -} diff --git a/pkg/metrics/helper.go b/pkg/metrics/helper.go new file mode 100644 index 000000000..2a548f135 --- /dev/null +++ b/pkg/metrics/helper.go @@ -0,0 +1,30 @@ +package metrics + +import ( + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" +) + +func GetFilters(fm *metricslatest.FlowMetricSpec) []metricslatest.MetricFilter { + var filters []metricslatest.MetricFilter + if !fm.IncludeDuplicates { + filters = append(filters, metricslatest.MetricFilter{ + Field: "Duplicate", + Value: "true", + MatchType: metricslatest.MatchNotEqual, + }) + } + if fm.Direction == metricslatest.Egress { + filters = append(filters, metricslatest.MetricFilter{ + Field: "FlowDirection", + Value: "1|2", + MatchType: metricslatest.MatchRegex, + }) + } else if fm.Direction == metricslatest.Ingress { + filters = append(filters, metricslatest.MetricFilter{ + Field: "FlowDirection", + Value: "0|2", + MatchType: metricslatest.MatchRegex, + }) + } + return append(fm.Filters, filters...) +} diff --git a/pkg/metrics/predefined_charts.go b/pkg/metrics/predefined_charts.go new file mode 100644 index 000000000..8a21778f2 --- /dev/null +++ b/pkg/metrics/predefined_charts.go @@ -0,0 +1,285 @@ +package metrics + +import ( + "fmt" + "strings" + + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" +) + +const ( + netobservDashboard = "NetObserv" +) + +func trafficCharts(group, vt, dir string) []metricslatest.Chart { + sectionName := "Traffic rates" + var unit metricslatest.Unit + switch vt { + case tagBytes: + unit = metricslatest.UnitBPS + case tagPackets: + unit = metricslatest.UnitPPS + } + + totalSingleStat := metricslatest.Chart{ + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: fmt.Sprintf("Total %s traffic", dir), + Unit: unit, + Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC[2m]))"}}, + } + + appSingleStat := metricslatest.Chart{ + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: fmt.Sprintf("Apps %s traffic", dir), + Unit: unit, + Queries: []metricslatest.Query{{PromQL: `sum(rate($METRIC{K8S_FlowLayer="app"}[2m]))`}}, + } + + infraSingleStat := metricslatest.Chart{ + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: fmt.Sprintf("Infra %s traffic", dir), + Unit: unit, + Queries: []metricslatest.Query{{PromQL: `sum(rate($METRIC{K8S_FlowLayer="infra"}[2m]))`}}, + } + + var charts []metricslatest.Chart + switch group { + case tagNodes: + charts = []metricslatest.Chart{ + totalSingleStat, + } + case tagNamespaces, tagWorkloads: + charts = []metricslatest.Chart{ + totalSingleStat, + infraSingleStat, + appSingleStat, + } + } + + return append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeStackArea, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: fmt.Sprintf("Top %s traffic", dir), + Unit: unit, + Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC{$FILTERS}[2m])) by ($LABELS)", Legend: "$LEGEND"}}, + }, group, "")...) +} + +func rttCharts(group string) []metricslatest.Chart { + sectionName := "TCP latencies" + charts := []metricslatest.Chart{{ + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: "TCP latency", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket[2m])) by (le)) > 0", + Legend: "p99", + }, + }, + }} + charts = append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeLine, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "Top P50 sRTT", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.5, sum(rate($METRIC_bucket{$FILTERS}[2m])) by (le,$LABELS))*1000 > 0", + Legend: "$LEGEND", + }, + }, + }, group, "ms")...) + charts = append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeLine, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "Top P99 sRTT", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket{$FILTERS}[2m])) by (le,$LABELS))*1000 > 0", + Legend: "$LEGEND", + }, + }, + }, group, "ms")...) + + return charts +} + +func dropCharts(group string, unit metricslatest.Unit) []metricslatest.Chart { + sectionName := "Byte and packet drops" + var charts []metricslatest.Chart + if unit == "pps" { + charts = append(charts, metricslatest.Chart{ + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: "Drops", + Unit: unit, + Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC[2m]))"}}, + }) + } + return append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeStackArea, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "Top drops", + Unit: unit, + Queries: []metricslatest.Query{{PromQL: "sum(rate($METRIC{$FILTERS}[2m])) by ($LABELS)", Legend: "$LEGEND"}}, + }, group, string(unit))...) +} + +func dnsCharts(group string) []metricslatest.Chart { + sectionName := "DNS" + charts := []metricslatest.Chart{ + { + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: "DNS latency", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket[2m])) by (le)) > 0", + Legend: "p99", + }, + }, + }, + { + Type: metricslatest.ChartTypeSingleStat, + SectionName: "", + DashboardName: netobservDashboard, + Title: "DNS error rate", + Queries: []metricslatest.Query{{PromQL: `sum(rate($METRIC_count{DnsFlagsResponseCode!="NoError"}[2m]))`}}, + }, + } + charts = append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeLine, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "Top P50 DNS latency", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.5, sum(rate($METRIC_bucket{$FILTERS}[2m])) by (le,$LABELS))*1000 > 0", + Legend: "$LEGEND", + }, + }, + }, group, "ms")...) + charts = append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeLine, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "Top P99 DNS latency", + Unit: metricslatest.UnitSeconds, + Queries: []metricslatest.Query{ + { + PromQL: "histogram_quantile(0.99, sum(rate($METRIC_bucket{$FILTERS}[2m])) by (le,$LABELS))*1000 > 0", + Legend: "$LEGEND", + }, + }, + }, group, "ms")...) + + return append(charts, chartVariantsFor(&metricslatest.Chart{ + Type: metricslatest.ChartTypeStackArea, + SectionName: sectionName, + DashboardName: netobservDashboard, + Title: "DNS error rate", + Queries: []metricslatest.Query{{ + PromQL: `sum(rate($METRIC_count{DnsFlagsResponseCode!="NoError",$FILTERS}[2m])) by (DnsFlagsResponseCode,$LABELS)`, + Legend: "$LEGEND, {{ DnsFlagsResponseCode }}", + }}, + }, group, "")...) +} + +func chartVariantsFor(chart *metricslatest.Chart, group, unit string) []metricslatest.Chart { + switch group { + case tagNodes: + return []metricslatest.Chart{ + chartVariantFor(chart, group, "", unit), + } + case tagNamespaces: + return []metricslatest.Chart{ + chartVariantFor(chart, group, "infra", unit), + chartVariantFor(chart, group, "app", unit), + } + case tagWorkloads: + return []metricslatest.Chart{ + chartVariantFor(chart, tagNamespaces, "infra", unit), + chartVariantFor(chart, tagNamespaces, "app", unit), + chartVariantFor(chart, group, "infra", unit), + chartVariantFor(chart, group, "app", unit), + } + } + return nil +} + +func chartVariantFor(c *metricslatest.Chart, group, layer, unit string) metricslatest.Chart { + chart := *c + var flowLayerFilter, labels, legend string + chart.Title += " per " + if layer != "" { + chart.Title += layer + " " + flowLayerFilter = `K8S_FlowLayer="` + layer + `",` + } + var orFilters []string + switch group { + case tagNodes: + chart.Title += "node" + labels = "SrcK8S_HostName,DstK8S_HostName" + legend = "source:{{SrcK8S_HostName}}, dest:{{DstK8S_HostName}}" + case tagNamespaces: + chart.Title += "namespace" + labels = "SrcK8S_Namespace,DstK8S_Namespace" + legend = "source:{{SrcK8S_Namespace}}, dest:{{DstK8S_Namespace}}" + // orFilters aim to eliminate node-to-node traffic when looking at namespace-based metrics + orFilters = []string{ + flowLayerFilter + `SrcK8S_Namespace!=""`, + flowLayerFilter + `DstK8S_Namespace!=""`, + } + case tagWorkloads: + chart.Title += "workload" + labels = "SrcK8S_Namespace,SrcK8S_OwnerName,DstK8S_Namespace,DstK8S_OwnerName" + legend = "source:{{SrcK8S_OwnerName}}/{{SrcK8S_Namespace}}, dest:{{DstK8S_OwnerName}}/{{DstK8S_Namespace}}" + // orFilters aim to eliminate node-to-node traffic when looking at workload-based metrics + orFilters = []string{ + flowLayerFilter + `SrcK8S_Namespace!=""`, + flowLayerFilter + `DstK8S_Namespace!=""`, + } + } + if unit != "" { + chart.Title += " (" + unit + ")" + } + queriesReplaceAll(&chart, labels, legend, orFilters) + return chart +} + +func queriesReplaceAll(c *metricslatest.Chart, labels, legend string, orFilters []string) { + var queries []metricslatest.Query + for _, q := range c.Queries { + q.PromQL = strings.ReplaceAll(q.PromQL, "$LABELS", labels) + q.Legend = strings.ReplaceAll(q.Legend, "$LEGEND", legend) + if len(orFilters) == 0 { + q.PromQL = strings.ReplaceAll(q.PromQL, "$FILTERS", "") + } else { + var parts []string + for _, filter := range orFilters { + parts = append(parts, "("+strings.ReplaceAll(q.PromQL, "$FILTERS", filter)+")") + } + q.PromQL = strings.Join(parts, " or ") + } + queries = append(queries, q) + } + c.Queries = queries +} diff --git a/pkg/metrics/predefined_metrics.go b/pkg/metrics/predefined_metrics.go index 0d9710f08..85b33f4d9 100644 --- a/pkg/metrics/predefined_metrics.go +++ b/pkg/metrics/predefined_metrics.go @@ -5,8 +5,8 @@ import ( "reflect" "strings" - flpapi "github.com/netobserv/flowlogs-pipeline/pkg/api" flowslatest "github.com/netobserv/network-observability-operator/apis/flowcollector/v1beta2" + metricslatest "github.com/netobserv/network-observability-operator/apis/flowmetrics/v1alpha1" "github.com/netobserv/network-observability-operator/pkg/helper" ) @@ -21,19 +21,16 @@ const ( ) var ( - mapLabels = map[string][]string{ + latencyBuckets = []string{".005", ".01", ".02", ".03", ".04", ".05", ".075", ".1", ".25", "1"} + mapLabels = map[string][]string{ tagNodes: {"SrcK8S_HostName", "DstK8S_HostName"}, - tagNamespaces: {"SrcK8S_Namespace", "DstK8S_Namespace"}, - tagWorkloads: {"SrcK8S_Namespace", "DstK8S_Namespace", "SrcK8S_OwnerName", "DstK8S_OwnerName", "SrcK8S_OwnerType", "DstK8S_OwnerType"}, + tagNamespaces: {"SrcK8S_Namespace", "DstK8S_Namespace", "K8S_FlowLayer"}, + tagWorkloads: {"SrcK8S_Namespace", "DstK8S_Namespace", "K8S_FlowLayer", "SrcK8S_OwnerName", "DstK8S_OwnerName", "SrcK8S_OwnerType", "DstK8S_OwnerType"}, } mapValueFields = map[string]string{ tagBytes: "Bytes", tagPackets: "Packets", } - mapDirection = map[string]string{ - tagIngress: "0|2", - tagEgress: "1|2", - } predefinedMetrics []taggedMetricDefinition // Note that we set default in-code rather than in CRD, in order to keep track of value being unset or set intentionnally in FlowCollector DefaultIncludeList = []string{ @@ -51,7 +48,7 @@ var ( ) type taggedMetricDefinition struct { - flpapi.MetricsItem + metricslatest.FlowMetricSpec tags []string } @@ -62,84 +59,93 @@ func init() { // Bytes / packets metrics for _, vt := range []string{tagBytes, tagPackets} { valueField := mapValueFields[vt] - for _, dir := range []string{tagEgress, tagIngress} { + for _, dir := range []metricslatest.FlowDirection{metricslatest.Egress, metricslatest.Ingress} { + lowDir := strings.ToLower(string(dir)) predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_%s_%s_total", groupTrimmed, dir, vt), - Type: "counter", - ValueKey: valueField, - Filters: []flpapi.MetricsFilter{ - {Key: "Duplicate", Value: "true", Type: flpapi.MetricFilterNotEqual}, - {Key: "FlowDirection", Value: mapDirection[dir], Type: flpapi.MetricFilterRegex}, - }, - Labels: labels, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_%s_%s_total", groupTrimmed, lowDir, vt), + Type: metricslatest.CounterMetric, + ValueField: valueField, + IncludeDuplicates: false, + Direction: dir, + Labels: labels, + Charts: trafficCharts(group, vt, lowDir), }, - tags: []string{group, vt, dir}, + tags: []string{group, vt, lowDir}, }) } } // Flows metrics predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_flows_total", groupTrimmed), - Type: "counter", - Labels: labels, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_flows_total", groupTrimmed), + Type: "counter", + Labels: labels, + IncludeDuplicates: true, }, tags: []string{group, group + "-flows", "flows"}, }) // RTT metrics predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_rtt_seconds", groupTrimmed), - Type: "histogram", - ValueKey: "TimeFlowRttNs", - Filters: []flpapi.MetricsFilter{ - {Key: "TimeFlowRttNs", Type: flpapi.MetricFilterPresence}, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_rtt_seconds", groupTrimmed), + Type: metricslatest.HistogramMetric, + ValueField: "TimeFlowRttNs", + IncludeDuplicates: true, + Filters: []metricslatest.MetricFilter{ + {Field: "TimeFlowRttNs", MatchType: metricslatest.MatchPresence}, }, - Labels: labels, - ValueScale: 1_000_000_000, // ns => s + Labels: labels, + Divider: 1_000_000_000, // ns => s + Buckets: latencyBuckets, + Charts: rttCharts(group), }, tags: []string{group, "rtt"}, }) // Drops metrics predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_drop_packets_total", groupTrimmed), - Type: "counter", - ValueKey: "PktDropPackets", - Filters: []flpapi.MetricsFilter{ - {Key: "Duplicate", Value: "true", Type: flpapi.MetricFilterNotEqual}, - {Key: "PktDropPackets", Type: flpapi.MetricFilterPresence}, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_drop_packets_total", groupTrimmed), + Type: metricslatest.CounterMetric, + ValueField: "PktDropPackets", + IncludeDuplicates: false, + Filters: []metricslatest.MetricFilter{ + {Field: "PktDropPackets", MatchType: metricslatest.MatchPresence}, }, Labels: labels, + Charts: dropCharts(group, "pps"), }, tags: []string{group, tagPackets, "drops"}, }) predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_drop_bytes_total", groupTrimmed), - Type: "counter", - ValueKey: "PktDropBytes", - Filters: []flpapi.MetricsFilter{ - {Key: "Duplicate", Value: "true", Type: flpapi.MetricFilterNotEqual}, - {Key: "PktDropBytes", Type: flpapi.MetricFilterPresence}, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_drop_bytes_total", groupTrimmed), + Type: metricslatest.CounterMetric, + ValueField: "PktDropBytes", + IncludeDuplicates: false, + Filters: []metricslatest.MetricFilter{ + {Field: "PktDropBytes", MatchType: metricslatest.MatchPresence}, }, Labels: labels, + Charts: dropCharts(group, "Bps"), }, tags: []string{group, tagBytes, "drop"}, }) // DNS metrics - dnsLabels := append(labels, "DnsFlagsResponseCode") + dnsLabels := labels + dnsLabels = append(dnsLabels, "DnsFlagsResponseCode") predefinedMetrics = append(predefinedMetrics, taggedMetricDefinition{ - MetricsItem: flpapi.MetricsItem{ - Name: fmt.Sprintf("%s_dns_latency_seconds", groupTrimmed), - Type: "histogram", - ValueKey: "DnsLatencyMs", - Filters: []flpapi.MetricsFilter{ - {Key: "DnsId", Type: flpapi.MetricFilterPresence}, + FlowMetricSpec: metricslatest.FlowMetricSpec{ + MetricName: fmt.Sprintf("%s_dns_latency_seconds", groupTrimmed), + Type: metricslatest.HistogramMetric, + ValueField: "DnsLatencyMs", + Filters: []metricslatest.MetricFilter{ + {Field: "DnsId", MatchType: metricslatest.MatchPresence}, }, - Labels: dnsLabels, - ValueScale: 1000, // ms => s + Labels: dnsLabels, + Divider: 1000, // ms => s + Buckets: latencyBuckets, + Charts: dnsCharts(group), }, tags: []string{group, "dns"}, }) @@ -161,7 +167,7 @@ func convertIgnoreTagsToIncludeList(ignoreTags []string) []flowslatest.FLPMetric ret := []flowslatest.FLPMetric{} for i := range predefinedMetrics { if !isIgnored(&predefinedMetrics[i], ignoreTags) { - ret = append(ret, flowslatest.FLPMetric(predefinedMetrics[i].Name)) + ret = append(ret, flowslatest.FLPMetric(predefinedMetrics[i].MetricName)) } } return ret @@ -181,24 +187,24 @@ func GetAsIncludeList(ignoreTags []string, includeList *[]flowslatest.FLPMetric) func GetAllNames() []string { names := []string{} for i := range predefinedMetrics { - names = append(names, predefinedMetrics[i].Name) + names = append(names, predefinedMetrics[i].MetricName) } return names } -func GetDefinitions(names []string) []flpapi.MetricsItem { - ret := []flpapi.MetricsItem{} +func GetDefinitions(names []string) []metricslatest.FlowMetric { + ret := []metricslatest.FlowMetric{} for i := range predefinedMetrics { for _, name := range names { - if predefinedMetrics[i].Name == name { - ret = append(ret, predefinedMetrics[i].MetricsItem) + if predefinedMetrics[i].MetricName == name { + ret = append(ret, metricslatest.FlowMetric{Spec: predefinedMetrics[i].FlowMetricSpec}) } } } return ret } -func GetIncludeList(spec *flowslatest.FlowCollectorSpec) []string { +func getIncludeList(spec *flowslatest.FlowCollectorSpec) []string { var list []string if spec.Processor.Metrics.IncludeList == nil { list = DefaultIncludeList @@ -228,3 +234,9 @@ func removeMetricsByPattern(list []string, search string) []string { } return filtered } + +func MergePredefined(fm []metricslatest.FlowMetric, fc *flowslatest.FlowCollectorSpec) []metricslatest.FlowMetric { + names := getIncludeList(fc) + predefined := GetDefinitions(names) + return append(fm, predefined...) +} diff --git a/pkg/metrics/predefined_metrics_test.go b/pkg/metrics/predefined_metrics_test.go index c0f531f1f..d43a50970 100644 --- a/pkg/metrics/predefined_metrics_test.go +++ b/pkg/metrics/predefined_metrics_test.go @@ -45,13 +45,13 @@ func TestGetDefinitions(t *testing.T) { res := GetDefinitions([]string{"namespace_flows_total", "node_ingress_bytes_total", "workload_egress_packets_total"}) assert.Len(res, 3) - assert.Equal("node_ingress_bytes_total", res[0].Name) - assert.Equal("Bytes", res[0].ValueKey) - assert.Equal([]string{"SrcK8S_HostName", "DstK8S_HostName"}, res[0].Labels) - assert.Equal("namespace_flows_total", res[1].Name) - assert.Empty(res[1].ValueKey) - assert.Equal([]string{"SrcK8S_Namespace", "DstK8S_Namespace"}, res[1].Labels) - assert.Equal("workload_egress_packets_total", res[2].Name) - assert.Equal("Packets", res[2].ValueKey) - assert.Equal([]string{"SrcK8S_Namespace", "DstK8S_Namespace", "SrcK8S_OwnerName", "DstK8S_OwnerName", "SrcK8S_OwnerType", "DstK8S_OwnerType"}, res[2].Labels) + assert.Equal("node_ingress_bytes_total", res[0].Spec.MetricName) + assert.Equal("Bytes", res[0].Spec.ValueField) + assert.Equal([]string{"SrcK8S_HostName", "DstK8S_HostName"}, res[0].Spec.Labels) + assert.Equal("namespace_flows_total", res[1].Spec.MetricName) + assert.Empty(res[1].Spec.ValueField) + assert.Equal([]string{"SrcK8S_Namespace", "DstK8S_Namespace", "K8S_FlowLayer"}, res[1].Spec.Labels) + assert.Equal("workload_egress_packets_total", res[2].Spec.MetricName) + assert.Equal("Packets", res[2].Spec.ValueField) + assert.Equal([]string{"SrcK8S_Namespace", "DstK8S_Namespace", "K8S_FlowLayer", "SrcK8S_OwnerName", "DstK8S_OwnerName", "SrcK8S_OwnerType", "DstK8S_OwnerType"}, res[2].Spec.Labels) }