diff --git a/api/v1beta1/metricsconfig_types.go b/api/v1beta1/metricsconfig_types.go index bdd1857fe..bd0b33a7a 100644 --- a/api/v1beta1/metricsconfig_types.go +++ b/api/v1beta1/metricsconfig_types.go @@ -434,7 +434,7 @@ type PrometheusStatus struct { DisabledMetricsCollectionCostManagement *bool `json:"disabled_metrics_collection_cost_management,omitempty"` // DisabledMetricsCollectionResourceOptimization is a field of KokuMetricsConfigStatus to represent whether or not collecting - // resource-optimzation metrics is disabled. The default is true. + // resource-optimization metrics is disabled. The default is true. // +kubebuilder:default=true DisabledMetricsCollectionResourceOptimization *bool `json:"disabled_metrics_collection_resource_optimization,omitempty"` diff --git a/config/crd/bases/koku-metrics-cfg.openshift.io_kokumetricsconfigs.yaml b/config/crd/bases/koku-metrics-cfg.openshift.io_kokumetricsconfigs.yaml index 0f35a5b31..72cba6798 100644 --- a/config/crd/bases/koku-metrics-cfg.openshift.io_kokumetricsconfigs.yaml +++ b/config/crd/bases/koku-metrics-cfg.openshift.io_kokumetricsconfigs.yaml @@ -865,7 +865,7 @@ spec: default: true description: |- DisabledMetricsCollectionResourceOptimization is a field of KokuMetricsConfigStatus to represent whether or not collecting - resource-optimzation metrics is disabled. The default is true. + resource-optimization metrics is disabled. The default is true. type: boolean last_query_start_time: description: LastQueryStartTime is a field of KokuMetricsConfigStatus diff --git a/internal/collector/collector.go b/internal/collector/collector.go index 1c071706e..994703f96 100644 --- a/internal/collector/collector.go +++ b/internal/collector/collector.go @@ -35,7 +35,8 @@ var ( log = logr.Log.WithName("collector") - ErrNoData = errors.New("no data to collect") + ErrNoData = errors.New("no data to collect") + ErrROSNoEnabledNamespaces = errors.New("no enabled namespaces for ROS") ) type mappedCSVStruct map[string]csvStruct @@ -239,17 +240,23 @@ func GenerateReports(cr *metricscfgv1beta1.MetricsConfig, dirCfg *dirconfig.Dire timeRange := c.TimeSeries start := timeRange.Start.Add(1 * time.Second) end := start.Add(14*time.Minute + 59*time.Second) + var err error for i := 1; i < 5; i++ { timeRange.Start = start timeRange.End = end rosCollector.TimeSeries = timeRange - if err := generateResourceOpimizationReports(log, rosCollector, dirCfg, nodeRows, yearMonth); err != nil { - return err + if err = generateResourceOpimizationReports(log, rosCollector, dirCfg, nodeRows, yearMonth); err != nil { + if !errors.Is(err, ErrROSNoEnabledNamespaces) { + return err + } } start = start.Add(15 * time.Minute) end = end.Add(15 * time.Minute) } + if errors.Is(err, ErrROSNoEnabledNamespaces) { + return ErrROSNoEnabledNamespaces + } } //################################################################################################################ @@ -386,6 +393,15 @@ func generateResourceOpimizationReports(log gologr.Logger, c *PrometheusCollecto ts := c.TimeSeries.End log.Info(fmt.Sprintf("querying for resource-optimization for ts: %+v", ts)) rosResults := mappedResults{} + + namespacesAreEnabled, err := areNamespacesEnabled(c, ts) + if err != nil { + return err + } + if !namespacesAreEnabled { + return ErrROSNoEnabledNamespaces + } + if err := c.getQueryResults(ts, resourceOptimizationQueries, &rosResults, MaxRetries); err != nil { return err } @@ -424,6 +440,21 @@ func generateResourceOpimizationReports(log gologr.Logger, c *PrometheusCollecto return nil } +func areNamespacesEnabled(c *PrometheusCollector, ts time.Time) (bool, error) { + vector, err := c.getVectorQuerySimple(rosNamespaceFilter, ts) + if err != nil { + return false, fmt.Errorf("failed to query for namespaces: %v", err) + } + + namespaces := []string{} + for _, sample := range vector { + for _, field := range rosNamespaceFilter.MetricKey { + namespaces = append(namespaces, string(sample.Metric[field])) + } + } + return len(namespaces) > 0, nil +} + func findFields(input model.Metric, str string) string { result := []string{} for name, val := range input { diff --git a/internal/collector/collector_test.go b/internal/collector/collector_test.go index 21ed4f4c2..5dec3710b 100644 --- a/internal/collector/collector_test.go +++ b/internal/collector/collector_test.go @@ -143,18 +143,21 @@ func TestGenerateReports(t *testing.T) { mapResults[query.QueryString] = &mockPromResult{value: *res} } } - for _, query := range *resourceOptimizationQueries { + + qs := append(*resourceOptimizationQueries, rosNamespaceFilter) + for _, query := range qs { res := &model.Vector{} Load(filepath.Join("test_files", "test_data", query.Name), res, t) mapResults[query.QueryString] = &mockPromResult{value: *res} } + copyfakeTimeRange := fakeTimeRange fakeCollector := &PrometheusCollector{ PromConn: mockPrometheusConnection{ mappedResults: &mapResults, t: t, }, - TimeSeries: &fakeTimeRange, + TimeSeries: ©fakeTimeRange, } if err := GenerateReports(fakeCR, fakeDirCfg, fakeCollector); err != nil { t.Errorf("Failed to generate reports: %v", err) @@ -194,18 +197,21 @@ func TestGenerateReportsNoROS(t *testing.T) { mapResults[query.QueryString] = &mockPromResult{value: *res} } } - for _, query := range *resourceOptimizationQueries { + + qs := append(*resourceOptimizationQueries, rosNamespaceFilter) + for _, query := range qs { res := &model.Vector{} Load(filepath.Join("test_files", "test_data", query.Name), res, t) mapResults[query.QueryString] = &mockPromResult{value: *res} } + copyfakeTimeRange := fakeTimeRange fakeCollector := &PrometheusCollector{ PromConn: mockPrometheusConnection{ mappedResults: &mapResults, t: t, }, - TimeSeries: &fakeTimeRange, + TimeSeries: ©fakeTimeRange, } noRosCR := fakeCR.DeepCopy() noRosCR.Spec.PrometheusConfig.DisableMetricsCollectionResourceOptimization = &trueDef @@ -226,6 +232,50 @@ func TestGenerateReportsNoROS(t *testing.T) { } } +func TestGenerateReportsNoEnabledROS(t *testing.T) { + mapResults := make(mappedMockPromResult) + queryList := []*querys{nodeQueries, namespaceQueries, podQueries, volQueries} + for _, q := range queryList { + for _, query := range *q { + res := &model.Matrix{} + Load(filepath.Join("test_files", "test_data", query.Name), res, t) + mapResults[query.QueryString] = &mockPromResult{value: *res} + } + } + + // add the namespace specific query + res := &model.Vector{} + mapResults[rosNamespaceFilter.QueryString] = &mockPromResult{value: *res} + + copyfakeTimeRange := fakeTimeRange + fakeCollector := &PrometheusCollector{ + PromConn: mockPrometheusConnection{ + mappedResults: &mapResults, + t: t, + }, + TimeSeries: ©fakeTimeRange, + } + var err error + if err = GenerateReports(fakeCR, fakeDirCfg, fakeCollector); err == nil { + t.Errorf("Something failed: %v", err) + } + if !errors.Is(err, ErrROSNoEnabledNamespaces) { + t.Errorf("incorrect error returned: %v", err) + } + + // ####### everything below compares the generated reports to the expected reports ####### + expectedMap := getFiles("expected_reports", t) + generatedMap := getFiles("test_reports", t) + expectedDiff := 1 // The expected diff is equal to the number of ROS reports we generate. If we add or remove reports, this number should change + + if len(expectedMap)-len(generatedMap) != expectedDiff { + t.Errorf("incorrect number of reports generated") + } + if err := fakeDirCfg.Reports.RemoveContents(); err != nil { + t.Fatal("failed to cleanup reports directory") + } +} + func TestGenerateReportsNoCost(t *testing.T) { mapResults := make(mappedMockPromResult) queryList := []*querys{nodeQueries, namespaceQueries, podQueries, volQueries} @@ -236,18 +286,21 @@ func TestGenerateReportsNoCost(t *testing.T) { mapResults[query.QueryString] = &mockPromResult{value: *res} } } - for _, query := range *resourceOptimizationQueries { + + qs := append(*resourceOptimizationQueries, rosNamespaceFilter) + for _, query := range qs { res := &model.Vector{} Load(filepath.Join("test_files", "test_data", query.Name), res, t) mapResults[query.QueryString] = &mockPromResult{value: *res} } + copyfakeTimeRange := fakeTimeRange fakeCollector := &PrometheusCollector{ PromConn: mockPrometheusConnection{ mappedResults: &mapResults, t: t, }, - TimeSeries: &fakeTimeRange, + TimeSeries: ©fakeTimeRange, } noCostCR := fakeCR.DeepCopy() noCostCR.Spec.PrometheusConfig.DisableMetricsCollectionCostManagement = &trueDef @@ -258,7 +311,7 @@ func TestGenerateReportsNoCost(t *testing.T) { // ####### everything below compares the generated reports to the expected reports ####### expectedMap := getFiles("expected_reports", t) generatedMap := getFiles("test_reports", t) - expectedDiff := 4 // The expected diff is equal to the number of ROS reports we generate. If we add or remove reports, this number should change + expectedDiff := 4 // The expected diff is equal to the number of Cost reports we generate. If we add or remove reports, this number should change if len(expectedMap)-len(generatedMap) != expectedDiff { t.Errorf("incorrect number of reports generated") @@ -271,12 +324,13 @@ func TestGenerateReportsNoCost(t *testing.T) { func TestGenerateReportsQueryErrors(t *testing.T) { MaxRetries = 1 mapResults := make(mappedMockPromResult) + copyfakeTimeRange := fakeTimeRange fakeCollector := &PrometheusCollector{ PromConn: mockPrometheusConnection{ mappedResults: &mapResults, t: t, }, - TimeSeries: &fakeTimeRange, + TimeSeries: ©fakeTimeRange, } queryList := []*querys{nodeQueries, podQueries, volQueries, namespaceQueries} @@ -287,11 +341,11 @@ func TestGenerateReportsQueryErrors(t *testing.T) { mapResults[query.QueryString] = &mockPromResult{value: *res} } } - for _, query := range *resourceOptimizationQueries { - res := &model.Vector{} - Load(filepath.Join("test_files", "test_data", query.Name), res, t) - mapResults[query.QueryString] = &mockPromResult{value: *res} - } + + // add the namespace specific query + res := &model.Vector{} + Load(filepath.Join("test_files", "test_data", rosNamespaceFilter.Name), res, t) + mapResults[rosNamespaceFilter.QueryString] = &mockPromResult{value: *res} resourceOptimizationError := "resourceOptimization error" for _, q := range *resourceOptimizationQueries { @@ -349,12 +403,13 @@ func TestGenerateReportsNoNodeData(t *testing.T) { } } + copyfakeTimeRange := fakeTimeRange fakeCollector := &PrometheusCollector{ PromConn: mockPrometheusConnection{ mappedResults: &mapResults, t: t, }, - TimeSeries: &fakeTimeRange, + TimeSeries: ©fakeTimeRange, } if err := GenerateReports(fakeCR, fakeDirCfg, fakeCollector); err != nil && err != ErrNoData { t.Errorf("Failed to generate reports: %v", err) diff --git a/internal/collector/prometheus.go b/internal/collector/prometheus.go index eec69498d..31ab0cad6 100644 --- a/internal/collector/prometheus.go +++ b/internal/collector/prometheus.go @@ -289,3 +289,21 @@ func (c *PrometheusCollector) getQueryResults(ts time.Time, queries *querys, res return nil } + +func (c *PrometheusCollector) getVectorQuerySimple(q query, ts time.Time) (model.Vector, error) { + ctx, cancel := context.WithTimeout(context.Background(), c.ContextTimeout) + defer cancel() + + queryResult, warnings, err := c.PromConn.Query(ctx, q.QueryString, ts) + if err != nil { + return nil, fmt.Errorf("query: %s: error querying prometheus: %v", q.QueryString, err) + } + if len(warnings) > 0 { + log.Info("query warnings", "Warnings", warnings) + } + vector, ok := queryResult.(model.Vector) + if !ok { + return vector, fmt.Errorf("expected a vector in response to query, got a %v", queryResult.Type()) + } + return vector, nil +} diff --git a/internal/collector/prometheus_test.go b/internal/collector/prometheus_test.go index dda618f1a..db37c3bbc 100644 --- a/internal/collector/prometheus_test.go +++ b/internal/collector/prometheus_test.go @@ -44,12 +44,12 @@ func (m mockPrometheusConnection) QueryRange(ctx context.Context, query string, if m.mappedResults != nil { res, ok = (*m.mappedResults)[query] if !ok { - m.t.Fatalf("Could not find test result!") + m.t.Fatalf("Could not find test result for query: %s", query) } } else if m.singleResult != nil { res = m.singleResult } else { - m.t.Fatalf("Could not find test result!") + m.t.Fatalf("Could not find test result for query: %s", query) } return res.value, res.warnings, res.err } @@ -60,12 +60,12 @@ func (m mockPrometheusConnection) Query(ctx context.Context, query string, ts ti if m.mappedResults != nil { res, ok = (*m.mappedResults)[query] if !ok { - m.t.Fatalf("Could not find test result!") + m.t.Fatalf("Could not find test result for query: %s", query) } } else if m.singleResult != nil { res = m.singleResult } else { - m.t.Fatalf("Could not find test result!") + m.t.Fatalf("Could not find test result for query: %s", query) } return res.value, res.warnings, res.err } @@ -671,3 +671,148 @@ func TestSetPrometheusConfig(t *testing.T) { }) } } + +func TestGetVectorQuerySimpleSuccess(t *testing.T) { + c := PrometheusCollector{ + TimeSeries: &promv1.Range{}, + } + getVectorQuerySimpleSuccessTests := []struct { + name string + query query + queryResult mappedMockPromResult + wantedResult model.Vector + wantedError error + }{ + { + name: "get query results no errors", + query: query{ + Name: "usage-cpu-cores", + QueryString: "query1", + MetricKey: staticFields{"id": "id"}, + QueryValue: &saveQueryValue{ + ValName: "usage-cpu-cores", + Method: "max", + TransformedName: "usage-cpu-core-seconds", + }, + RowKey: []model.LabelName{"id"}, + }, + queryResult: mappedMockPromResult{ + "query1": &mockPromResult{ + value: model.Vector{ + { + Metric: model.Metric{ + "id": "1", + "random-field": "42", + }, + Value: model.SampleValue(32), + Timestamp: 1604339460, + }}, + warnings: nil, + err: nil, + }, + }, + wantedResult: model.Vector{{ + Metric: model.Metric{ + "id": "1", + "random-field": "42", + }, + Value: model.SampleValue(32), + Timestamp: 1604339460, + }, + }, + wantedError: nil, + }, + } + for _, tt := range getVectorQuerySimpleSuccessTests { + t.Run(tt.name, func(t *testing.T) { + c.PromConn = mockPrometheusConnection{ + mappedResults: &tt.queryResult, + t: t, + } + got, err := c.getVectorQuerySimple(tt.query, c.TimeSeries.End) + if tt.wantedError == nil && err != nil { + t.Errorf("got unexpected error: %v", err) + } + if !reflect.DeepEqual(got, tt.wantedResult) { + t.Errorf("getQueryRangeResults got:\n\t%s\n want:\n\t%s", got, tt.wantedResult) + } + }) + } +} + +func TestGetVectorQuerySimpleError(t *testing.T) { + c := PrometheusCollector{ + ContextTimeout: defaultContextTimeout, + TimeSeries: &promv1.Range{}, + } + getVectorQuerySimpleErrorsTests := []struct { + name string + queryResult *mockPromResult + wantedResult model.Vector + wantedError error + }{ + { + name: "return incorrect type (model.Scalar)", + queryResult: &mockPromResult{value: &model.Scalar{}}, + wantedResult: nil, + wantedError: errTest, + }, + { + name: "return incorrect type (model.Matrix)", + queryResult: &mockPromResult{value: &model.Matrix{}}, + wantedResult: nil, + wantedError: errTest, + }, + { + name: "return incorrect type (model.String)", + queryResult: &mockPromResult{value: &model.String{}}, + wantedResult: nil, + wantedError: errTest, + }, + { + name: "warnings with no error", + queryResult: &mockPromResult{ + value: model.Vector{}, + warnings: promv1.Warnings{"This is a warning."}, + err: nil, + }, + wantedResult: model.Vector{}, + wantedError: nil, + }, + { + name: "error with no warnings", + queryResult: &mockPromResult{ + value: model.Matrix{}, + warnings: nil, + err: errTest, + }, + wantedResult: nil, + wantedError: errTest, + }, + { + name: "error with warnings", + queryResult: &mockPromResult{ + value: model.Matrix{}, + warnings: promv1.Warnings{"This is another warning."}, + err: errTest, + }, + wantedResult: nil, + wantedError: errTest, + }, + } + for _, tt := range getVectorQuerySimpleErrorsTests { + t.Run(tt.name, func(t *testing.T) { + c.PromConn = mockPrometheusConnection{ + singleResult: tt.queryResult, + t: t, + } + got, err := c.getVectorQuerySimple(query{QueryString: "fake-query"}, c.TimeSeries.End) + if tt.wantedError != nil && err == nil { + t.Errorf("%s got: nil error, want: error", tt.name) + } + if !reflect.DeepEqual(got, tt.wantedResult) { + t.Errorf("%s got: %s want: %s", tt.name, got, tt.wantedResult) + } + }) + } +} diff --git a/internal/collector/queries.go b/internal/collector/queries.go index 1db1e1b9b..fc0a2048b 100644 --- a/internal/collector/queries.go +++ b/internal/collector/queries.go @@ -28,31 +28,38 @@ var ( "cost:pod_request_memory_bytes": "sum by (pod, namespace, node) (kube_pod_container_resource_requests{pod!='', namespace!='', node!='', resource='memory'} * on(pod, namespace) group_left max by (pod, namespace) (kube_pod_status_phase{phase='Running'}))", "cost:pod_usage_memory_bytes": "sum by (pod, namespace, node) (container_memory_usage_bytes{container!='', container!='POD', pod!='', namespace!='', node!=''})", - "ros:image_owners": "max_over_time(kube_pod_container_info{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]) * on(pod, namespace) group_left(owner_kind, owner_name) max by(pod, namespace, owner_kind, owner_name) (max_over_time(kube_pod_owner{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:image_workloads": "max_over_time(kube_pod_container_info{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]) * on(pod, namespace) group_left(workload, workload_type) max by(pod, namespace, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:cpu_request_container_avg": "avg by(container, pod, namespace, node) (kube_pod_container_resource_requests{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*', resource='cpu', unit='core'} * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", - "ros:cpu_request_container_sum": "sum by(container, pod, namespace, node) (kube_pod_container_resource_requests{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*', resource='cpu', unit='core'} * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", - "ros:cpu_limit_container_avg": "avg by(container, pod, namespace, node) (kube_pod_container_resource_limits{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*', resource='cpu', unit='core'} * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", - "ros:cpu_limit_container_sum": "sum by(container, pod, namespace, node) (kube_pod_container_resource_limits{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*', resource='cpu', unit='core'} * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", - "ros:cpu_usage_container_avg": "avg by(container, pod, namespace, node) (avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:cpu_usage_container_min": "min by(container, pod, namespace, node) (min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:cpu_usage_container_max": "max by(container, pod, namespace, node) (max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:cpu_usage_container_sum": "sum by(container, pod, namespace, node) (avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:cpu_throttle_container_avg": "avg by(container, pod, namespace, node) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:cpu_throttle_container_max": "max by(container, pod, namespace, node) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:cpu_throttle_container_sum": "sum by(container, pod, namespace, node) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:memory_request_container_avg": "avg by(container, pod, namespace, node) (kube_pod_container_resource_requests{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*', resource='memory', unit='byte'} * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", - "ros:memory_request_container_sum": "sum by(container, pod, namespace, node) (kube_pod_container_resource_requests{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*', resource='memory', unit='byte'} * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", - "ros:memory_limit_container_avg": "avg by(container, pod, namespace, node) (kube_pod_container_resource_limits{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*', resource='memory', unit='byte'} * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", - "ros:memory_limit_container_sum": "sum by(container, pod, namespace, node) (kube_pod_container_resource_limits{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*', resource='memory', unit='byte'} * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", - "ros:memory_usage_container_avg": "avg by(container, pod, namespace, node) (avg_over_time(container_memory_working_set_bytes{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:memory_usage_container_min": "min by(container, pod, namespace, node) (min_over_time(container_memory_working_set_bytes{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:memory_usage_container_max": "max by(container, pod, namespace, node) (max_over_time(container_memory_working_set_bytes{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:memory_usage_container_sum": "sum by(container, pod, namespace, node) (avg_over_time(container_memory_working_set_bytes{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:memory_rss_usage_container_avg": "avg by(container, pod, namespace, node) (avg_over_time(container_memory_rss{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:memory_rss_usage_container_min": "min by(container, pod, namespace, node) (min_over_time(container_memory_rss{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:memory_rss_usage_container_max": "max by(container, pod, namespace, node) (max_over_time(container_memory_rss{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", - "ros:memory_rss_usage_container_sum": "sum by(container, pod, namespace, node) (avg_over_time(container_memory_rss{container!='', container!='POD', pod!='', namespace!='', namespace!~'kube-.*|openshift|openshift-.*'}[15m]))", + "ros:namespace_filter": "kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'}", + "ros:image_owners": "(max_over_time(kube_pod_container_info{container!='', container!='POD'}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'}) * on(pod, namespace) group_left(owner_kind, owner_name) max by(pod, namespace, owner_kind, owner_name) (max_over_time(kube_pod_owner{container!='', container!='POD', pod!=''}[15m]))", + "ros:image_workloads": "(max_over_time(kube_pod_container_info{container!='', container!='POD'}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'}) * on(pod, namespace) group_left(workload, workload_type) max by(pod, namespace, workload, workload_type) (max_over_time(namespace_workload_pod:kube_pod_owner:relabel{pod!=''}[15m]))", + "ros:cpu_request_container_avg": "avg by(container, pod, namespace, node) ((kube_pod_container_resource_requests{container!='', container!='POD', pod!='', resource='cpu', unit='core'} * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'}) * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", + "ros:cpu_request_container_sum": "sum by(container, pod, namespace, node) ((kube_pod_container_resource_requests{container!='', container!='POD', pod!='', resource='cpu', unit='core'} * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'}) * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", + "ros:cpu_limit_container_avg": "avg by(container, pod, namespace, node) ((kube_pod_container_resource_limits{container!='', container!='POD', pod!='', resource='cpu', unit='core'} * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'}) * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", + "ros:cpu_limit_container_sum": "sum by(container, pod, namespace, node) ((kube_pod_container_resource_limits{container!='', container!='POD', pod!='', resource='cpu', unit='core'} * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'}) * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", + "ros:cpu_usage_container_avg": "avg by(container, pod, namespace, node) (avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:cpu_usage_container_min": "min by(container, pod, namespace, node) (min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:cpu_usage_container_max": "max by(container, pod, namespace, node) (max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:cpu_usage_container_sum": "sum by(container, pod, namespace, node) (avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:cpu_throttle_container_avg": "avg by(container, pod, namespace, node) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:cpu_throttle_container_max": "max by(container, pod, namespace, node) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:cpu_throttle_container_sum": "sum by(container, pod, namespace, node) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:memory_request_container_avg": "avg by(container, pod, namespace, node) ((kube_pod_container_resource_requests{container!='', container!='POD', pod!='', resource='memory', unit='byte'} * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'}) * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", + "ros:memory_request_container_sum": "sum by(container, pod, namespace, node) ((kube_pod_container_resource_requests{container!='', container!='POD', pod!='', resource='memory', unit='byte'} * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'}) * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", + "ros:memory_limit_container_avg": "avg by(container, pod, namespace, node) ((kube_pod_container_resource_limits{container!='', container!='POD', pod!='', resource='memory', unit='byte'} * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'}) * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", + "ros:memory_limit_container_sum": "sum by(container, pod, namespace, node) ((kube_pod_container_resource_limits{container!='', container!='POD', pod!='', resource='memory', unit='byte'} * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'}) * on(pod, namespace) group_left max by (container, pod, namespace) (kube_pod_status_phase{phase='Running'}))", + "ros:memory_usage_container_avg": "avg by(container, pod, namespace, node) (avg_over_time(container_memory_working_set_bytes{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:memory_usage_container_min": "min by(container, pod, namespace, node) (min_over_time(container_memory_working_set_bytes{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:memory_usage_container_max": "max by(container, pod, namespace, node) (max_over_time(container_memory_working_set_bytes{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:memory_usage_container_sum": "sum by(container, pod, namespace, node) (avg_over_time(container_memory_working_set_bytes{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:memory_rss_usage_container_avg": "avg by(container, pod, namespace, node) (avg_over_time(container_memory_rss{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:memory_rss_usage_container_min": "min by(container, pod, namespace, node) (min_over_time(container_memory_rss{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:memory_rss_usage_container_max": "max by(container, pod, namespace, node) (max_over_time(container_memory_rss{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + "ros:memory_rss_usage_container_sum": "sum by(container, pod, namespace, node) (avg_over_time(container_memory_rss{container!='', container!='POD', pod!=''}[15m]) * on(namespace) group_left kube_namespace_labels{label_insights_cost_management_optimizations='true', namespace!~'kube-.*|openshift|openshift-.*'})", + } + + rosNamespaceFilter = query{ + Name: "ros-namespace-filter", + QueryString: QueryMap["ros:namespace_filter"], + MetricKey: staticFields{"namespace": "namespace"}, } nodeQueries = &querys{ diff --git a/internal/collector/test_files/test_data/ros-namespace-filter b/internal/collector/test_files/test_data/ros-namespace-filter new file mode 100644 index 000000000..590a3ce75 --- /dev/null +++ b/internal/collector/test_files/test_data/ros-namespace-filter @@ -0,0 +1,22 @@ +[ + { + "metric": { + "label_insights_cost_management_optimizations": "true", + "namespace": "costmanagement-metrics-operator" + }, + "value": [ + 1677009600, + "1" + ] + }, + { + "metric": { + "label_insights_cost_management_optimizations": "true", + "namespace": "koku-metrics-operator" + }, + "value": [ + 1677009600, + "1" + ] + } +] diff --git a/internal/controller/kokumetricsconfig_controller.go b/internal/controller/kokumetricsconfig_controller.go index 4f3e34649..3ba9f02a3 100644 --- a/internal/controller/kokumetricsconfig_controller.go +++ b/internal/controller/kokumetricsconfig_controller.go @@ -8,6 +8,7 @@ package controller import ( "context" "encoding/json" + "errors" "fmt" "math/rand" "os" @@ -19,7 +20,7 @@ import ( gologr "github.com/go-logr/logr" promv1 "github.com/prometheus/client_golang/api/prometheus/v1" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + k8sErrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -213,10 +214,10 @@ func GetClusterID(r *MetricsConfigReconciler, cr *metricscfgv1beta1.MetricsConfi // LogSecretAccessError evaluates the type of kube secret error and logs the appropriate message. func LogSecretAccessError(err error, msg string) { switch { - case errors.IsNotFound(err): + case k8sErrors.IsNotFound(err): errMsg := fmt.Sprintf("%s does not exist", msg) log.Error(err, errMsg) - case errors.IsForbidden(err): + case k8sErrors.IsForbidden(err): errMsg := fmt.Sprintf("operator does not have permission to check %s", msg) log.Error(err, errMsg) default: @@ -837,7 +838,7 @@ func (r *MetricsConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques Step: time.Minute, } if err := collectPromStats(r, cr, dirCfg, timeRange); err != nil { - if err == collector.ErrNoData && t.Hour() == 0 && t.Day() != endTime.Day() && r.initialDataCollection { + if errors.Is(err, collector.ErrNoData) && t.Hour() == 0 && t.Day() != endTime.Day() && r.initialDataCollection { // if there is no data for the first hour of the day, and we are doing the // initial data collection, skip to the next day so we avoid collecting // partial data for a full day. This ensures we are generating a full daily @@ -872,12 +873,12 @@ func (r *MetricsConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques // Initial returned result -> requeue reconcile after 5 min. // This result is replaced if upload or status update results in error. var result = ctrl.Result{RequeueAfter: time.Minute * 5} - var errors []error + var errorSlice []error if cr.Spec.Upload.UploadToggle != nil && *cr.Spec.Upload.UploadToggle { if err := r.setAuthAndUpload(ctx, cr, packager, req); err != nil { result = ctrl.Result{} - errors = append(errors, err) + errorSlice = append(errorSlice, err) } } else { @@ -887,24 +888,24 @@ func (r *MetricsConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques // remove old reports if maximum report count has been exceeded if err := packager.TrimPackages(cr); err != nil { result = ctrl.Result{} - errors = append(errors, err) + errorSlice = append(errorSlice, err) } uploadFiles, err := dirCfg.Upload.GetFilesFullPath() if err != nil { result = ctrl.Result{} - errors = append(errors, err) + errorSlice = append(errorSlice, err) } cr.Status.Packaging.PackagedFiles = uploadFiles if err := r.Status().Update(ctx, cr); err != nil { log.Info("failed to update MetricsConfig status", "error", err) result = ctrl.Result{} - errors = append(errors, err) + errorSlice = append(errorSlice, err) } // Requeue for processing after 5 minutes - return result, concatErrs(errors...) + return result, concatErrs(errorSlice...) } // SetupWithManager Setup reconciliation with manager object diff --git a/internal/controller/kokumetricsconfig_controller_test.go b/internal/controller/kokumetricsconfig_controller_test.go index a665aadb4..7526fd9e6 100644 --- a/internal/controller/kokumetricsconfig_controller_test.go +++ b/internal/controller/kokumetricsconfig_controller_test.go @@ -1409,6 +1409,7 @@ var _ = Describe("MetricsConfigController - CRD Handling", Ordered, func() { }, timeout, interval).Should(BeTrue()) Expect(fetched.Status.Reports.DataCollected).To(BeTrue()) + Expect(fetched.Status.Reports.DataCollectionMessage).To(ContainSubstring("namespaces contain the `insights_cost_management_optimizations=\"true\"`")) }) It("8day retention period - successfully queried but there was no data on first day, but data on all remaining days", func() { // slow test, always run this one last diff --git a/internal/controller/prometheus.go b/internal/controller/prometheus.go index fd745bfa2..f6ef390e2 100644 --- a/internal/controller/prometheus.go +++ b/internal/controller/prometheus.go @@ -2,6 +2,7 @@ package controller import ( "context" + "errors" "fmt" "os" "time" @@ -183,10 +184,15 @@ func collectPromStats(r *MetricsConfigReconciler, cr *metricscfgv1beta1.MetricsC log.Info("generating reports for range", "start", formattedStart, "end", formattedEnd) if err := collector.GenerateReports(cr, dirCfg, r.promCollector); err != nil { cr.Status.Reports.DataCollected = false - if err == collector.ErrNoData { + if errors.Is(err, collector.ErrNoData) { cr.Status.Prometheus.LastQuerySuccessTime = t cr.Status.Reports.DataCollectionMessage = "No data to report for the hour queried." log.Info("no data available to generate reports") + } else if errors.Is(err, collector.ErrROSNoEnabledNamespaces) { + cr.Status.Prometheus.LastQuerySuccessTime = t + cr.Status.Reports.DataCollected = true + cr.Status.Reports.DataCollectionMessage = "No namespaces contain the `insights_cost_management_optimizations=\"true\"` label, so no resource optimization metrics were collected." + log.Info("no namespaces contain the `insights_cost_management_optimizations=\"true\"` label, so no resource optimization metrics were collected") } else { retryTracker[timeRange.Start]++ cr.Status.Reports.DataCollectionMessage = fmt.Sprintf("error: %v", err) diff --git a/internal/packaging/packaging.go b/internal/packaging/packaging.go index 390e21bf8..c2b44dcea 100644 --- a/internal/packaging/packaging.go +++ b/internal/packaging/packaging.go @@ -294,7 +294,7 @@ func (p *FilePackager) writePart(fileName string, csvReader *csv.Reader, csvHead } for { row, err := csvReader.Read() - if err == io.EOF { + if errors.Is(err, io.EOF) { writer.Flush() return splitFile, true, nil } else if err != nil { @@ -535,7 +535,7 @@ func (p *FilePackager) PackageReports(cr *metricscfgv1beta1.MetricsConfig) error // move CSV reports from data directory to staging directory filesToPackage, err := p.moveOrCopyFiles(cr) - if err == ErrNoReports { + if errors.Is(err, ErrNoReports) { log.Info("no payload to generate") return nil } else if err != nil { diff --git a/internal/packaging/packaging_test.go b/internal/packaging/packaging_test.go index 7c4306172..4fbce3ceb 100644 --- a/internal/packaging/packaging_test.go +++ b/internal/packaging/packaging_test.go @@ -787,7 +787,7 @@ func TestWriteTarball(t *testing.T) { var files []string for { hdr, err := tr.Next() - if err == io.EOF { + if errors.Is(err, io.EOF) { break }