diff --git a/CHANGELOG.md b/CHANGELOG.md index 49834300050..82d70976d54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,13 @@ next release (yyyy-mm-dd) #### ⛔ Breaking Changes +* [SPM] Due to a breaking change in OpenTelemetry's prometheus exporter ([details](https://github.com/open-telemetry/opentelemetry-collector-contrib/releases/tag/v0.80.0)) + metric names will no longer be normalized by default, meaning that the expected metric names would be `calls` and + `duration_*`. Backwards compatibility with older OpenTelemetry Collector versions can be achieved through the following flags: + * `prometheus.query.normalize-calls`: If true, normalizes the "calls" metric name. e.g. "calls_total". + * `prometheus.query.normalize-duration`: If true, normalizes the "duration" metric name to include the duration units. e.g. "duration_milliseconds_bucket". + + #### New Features #### Bug fixes, Minor Improvements diff --git a/docker-compose/monitor/Makefile b/docker-compose/monitor/Makefile index 6b0bfa2aa99..c6948ec1f8c 100644 --- a/docker-compose/monitor/Makefile +++ b/docker-compose/monitor/Makefile @@ -18,7 +18,7 @@ run-dev: _run-connector # _run-connector is the base target to bring up the system required for SPM using the new OTEL spanmetrics connector. .PHONY: _run-connector -_run-connector: export OTEL_IMAGE_TAG = latest +_run-connector: export OTEL_IMAGE_TAG = 0.80.0 _run-connector: export OTEL_CONFIG_SRC = ./otel-collector-config-connector.yml _run-connector: export PROMETHEUS_QUERY_SUPPORT_SPANMETRICS_CONNECTOR = true _run-connector: diff --git a/pkg/prometheus/config/config.go b/pkg/prometheus/config/config.go index c3e709f535c..418f1ba0746 100644 --- a/pkg/prometheus/config/config.go +++ b/pkg/prometheus/config/config.go @@ -30,4 +30,6 @@ type Configuration struct { SupportSpanmetricsConnector bool MetricNamespace string LatencyUnit string + NormalizeCalls bool + NormalizeDuration bool } diff --git a/plugin/metrics/prometheus/metricsstore/reader.go b/plugin/metrics/prometheus/metricsstore/reader.go index 1e0f69e0858..0cf6852183d 100644 --- a/plugin/metrics/prometheus/metricsstore/reader.go +++ b/plugin/metrics/prometheus/metricsstore/reader.go @@ -141,6 +141,10 @@ func buildFullLatencyMetricName(cfg config.Configuration) string { metricName = cfg.MetricNamespace + "_" + metricName } + if !cfg.NormalizeDuration { + return metricName + } + // The long names are automatically appended to the metric name by OTEL's prometheus exporters and are defined in: // https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/pkg/translator/prometheus#metric-name shortToLongName := map[string]string{"ms": "milliseconds", "s": "seconds"} @@ -160,7 +164,7 @@ func (m MetricsReader) GetCallRates(ctx context.Context, requestParams *metricss buildPromQuery: func(p promQueryParams) string { return fmt.Sprintf( // Note: p.spanKindFilter can be ""; trailing commas are okay within a timeseries selection. - `sum(rate(%s_total{service_name =~ "%s", %s}[%s])) by (%s)`, + `sum(rate(%s{service_name =~ "%s", %s}[%s])) by (%s)`, m.callsMetricName, p.serviceFilter, p.spanKindFilter, @@ -181,7 +185,12 @@ func buildFullCallsMetricName(cfg config.Configuration) string { if cfg.MetricNamespace != "" { metricName = cfg.MetricNamespace + "_" + metricName } - return metricName + + if !cfg.NormalizeCalls { + return metricName + } + + return metricName + "_total" } // GetErrorRates gets the error rate metrics for the given set of error rate query parameters. @@ -193,7 +202,7 @@ func (m MetricsReader) GetErrorRates(ctx context.Context, requestParams *metrics buildPromQuery: func(p promQueryParams) string { return fmt.Sprintf( // Note: p.spanKindFilter can be ""; trailing commas are okay within a timeseries selection. - `sum(rate(%s_total{service_name =~ "%s", status_code = "STATUS_CODE_ERROR", %s}[%s])) by (%s) / sum(rate(%s_total{service_name =~ "%s", %s}[%s])) by (%s)`, + `sum(rate(%s{service_name =~ "%s", status_code = "STATUS_CODE_ERROR", %s}[%s])) by (%s) / sum(rate(%s{service_name =~ "%s", %s}[%s])) by (%s)`, m.callsMetricName, p.serviceFilter, p.spanKindFilter, p.rate, p.groupBy, m.callsMetricName, p.serviceFilter, p.spanKindFilter, p.rate, p.groupBy, ) diff --git a/plugin/metrics/prometheus/metricsstore/reader_test.go b/plugin/metrics/prometheus/metricsstore/reader_test.go index 501c664c130..cf027f8d6a0 100644 --- a/plugin/metrics/prometheus/metricsstore/reader_test.go +++ b/plugin/metrics/prometheus/metricsstore/reader_test.go @@ -178,7 +178,7 @@ func TestGetLatencies(t *testing.T) { `span_kind =~ "SPAN_KIND_SERVER|SPAN_KIND_CLIENT"}[10m])) by (service_name,le))`, }, { - name: "override the default latency metric name", + name: "enable support for spanmetrics connector with normalized metric name", serviceNames: []string{"emailservice"}, spanKinds: []string{"SPAN_KIND_SERVER"}, groupByOperation: true, @@ -193,7 +193,26 @@ func TestGetLatencies(t *testing.T) { wantLabels: map[string]string{ "service_name": "emailservice", }, - wantPromQlQuery: `histogram_quantile(0.95, sum(rate(span_metrics_duration_seconds_bucket{service_name =~ "emailservice", ` + + wantPromQlQuery: `histogram_quantile(0.95, sum(rate(span_metrics_duration_bucket{service_name =~ "emailservice", ` + + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name,le))`, + }, + { + name: "enable support for spanmetrics connector with normalized metric name", + serviceNames: []string{"emailservice"}, + spanKinds: []string{"SPAN_KIND_SERVER"}, + groupByOperation: true, + updateConfig: func(cfg config.Configuration) config.Configuration { + cfg.SupportSpanmetricsConnector = true + cfg.NormalizeDuration = true + cfg.LatencyUnit = "s" + return cfg + }, + wantName: "service_operation_latencies", + wantDescription: "0.95th quantile latency, grouped by service & operation", + wantLabels: map[string]string{ + "service_name": "emailservice", + }, + wantPromQlQuery: `histogram_quantile(0.95, sum(rate(duration_seconds_bucket{service_name =~ "emailservice", ` + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name,le))`, }, } { @@ -228,7 +247,7 @@ func TestGetCallRates(t *testing.T) { wantLabels: map[string]string{ "service_name": "emailservice", }, - wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", ` + + wantPromQlQuery: `sum(rate(calls{service_name =~ "emailservice", ` + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`, }, { @@ -242,7 +261,7 @@ func TestGetCallRates(t *testing.T) { "operation": "/OrderResult", "service_name": "emailservice", }, - wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", ` + + wantPromQlQuery: `sum(rate(calls{service_name =~ "emailservice", ` + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,operation)`, }, { @@ -255,11 +274,11 @@ func TestGetCallRates(t *testing.T) { wantLabels: map[string]string{ "service_name": "emailservice", }, - wantPromQlQuery: `sum(rate(calls_total{service_name =~ "frontend|emailservice", ` + + wantPromQlQuery: `sum(rate(calls{service_name =~ "frontend|emailservice", ` + `span_kind =~ "SPAN_KIND_SERVER|SPAN_KIND_CLIENT"}[10m])) by (service_name)`, }, { - name: "override the default call rate metric name", + name: "enable support for spanmetrics connector with a namespace", serviceNames: []string{"emailservice"}, spanKinds: []string{"SPAN_KIND_SERVER"}, groupByOperation: true, @@ -273,7 +292,25 @@ func TestGetCallRates(t *testing.T) { wantLabels: map[string]string{ "service_name": "emailservice", }, - wantPromQlQuery: `sum(rate(span_metrics_calls_total{service_name =~ "emailservice", ` + + wantPromQlQuery: `sum(rate(span_metrics_calls{service_name =~ "emailservice", ` + + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name)`, + }, + { + name: "enable support for spanmetrics connector with normalized metric name", + serviceNames: []string{"emailservice"}, + spanKinds: []string{"SPAN_KIND_SERVER"}, + groupByOperation: true, + updateConfig: func(cfg config.Configuration) config.Configuration { + cfg.SupportSpanmetricsConnector = true + cfg.NormalizeCalls = true + return cfg + }, + wantName: "service_operation_call_rate", + wantDescription: "calls/sec, grouped by service & operation", + wantLabels: map[string]string{ + "service_name": "emailservice", + }, + wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", ` + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name)`, }, } { @@ -307,9 +344,9 @@ func TestGetErrorRates(t *testing.T) { wantLabels: map[string]string{ "service_name": "emailservice", }, - wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` + + wantPromQlQuery: `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` + - `sum(rate(calls_total{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`, + `sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`, }, { name: "group by service and operation should be reflected in name/description and query group-by", @@ -322,9 +359,9 @@ func TestGetErrorRates(t *testing.T) { "operation": "/OrderResult", "service_name": "emailservice", }, - wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` + + wantPromQlQuery: `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,operation) / ` + - `sum(rate(calls_total{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,operation)`, + `sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,operation)`, }, { name: "two services and span kinds result in regex 'or' symbol in query", @@ -336,12 +373,32 @@ func TestGetErrorRates(t *testing.T) { wantLabels: map[string]string{ "service_name": "emailservice", }, - wantPromQlQuery: `sum(rate(calls_total{service_name =~ "frontend|emailservice", status_code = "STATUS_CODE_ERROR", ` + + wantPromQlQuery: `sum(rate(calls{service_name =~ "frontend|emailservice", status_code = "STATUS_CODE_ERROR", ` + `span_kind =~ "SPAN_KIND_SERVER|SPAN_KIND_CLIENT"}[10m])) by (service_name) / ` + - `sum(rate(calls_total{service_name =~ "frontend|emailservice", span_kind =~ "SPAN_KIND_SERVER|SPAN_KIND_CLIENT"}[10m])) by (service_name)`, + `sum(rate(calls{service_name =~ "frontend|emailservice", span_kind =~ "SPAN_KIND_SERVER|SPAN_KIND_CLIENT"}[10m])) by (service_name)`, }, { - name: "override the default error rate metric name", + name: "neither metric namespace nor enabling normalized metric names have an impact when spanmetrics connector is not supported", + serviceNames: []string{"emailservice"}, + spanKinds: []string{"SPAN_KIND_SERVER"}, + groupByOperation: false, + updateConfig: func(cfg config.Configuration) config.Configuration { + cfg.SupportSpanmetricsConnector = false + cfg.MetricNamespace = "span_metrics" + cfg.NormalizeCalls = true + return cfg + }, + wantName: "service_error_rate", + wantDescription: "error rate, computed as a fraction of errors/sec over calls/sec, grouped by service", + wantLabels: map[string]string{ + "service_name": "emailservice", + }, + wantPromQlQuery: `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` + + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` + + `sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`, + }, + { + name: "enable support for spanmetrics connector with a metric namespace", serviceNames: []string{"emailservice"}, spanKinds: []string{"SPAN_KIND_SERVER"}, groupByOperation: true, @@ -355,9 +412,28 @@ func TestGetErrorRates(t *testing.T) { wantLabels: map[string]string{ "service_name": "emailservice", }, - wantPromQlQuery: `sum(rate(span_metrics_calls_total{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` + + wantPromQlQuery: `sum(rate(span_metrics_calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name) / ` + - `sum(rate(span_metrics_calls_total{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name)`, + `sum(rate(span_metrics_calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name)`, + }, + { + name: "enable support for spanmetrics connector with normalized metric name", + serviceNames: []string{"emailservice"}, + spanKinds: []string{"SPAN_KIND_SERVER"}, + groupByOperation: true, + updateConfig: func(cfg config.Configuration) config.Configuration { + cfg.SupportSpanmetricsConnector = true + cfg.NormalizeCalls = true + return cfg + }, + wantName: "service_operation_error_rate", + wantDescription: "error rate, computed as a fraction of errors/sec over calls/sec, grouped by service & operation", + wantLabels: map[string]string{ + "service_name": "emailservice", + }, + wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` + + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name) / ` + + `sum(rate(calls_total{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name)`, }, } { t.Run(tc.name, func(t *testing.T) { @@ -384,7 +460,11 @@ func TestInvalidLatencyUnit(t *testing.T) { t.Errorf("Expected a panic due to invalid latency unit") } }() - cfg := config.Configuration{SupportSpanmetricsConnector: true, LatencyUnit: "something invalid"} + cfg := config.Configuration{ + SupportSpanmetricsConnector: true, + NormalizeDuration: true, + LatencyUnit: "something invalid", + } _, _ = NewMetricsReader(zap.NewNop(), cfg) } diff --git a/plugin/metrics/prometheus/options.go b/plugin/metrics/prometheus/options.go index 1ea90e29756..ae31944230a 100644 --- a/plugin/metrics/prometheus/options.go +++ b/plugin/metrics/prometheus/options.go @@ -34,6 +34,8 @@ const ( suffixSupportSpanmetricsConnector = ".query.support-spanmetrics-connector" suffixMetricNamespace = ".query.namespace" suffixLatencyUnit = ".query.duration-unit" + suffixNormalizeCalls = ".query.normalize-calls" + suffixNormalizeDuration = ".query.normalize-duration" defaultServerURL = "http://localhost:9090" defaultConnectTimeout = 30 * time.Second @@ -42,6 +44,8 @@ const ( defaultSupportSpanmetricsConnector = false defaultMetricNamespace = "" defaultLatencyUnit = "ms" + defaultNormalizeCalls = false + defaultNormalizeDuration = false ) type namespaceConfig struct { @@ -63,6 +67,8 @@ func NewOptions(primaryNamespace string) *Options { SupportSpanmetricsConnector: false, MetricNamespace: defaultMetricNamespace, LatencyUnit: defaultLatencyUnit, + NormalizeCalls: defaultNormalizeCalls, + NormalizeDuration: defaultNormalizeCalls, } return &Options{ @@ -93,6 +99,16 @@ func (opt *Options) AddFlags(flagSet *flag.FlagSet) { `histogram unit value set in the spanmetrics connector (see: `+ `https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/connector/spanmetricsconnector#configurations). `+ `This also helps jaeger-query determine the metric name when querying for "latency" metrics.`) + flagSet.Bool(nsConfig.namespace+suffixNormalizeCalls, defaultNormalizeCalls, + `Whether to normalize the "calls" metric name according to `+ + `https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/translator/prometheus/README.md. `+ + `For example: `+ + `"calls" (not normalized) -> "calls_total" (normalized), `) + flagSet.Bool(nsConfig.namespace+suffixNormalizeDuration, defaultNormalizeDuration, + `Whether to normalize the "duration" metric name according to `+ + `https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/translator/prometheus/README.md. `+ + `For example: `+ + `"duration_bucket" (not normalized) -> "duration_milliseconds_bucket (normalized)"`) nsConfig.getTLSFlagsConfig().AddFlags(flagSet) } @@ -107,6 +123,8 @@ func (opt *Options) InitFromViper(v *viper.Viper) error { cfg.SupportSpanmetricsConnector = v.GetBool(cfg.namespace + suffixSupportSpanmetricsConnector) cfg.MetricNamespace = v.GetString(cfg.namespace + suffixMetricNamespace) cfg.LatencyUnit = v.GetString(cfg.namespace + suffixLatencyUnit) + cfg.NormalizeCalls = v.GetBool(cfg.namespace + suffixNormalizeCalls) + cfg.NormalizeDuration = v.GetBool(cfg.namespace + suffixNormalizeDuration) isValidUnit := map[string]bool{"ms": true, "s": true} if _, ok := isValidUnit[cfg.LatencyUnit]; !ok {