Skip to content

Commit

Permalink
Support normalized metric names (#4555)
Browse files Browse the repository at this point in the history
## Which problem is this PR solving?
- Resolves #4547

## Short description of the changes
- Adds the following boolean parameters to declaratively determine
whether the metric name should be modified to match normalization rules
as defined
[here](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/translator/prometheus/README.md):
  - `prometheus.query.normalize-calls`
  - `prometheus.query.normalize-duration`
- Separate flags intentional to allow support for backwards
compatibility with older OpenTelemetry Collector versions. A single flag
is insufficient.
- Motivated by a breaking change in:
https://github.com/open-telemetry/opentelemetry-collector-contrib/releases/tag/v0.80.0.

---------

Signed-off-by: albertteoh <see.kwang.teoh@gmail.com>
  • Loading branch information
albertteoh authored Jun 29, 2023
1 parent f282d9b commit 855b226
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 21 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ next release (yyyy-mm-dd)

#### ⛔ Breaking Changes

* [SPM] Due to a breaking change in OpenTelemetry's prometheus exporter ([details](https://github.com/open-telemetry/opentelemetry-collector-contrib/releases/tag/v0.80.0))
metric names will no longer be normalized by default, meaning that the expected metric names would be `calls` and
`duration_*`. Backwards compatibility with older OpenTelemetry Collector versions can be achieved through the following flags:
* `prometheus.query.normalize-calls`: If true, normalizes the "calls" metric name. e.g. "calls_total".
* `prometheus.query.normalize-duration`: If true, normalizes the "duration" metric name to include the duration units. e.g. "duration_milliseconds_bucket".


#### New Features

#### Bug fixes, Minor Improvements
Expand Down
2 changes: 1 addition & 1 deletion docker-compose/monitor/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ run-dev: _run-connector

# _run-connector is the base target to bring up the system required for SPM using the new OTEL spanmetrics connector.
.PHONY: _run-connector
_run-connector: export OTEL_IMAGE_TAG = latest
_run-connector: export OTEL_IMAGE_TAG = 0.80.0
_run-connector: export OTEL_CONFIG_SRC = ./otel-collector-config-connector.yml
_run-connector: export PROMETHEUS_QUERY_SUPPORT_SPANMETRICS_CONNECTOR = true
_run-connector:
Expand Down
2 changes: 2 additions & 0 deletions pkg/prometheus/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,6 @@ type Configuration struct {
SupportSpanmetricsConnector bool
MetricNamespace string
LatencyUnit string
NormalizeCalls bool
NormalizeDuration bool
}
15 changes: 12 additions & 3 deletions plugin/metrics/prometheus/metricsstore/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ func buildFullLatencyMetricName(cfg config.Configuration) string {
metricName = cfg.MetricNamespace + "_" + metricName
}

if !cfg.NormalizeDuration {
return metricName
}

// The long names are automatically appended to the metric name by OTEL's prometheus exporters and are defined in:
// https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/pkg/translator/prometheus#metric-name
shortToLongName := map[string]string{"ms": "milliseconds", "s": "seconds"}
Expand All @@ -160,7 +164,7 @@ func (m MetricsReader) GetCallRates(ctx context.Context, requestParams *metricss
buildPromQuery: func(p promQueryParams) string {
return fmt.Sprintf(
// Note: p.spanKindFilter can be ""; trailing commas are okay within a timeseries selection.
`sum(rate(%s_total{service_name =~ "%s", %s}[%s])) by (%s)`,
`sum(rate(%s{service_name =~ "%s", %s}[%s])) by (%s)`,
m.callsMetricName,
p.serviceFilter,
p.spanKindFilter,
Expand All @@ -181,7 +185,12 @@ func buildFullCallsMetricName(cfg config.Configuration) string {
if cfg.MetricNamespace != "" {
metricName = cfg.MetricNamespace + "_" + metricName
}
return metricName

if !cfg.NormalizeCalls {
return metricName
}

return metricName + "_total"
}

// GetErrorRates gets the error rate metrics for the given set of error rate query parameters.
Expand All @@ -193,7 +202,7 @@ func (m MetricsReader) GetErrorRates(ctx context.Context, requestParams *metrics
buildPromQuery: func(p promQueryParams) string {
return fmt.Sprintf(
// Note: p.spanKindFilter can be ""; trailing commas are okay within a timeseries selection.
`sum(rate(%s_total{service_name =~ "%s", status_code = "STATUS_CODE_ERROR", %s}[%s])) by (%s) / sum(rate(%s_total{service_name =~ "%s", %s}[%s])) by (%s)`,
`sum(rate(%s{service_name =~ "%s", status_code = "STATUS_CODE_ERROR", %s}[%s])) by (%s) / sum(rate(%s{service_name =~ "%s", %s}[%s])) by (%s)`,
m.callsMetricName, p.serviceFilter, p.spanKindFilter, p.rate, p.groupBy,
m.callsMetricName, p.serviceFilter, p.spanKindFilter, p.rate, p.groupBy,
)
Expand Down
114 changes: 97 additions & 17 deletions plugin/metrics/prometheus/metricsstore/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ func TestGetLatencies(t *testing.T) {
`span_kind =~ "SPAN_KIND_SERVER|SPAN_KIND_CLIENT"}[10m])) by (service_name,le))`,
},
{
name: "override the default latency metric name",
name: "enable support for spanmetrics connector with normalized metric name",
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
groupByOperation: true,
Expand All @@ -193,7 +193,26 @@ func TestGetLatencies(t *testing.T) {
wantLabels: map[string]string{
"service_name": "emailservice",
},
wantPromQlQuery: `histogram_quantile(0.95, sum(rate(span_metrics_duration_seconds_bucket{service_name =~ "emailservice", ` +
wantPromQlQuery: `histogram_quantile(0.95, sum(rate(span_metrics_duration_bucket{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name,le))`,
},
{
name: "enable support for spanmetrics connector with normalized metric name",
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
groupByOperation: true,
updateConfig: func(cfg config.Configuration) config.Configuration {
cfg.SupportSpanmetricsConnector = true
cfg.NormalizeDuration = true
cfg.LatencyUnit = "s"
return cfg
},
wantName: "service_operation_latencies",
wantDescription: "0.95th quantile latency, grouped by service & operation",
wantLabels: map[string]string{
"service_name": "emailservice",
},
wantPromQlQuery: `histogram_quantile(0.95, sum(rate(duration_seconds_bucket{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name,le))`,
},
} {
Expand Down Expand Up @@ -228,7 +247,7 @@ func TestGetCallRates(t *testing.T) {
wantLabels: map[string]string{
"service_name": "emailservice",
},
wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", ` +
wantPromQlQuery: `sum(rate(calls{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`,
},
{
Expand All @@ -242,7 +261,7 @@ func TestGetCallRates(t *testing.T) {
"operation": "/OrderResult",
"service_name": "emailservice",
},
wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", ` +
wantPromQlQuery: `sum(rate(calls{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,operation)`,
},
{
Expand All @@ -255,11 +274,11 @@ func TestGetCallRates(t *testing.T) {
wantLabels: map[string]string{
"service_name": "emailservice",
},
wantPromQlQuery: `sum(rate(calls_total{service_name =~ "frontend|emailservice", ` +
wantPromQlQuery: `sum(rate(calls{service_name =~ "frontend|emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER|SPAN_KIND_CLIENT"}[10m])) by (service_name)`,
},
{
name: "override the default call rate metric name",
name: "enable support for spanmetrics connector with a namespace",
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
groupByOperation: true,
Expand All @@ -273,7 +292,25 @@ func TestGetCallRates(t *testing.T) {
wantLabels: map[string]string{
"service_name": "emailservice",
},
wantPromQlQuery: `sum(rate(span_metrics_calls_total{service_name =~ "emailservice", ` +
wantPromQlQuery: `sum(rate(span_metrics_calls{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name)`,
},
{
name: "enable support for spanmetrics connector with normalized metric name",
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
groupByOperation: true,
updateConfig: func(cfg config.Configuration) config.Configuration {
cfg.SupportSpanmetricsConnector = true
cfg.NormalizeCalls = true
return cfg
},
wantName: "service_operation_call_rate",
wantDescription: "calls/sec, grouped by service & operation",
wantLabels: map[string]string{
"service_name": "emailservice",
},
wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name)`,
},
} {
Expand Down Expand Up @@ -307,9 +344,9 @@ func TestGetErrorRates(t *testing.T) {
wantLabels: map[string]string{
"service_name": "emailservice",
},
wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
wantPromQlQuery: `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` +
`sum(rate(calls_total{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`,
`sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`,
},
{
name: "group by service and operation should be reflected in name/description and query group-by",
Expand All @@ -322,9 +359,9 @@ func TestGetErrorRates(t *testing.T) {
"operation": "/OrderResult",
"service_name": "emailservice",
},
wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
wantPromQlQuery: `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,operation) / ` +
`sum(rate(calls_total{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,operation)`,
`sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,operation)`,
},
{
name: "two services and span kinds result in regex 'or' symbol in query",
Expand All @@ -336,12 +373,32 @@ func TestGetErrorRates(t *testing.T) {
wantLabels: map[string]string{
"service_name": "emailservice",
},
wantPromQlQuery: `sum(rate(calls_total{service_name =~ "frontend|emailservice", status_code = "STATUS_CODE_ERROR", ` +
wantPromQlQuery: `sum(rate(calls{service_name =~ "frontend|emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER|SPAN_KIND_CLIENT"}[10m])) by (service_name) / ` +
`sum(rate(calls_total{service_name =~ "frontend|emailservice", span_kind =~ "SPAN_KIND_SERVER|SPAN_KIND_CLIENT"}[10m])) by (service_name)`,
`sum(rate(calls{service_name =~ "frontend|emailservice", span_kind =~ "SPAN_KIND_SERVER|SPAN_KIND_CLIENT"}[10m])) by (service_name)`,
},
{
name: "override the default error rate metric name",
name: "neither metric namespace nor enabling normalized metric names have an impact when spanmetrics connector is not supported",
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
groupByOperation: false,
updateConfig: func(cfg config.Configuration) config.Configuration {
cfg.SupportSpanmetricsConnector = false
cfg.MetricNamespace = "span_metrics"
cfg.NormalizeCalls = true
return cfg
},
wantName: "service_error_rate",
wantDescription: "error rate, computed as a fraction of errors/sec over calls/sec, grouped by service",
wantLabels: map[string]string{
"service_name": "emailservice",
},
wantPromQlQuery: `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` +
`sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`,
},
{
name: "enable support for spanmetrics connector with a metric namespace",
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
groupByOperation: true,
Expand All @@ -355,9 +412,28 @@ func TestGetErrorRates(t *testing.T) {
wantLabels: map[string]string{
"service_name": "emailservice",
},
wantPromQlQuery: `sum(rate(span_metrics_calls_total{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
wantPromQlQuery: `sum(rate(span_metrics_calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name) / ` +
`sum(rate(span_metrics_calls_total{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name)`,
`sum(rate(span_metrics_calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name)`,
},
{
name: "enable support for spanmetrics connector with normalized metric name",
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
groupByOperation: true,
updateConfig: func(cfg config.Configuration) config.Configuration {
cfg.SupportSpanmetricsConnector = true
cfg.NormalizeCalls = true
return cfg
},
wantName: "service_operation_error_rate",
wantDescription: "error rate, computed as a fraction of errors/sec over calls/sec, grouped by service & operation",
wantLabels: map[string]string{
"service_name": "emailservice",
},
wantPromQlQuery: `sum(rate(calls_total{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name) / ` +
`sum(rate(calls_total{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name,span_name)`,
},
} {
t.Run(tc.name, func(t *testing.T) {
Expand All @@ -384,7 +460,11 @@ func TestInvalidLatencyUnit(t *testing.T) {
t.Errorf("Expected a panic due to invalid latency unit")
}
}()
cfg := config.Configuration{SupportSpanmetricsConnector: true, LatencyUnit: "something invalid"}
cfg := config.Configuration{
SupportSpanmetricsConnector: true,
NormalizeDuration: true,
LatencyUnit: "something invalid",
}
_, _ = NewMetricsReader(zap.NewNop(), cfg)
}

Expand Down
18 changes: 18 additions & 0 deletions plugin/metrics/prometheus/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ const (
suffixSupportSpanmetricsConnector = ".query.support-spanmetrics-connector"
suffixMetricNamespace = ".query.namespace"
suffixLatencyUnit = ".query.duration-unit"
suffixNormalizeCalls = ".query.normalize-calls"
suffixNormalizeDuration = ".query.normalize-duration"

defaultServerURL = "http://localhost:9090"
defaultConnectTimeout = 30 * time.Second
Expand All @@ -42,6 +44,8 @@ const (
defaultSupportSpanmetricsConnector = false
defaultMetricNamespace = ""
defaultLatencyUnit = "ms"
defaultNormalizeCalls = false
defaultNormalizeDuration = false
)

type namespaceConfig struct {
Expand All @@ -63,6 +67,8 @@ func NewOptions(primaryNamespace string) *Options {
SupportSpanmetricsConnector: false,
MetricNamespace: defaultMetricNamespace,
LatencyUnit: defaultLatencyUnit,
NormalizeCalls: defaultNormalizeCalls,
NormalizeDuration: defaultNormalizeCalls,
}

return &Options{
Expand Down Expand Up @@ -93,6 +99,16 @@ func (opt *Options) AddFlags(flagSet *flag.FlagSet) {
`histogram unit value set in the spanmetrics connector (see: `+
`https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/connector/spanmetricsconnector#configurations). `+
`This also helps jaeger-query determine the metric name when querying for "latency" metrics.`)
flagSet.Bool(nsConfig.namespace+suffixNormalizeCalls, defaultNormalizeCalls,
`Whether to normalize the "calls" metric name according to `+
`https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/translator/prometheus/README.md. `+
`For example: `+
`"calls" (not normalized) -> "calls_total" (normalized), `)
flagSet.Bool(nsConfig.namespace+suffixNormalizeDuration, defaultNormalizeDuration,
`Whether to normalize the "duration" metric name according to `+
`https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/pkg/translator/prometheus/README.md. `+
`For example: `+
`"duration_bucket" (not normalized) -> "duration_milliseconds_bucket (normalized)"`)

nsConfig.getTLSFlagsConfig().AddFlags(flagSet)
}
Expand All @@ -107,6 +123,8 @@ func (opt *Options) InitFromViper(v *viper.Viper) error {
cfg.SupportSpanmetricsConnector = v.GetBool(cfg.namespace + suffixSupportSpanmetricsConnector)
cfg.MetricNamespace = v.GetString(cfg.namespace + suffixMetricNamespace)
cfg.LatencyUnit = v.GetString(cfg.namespace + suffixLatencyUnit)
cfg.NormalizeCalls = v.GetBool(cfg.namespace + suffixNormalizeCalls)
cfg.NormalizeDuration = v.GetBool(cfg.namespace + suffixNormalizeDuration)

isValidUnit := map[string]bool{"ms": true, "s": true}
if _, ok := isValidUnit[cfg.LatencyUnit]; !ok {
Expand Down

0 comments on commit 855b226

Please sign in to comment.