-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[exporter/prometheusremotewrite] Fix: Don't drop batch in case of failure to translate metrics #29729
[exporter/prometheusremotewrite] Fix: Don't drop batch in case of failure to translate metrics #29729
Changes from 6 commits
ac29886
883bd39
f2a846b
baa04c6
691a7ec
f897551
b3a6ce6
a128f16
84af853
bffdd8c
faf0fdd
8bf5987
f60d68e
5e953f5
0fad48b
bfdb63a
c178c3b
2bc92ca
9e7ac06
315996a
e8ea846
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Use this changelog template to create an entry for release notes. | ||
|
||
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' | ||
change_type: enhancement | ||
|
||
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) | ||
component: prometheusremotewriteexporter | ||
|
||
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). | ||
note: Publish telemetry about translation of metrics from Otel to Prometheus. Don't drop all data points if some fail translation. | ||
|
||
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. | ||
issues: [29729] | ||
|
||
# (Optional) One or more lines of additional information to render under the primary note. | ||
# These lines will be padded with 2 spaces and then inserted directly into the document. | ||
# Use pipe (|) for multiline entries. | ||
subtext: | ||
|
||
# If your change doesn't affect end users or the exported elements of any package, | ||
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. | ||
# Optional: The change log or logs in which this entry should be included. | ||
# e.g. '[user]' or '[user, api]' | ||
# Include 'user' if the change is relevant to end users. | ||
# Include 'api' if there is a change to a library API. | ||
# Default: '[user]' | ||
change_logs: [user] |
Original file line number | Diff line number | Diff line change | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -25,12 +25,33 @@ import ( | |||||||||||||||
"go.opentelemetry.io/collector/consumer/consumererror" | ||||||||||||||||
"go.opentelemetry.io/collector/exporter" | ||||||||||||||||
"go.opentelemetry.io/collector/pdata/pmetric" | ||||||||||||||||
"go.opentelemetry.io/otel/metric" | ||||||||||||||||
"go.uber.org/multierr" | ||||||||||||||||
"go.uber.org/zap" | ||||||||||||||||
|
||||||||||||||||
"github.com/open-telemetry/opentelemetry-collector-contrib/exporter/prometheusremotewriteexporter/internal/metadata" | ||||||||||||||||
prometheustranslator "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheus" | ||||||||||||||||
"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheusremotewrite" | ||||||||||||||||
) | ||||||||||||||||
|
||||||||||||||||
type prwTelemetry interface { | ||||||||||||||||
recordTranslationFailure(ctx context.Context) | ||||||||||||||||
recordTranslatedTimeSeries(ctx context.Context, numTS int) | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
type prwTelemetryOtel struct { | ||||||||||||||||
failedTranslations metric.Int64Counter | ||||||||||||||||
translatedTimeSeries metric.Int64Counter | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
func (p *prwTelemetryOtel) recordTranslationFailure(ctx context.Context) { | ||||||||||||||||
p.failedTranslations.Add(ctx, 1) | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
func (p *prwTelemetryOtel) recordTranslatedTimeSeries(ctx context.Context, numTS int) { | ||||||||||||||||
p.translatedTimeSeries.Add(ctx, int64(numTS)) | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
// prwExporter converts OTLP metrics to Prometheus remote write TimeSeries and sends them to a remote endpoint. | ||||||||||||||||
type prwExporter struct { | ||||||||||||||||
endpointURL *url.URL | ||||||||||||||||
|
@@ -45,10 +66,33 @@ type prwExporter struct { | |||||||||||||||
retrySettings configretry.BackOffConfig | ||||||||||||||||
wal *prweWAL | ||||||||||||||||
exporterSettings prometheusremotewrite.Settings | ||||||||||||||||
telemetry prwTelemetry | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
func newPRWTelemetry(set exporter.CreateSettings) prwTelemetry { | ||||||||||||||||
|
||||||||||||||||
meter := metadata.Meter(set.TelemetrySettings) | ||||||||||||||||
// TODO: create helper functions similar to the processor helper: BuildCustomMetricName | ||||||||||||||||
prefix := "exporter/" + metadata.Type + "/" | ||||||||||||||||
|
||||||||||||||||
failedTranslations, _ := meter.Int64Counter(prefix+"failed_translations", | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't ignore instrument creation errors. These should be propagated up the call stack if they can't be handled here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know why I thought it was ok to do this. fixed. |
||||||||||||||||
metric.WithDescription("Number of translation operations that failed to translate metrics from OTEL to Prometheus"), | ||||||||||||||||
metric.WithUnit("1"), | ||||||||||||||||
) | ||||||||||||||||
|
||||||||||||||||
translatedTimeSeries, _ := meter.Int64Counter(prefix+"translated_metrics", | ||||||||||||||||
metric.WithDescription("Number of Prometheus time series that were translated from OTEL metrics"), | ||||||||||||||||
metric.WithUnit("1"), | ||||||||||||||||
) | ||||||||||||||||
bryan-aguilar marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||
|
||||||||||||||||
return &prwTelemetryOtel{ | ||||||||||||||||
failedTranslations: failedTranslations, | ||||||||||||||||
translatedTimeSeries: translatedTimeSeries, | ||||||||||||||||
} | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
// newPRWExporter initializes a new prwExporter instance and sets fields accordingly. | ||||||||||||||||
func newPRWExporter(cfg *Config, set exporter.CreateSettings) (*prwExporter, error) { | ||||||||||||||||
func newPRWExporter(cfg *Config, set exporter.CreateSettings, telemetry prwTelemetry) (*prwExporter, error) { | ||||||||||||||||
sanitizedLabels, err := validateAndSanitizeExternalLabels(cfg) | ||||||||||||||||
if err != nil { | ||||||||||||||||
return nil, err | ||||||||||||||||
|
@@ -79,6 +123,7 @@ func newPRWExporter(cfg *Config, set exporter.CreateSettings) (*prwExporter, err | |||||||||||||||
AddMetricSuffixes: cfg.AddMetricSuffixes, | ||||||||||||||||
SendMetadata: cfg.SendMetadata, | ||||||||||||||||
}, | ||||||||||||||||
telemetry: telemetry, | ||||||||||||||||
} | ||||||||||||||||
if cfg.WAL == nil { | ||||||||||||||||
return prwe, nil | ||||||||||||||||
|
@@ -134,15 +179,20 @@ func (prwe *prwExporter) PushMetrics(ctx context.Context, md pmetric.Metrics) er | |||||||||||||||
|
||||||||||||||||
tsMap, err := prometheusremotewrite.FromMetrics(md, prwe.exporterSettings) | ||||||||||||||||
if err != nil { | ||||||||||||||||
err = consumererror.NewPermanent(err) | ||||||||||||||||
prwe.telemetry.recordTranslationFailure(ctx) | ||||||||||||||||
prwe.settings.Logger.Debug("failed to translate metrics %s", zap.Error(err)) | ||||||||||||||||
prwe.settings.Logger.Debug("exporting remaining %s metrics", zap.Int("translated", len(tsMap))) | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
The I also wondering if we want to adjust these log lines to be more clear. I think they may confuse users. I think the message intent is
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
If these are to be logged at the same level might as well make it a single logging statement. Is the number of metrics that failed translation available to record? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like your idea of using a single line.
No, this is hidden in the We could also look into the number of errors embedded in the error returned in the FromMetrics function, but this would also rely on assumptions about the |
||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
prwe.telemetry.recordTranslatedTimeSeries(ctx, len(tsMap)) | ||||||||||||||||
|
||||||||||||||||
var m []*prompb.MetricMetadata | ||||||||||||||||
if prwe.exporterSettings.SendMetadata { | ||||||||||||||||
m = prometheusremotewrite.OtelMetricsToMetadata(md, prwe.exporterSettings.AddMetricSuffixes) | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
// Call export even if a conversion error, since there may be points that were successfully converted. | ||||||||||||||||
return multierr.Combine(err, prwe.handleExport(ctx, tsMap, m)) | ||||||||||||||||
return prwe.handleExport(ctx, tsMap, m) | ||||||||||||||||
} | ||||||||||||||||
} | ||||||||||||||||
|
||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,8 @@ func createMetricsExporter(ctx context.Context, set exporter.CreateSettings, | |
return nil, errors.New("invalid configuration") | ||
} | ||
|
||
prwe, err := newPRWExporter(prwCfg, set) | ||
telemetry := newPRWTelemetry(set) | ||
prwe, err := newPRWExporter(prwCfg, set, telemetry) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be done inside There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was done because of the tests. I think it is ok to create a real telemetry and then switch for a mock one. I updated the code. |
||
if err != nil { | ||
return nil, err | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should these include the component ID as an attribute? It can be important to identify which component is failing when multiple exporters of the same type are configured.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good point. done!