From 3f40ffc13619997802d86de0bfc3642a852d7af0 Mon Sep 17 00:00:00 2001 From: Emmanuel T Odeke Date: Wed, 14 Apr 2021 16:24:20 -0700 Subject: [PATCH] exporter/prometheusremotewriter: sort Sample by Timestamp to avoid out of order errors Ensures that before a prompb.WriteRequest is created, that the TimeSeries contained have their Sample values sorted chronologically by Prometheus to avoid out of order errors reported by Prometheus barfing. Thanks to @rakyll for a reproducer and for diagnosing the problem, which helped distill the issue from a very complex setup that required super expensive Kubernetes clusters with many replicas, but essentially the problem became more apparently when the number of TimeSeries grew. It is important to note that the presence of such a bug signifies that with a large number of replicas are being scraped from, this stalls scraping and takes a long time which means that targets scraped in a round-robin fashion experience staleness when many. This might be even more reasons for setups to adopt a push model as opposed to scrape endpoints. Fixes #2315 Fixes https://github.com/open-telemetry/wg-prometheus/issues/10 --- .../prometheusremotewriteexporter/helper.go | 17 ++- .../helper_test.go | 113 ++++++++++++++++++ 2 files changed, 129 insertions(+), 1 deletion(-) diff --git a/exporter/prometheusremotewriteexporter/helper.go b/exporter/prometheusremotewriteexporter/helper.go index ac40651e3549..8c7164ee7b9c 100644 --- a/exporter/prometheusremotewriteexporter/helper.go +++ b/exporter/prometheusremotewriteexporter/helper.go @@ -476,10 +476,25 @@ func addSingleDoubleSummaryDataPoint(pt *otlp.DoubleSummaryDataPoint, metric *ot } } +func orderBySampleTimestamp(tsArray []prompb.TimeSeries) []prompb.TimeSeries { + for i := range tsArray { + sL := tsArray[i].Samples + sort.Slice(sL, func(i, j int) bool { + return sL[i].Timestamp < sL[j].Timestamp + }) + tsArray[i].Samples = sL + } + return tsArray +} + func convertTimeseriesToRequest(tsArray []prompb.TimeSeries) *prompb.WriteRequest { // the remote_write endpoint only requires the timeseries. // otlp defines it's own way to handle metric metadata return &prompb.WriteRequest{ - Timeseries: tsArray, + // Prometheus requires time series to be sorted by Timestamp to avoid out of order problems. + // See: + // * https://github.com/open-telemetry/wg-prometheus/issues/10 + // * https://github.com/open-telemetry/opentelemetry-collector/issues/2315 + Timeseries: orderBySampleTimestamp(tsArray), } } diff --git a/exporter/prometheusremotewriteexporter/helper_test.go b/exporter/prometheusremotewriteexporter/helper_test.go index 86355c15649b..4ed96c0a73d1 100644 --- a/exporter/prometheusremotewriteexporter/helper_test.go +++ b/exporter/prometheusremotewriteexporter/helper_test.go @@ -367,3 +367,116 @@ func Test_batchTimeSeries(t *testing.T) { }) } } + +// Ensure that before a prompb.WriteRequest is created, that the points per TimeSeries +// are sorted by Timestamp value, to prevent Prometheus from barfing when it gets poorly +// sorted values. See issues: +// * https://github.com/open-telemetry/wg-prometheus/issues/10 +// * https://github.com/open-telemetry/opentelemetry-collector/issues/2315 +func TestEnsureTimeseriesPointsAreSortedByTimestamp(t *testing.T) { + outOfOrder := []prompb.TimeSeries{ + { + Samples: []prompb.Sample{ + { + Value: 10.11, + Timestamp: 1000, + }, + { + Value: 7.81, + Timestamp: 2, + }, + { + Value: 987.81, + Timestamp: 1, + }, + { + Value: 18.22, + Timestamp: 999, + }, + }, + }, + { + Samples: []prompb.Sample{ + { + Value: 99.91, + Timestamp: 5, + }, + { + Value: 4.33, + Timestamp: 3, + }, + { + Value: 47.81, + Timestamp: 4, + }, + { + Value: 18.22, + Timestamp: 8, + }, + }, + }, + } + got := convertTimeseriesToRequest(outOfOrder) + + // We must ensure that the resulting Timeseries' sample points are sorted by Timestamp. + want := &prompb.WriteRequest{ + Timeseries: []prompb.TimeSeries{ + { + Samples: []prompb.Sample{ + { + Value: 987.81, + Timestamp: 1, + }, + { + Value: 7.81, + Timestamp: 2, + }, + { + Value: 18.22, + Timestamp: 999, + }, + { + Value: 10.11, + Timestamp: 1000, + }, + }, + }, + { + Samples: []prompb.Sample{ + { + Value: 4.33, + Timestamp: 3, + }, + { + Value: 47.81, + Timestamp: 4, + }, + { + Value: 99.91, + Timestamp: 5, + }, + { + Value: 18.22, + Timestamp: 8, + }, + }, + }, + }, + } + assert.Equal(t, got, want) + + // For a full sanity/logical check, assert that EVERY + // Sample has a Timestamp bigger than its prior values. + for ti, ts := range got.Timeseries { + for i := range ts.Samples { + si := ts.Samples[i] + for j := 0; j < i; j++ { + sj := ts.Samples[j] + if sj.Timestamp > si.Timestamp { + t.Errorf("Timeseries[%d]: Sample[%d].Timestamp(%d) > Sample[%d].Timestamp(%d)", + ti, j, sj.Timestamp, i, si.Timestamp) + } + } + } + } +}