diff --git a/CHANGELOG.md b/CHANGELOG.md index 28d95811f054b..d1bbbbf2ec5ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -92,7 +92,7 @@ Check the history of the branch FIXME. #### Promtail ##### Enhancements - +* [7593](https://github.com/grafana/loki/pull/7593) **chodges15**: Promtail: Add tenant label to client drop metrics and logs * [7101](https://github.com/grafana/loki/pull/7101) **liguozhong**: Promtail: Add support for max stream limit. * [7247](https://github.com/grafana/loki/pull/7247) **liguozhong**: Add config reload endpoint / signal to promtail. * [6708](https://github.com/grafana/loki/pull/6708) **DylanGuedes**: Add compressed files support to Promtail. diff --git a/clients/pkg/promtail/client/client.go b/clients/pkg/promtail/client/client.go index 8f9ec07df3389..0b008377a068a 100644 --- a/clients/pkg/promtail/client/client.go +++ b/clients/pkg/promtail/client/client.go @@ -38,20 +38,22 @@ const ( LatencyLabel = "filename" HostLabel = "host" ClientLabel = "client" + TenantLabel = "tenant" ) var UserAgent = fmt.Sprintf("promtail/%s", build.Version) type Metrics struct { - encodedBytes *prometheus.CounterVec - sentBytes *prometheus.CounterVec - droppedBytes *prometheus.CounterVec - sentEntries *prometheus.CounterVec - droppedEntries *prometheus.CounterVec - requestDuration *prometheus.HistogramVec - batchRetries *prometheus.CounterVec - countersWithHost []*prometheus.CounterVec - streamLag *prometheus.GaugeVec + encodedBytes *prometheus.CounterVec + sentBytes *prometheus.CounterVec + droppedBytes *prometheus.CounterVec + sentEntries *prometheus.CounterVec + droppedEntries *prometheus.CounterVec + requestDuration *prometheus.HistogramVec + batchRetries *prometheus.CounterVec + countersWithHost []*prometheus.CounterVec + countersWithTenant []*prometheus.CounterVec + streamLag *prometheus.GaugeVec } func NewMetrics(reg prometheus.Registerer, streamLagLabels []string) *Metrics { @@ -71,7 +73,7 @@ func NewMetrics(reg prometheus.Registerer, streamLagLabels []string) *Metrics { Namespace: "promtail", Name: "dropped_bytes_total", Help: "Number of bytes dropped because failed to be sent to the ingester after all retries.", - }, []string{HostLabel}) + }, []string{HostLabel, TenantLabel}) m.sentEntries = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "promtail", Name: "sent_entries_total", @@ -81,7 +83,7 @@ func NewMetrics(reg prometheus.Registerer, streamLagLabels []string) *Metrics { Namespace: "promtail", Name: "dropped_entries_total", Help: "Number of log entries dropped because failed to be sent to the ingester after all retries.", - }, []string{HostLabel}) + }, []string{HostLabel, TenantLabel}) m.requestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "promtail", Name: "request_duration_seconds", @@ -91,10 +93,14 @@ func NewMetrics(reg prometheus.Registerer, streamLagLabels []string) *Metrics { Namespace: "promtail", Name: "batch_retries_total", Help: "Number of times batches has had to be retried.", - }, []string{HostLabel}) + }, []string{HostLabel, TenantLabel}) m.countersWithHost = []*prometheus.CounterVec{ - m.encodedBytes, m.sentBytes, m.droppedBytes, m.sentEntries, m.droppedEntries, m.batchRetries, + m.encodedBytes, m.sentBytes, m.sentEntries, + } + + m.countersWithTenant = []*prometheus.CounterVec{ + m.droppedBytes, m.droppedEntries, m.batchRetries, } streamLagLabelsMerged := []string{HostLabel, ClientLabel} @@ -270,6 +276,11 @@ func (c *client) run() { // If the batch doesn't exist yet, we create a new one with the entry if !ok { batches[tenantID] = newBatch(c.maxStreams, e) + // Initialize counters to 0 so the metrics are exported before the first + // occurrence of incrementing to avoid missing metrics. + for _, counter := range c.metrics.countersWithTenant { + counter.WithLabelValues(c.cfg.URL.Host, tenantID).Add(0) + } break } @@ -285,8 +296,9 @@ func (c *client) run() { // The max size of the batch isn't reached, so we can add the entry err := batch.add(e) if err != nil { - level.Error(c.logger).Log("msg", "batch add err", "error", err) - c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host).Inc() + level.Error(c.logger).Log("msg", "batch add err", "tenant", tenantID, "error", err) + c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host, tenantID).Add(float64(len(e.Line))) + c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host, tenantID).Inc() return } case <-maxWaitCheck.C: @@ -376,8 +388,8 @@ func (c *client) sendBatch(tenantID string, batch *batch) { break } - level.Warn(c.logger).Log("msg", "error sending batch, will retry", "status", status, "error", err) - c.metrics.batchRetries.WithLabelValues(c.cfg.URL.Host).Inc() + level.Warn(c.logger).Log("msg", "error sending batch, will retry", "status", status, "tenant", tenantID, "error", err) + c.metrics.batchRetries.WithLabelValues(c.cfg.URL.Host, tenantID).Inc() backoff.Wait() // Make sure it sends at least once before checking for retry. @@ -387,9 +399,9 @@ func (c *client) sendBatch(tenantID string, batch *batch) { } if err != nil { - level.Error(c.logger).Log("msg", "final error sending batch", "status", status, "error", err) - c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host).Add(bufBytes) - c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host).Add(float64(entriesCount)) + level.Error(c.logger).Log("msg", "final error sending batch", "status", status, "tenant", tenantID, "error", err) + c.metrics.droppedBytes.WithLabelValues(c.cfg.URL.Host, tenantID).Add(bufBytes) + c.metrics.droppedEntries.WithLabelValues(c.cfg.URL.Host, tenantID).Add(float64(entriesCount)) } } diff --git a/clients/pkg/promtail/client/client_test.go b/clients/pkg/promtail/client/client_test.go index 81d03eaa2c5e7..b1aed0da12e5e 100644 --- a/clients/pkg/promtail/client/client_test.go +++ b/clients/pkg/promtail/client/client_test.go @@ -75,7 +75,7 @@ func TestClient_Handle(t *testing.T) { promtail_sent_entries_total{host="__HOST__"} 3.0 # HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. # TYPE promtail_dropped_entries_total counter - promtail_dropped_entries_total{host="__HOST__"} 0 + promtail_dropped_entries_total{host="__HOST__", tenant=""} 0 `, }, "batch log entries together until the batch wait time is reached": { @@ -101,7 +101,7 @@ func TestClient_Handle(t *testing.T) { promtail_sent_entries_total{host="__HOST__"} 2.0 # HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. # TYPE promtail_dropped_entries_total counter - promtail_dropped_entries_total{host="__HOST__"} 0 + promtail_dropped_entries_total{host="__HOST__", tenant=""} 0 `, }, "retry send a batch up to backoff's max retries in case the server responds with a 5xx": { @@ -127,7 +127,7 @@ func TestClient_Handle(t *testing.T) { expectedMetrics: ` # HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. # TYPE promtail_dropped_entries_total counter - promtail_dropped_entries_total{host="__HOST__"} 1.0 + promtail_dropped_entries_total{host="__HOST__", tenant=""} 1.0 # HELP promtail_sent_entries_total Number of log entries sent to the ingester. # TYPE promtail_sent_entries_total counter promtail_sent_entries_total{host="__HOST__"} 0 @@ -148,7 +148,7 @@ func TestClient_Handle(t *testing.T) { expectedMetrics: ` # HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. # TYPE promtail_dropped_entries_total counter - promtail_dropped_entries_total{host="__HOST__"} 1.0 + promtail_dropped_entries_total{host="__HOST__", tenant=""} 1.0 # HELP promtail_sent_entries_total Number of log entries sent to the ingester. # TYPE promtail_sent_entries_total counter promtail_sent_entries_total{host="__HOST__"} 0 @@ -177,7 +177,7 @@ func TestClient_Handle(t *testing.T) { expectedMetrics: ` # HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. # TYPE promtail_dropped_entries_total counter - promtail_dropped_entries_total{host="__HOST__"} 1.0 + promtail_dropped_entries_total{host="__HOST__", tenant=""} 1.0 # HELP promtail_sent_entries_total Number of log entries sent to the ingester. # TYPE promtail_sent_entries_total counter promtail_sent_entries_total{host="__HOST__"} 0 @@ -202,7 +202,7 @@ func TestClient_Handle(t *testing.T) { promtail_sent_entries_total{host="__HOST__"} 2.0 # HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. # TYPE promtail_dropped_entries_total counter - promtail_dropped_entries_total{host="__HOST__"} 0 + promtail_dropped_entries_total{host="__HOST__", tenant="tenant-default"} 0 `, }, "batch log entries together honoring the tenant ID overridden while processing the pipeline stages": { @@ -232,7 +232,9 @@ func TestClient_Handle(t *testing.T) { promtail_sent_entries_total{host="__HOST__"} 4.0 # HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. # TYPE promtail_dropped_entries_total counter - promtail_dropped_entries_total{host="__HOST__"} 0 + promtail_dropped_entries_total{host="__HOST__", tenant="tenant-1"} 0 + promtail_dropped_entries_total{host="__HOST__", tenant="tenant-2"} 0 + promtail_dropped_entries_total{host="__HOST__", tenant="tenant-default"} 0 `, }, } @@ -343,7 +345,7 @@ func TestClient_StopNow(t *testing.T) { promtail_sent_entries_total{host="__HOST__"} 3.0 # HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. # TYPE promtail_dropped_entries_total counter - promtail_dropped_entries_total{host="__HOST__"} 0 + promtail_dropped_entries_total{host="__HOST__", tenant=""} 0 `, }, { @@ -362,7 +364,7 @@ func TestClient_StopNow(t *testing.T) { expectedMetrics: ` # HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries. # TYPE promtail_dropped_entries_total counter - promtail_dropped_entries_total{host="__HOST__"} 1.0 + promtail_dropped_entries_total{host="__HOST__", tenant=""} 1.0 # HELP promtail_sent_entries_total Number of log entries sent to the ingester. # TYPE promtail_sent_entries_total counter promtail_sent_entries_total{host="__HOST__"} 0