grafana · vlad-diachenko · Dec 12, 2022 · Nov 4, 2022 · Dec 12, 2022 · kavirajk
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -34,6 +34,7 @@
 
 * [7462](https://github.com/grafana/loki/pull/7462) **MarNicGit**: Allow excluding event message from Windows Event Log entries.
 * [7602](https://github.com/grafana/loki/pull/7602) **vmax**: Add decolorize stage to Promtail to easily parse colored logs.
+* [7597](https://github.com/grafana/loki/pull/7597) **redbaron**: allow ratelimiting by label
 
 ##### Fixes
 * [7771](https://github.com/grafana/loki/pull/7771) **GeorgeTsilias**: Handle nil error on target Details() call.

@@ -2,24 +2,34 @@ package stages
 
 import (
 	"context"
+	"fmt"
+
+	"github.com/go-kit/log/level"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/common/model"
+
+	"github.com/grafana/loki/pkg/util"
 
 	"github.com/go-kit/log"
 	"github.com/mitchellh/mapstructure"
 	"github.com/pkg/errors"
-	"github.com/prometheus/client_golang/prometheus"
 	"golang.org/x/time/rate"
 )
 
 const (
 	ErrLimitStageInvalidRateOrBurst = "limit stage failed to parse rate or burst"
+	ErrLimitStageByLabelMustDrop    = "When ratelimiting by label, drop must be true"
+	MinReasonableMaxDistinctLabels  = 10000 // 80bytes per rate.Limiter ~ 1MiB memory
 )
 
 var ratelimitDropReason = "ratelimit_drop_stage"
 
 type LimitConfig struct {
-	Rate  float64 `mapstructure:"rate"`
-	Burst int     `mapstructure:"burst"`
-	Drop  bool    `mapstructure:"drop"`
+	Rate              float64 `mapstructure:"rate"`
+	Burst             int     `mapstructure:"burst"`
+	Drop              bool    `mapstructure:"drop"`
+	ByLabelName       string  `mapstructure:"by_label_name"`
+	MaxDistinctLabels int     `mapstructure:"max_distinct_labels"`
 }
 
 func newLimitStage(logger log.Logger, config interface{}, registerer prometheus.Registerer) (Stage, error) {
@@ -34,36 +44,61 @@ func newLimitStage(logger log.Logger, config interface{}, registerer prometheus.
 		return nil, err
 	}
 
+	logger = log.With(logger, "component", "stage", "type", "limit")
+	if cfg.ByLabelName != "" && cfg.MaxDistinctLabels < MinReasonableMaxDistinctLabels {
+		level.Warn(logger).Log(
+			"msg",
+			fmt.Sprintf("max_distinct_labels was adjusted up to the minimal reasonable value of %d", MinReasonableMaxDistinctLabels),
+		)
+		cfg.MaxDistinctLabels = MinReasonableMaxDistinctLabels
+	}
+
 	r := &limitStage{
-		logger:      log.With(logger, "component", "stage", "type", "limit"),
-		cfg:         cfg,
-		dropCount:   getDropCountMetric(registerer),
-		rateLimiter: rate.NewLimiter(rate.Limit(cfg.Rate), cfg.Burst),
+		logger:    logger,
+		cfg:       cfg,
+		dropCount: getDropCountMetric(registerer),
+	}
+
+	if cfg.ByLabelName != "" {
+		r.dropCountByLabel = getDropCountByLabelMetric(registerer)
+		newRateLimiter := func() *rate.Limiter { return rate.NewLimiter(rate.Limit(cfg.Rate), cfg.Burst) }
+		gcCb := func() { r.dropCountByLabel.Reset() }
+		r.rateLimiterByLabel = util.NewGenMap[model.LabelValue, *rate.Limiter](cfg.MaxDistinctLabels, newRateLimiter, gcCb)
+	} else {
+		r.rateLimiter = rate.NewLimiter(rate.Limit(cfg.Rate), cfg.Burst)
 	}
+
 	return r, nil
 }
 
 func validateLimitConfig(cfg *LimitConfig) error {
 	if cfg.Rate <= 0 || cfg.Burst <= 0 {
 		return errors.Errorf(ErrLimitStageInvalidRateOrBurst)
 	}
+
+	if cfg.ByLabelName != "" && !cfg.Drop {
+		return errors.Errorf(ErrLimitStageByLabelMustDrop)
+	}
 	return nil
 }
 
 // limitStage applies Label matchers to determine if the include stages should be run
 type limitStage struct {
-	logger      log.Logger
-	cfg         *LimitConfig
-	rateLimiter *rate.Limiter
-	dropCount   *prometheus.CounterVec
+	logger             log.Logger
+	cfg                *LimitConfig
+	rateLimiter        *rate.Limiter
+	rateLimiterByLabel util.GenerationalMap[model.LabelValue, *rate.Limiter]
+	dropCount          *prometheus.CounterVec
+	dropCountByLabel   *prometheus.CounterVec
+	byLabelName        model.LabelName
 }
 
 func (m *limitStage) Run(in chan Entry) chan Entry {
 	out := make(chan Entry)
 	go func() {
 		defer close(out)
 		for e := range in {
-			if !m.shouldThrottle() {
+			if !m.shouldThrottle(e.Labels) {
 				out <- e
 				continue
 			}
@@ -72,7 +107,21 @@ func (m *limitStage) Run(in chan Entry) chan Entry {
 	return out
 }
 
-func (m *limitStage) shouldThrottle() bool {
+func (m *limitStage) shouldThrottle(labels model.LabelSet) bool {
+	if m.cfg.ByLabelName != "" {
+		labelValue, ok := labels[model.LabelName(m.cfg.ByLabelName)]
+		if !ok {
+			return false // if no label found, dont ratelimit
+		}
+		rl := m.rateLimiterByLabel.GetOrCreate(labelValue)
+		if rl.Allow() {
+			return false
+		}
+		m.dropCount.WithLabelValues(ratelimitDropReason).Inc()
+		m.dropCountByLabel.WithLabelValues(m.cfg.ByLabelName, string(labelValue)).Inc()
+		return true
+	}
+
 	if m.cfg.Drop {
 		if m.rateLimiter.Allow() {
 			return false
@@ -88,3 +137,9 @@ func (m *limitStage) shouldThrottle() bool {
 func (m *limitStage) Name() string {
 	return StageTypeLimit
 }
+
+func getDropCountByLabelMetric(registerer prometheus.Registerer) *prometheus.CounterVec {
+	return util.RegisterCounterVec(registerer, "logentry", "dropped_lines_by_label_total",
+		"A count of all log lines dropped as a result of a pipeline stage",
+		[]string{"label_name", "label_value"})
+}
@@ -38,10 +38,31 @@ pipeline_stages:
     drop: true
 `
 
+var testLimitByLabelYaml = `
+pipeline_stages:
+- json:
+    expressions:
+      app:
+      msg:
+- limit:
+    rate: 1
+    burst: 1
+    drop: true
+    by_label_name: app
+`
+
+var testNonAppLogLine = `
+{
+	"time":"2012-11-01T22:08:41+00:00",
+	"msg" : "Non app log line"
+}
+`
+
+var plName = "testPipeline"
+
 // TestLimitPipeline is used to verify we properly parse the yaml config and create a working pipeline
 func TestLimitWaitPipeline(t *testing.T) {
 	registry := prometheus.NewRegistry()
-	plName := "testPipeline"
 	pl, err := NewPipeline(util_log.Logger, loadConfig(testLimitWaitYaml), &plName, registry)
 	logs := make([]Entry, 0)
 	logCount := 5
@@ -60,7 +81,6 @@ func TestLimitWaitPipeline(t *testing.T) {
 // TestLimitPipeline is used to verify we properly parse the yaml config and create a working pipeline
 func TestLimitDropPipeline(t *testing.T) {
 	registry := prometheus.NewRegistry()
-	plName := "testPipeline"
 	pl, err := NewPipeline(util_log.Logger, loadConfig(testLimitDropYaml), &plName, registry)
 	logs := make([]Entry, 0)
 	logCount := 10
@@ -75,3 +95,56 @@ func TestLimitDropPipeline(t *testing.T) {
 	assert.Len(t, out, 1)
 	assert.Equal(t, out[0].Line, testMatchLogLineApp1)
 }
+
+// TestLimitByLabelPipeline is used to verify we properly parse the yaml config and create a working pipeline
+func TestLimitByLabelPipeline(t *testing.T) {
+	registry := prometheus.NewRegistry()
+	pl, err := NewPipeline(util_log.Logger, loadConfig(testLimitByLabelYaml), &plName, registry)
+	logs := make([]Entry, 0)
+	logCount := 5
+	for i := 0; i < logCount; i++ {
+		logs = append(logs, newEntry(nil, model.LabelSet{"app": "loki"}, testMatchLogLineApp1, time.Now()))
+	}
+	for i := 0; i < logCount; i++ {
+		logs = append(logs, newEntry(nil, model.LabelSet{"app": "poki"}, testMatchLogLineApp2, time.Now()))
+	}
+	for i := 0; i < logCount; i++ {
+		logs = append(logs, newEntry(nil, model.LabelSet{}, testNonAppLogLine, time.Now()))
+	}
+	require.NoError(t, err)
+	out := processEntries(pl,
+		logs...,
+	)
+	// Only one entry of each app will go through + all log lines without expected label
+	assert.Len(t, out, 2+logCount)
+	assert.Equal(t, out[0].Line, testMatchLogLineApp1)
+	assert.Equal(t, out[1].Line, testMatchLogLineApp2)
+	assert.Equal(t, out[3].Line, testNonAppLogLine)
+
+	var hasTotal, hasByLabel bool
+	mfs, _ := registry.Gather()
+	for _, mf := range mfs {
+		if *mf.Name == "logentry_dropped_lines_total" {
+			hasTotal = true
+			assert.Len(t, mf.Metric, 1)
+			assert.Equal(t, 8, int(mf.Metric[0].Counter.GetValue()))
+		} else if *mf.Name == "logentry_dropped_lines_by_label_total" {
+			hasByLabel = true
+			assert.Len(t, mf.Metric, 2)
+			assert.Equal(t, 4, int(mf.Metric[0].Counter.GetValue()))
+			assert.Equal(t, 4, int(mf.Metric[1].Counter.GetValue()))
+
+			assert.Equal(t, mf.Metric[0].Label[0].GetName(), "label_name")
+			assert.Equal(t, mf.Metric[0].Label[0].GetValue(), "app")
+			assert.Equal(t, mf.Metric[0].Label[1].GetName(), "label_value")
+			assert.Equal(t, mf.Metric[0].Label[1].GetValue(), "loki")
+
+			assert.Equal(t, mf.Metric[1].Label[0].GetName(), "label_name")
+			assert.Equal(t, mf.Metric[1].Label[0].GetValue(), "app")
+			assert.Equal(t, mf.Metric[1].Label[1].GetName(), "label_value")
+			assert.Equal(t, mf.Metric[1].Label[1].GetValue(), "poki")
+		}
+	}
+	assert.True(t, hasTotal)
+	assert.True(t, hasByLabel)
+}
@@ -17,6 +17,13 @@ limit:
 
   # The cap in the quantity of burst lines that Promtail will push to Loki
   [burst: <int>]
+
+  # Ratelimit each label value independently. If label is not found, log line is not
+  # considered for ratelimiting. Drop must be true if this is set.
+  [by_label_name: <string>]  
+
+  # When ratelimiting by label is enabled, keep track of this many last used labels
+  [max_distinct_labels: <int> | default = 10000]  
 
   # When drop is true, log lines that exceed the current rate limit will be discarded.
   # When drop is false, log lines that exceed the current rate limit will only wait
@@ -56,3 +63,18 @@ Given the pipeline:
 ```
 
 Would throttle any log line and drop logs when rate limit.
+
+#### Ratelimit by a label
+
+Given the pipeline:
+
+```yaml
+- limit:
+    rate: 10
+    burst: 10
+    drop: true
+    by_label_name: "namespace"
+```
+
+Would ratelimit messages originating from each namespace independently.
+Any message without namespace label will not be ratelimited.
@@ -0,0 +1,39 @@
+package util
+
+type GenerationalMap[K comparable, V any] struct {
+	oldgen map[K]V
+	newgen map[K]V
+
+	maxSize int
+	newV    func() V
+	gcCb    func()
+}
+
+// NewGenMap created which maintains at most maxSize recently used entries
+func NewGenMap[K comparable, V any](maxSize int, newV func() V, gcCb func()) GenerationalMap[K, V] {
+	return GenerationalMap[K, V]{
+		newgen:  make(map[K]V),
+		maxSize: maxSize,
+		newV:    newV,
+		gcCb:    gcCb,
+	}
+}
+
+func (m *GenerationalMap[K, T]) GetOrCreate(key K) T {
+	v, ok := m.newgen[key]
+	if !ok {
+		if v, ok = m.oldgen[key]; !ok {
+			v = m.newV()
+		}
+		m.newgen[key] = v
+
+		if len(m.newgen) == m.maxSize {
+			m.oldgen = m.newgen
+			m.newgen = make(map[K]T)
+			if m.gcCb != nil {
+				m.gcCb()
+			}
+		}
+	}
+	return v
+}
@@ -805,3 +805,23 @@ type CollectorVec interface {
 	prometheus.Collector
 	Delete(labels prometheus.Labels) bool
 }
+
+// RegisterCounterVec registers new CounterVec with given name,namespace and labels.
+// If metric was already registered it returns existing instance.
+func RegisterCounterVec(registerer prometheus.Registerer, namespace, name, help string, labels []string) *prometheus.CounterVec {
+	vec := prometheus.NewCounterVec(prometheus.CounterOpts{
+		Namespace: namespace,
+		Name:      name,
+		Help:      help,
+	}, labels)
+	err := registerer.Register(vec)
+	if err != nil {
+		if existing, ok := err.(prometheus.AlreadyRegisteredError); ok {
+			vec = existing.ExistingCollector.(*prometheus.CounterVec)
+		} else {
+			// Same behavior as MustRegister if the error is not for AlreadyRegistered
+			panic(err)
+		}
+	}
+	return vec
+}