Implement suggestions; Make naming consistent; Rm/Add metrics

Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>
thanos-io · Jul 20, 2022 · b647b54 · b647b54
1 parent d8797ca
commit b647b54
Show file tree

Hide file tree

Showing 6 changed files with 158 additions and 71 deletions.
diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go
@@ -831,15 +831,15 @@ func (rc *receiveConfig) registerFlag(cmd extkingpin.FlagClause) {
 
 	cmd.Flag("receive.replication-factor", "How many times to replicate incoming write requests.").Default("1").Uint64Var(&rc.replicationFactor)
 
-	cmd.Flag("receive.per-tenant-limit", "The total number of active series that a tenant is allowed to have within a hashring topology.").Uint64Var(&rc.maxPerTenantLimit)
+	cmd.Flag("receive.tenant-limits.max-head-series", "The total number of active or HEAD series that a tenant is allowed to have within a Receive topology.").Uint64Var(&rc.maxPerTenantLimit)
 
-	cmd.Flag("receive.limit-meta-monitoring.url", "Meta-monitoring URL which is compatible with Prometheus Query API for active series limiting.").Default("http://localhost:9090").URLVar(&rc.metaMonitoringUrl)
+	cmd.Flag("receive.tenant-limits.meta-monitoring-url", "Meta-monitoring URL which is compatible with Prometheus Query API for active series limiting.").Default("http://localhost:9090").URLVar(&rc.metaMonitoringUrl)
 
-	cmd.Flag("receive.limit-meta-monitoring.query", "PromQL Query to execute against meta-monitoring, to get the current number of active series for each tenant, across Receive replicas.").Default("sum(prometheus_tsdb_head_series) by (tenant)").StringVar(&rc.metaMonitoringLimitQuery)
+	cmd.Flag("receive.tenant-limits.meta-monitoring-query", "PromQL Query to execute against meta-monitoring, to get the current number of active series for each tenant, across Receive replicas.").Default("sum(prometheus_tsdb_head_series) by (tenant)").StringVar(&rc.metaMonitoringLimitQuery)
 
 	rc.metaMonitoringHttpClient = extflag.RegisterPathOrContent(
 		cmd,
-		"receive.limit-meta-monitoring.http-client",
+		"receive.tenant-limits.meta-monitoring-client",
 		"YAML file or string with http client configs for meta-monitoring.",
 	)
 

diff --git a/docs/components/receive.md b/docs/components/receive.md
@@ -79,17 +79,19 @@ With such configuration any receive listens for remote write on `<ip>10908/api/v
 
 ## Limiting
 
-Thanos Receive, in Router mode, supports limiting tenant active series, to maintain stability of the system. It uses any Prometheus Query API compatible meta-monitoring solution to get the current number of active series, and compares that with a configured limit, before ingesting any tenant's remote write request. In case a tenant has gone above the limit, their remote write requests are failed fully.
+Thanos Receive, in Router or RouterIngestor mode, supports limiting tenant active or HEAD series, to maintain stability of the system. It uses any Prometheus Query API compatible meta-monitoring solution to get the current number of active series, and compares that with a configured limit, before ingesting any tenant's remote write request. In case a tenant has gone above the limit, their remote write requests are failed fully.
 
-This can used by specifying the following flags,
-- `--receive.per-tenant-limit`: Specifies the total number of active series for any tenant, across all replicas (including data replication), allowed by Thanos Receive.
-- `--receive.limit-meta-monitoring.url`: Specifies Prometheus Query API compatible meta-monitoring endpoint.
-- `--receive.limit-meta-monitoring.query`: Optional flag to specify PromQL query to execute against meta-monitoring.
-- `receive.limit-meta-monitoring.http-client`: Optional YAML file/string specifying HTTP client config for meta-monitoring.
+Meta-monitoring in this context refers to an external monitoring system scraping all Thanos Receive instances and exposing them in an API compatible with the Prometheus Query API.
+
+To use the feature, one should specify the following flags:
+- `--receive.tenant-limits.max-head-series`: Specifies the total number of active or HEAD series for any tenant, across all replicas (including data replication), allowed by Thanos Receive.
+- `--receive.tenant-limits.meta-monitoring-url`: Specifies Prometheus Query API compatible meta-monitoring endpoint.
+- `--receive.tenant-limits.meta-monitoring-query`: Optional flag to specify PromQL query to execute against meta-monitoring.
+- `--receive.tenant-limits.meta-monitoring-client`: Optional YAML file/string specifying HTTP client config for meta-monitoring.
 
 NOTE:
 - It is possible that Receive ingests more active series than the specified limit, as it relies on meta-monitoring, which may not have the latest data for current number of active series of a tenant at all times.
-- Thanos Receive performs best-effort limting. In case meta-monitoring is down/unreachable, Thanos Receive will not impose limits.
+- Thanos Receive performs best-effort limiting. In case meta-monitoring is down/unreachable, Thanos Receive will not impose limits.
 
 ## Flags
 
@@ -166,33 +168,12 @@ Flags:
       --receive.hashrings-file-refresh-interval=5m
                                  Refresh interval to re-read the hashring
                                  configuration file. (used as a fallback)
-      --receive.limit-meta-monitoring.http-client=<content>
-                                 Alternative to
-                                 'receive.limit-meta-monitoring.http-client-file'
-                                 flag (mutually exclusive). Content of YAML file
-                                 or string with http client configs for
-                                 meta-monitoring.
-      --receive.limit-meta-monitoring.http-client-file=<file-path>
-                                 Path to YAML file or string with http client
-                                 configs for meta-monitoring.
-      --receive.limit-meta-monitoring.query="sum(prometheus_tsdb_head_series) by (tenant)"
-                                 PromQL Query to execute against
-                                 meta-monitoring, to get the current number of
-                                 active series for each tenant, across Receive
-                                 replicas.
-      --receive.limit-meta-monitoring.url=http://localhost:9090
-                                 Meta-monitoring URL which is compatible with
-                                 Prometheus Query API for active series
-                                 limiting.
       --receive.local-endpoint=RECEIVE.LOCAL-ENDPOINT
                                  Endpoint of local receive node. Used to
                                  identify the local node in the hashring
                                  configuration. If it's empty AND hashring
                                  configuration was provided, it means that
                                  receive will run in RoutingOnly mode.
-      --receive.per-tenant-limit=RECEIVE.PER-TENANT-LIMIT
-                                 The total number of active series that a tenant
-                                 is allowed to have within a hashring topology.
       --receive.relabel-config=<content>
                                  Alternative to 'receive.relabel-config-file'
                                  flag (mutually exclusive). Content of YAML file
@@ -218,6 +199,28 @@ Flags:
       --receive.tenant-label-name="tenant_id"
                                  Label name through which the tenant will be
                                  announced.
+      --receive.tenant-limits.max-head-series=RECEIVE.TENANT-LIMITS.MAX-HEAD-SERIES
+                                 The total number of active or HEAD series that
+                                 a tenant is allowed to have within a Receive
+                                 topology.
+      --receive.tenant-limits.meta-monitoring-client=<content>
+                                 Alternative to
+                                 'receive.tenant-limits.meta-monitoring-client-file'
+                                 flag (mutually exclusive). Content of YAML file
+                                 or string with http client configs for
+                                 meta-monitoring.
+      --receive.tenant-limits.meta-monitoring-client-file=<file-path>
+                                 Path to YAML file or string with http client
+                                 configs for meta-monitoring.
+      --receive.tenant-limits.meta-monitoring-query="sum(prometheus_tsdb_head_series) by (tenant)"
+                                 PromQL Query to execute against
+                                 meta-monitoring, to get the current number of
+                                 active series for each tenant, across Receive
+                                 replicas.
+      --receive.tenant-limits.meta-monitoring-url=http://localhost:9090
+                                 Meta-monitoring URL which is compatible with
+                                 Prometheus Query API for active series
+                                 limiting.
       --remote-write.address="0.0.0.0:19291"
                                  Address to listen on for remote write requests.
       --remote-write.client-server-name=""

diff --git a/pkg/receive/handler.go b/pkg/receive/handler.go
@@ -128,8 +128,8 @@ type Handler struct {
 	replications          *prometheus.CounterVec
 	replicationFactor     prometheus.Gauge
 	configuredTenantLimit prometheus.Gauge
-	aboveLimit            *prometheus.GaugeVec
 	limitedRequests       *prometheus.CounterVec
+	metaMonitoringErr     *prometheus.CounterVec
 
 	writeSamplesTotal    *prometheus.HistogramVec
 	writeTimeseriesTotal *prometheus.HistogramVec
@@ -178,20 +178,20 @@ func NewHandler(logger log.Logger, o *Options) *Handler {
 		),
 		configuredTenantLimit: promauto.With(registerer).NewGauge(
 			prometheus.GaugeOpts{
-				Name: "thanos_receive_tenant_active_series_limit",
-				Help: "The configured limit for active series of tenants.",
+				Name: "thanos_receive_tenant_head_series_limit",
+				Help: "The configured limit for active or HEAD series of tenants.",
 			},
 		),
-		aboveLimit: promauto.With(registerer).NewGaugeVec(
-			prometheus.GaugeOpts{
-				Name: "thanos_receive_series_above_limit",
-				Help: "The difference between current number of active series and set limit.",
+		limitedRequests: promauto.With(registerer).NewCounterVec(
+			prometheus.CounterOpts{
+				Name: "thanos_receive_head_series_limited_requests_total",
+				Help: "The total number of remote write requests that have been dropped due to head series limiting.",
 			}, []string{"tenant"},
 		),
-		limitedRequests: promauto.With(registerer).NewCounterVec(
+		metaMonitoringErr: promauto.With(registerer).NewCounterVec(
 			prometheus.CounterOpts{
-				Name: "thanos_receive_limited_requests_total",
-				Help: "The total number of remote write requests that have been dropped due to limiting.",
+				Name: "thanos_receive_metamonitoring_failed_queries_total",
+				Help: "The total number of meta-monitoring queries that failed while limiting.",
 			}, []string{"tenant"},
 		),
 		writeTimeseriesTotal: promauto.With(registerer).NewHistogramVec(
@@ -556,6 +556,7 @@ func (h *Handler) isUnderLimit(ctx context.Context, tenant string, logger log.Lo
 
 	vectorRes, _, err := c.QueryInstant(ctx, h.options.MetaMonitoringUrl, h.options.MetaMonitoringLimitQuery, time.Now(), promclient.QueryOptions{})
 	if err != nil {
+		h.metaMonitoringErr.WithLabelValues(tenant).Inc()
 		return true, errors.Wrap(err, "failed to query meta-monitoring")
 	}
 
@@ -570,9 +571,8 @@ func (h *Handler) isUnderLimit(ctx context.Context, tenant string, logger log.Lo
 		for k, v := range e.Metric {
 			// Search for metric which has tenant label for a particular tenant.
 			if k == "tenant" && string(v) == tenant {
-				h.aboveLimit.WithLabelValues(tenant).Set(float64(e.Value) - float64(h.options.MaxPerTenantLimit))
 				if float64(e.Value) >= float64(h.options.MaxPerTenantLimit) {
-					level.Error(logger).Log("msg", "tenant above limit", "currentSeries", float64(e.Value))
+					level.Error(logger).Log("msg", "tenant above limit", "currentSeries", float64(e.Value), "limit", h.options.MaxPerTenantLimit)
 					h.limitedRequests.WithLabelValues(tenant).Inc()
 					return false, nil
 				}

diff --git a/test/e2e/e2ethanos/services.go b/test/e2e/e2ethanos/services.go
@@ -489,10 +489,10 @@ func (r *ReceiveBuilder) Init() e2e.InstrumentedRunnable {
 	}
 
 	if r.limit != 0 && r.metamonitoring != "" {
-		args["--receive.per-tenant-limit"] = fmt.Sprintf("%v", r.limit)
-		args["--receive.limit-meta-monitoring.url"] = r.metamonitoring
+		args["--receive.tenant-limits.max-head-series"] = fmt.Sprintf("%v", r.limit)
+		args["--receive.tenant-limits.meta-monitoring-url"] = r.metamonitoring
 		if r.metamonitoringQuery != "" {
-			args["--receive.limit-meta-monitoring.query"] = r.metamonitoringQuery
+			args["--receive.tenant-limits.meta-monitoring-query"] = r.metamonitoringQuery
 		}
 	}
 

diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go
@@ -892,6 +892,43 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin
 	return result
 }
 
+func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expected model.Vector) {
+	t.Helper()
+
+	fmt.Println("queryWaitAndAssert: Waiting for", len(expected), "results for query", q())
+	var result model.Vector
+
+	logger := log.NewLogfmtLogger(os.Stdout)
+	logger = log.With(logger, "ts", log.DefaultTimestampUTC)
+	testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error {
+		res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts)
+		if err != nil {
+			return err
+		}
+
+		if len(warnings) > 0 {
+			return errors.Errorf("unexpected warnings %s", warnings)
+		}
+
+		if len(res) != len(expected) {
+			return errors.Errorf("unexpected result size, expected %d; result %d: %v", len(expected), len(res), res)
+		}
+		result = res
+		sortResults(result)
+		for _, r := range result {
+			r.Timestamp = 0 // Does not matter for us.
+		}
+
+		// Retry if not expected result
+		if reflect.DeepEqual(expected, result) {
+			return nil
+		}
+		return errors.New("series are different")
+	}))
+
+	testutil.Equals(t, expected, result)
+}
+
 func queryAndAssertSeries(t *testing.T, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expected []model.Metric) {
 	t.Helper()