Skip to content

Commit

Permalink
Implement suggestions; Make naming consistent; Rm/Add metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>
  • Loading branch information
saswatamcode committed Jul 20, 2022
1 parent d8797ca commit b647b54
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 71 deletions.
8 changes: 4 additions & 4 deletions cmd/thanos/receive.go
Original file line number Diff line number Diff line change
Expand Up @@ -831,15 +831,15 @@ func (rc *receiveConfig) registerFlag(cmd extkingpin.FlagClause) {

cmd.Flag("receive.replication-factor", "How many times to replicate incoming write requests.").Default("1").Uint64Var(&rc.replicationFactor)

cmd.Flag("receive.per-tenant-limit", "The total number of active series that a tenant is allowed to have within a hashring topology.").Uint64Var(&rc.maxPerTenantLimit)
cmd.Flag("receive.tenant-limits.max-head-series", "The total number of active or HEAD series that a tenant is allowed to have within a Receive topology.").Uint64Var(&rc.maxPerTenantLimit)

cmd.Flag("receive.limit-meta-monitoring.url", "Meta-monitoring URL which is compatible with Prometheus Query API for active series limiting.").Default("http://localhost:9090").URLVar(&rc.metaMonitoringUrl)
cmd.Flag("receive.tenant-limits.meta-monitoring-url", "Meta-monitoring URL which is compatible with Prometheus Query API for active series limiting.").Default("http://localhost:9090").URLVar(&rc.metaMonitoringUrl)

cmd.Flag("receive.limit-meta-monitoring.query", "PromQL Query to execute against meta-monitoring, to get the current number of active series for each tenant, across Receive replicas.").Default("sum(prometheus_tsdb_head_series) by (tenant)").StringVar(&rc.metaMonitoringLimitQuery)
cmd.Flag("receive.tenant-limits.meta-monitoring-query", "PromQL Query to execute against meta-monitoring, to get the current number of active series for each tenant, across Receive replicas.").Default("sum(prometheus_tsdb_head_series) by (tenant)").StringVar(&rc.metaMonitoringLimitQuery)

rc.metaMonitoringHttpClient = extflag.RegisterPathOrContent(
cmd,
"receive.limit-meta-monitoring.http-client",
"receive.tenant-limits.meta-monitoring-client",
"YAML file or string with http client configs for meta-monitoring.",
)

Expand Down
59 changes: 31 additions & 28 deletions docs/components/receive.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,19 @@ With such configuration any receive listens for remote write on `<ip>10908/api/v

## Limiting

Thanos Receive, in Router mode, supports limiting tenant active series, to maintain stability of the system. It uses any Prometheus Query API compatible meta-monitoring solution to get the current number of active series, and compares that with a configured limit, before ingesting any tenant's remote write request. In case a tenant has gone above the limit, their remote write requests are failed fully.
Thanos Receive, in Router or RouterIngestor mode, supports limiting tenant active or HEAD series, to maintain stability of the system. It uses any Prometheus Query API compatible meta-monitoring solution to get the current number of active series, and compares that with a configured limit, before ingesting any tenant's remote write request. In case a tenant has gone above the limit, their remote write requests are failed fully.

This can used by specifying the following flags,
- `--receive.per-tenant-limit`: Specifies the total number of active series for any tenant, across all replicas (including data replication), allowed by Thanos Receive.
- `--receive.limit-meta-monitoring.url`: Specifies Prometheus Query API compatible meta-monitoring endpoint.
- `--receive.limit-meta-monitoring.query`: Optional flag to specify PromQL query to execute against meta-monitoring.
- `receive.limit-meta-monitoring.http-client`: Optional YAML file/string specifying HTTP client config for meta-monitoring.
Meta-monitoring in this context refers to an external monitoring system scraping all Thanos Receive instances and exposing them in an API compatible with the Prometheus Query API.

To use the feature, one should specify the following flags:
- `--receive.tenant-limits.max-head-series`: Specifies the total number of active or HEAD series for any tenant, across all replicas (including data replication), allowed by Thanos Receive.
- `--receive.tenant-limits.meta-monitoring-url`: Specifies Prometheus Query API compatible meta-monitoring endpoint.
- `--receive.tenant-limits.meta-monitoring-query`: Optional flag to specify PromQL query to execute against meta-monitoring.
- `--receive.tenant-limits.meta-monitoring-client`: Optional YAML file/string specifying HTTP client config for meta-monitoring.

NOTE:
- It is possible that Receive ingests more active series than the specified limit, as it relies on meta-monitoring, which may not have the latest data for current number of active series of a tenant at all times.
- Thanos Receive performs best-effort limting. In case meta-monitoring is down/unreachable, Thanos Receive will not impose limits.
- Thanos Receive performs best-effort limiting. In case meta-monitoring is down/unreachable, Thanos Receive will not impose limits.

## Flags

Expand Down Expand Up @@ -166,33 +168,12 @@ Flags:
--receive.hashrings-file-refresh-interval=5m
Refresh interval to re-read the hashring
configuration file. (used as a fallback)
--receive.limit-meta-monitoring.http-client=<content>
Alternative to
'receive.limit-meta-monitoring.http-client-file'
flag (mutually exclusive). Content of YAML file
or string with http client configs for
meta-monitoring.
--receive.limit-meta-monitoring.http-client-file=<file-path>
Path to YAML file or string with http client
configs for meta-monitoring.
--receive.limit-meta-monitoring.query="sum(prometheus_tsdb_head_series) by (tenant)"
PromQL Query to execute against
meta-monitoring, to get the current number of
active series for each tenant, across Receive
replicas.
--receive.limit-meta-monitoring.url=http://localhost:9090
Meta-monitoring URL which is compatible with
Prometheus Query API for active series
limiting.
--receive.local-endpoint=RECEIVE.LOCAL-ENDPOINT
Endpoint of local receive node. Used to
identify the local node in the hashring
configuration. If it's empty AND hashring
configuration was provided, it means that
receive will run in RoutingOnly mode.
--receive.per-tenant-limit=RECEIVE.PER-TENANT-LIMIT
The total number of active series that a tenant
is allowed to have within a hashring topology.
--receive.relabel-config=<content>
Alternative to 'receive.relabel-config-file'
flag (mutually exclusive). Content of YAML file
Expand All @@ -218,6 +199,28 @@ Flags:
--receive.tenant-label-name="tenant_id"
Label name through which the tenant will be
announced.
--receive.tenant-limits.max-head-series=RECEIVE.TENANT-LIMITS.MAX-HEAD-SERIES
The total number of active or HEAD series that
a tenant is allowed to have within a Receive
topology.
--receive.tenant-limits.meta-monitoring-client=<content>
Alternative to
'receive.tenant-limits.meta-monitoring-client-file'
flag (mutually exclusive). Content of YAML file
or string with http client configs for
meta-monitoring.
--receive.tenant-limits.meta-monitoring-client-file=<file-path>
Path to YAML file or string with http client
configs for meta-monitoring.
--receive.tenant-limits.meta-monitoring-query="sum(prometheus_tsdb_head_series) by (tenant)"
PromQL Query to execute against
meta-monitoring, to get the current number of
active series for each tenant, across Receive
replicas.
--receive.tenant-limits.meta-monitoring-url=http://localhost:9090
Meta-monitoring URL which is compatible with
Prometheus Query API for active series
limiting.
--remote-write.address="0.0.0.0:19291"
Address to listen on for remote write requests.
--remote-write.client-server-name=""
Expand Down
24 changes: 12 additions & 12 deletions pkg/receive/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ type Handler struct {
replications *prometheus.CounterVec
replicationFactor prometheus.Gauge
configuredTenantLimit prometheus.Gauge
aboveLimit *prometheus.GaugeVec
limitedRequests *prometheus.CounterVec
metaMonitoringErr *prometheus.CounterVec

writeSamplesTotal *prometheus.HistogramVec
writeTimeseriesTotal *prometheus.HistogramVec
Expand Down Expand Up @@ -178,20 +178,20 @@ func NewHandler(logger log.Logger, o *Options) *Handler {
),
configuredTenantLimit: promauto.With(registerer).NewGauge(
prometheus.GaugeOpts{
Name: "thanos_receive_tenant_active_series_limit",
Help: "The configured limit for active series of tenants.",
Name: "thanos_receive_tenant_head_series_limit",
Help: "The configured limit for active or HEAD series of tenants.",
},
),
aboveLimit: promauto.With(registerer).NewGaugeVec(
prometheus.GaugeOpts{
Name: "thanos_receive_series_above_limit",
Help: "The difference between current number of active series and set limit.",
limitedRequests: promauto.With(registerer).NewCounterVec(
prometheus.CounterOpts{
Name: "thanos_receive_head_series_limited_requests_total",
Help: "The total number of remote write requests that have been dropped due to head series limiting.",
}, []string{"tenant"},
),
limitedRequests: promauto.With(registerer).NewCounterVec(
metaMonitoringErr: promauto.With(registerer).NewCounterVec(
prometheus.CounterOpts{
Name: "thanos_receive_limited_requests_total",
Help: "The total number of remote write requests that have been dropped due to limiting.",
Name: "thanos_receive_metamonitoring_failed_queries_total",
Help: "The total number of meta-monitoring queries that failed while limiting.",
}, []string{"tenant"},
),
writeTimeseriesTotal: promauto.With(registerer).NewHistogramVec(
Expand Down Expand Up @@ -556,6 +556,7 @@ func (h *Handler) isUnderLimit(ctx context.Context, tenant string, logger log.Lo

vectorRes, _, err := c.QueryInstant(ctx, h.options.MetaMonitoringUrl, h.options.MetaMonitoringLimitQuery, time.Now(), promclient.QueryOptions{})
if err != nil {
h.metaMonitoringErr.WithLabelValues(tenant).Inc()
return true, errors.Wrap(err, "failed to query meta-monitoring")
}

Expand All @@ -570,9 +571,8 @@ func (h *Handler) isUnderLimit(ctx context.Context, tenant string, logger log.Lo
for k, v := range e.Metric {
// Search for metric which has tenant label for a particular tenant.
if k == "tenant" && string(v) == tenant {
h.aboveLimit.WithLabelValues(tenant).Set(float64(e.Value) - float64(h.options.MaxPerTenantLimit))
if float64(e.Value) >= float64(h.options.MaxPerTenantLimit) {
level.Error(logger).Log("msg", "tenant above limit", "currentSeries", float64(e.Value))
level.Error(logger).Log("msg", "tenant above limit", "currentSeries", float64(e.Value), "limit", h.options.MaxPerTenantLimit)
h.limitedRequests.WithLabelValues(tenant).Inc()
return false, nil
}
Expand Down
6 changes: 3 additions & 3 deletions test/e2e/e2ethanos/services.go
Original file line number Diff line number Diff line change
Expand Up @@ -489,10 +489,10 @@ func (r *ReceiveBuilder) Init() e2e.InstrumentedRunnable {
}

if r.limit != 0 && r.metamonitoring != "" {
args["--receive.per-tenant-limit"] = fmt.Sprintf("%v", r.limit)
args["--receive.limit-meta-monitoring.url"] = r.metamonitoring
args["--receive.tenant-limits.max-head-series"] = fmt.Sprintf("%v", r.limit)
args["--receive.tenant-limits.meta-monitoring-url"] = r.metamonitoring
if r.metamonitoringQuery != "" {
args["--receive.limit-meta-monitoring.query"] = r.metamonitoringQuery
args["--receive.tenant-limits.meta-monitoring-query"] = r.metamonitoringQuery
}
}

Expand Down
37 changes: 37 additions & 0 deletions test/e2e/query_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,43 @@ func instantQuery(t testing.TB, ctx context.Context, addr string, q func() strin
return result
}

func queryWaitAndAssert(t *testing.T, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expected model.Vector) {
t.Helper()

fmt.Println("queryWaitAndAssert: Waiting for", len(expected), "results for query", q())
var result model.Vector

logger := log.NewLogfmtLogger(os.Stdout)
logger = log.With(logger, "ts", log.DefaultTimestampUTC)
testutil.Ok(t, runutil.RetryWithLog(logger, 5*time.Second, ctx.Done(), func() error {
res, warnings, err := promclient.NewDefaultClient().QueryInstant(ctx, urlParse(t, "http://"+addr), q(), ts(), opts)
if err != nil {
return err
}

if len(warnings) > 0 {
return errors.Errorf("unexpected warnings %s", warnings)
}

if len(res) != len(expected) {
return errors.Errorf("unexpected result size, expected %d; result %d: %v", len(expected), len(res), res)
}
result = res
sortResults(result)
for _, r := range result {
r.Timestamp = 0 // Does not matter for us.
}

// Retry if not expected result
if reflect.DeepEqual(expected, result) {
return nil
}
return errors.New("series are different")
}))

testutil.Equals(t, expected, result)
}

func queryAndAssertSeries(t *testing.T, ctx context.Context, addr string, q func() string, ts func() time.Time, opts promclient.QueryOptions, expected []model.Metric) {
t.Helper()

Expand Down
Loading

0 comments on commit b647b54

Please sign in to comment.