Skip to content

Commit

Permalink
Move rules deletion from Purger to Ruler API. (cortexproject#3899)
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>
  • Loading branch information
pstibrany authored and harry671003 committed Mar 11, 2021
1 parent 8a132fe commit e1488b6
Show file tree
Hide file tree
Showing 13 changed files with 152 additions and 204 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
* `cortex_ha_tracker_replicas_cleanup_marked_for_deletion_total`
* `cortex_ha_tracker_replicas_cleanup_deleted_total`
* `cortex_ha_tracker_replicas_cleanup_delete_failed_total`
* [ENHANCEMENT] Tenant deletion endpoints now support deletion of ruler groups. This only works when using rule store that supports deletion. #3750
* [ENHANCEMENT] Ruler now has new API endpoint `/ruler/delete_tenant_config` that can be used to delete all ruler groups for tenant. It is intended to be used by administrators who wish to clean up state after removed user. Note that this endpoint is enabled regardless of `-experimental.ruler.enable-api`. #3750 #3899
* [ENHANCEMENT] Query-frontend, query-scheduler: cleanup metrics for inactive tenants. #3826
* [ENHANCEMENT] Distributor: Prevent failed ingestion from affecting rate limiting. #3825
* [ENHANCEMENT] Blocks storage: added `-blocks-storage.s3.region` support to S3 client configuration. #3811
Expand Down
13 changes: 13 additions & 0 deletions docs/api/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ For the sake of clarity, in this document we have grouped API endpoints by servi
| [Set rule group](#set-rule-group) | Ruler | `POST /api/v1/rules/{namespace}` |
| [Delete rule group](#delete-rule-group) | Ruler | `DELETE /api/v1/rules/{namespace}/{groupName}` |
| [Delete namespace](#delete-namespace) | Ruler | `DELETE /api/v1/rules/{namespace}` |
| [Delete tenant configuration](#delete-tenant-configuration) | Ruler | `POST /ruler/delete_tenant_config` |
| [Alertmanager status](#alertmanager-status) | Alertmanager | `GET /multitenant_alertmanager/status` |
| [Alertmanager ring status](#alertmanager-ring-status) | Alertmanager | `GET /multitenant_alertmanager/ring` |
| [Alertmanager UI](#alertmanager-ui) | Alertmanager | `GET /<alertmanager-http-prefix>` |
Expand Down Expand Up @@ -636,6 +637,18 @@ _This experimental endpoint is disabled by default and can be enabled via the `-

_Requires [authentication](#authentication)._

### Delete tenant configuration

```
POST /ruler/delete_tenant_config
```

This deletes all rule groups for tenant, and returns `200` on success. Calling endpoint when no rule groups exist for user returns `200`. Authentication is only to identify the tenant.

This is intended as internal API, and not to be exposed to users. This endpoint is enabled regardless of whether `-experimental.ruler.enable-api` is enabled or not.

_Requires [authentication](#authentication)._

## Alertmanager

### Alertmanager status
Expand Down
3 changes: 3 additions & 0 deletions pkg/api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,9 @@ func (a *API) RegisterRuler(r *ruler.Ruler) {
a.indexPage.AddLink(SectionAdminEndpoints, "/ruler/ring", "Ruler Ring Status")
a.RegisterRoute("/ruler/ring", r, false, "GET", "POST")

// Administrative API, uses authentication to inform which user's configuration to delete.
a.RegisterRoute("/ruler/delete_tenant_config", http.HandlerFunc(r.DeleteTenantConfiguration), true, "POST")

// Legacy Ring Route
a.RegisterRoute("/ruler_ring", r, false, "GET", "POST")

Expand Down
63 changes: 8 additions & 55 deletions pkg/chunk/purger/tenant_deletion_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import (
"github.com/prometheus/client_golang/prometheus"
"github.com/thanos-io/thanos/pkg/objstore"

"github.com/cortexproject/cortex/pkg/ruler/rulestore"
"github.com/cortexproject/cortex/pkg/storage/bucket"
cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb"
"github.com/cortexproject/cortex/pkg/tenant"
Expand All @@ -22,24 +21,22 @@ import (

type TenantDeletionAPI struct {
bucketClient objstore.Bucket
ruleStore rulestore.RuleStore
logger log.Logger
cfgProvider bucket.TenantConfigProvider
}

func NewTenantDeletionAPI(storageCfg cortex_tsdb.BlocksStorageConfig, cfgProvider bucket.TenantConfigProvider, ruleStore rulestore.RuleStore, logger log.Logger, reg prometheus.Registerer) (*TenantDeletionAPI, error) {
func NewTenantDeletionAPI(storageCfg cortex_tsdb.BlocksStorageConfig, cfgProvider bucket.TenantConfigProvider, logger log.Logger, reg prometheus.Registerer) (*TenantDeletionAPI, error) {
bucketClient, err := createBucketClient(storageCfg, logger, reg)
if err != nil {
return nil, err
}

return newTenantDeletionAPI(bucketClient, cfgProvider, ruleStore, logger), nil
return newTenantDeletionAPI(bucketClient, cfgProvider, logger), nil
}

func newTenantDeletionAPI(bkt objstore.Bucket, cfgProvider bucket.TenantConfigProvider, ruleStore rulestore.RuleStore, logger log.Logger) *TenantDeletionAPI {
func newTenantDeletionAPI(bkt objstore.Bucket, cfgProvider bucket.TenantConfigProvider, logger log.Logger) *TenantDeletionAPI {
return &TenantDeletionAPI{
bucketClient: bkt,
ruleStore: ruleStore,
cfgProvider: cfgProvider,
logger: logger,
}
Expand All @@ -49,7 +46,9 @@ func (api *TenantDeletionAPI) DeleteTenant(w http.ResponseWriter, r *http.Reques
ctx := r.Context()
userID, err := tenant.TenantID(ctx)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
// When Cortex is running, it uses Auth Middleware for checking X-Scope-OrgID and injecting tenant into context.
// Auth Middleware sends http.StatusUnauthorized if X-Scope-OrgID is missing, so we do too here, for consistency.
http.Error(w, err.Error(), http.StatusUnauthorized)
return
}

Expand All @@ -63,38 +62,12 @@ func (api *TenantDeletionAPI) DeleteTenant(w http.ResponseWriter, r *http.Reques

level.Info(api.logger).Log("msg", "tenant deletion mark in blocks storage created", "user", userID)

if api.ruleStore != nil {
err := api.deleteRules(r.Context(), userID)
if err != nil {
level.Error(api.logger).Log("msg", "failed to delete tenant rule groups", "user", userID, "err", err)
http.Error(w, errors.Wrapf(err, "failed to delete tenant rule groups").Error(), http.StatusInternalServerError)
return
}
}

w.WriteHeader(http.StatusOK)
}

func (api *TenantDeletionAPI) deleteRules(ctx context.Context, userID string) error {
if !api.ruleStore.SupportsModifications() {
level.Warn(api.logger).Log("msg", "cannot delete tenant rule groups, using read-only rule store", "user", userID)
return nil
}

err := api.ruleStore.DeleteNamespace(ctx, userID, "") // Empty namespace = delete all rule groups.
if err != nil && !errors.Is(err, rulestore.ErrGroupNamespaceNotFound) {
return err
}

level.Info(api.logger).Log("msg", "deleted all tenant rule groups", "user", userID)
return nil
}

type DeleteTenantStatusResponse struct {
TenantID string `json:"tenant_id"`
BlocksDeleted bool `json:"blocks_deleted"`
RuleGroupsDeleted bool `json:"rule_groups_deleted"`
AlertManagerConfigDeleted bool `json:"alert_manager_config_deleted,omitempty"` // Not yet supported.
TenantID string `json:"tenant_id"`
BlocksDeleted bool `json:"blocks_deleted"`
}

func (api *TenantDeletionAPI) DeleteTenantStatus(w http.ResponseWriter, r *http.Request) {
Expand All @@ -113,29 +86,9 @@ func (api *TenantDeletionAPI) DeleteTenantStatus(w http.ResponseWriter, r *http.
return
}

result.RuleGroupsDeleted, err = api.isRulesForUserDeleted(ctx, userID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}

util.WriteJSONResponse(w, result)
}

func (api *TenantDeletionAPI) isRulesForUserDeleted(ctx context.Context, userID string) (bool, error) {
if api.ruleStore == nil {
// If API doesn't have access to rule store, then we cannot say that rules have been deleted.
return false, nil
}

list, err := api.ruleStore.ListRuleGroupsForUserAndNamespace(ctx, userID, "")
if err != nil {
return false, errors.Wrap(err, "failed to list rule groups for tenant")
}

return len(list) == 0, nil
}

func (api *TenantDeletionAPI) isBlocksForUserDeleted(ctx context.Context, userID string) (bool, error) {
var errBlockFound = errors.New("block found")

Expand Down
123 changes: 3 additions & 120 deletions pkg/chunk/purger/tenant_deletion_api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,27 @@ package purger
import (
"bytes"
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"path"
"testing"

"github.com/go-kit/kit/log"
"github.com/prometheus/prometheus/pkg/rulefmt"
"github.com/stretchr/testify/require"
"github.com/thanos-io/thanos/pkg/objstore"
"github.com/weaveworks/common/user"

"github.com/cortexproject/cortex/pkg/chunk"
"github.com/cortexproject/cortex/pkg/ruler/rulespb"
"github.com/cortexproject/cortex/pkg/ruler/rulestore"
"github.com/cortexproject/cortex/pkg/ruler/rulestore/objectclient"
"github.com/cortexproject/cortex/pkg/storage/tsdb"
)

func TestDeleteTenant(t *testing.T) {
bkt := objstore.NewInMemBucket()
api := newTenantDeletionAPI(bkt, nil, nil, log.NewNopLogger())
api := newTenantDeletionAPI(bkt, nil, log.NewNopLogger())

{
resp := httptest.NewRecorder()
api.DeleteTenant(resp, &http.Request{})
require.Equal(t, http.StatusBadRequest, resp.Code)
require.Equal(t, http.StatusUnauthorized, resp.Code)
}

{
Expand Down Expand Up @@ -86,122 +80,11 @@ func TestDeleteTenantStatus(t *testing.T) {
require.NoError(t, bkt.Upload(context.Background(), objName, bytes.NewReader(data)))
}

api := newTenantDeletionAPI(bkt, nil, nil, log.NewNopLogger())
api := newTenantDeletionAPI(bkt, nil, log.NewNopLogger())

res, err := api.isBlocksForUserDeleted(context.Background(), username)
require.NoError(t, err)
require.Equal(t, tc.expectedBlocksDeleted, res)
})
}
}

func TestDeleteTenantRuleGroups(t *testing.T) {
ruleGroups := []ruleGroupKey{
{user: "userA", namespace: "namespace", group: "group"},
{user: "userB", namespace: "namespace1", group: "group"},
{user: "userB", namespace: "namespace2", group: "group"},
}

obj, rs := setupRuleGroupsStore(t, ruleGroups)
require.Equal(t, 3, obj.GetObjectCount())

api := newTenantDeletionAPI(objstore.NewInMemBucket(), nil, rs, log.NewNopLogger())

{
callDeleteTenantAPI(t, api, "user-with-no-rule-groups")
require.Equal(t, 3, obj.GetObjectCount())

verifyExpectedDeletedRuleGroupsForUser(t, api, "user-with-no-rule-groups", true) // Has no rule groups
verifyExpectedDeletedRuleGroupsForUser(t, api, "userA", false)
verifyExpectedDeletedRuleGroupsForUser(t, api, "userB", false)
}

{
callDeleteTenantAPI(t, api, "userA")
require.Equal(t, 2, obj.GetObjectCount())

verifyExpectedDeletedRuleGroupsForUser(t, api, "user-with-no-rule-groups", true) // Has no rule groups
verifyExpectedDeletedRuleGroupsForUser(t, api, "userA", true) // Just deleted.
verifyExpectedDeletedRuleGroupsForUser(t, api, "userB", false)
}

{
callDeleteTenantAPI(t, api, "userB")
require.Equal(t, 0, obj.GetObjectCount())

verifyExpectedDeletedRuleGroupsForUser(t, api, "user-with-no-rule-groups", true) // Has no rule groups
verifyExpectedDeletedRuleGroupsForUser(t, api, "userA", true) // Deleted previously
verifyExpectedDeletedRuleGroupsForUser(t, api, "userB", true) // Just deleted
}
}

func TestDeleteTenantRuleGroupsWithReadOnlyStore(t *testing.T) {
ruleGroups := []ruleGroupKey{
{user: "userA", namespace: "namespace", group: "group"},
{user: "userB", namespace: "namespace1", group: "group"},
{user: "userB", namespace: "namespace2", group: "group"},
}

obj, rs := setupRuleGroupsStore(t, ruleGroups)
require.Equal(t, 3, obj.GetObjectCount())

rs = &readOnlyRuleStore{RuleStore: rs}

api := newTenantDeletionAPI(objstore.NewInMemBucket(), nil, rs, log.NewNopLogger())

// Make sure there is no error reported.
callDeleteTenantAPI(t, api, "userA")
require.Equal(t, 3, obj.GetObjectCount())

verifyExpectedDeletedRuleGroupsForUser(t, api, "userA", false) // Cannot delete from read-only store.
verifyExpectedDeletedRuleGroupsForUser(t, api, "userB", false)
}

func callDeleteTenantAPI(t *testing.T, api *TenantDeletionAPI, userID string) {
ctx := user.InjectOrgID(context.Background(), userID)

req := &http.Request{}
resp := httptest.NewRecorder()
api.DeleteTenant(resp, req.WithContext(ctx))

require.Equal(t, http.StatusOK, resp.Code)
}

func verifyExpectedDeletedRuleGroupsForUser(t *testing.T, api *TenantDeletionAPI, userID string, expected bool) {
ctx := user.InjectOrgID(context.Background(), userID)

req := &http.Request{}
resp := httptest.NewRecorder()
api.DeleteTenantStatus(resp, req.WithContext(ctx))

require.Equal(t, http.StatusOK, resp.Code)

deleteResp := &DeleteTenantStatusResponse{}
require.NoError(t, json.Unmarshal(resp.Body.Bytes(), deleteResp))
require.Equal(t, expected, deleteResp.RuleGroupsDeleted)
}

func setupRuleGroupsStore(t *testing.T, ruleGroups []ruleGroupKey) (*chunk.MockStorage, rulestore.RuleStore) {
obj := chunk.NewMockStorage()
rs := objectclient.NewRuleStore(obj, 5, log.NewNopLogger())

// "upload" rule groups
for _, key := range ruleGroups {
desc := rulespb.ToProto(key.user, key.namespace, rulefmt.RuleGroup{Name: key.group})
require.NoError(t, rs.SetRuleGroup(context.Background(), key.user, key.namespace, desc))
}

return obj, rs
}

type ruleGroupKey struct {
user, namespace, group string
}

type readOnlyRuleStore struct {
rulestore.RuleStore
}

func (r *readOnlyRuleStore) SupportsModifications() bool {
return false
}
11 changes: 2 additions & 9 deletions pkg/cortex/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -639,13 +639,6 @@ func (t *Cortex) initRulerStorage() (serv services.Service, err error) {
return
}

// Purger didn't use ruler storage before, but now it does. However empty configuration just causes error,
// so to preserve previous purger behaviour, we simply disable it.
if t.Cfg.isModuleEnabled(Purger) && t.Cfg.Ruler.StoreConfig.IsDefaults() {
level.Info(util_log.Logger).Log("msg", "Ruler storage is not configured. If you want to use tenant deletion API and delete rule groups, please configure ruler storage.")
return
}

if !t.Cfg.Ruler.StoreConfig.IsDefaults() {
t.RulerStorage, err = ruler.NewLegacyRuleStore(t.Cfg.Ruler.StoreConfig, rules.FileLoader{}, util_log.Logger)
} else {
Expand Down Expand Up @@ -810,7 +803,7 @@ func (t *Cortex) initTenantDeletionAPI() (services.Service, error) {
}

// t.RulerStorage can be nil when running in single-binary mode, and rule storage is not configured.
tenantDeletionAPI, err := purger.NewTenantDeletionAPI(t.Cfg.BlocksStorage, t.Overrides, t.RulerStorage, util_log.Logger, prometheus.DefaultRegisterer)
tenantDeletionAPI, err := purger.NewTenantDeletionAPI(t.Cfg.BlocksStorage, t.Overrides, util_log.Logger, prometheus.DefaultRegisterer)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -895,7 +888,7 @@ func (t *Cortex) setupModuleManager() error {
Compactor: {API, MemberlistKV, Overrides},
StoreGateway: {API, Overrides, MemberlistKV},
ChunksPurger: {Store, DeleteRequestsStore, API},
TenantDeletion: {Store, API, Overrides, RulerStorage},
TenantDeletion: {Store, API, Overrides},
Purger: {ChunksPurger, TenantDeletion},
TenantFederation: {Queryable},
All: {QueryFrontend, Querier, Ingester, Distributor, TableManager, Purger, StoreGateway, Ruler},
Expand Down
22 changes: 22 additions & 0 deletions pkg/ruler/ruler.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/cortexproject/cortex/pkg/util"
"github.com/cortexproject/cortex/pkg/util/flagext"
"github.com/cortexproject/cortex/pkg/util/grpcclient"
util_log "github.com/cortexproject/cortex/pkg/util/log"
"github.com/cortexproject/cortex/pkg/util/services"
"github.com/cortexproject/cortex/pkg/util/validation"
)
Expand Down Expand Up @@ -771,3 +772,24 @@ func (r *Ruler) AssertMaxRulesPerRuleGroup(userID string, rules int) error {
}
return fmt.Errorf(errMaxRulesPerRuleGroupPerUserLimitExceeded, limit, rules)
}

func (r *Ruler) DeleteTenantConfiguration(w http.ResponseWriter, req *http.Request) {
logger := util_log.WithContext(req.Context(), r.logger)

userID, err := tenant.TenantID(req.Context())
if err != nil {
// When Cortex is running, it uses Auth Middleware for checking X-Scope-OrgID and injecting tenant into context.
// Auth Middleware sends http.StatusUnauthorized if X-Scope-OrgID is missing, so we do too here, for consistency.
http.Error(w, err.Error(), http.StatusUnauthorized)
return
}

err = r.store.DeleteNamespace(req.Context(), userID, "") // Empty namespace = delete all rule groups.
if err != nil && !errors.Is(err, rulestore.ErrGroupNamespaceNotFound) {
respondError(logger, w, err.Error())
return
}

level.Info(logger).Log("msg", "deleted all tenant rule groups", "user", userID)
w.WriteHeader(http.StatusOK)
}
Loading

0 comments on commit e1488b6

Please sign in to comment.