diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml index ace56ce4c7..6f60778e7c 100644 --- a/.github/workflows/test-build-deploy.yml +++ b/.github/workflows/test-build-deploy.yml @@ -45,6 +45,8 @@ jobs: run: make BUILD_IN_CONTAINER=false check-doc - name: Check Mixin run: make BUILD_IN_CONTAINER=false check-mixin + - name: Check Jsonnet Manifests + run: make BUILD_IN_CONTAINER=false check-jsonnet-manifests - name: Check White Noise. run: make BUILD_IN_CONTAINER=false check-white-noise - name: Check License Header diff --git a/CHANGELOG.md b/CHANGELOG.md index a8b40d3efa..e082296211 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -180,19 +180,120 @@ * [BUGFIX] Distributor: fix bug in query-exemplar where some results would get dropped. #583 * [BUGFIX] Azure storage: only create HTTP client once, to reduce memory utilization. #605 -Mixin: - +### Mixin (changes since `grafana/cortex-jsonnet` `1.9.0`) + +* [CHANGE] Update grafana-builder dependency: use $__rate_interval in qpsPanel and latencyPanel. [#372](https://github.com/grafana/cortex-jsonnet/pull/372) +* [CHANGE] `namespace` template variable in dashboards now only selects namespaces for selected clusters. [#311](https://github.com/grafana/cortex-jsonnet/pull/311) +* [CHANGE] `CortexIngesterRestarts` alert severity changed from `critical` to `warning`. [#321](https://github.com/grafana/cortex-jsonnet/pull/321) +* [CHANGE] Dashboards: added overridable `job_labels` and `cluster_labels` to the configuration object as label lists to uniquely identify jobs and clusters in the metric names and group-by lists in dashboards. [#319](https://github.com/grafana/cortex-jsonnet/pull/319) +* [CHANGE] Dashboards: `alert_aggregation_labels` has been removed from the configuration and overriding this value has been deprecated. Instead the labels are now defined by the `cluster_labels` list, and should be overridden accordingly through that list. [#319](https://github.com/grafana/cortex-jsonnet/pull/319) +* [CHANGE] Renamed `CortexCompactorHasNotUploadedBlocksSinceStart` to `CortexCompactorHasNotUploadedBlocks`. [#334](https://github.com/grafana/cortex-jsonnet/pull/334) +* [CHANGE] Renamed `CortexCompactorRunFailed` to `CortexCompactorHasNotSuccessfullyRunCompaction`. [#334](https://github.com/grafana/cortex-jsonnet/pull/334) +* [CHANGE] Renamed `CortexInconsistentConfig` alert to `CortexInconsistentRuntimeConfig` and increased severity to `critical`. [#335](https://github.com/grafana/cortex-jsonnet/pull/335) +* [CHANGE] Increased `CortexBadRuntimeConfig` alert severity to `critical` and removed support for `cortex_overrides_last_reload_successful` metric (was removed in Cortex 1.3.0). [#335](https://github.com/grafana/cortex-jsonnet/pull/335) +* [CHANGE] Grafana 'min step' changed to 15s so dashboard show better detail. [#340](https://github.com/grafana/cortex-jsonnet/pull/340) +* [CHANGE] Replace `CortexRulerFailedEvaluations` with two new alerts: `CortexRulerTooManyFailedPushes` and `CortexRulerTooManyFailedQueries`. [#347](https://github.com/grafana/cortex-jsonnet/pull/347) +* [CHANGE] Removed `CortexCacheRequestErrors` alert. This alert was not working because the legacy Cortex cache client instrumentation doesn't track errors. [#346](https://github.com/grafana/cortex-jsonnet/pull/346) +* [CHANGE] Removed `CortexQuerierCapacityFull` alert. [#342](https://github.com/grafana/cortex-jsonnet/pull/342) +* [CHANGE] Changes blocks storage alerts to group metrics by the configured `cluster_labels` (supporting the deprecated `alert_aggregation_labels`). [#351](https://github.com/grafana/cortex-jsonnet/pull/351) +* [CHANGE] Increased `CortexIngesterReachingSeriesLimit` critical alert threshold from 80% to 85%. [#363](https://github.com/grafana/cortex-jsonnet/pull/363) +* [CHANGE] Changed default `job_names` for query-frontend, query-scheduler and querier to match custom deployments too. [#376](https://github.com/grafana/cortex-jsonnet/pull/376) +* [CHANGE] Split `cortex_api` recording rule group into three groups. This is a workaround for large clusters where this group can become slow to evaluate. [#401](https://github.com/grafana/cortex-jsonnet/pull/401) +* [CHANGE] Increased `CortexIngesterReachingSeriesLimit` warning threshold from 70% to 80% and critical threshold from 85% to 90%. [#404](https://github.com/grafana/cortex-jsonnet/pull/404) * [CHANGE] Raised `CortexKVStoreFailure` alert severity from warning to critical. #493 * [CHANGE] Increase `CortexRolloutStuck` alert "for" duration from 15m to 30m. #493 #573 +* [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. [#317](https://github.com/grafana/cortex-jsonnet/pull/317) +* [ENHANCEMENT] Cortex-mixin: Include `cortex-gw-internal` naming variation in default `gateway` job names. [#328](https://github.com/grafana/cortex-jsonnet/pull/328) +* [ENHANCEMENT] Ruler dashboard: added object storage metrics. [#354](https://github.com/grafana/cortex-jsonnet/pull/354) +* [ENHANCEMENT] Alertmanager dashboard: added object storage metrics. [#354](https://github.com/grafana/cortex-jsonnet/pull/354) +* [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards. [#324](https://github.com/grafana/cortex-jsonnet/pull/324) +* [ENHANCEMENT] Dashboards: defined container functions for common resources panels: containerDiskWritesPanel, containerDiskReadsPanel, containerDiskSpaceUtilization. [#331](https://github.com/grafana/cortex-jsonnet/pull/331) +* [ENHANCEMENT] cortex-mixin: Added `alert_excluded_routes` config to exclude specific routes from alerts. [#338](https://github.com/grafana/cortex-jsonnet/pull/338) +* [ENHANCEMENT] Added `CortexMemcachedRequestErrors` alert. [#346](https://github.com/grafana/cortex-jsonnet/pull/346) +* [ENHANCEMENT] Ruler dashboard: added "Per route p99 latency" panel in the "Configuration API" row. [#353](https://github.com/grafana/cortex-jsonnet/pull/353) +* [ENHANCEMENT] Increased the `for` duration of the `CortexIngesterReachingSeriesLimit` warning alert to 3h. [#362](https://github.com/grafana/cortex-jsonnet/pull/362) +* [ENHANCEMENT] Added a new tier (`medium_small_user`) so we have another tier between 100K and 1Mil active series. [#364](https://github.com/grafana/cortex-jsonnet/pull/364) +* [ENHANCEMENT] Extend Alertmanager dashboard: [#313](https://github.com/grafana/cortex-jsonnet/pull/313) + * "Tenants" stat panel - shows number of discovered tenant configurations. + * "Replication" row - information about the replication of tenants/alerts/silences over instances. + * "Tenant Configuration Sync" row - information about the configuration sync procedure. + * "Sharding Initial State Sync" row - information about the initial state sync procedure when sharding is enabled. + * "Sharding Runtime State Sync" row - information about various state operations which occur when sharding is enabled (replication, fetch, marge, persist). +* [ENHANCEMENT] Update gsutil command for `not healthy index found` playbook [#370](https://github.com/grafana/cortex-jsonnet/pull/370) +* [ENHANCEMENT] Added Alertmanager alerts and playbooks covering configuration syncs and sharding operation: [#377 [#378](https://github.com/grafana/cortex-jsonnet/pull/378) + * `CortexAlertmanagerSyncConfigsFailing` + * `CortexAlertmanagerRingCheckFailing` + * `CortexAlertmanagerPartialStateMergeFailing` + * `CortexAlertmanagerReplicationFailing` + * `CortexAlertmanagerPersistStateFailing` + * `CortexAlertmanagerInitialSyncFailed` +* [ENHANCEMENT] Add recording rules to improve responsiveness of Alertmanager dashboard. [#387](https://github.com/grafana/cortex-jsonnet/pull/387) +* [ENHANCEMENT] Add `CortexRolloutStuck` alert. [#405](https://github.com/grafana/cortex-jsonnet/pull/405) +* [ENHANCEMENT] Added `CortexKVStoreFailure` alert. [#406](https://github.com/grafana/cortex-jsonnet/pull/406) +* [ENHANCEMENT] Use configured `ruler` jobname for ruler dashboard panels. [#409](https://github.com/grafana/cortex-jsonnet/pull/409) +* [ENHANCEMENT] Add ability to override `datasource` for generated dashboards. [#407](https://github.com/grafana/cortex-jsonnet/pull/407) +* [ENHANCEMENT] Use alertmanager jobname for alertmanager dashboard panels [#411](https://github.com/grafana/cortex-jsonnet/pull/411) +* [ENHANCEMENT] Added `CortexDistributorReachingInflightPushRequestLimit` alert. [#408](https://github.com/grafana/cortex-jsonnet/pull/408) * [ENHANCEMENT] Added `CortexReachingTCPConnectionsLimit` alert. #403 * [ENHANCEMENT] Added "Cortex / Writes Networking" and "Cortex / Reads Networking" dashboards. #405 * [ENHANCEMENT] Improved "Queue length" panel in "Cortex / Queries" dashboard. #408 * [ENHANCEMENT] Add `CortexDistributorReachingInflightPushRequestLimit` alert and playbook. #401 * [ENHANCEMENT] Added "Recover accidentally deleted blocks (Google Cloud specific)" playbook. #475 * [ENHANCEMENT] Added support to multi-zone store-gateway deployments. #608 #615 +* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. [#308](https://github.com/grafana/cortex-jsonnet/pull/308) +* [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. [#335](https://github.com/grafana/cortex-jsonnet/pull/335) +* [BUGFIX] Fixed scaling dashboard to correctly work when a Cortex service deployment spans across multiple zones (a zone is expected to have the `zone-[a-z]` suffix). [#365](https://github.com/grafana/cortex-jsonnet/pull/365) +* [BUGFIX] Fixed rollout progress dashboard to correctly work when a Cortex service deployment spans across multiple zones (a zone is expected to have the `zone-[a-z]` suffix). [#366](https://github.com/grafana/cortex-jsonnet/pull/366) +* [BUGFIX] Fixed rollout progress dashboard to include query-scheduler too. [#376](https://github.com/grafana/cortex-jsonnet/pull/376) +* [BUGFIX] Upstream recording rule `node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate` renamed. [#379](https://github.com/grafana/cortex-jsonnet/pull/379) +* [BUGFIX] Fixed writes/reads/alertmanager resources dashboards to use `$._config.job_names.gateway`. [#403](https://github.com/grafana/cortex-jsonnet/pull/403) +* [BUGFIX] Span the annotation.message in alerts as YAML multiline strings. [#412](https://github.com/grafana/cortex-jsonnet/pull/412) * [BUGFIX] Fixed "Instant queries / sec" in "Cortex / Reads" dashboard. #445 * [BUGFIX] Fixed and added missing KV store panels in Writes, Reads, Ruler and Compactor dashboards. #448 +### Jsonnet (changes since `grafana/cortex-jsonnet` `1.9.0`) + +* [CHANGE] Store gateway: set `-blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency`, + `-blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency`, + `-blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency`, + `-blocks-storage.bucket-store.index-cache.memcached.max-idle-connections`, + `-blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections`, + `-blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections` to 100 [#414](https://github.com/grafana/cortex-jsonnet/pull/414) +* [CHANGE] Alertmanager: mounted overrides configmap to alertmanager too. [#315](https://github.com/grafana/cortex-jsonnet/pull/315) +* [CHANGE] Memcached: upgraded memcached from `1.5.17` to `1.6.9`. [#316](https://github.com/grafana/cortex-jsonnet/pull/316) +* [CHANGE] Store-gateway: increased memory request and limit respectively from 6GB / 6GB to 12GB / 18GB. [#322](https://github.com/grafana/cortex-jsonnet/pull/322) +* [CHANGE] Store-gateway: increased `-blocks-storage.bucket-store.max-chunk-pool-bytes` from 2GB (default) to 12GB. [#322](https://github.com/grafana/cortex-jsonnet/pull/322) +* [CHANGE] Ingester/Ruler: set `-server.grpc-max-send-msg-size-bytes` and `-server.grpc-max-send-msg-size-bytes` to sensible default values (10MB). [#326](https://github.com/grafana/cortex-jsonnet/pull/326) +* [CHANGE] Decreased `-server.grpc-max-concurrent-streams` from 100k to 10k. [#369](https://github.com/grafana/cortex-jsonnet/pull/369) +* [CHANGE] Decreased blocks storage ingesters graceful termination period from 80m to 20m. [#369](https://github.com/grafana/cortex-jsonnet/pull/369) +* [CHANGE] Increase the rules per group and rule groups limits on different tiers. [#396](https://github.com/grafana/cortex-jsonnet/pull/396) +* [CHANGE] Removed `max_samples_per_query` limit, since it only works with chunks and only when using `-distributor.shard-by-all-labels=false`. [#397](https://github.com/grafana/cortex-jsonnet/pull/397) +* [CHANGE] Removed chunks storage query sharding config support. The following config options have been removed: [#398](https://github.com/grafana/cortex-jsonnet/pull/398) + * `_config` > `queryFrontend` > `shard_factor` + * `_config` > `queryFrontend` > `sharded_queries_enabled` + * `_config` > `queryFrontend` > `query_split_factor` +* [CHANGE] Rename ruler_s3_bucket_name and ruler_gcs_bucket_name to ruler_storage_bucket_name: [#415](https://github.com/grafana/cortex-jsonnet/pull/415) +* [CHANGE] Fine-tuned rolling update policy for distributor, querier, query-frontend, query-scheduler. [#420](https://github.com/grafana/cortex-jsonnet/pull/420) +* [CHANGE] Increased memcached metadata/chunks/index-queries max connections from 4k to 16k. [#420](https://github.com/grafana/cortex-jsonnet/pull/420) +* [CHANGE] Disabled step alignment in query-frontend to be compliant with PromQL. [#420](https://github.com/grafana/cortex-jsonnet/pull/420) +* [CHANGE] Do not limit compactor CPU and request a number of cores equal to the configured concurrency. [#420](https://github.com/grafana/cortex-jsonnet/pull/420) +* [ENHANCEMENT] Add overrides config to compactor. This allows setting retention configs per user. [#386](https://github.com/grafana/cortex-jsonnet/pull/386) +* [ENHANCEMENT] Added 256MB memory ballast to querier. [#369](https://github.com/grafana/cortex-jsonnet/pull/369) +* [ENHANCEMENT] Update `etcd-operator` to latest version (see https://github.com/grafana/jsonnet-libs/pull/480). [#263](https://github.com/grafana/cortex-jsonnet/pull/263) +* [ENHANCEMENT] Add support for Azure storage in Alertmanager configuration. [#381](https://github.com/grafana/cortex-jsonnet/pull/381) +* [ENHANCEMENT] Add support for running Alertmanager in sharding mode. [#394](https://github.com/grafana/cortex-jsonnet/pull/394) +* [ENHANCEMENT] Allow to customize PromQL engine settings via `queryEngineConfig`. [#399](https://github.com/grafana/cortex-jsonnet/pull/399) +* [ENHANCEMENT] Define Azure object storage ruler args. [#416](https://github.com/grafana/cortex-jsonnet/pull/416) +* [ENHANCEMENT] Added the following config options to allow to schedule multiple replicas of the same service on the same node: [#418](https://github.com/grafana/cortex-jsonnet/pull/418) + * `cortex_distributor_allow_multiple_replicas_on_same_node` + * `cortex_ruler_allow_multiple_replicas_on_same_node` + * `cortex_querier_allow_multiple_replicas_on_same_node` + * `cortex_query_frontend_allow_multiple_replicas_on_same_node` +* [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. [#329](https://github.com/grafana/cortex-jsonnet/pull/329) +* [BUGFIX] Fixed `-distributor.extend-writes` setting on ruler when `unregister_ingesters_on_shutdown` is disabled. [#369](https://github.com/grafana/cortex-jsonnet/pull/369) +* [BUGFIX] Treat `compactor_blocks_retention_period` type as string rather than int.[#395](https://github.com/grafana/cortex-jsonnet/pull/395) +* [BUGFIX] Pass `-ruler-storage.s3.endpoint` to ruler when using S3. [#421](https://github.com/grafana/cortex-jsonnet/pull/421) + ### Query-tee * [ENHANCEMENT] Added `/api/v1/query_exemplars` API endpoint support (no results comparison). #168 diff --git a/Makefile b/Makefile index 6aec0b99d5..4f9e2ee1d9 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # WARNING: do not commit to a repository! -include Makefile.local -.PHONY: all test test-with-race integration-tests cover clean images protos exes dist doc clean-doc check-doc push-multiarch-build-image license check-license format check-mixin check-mixin-jb check-mixin-mixtool checkin-mixin-playbook build-mixin format-mixin push-multiarch-mimir list-image-targets +.PHONY: all test test-with-race integration-tests cover clean images protos exes dist doc clean-doc check-doc push-multiarch-build-image license check-license format check-mixin check-mixin-jb check-mixin-mixtool checkin-mixin-playbook build-mixin format-mixin check-jsonnet-manifests format-jsonnet-manifests push-multiarch-mimir list-image-targets .DEFAULT_GOAL := all # Version number @@ -39,9 +39,12 @@ UPTODATE := .uptodate # path to jsonnetfmt JSONNET_FMT := jsonnetfmt -# path to the mimir/mixin +# path to the mimir-mixin MIXIN_PATH := operations/mimir-mixin +# path to the mimir jsonnet manifests +JSONNET_MANIFESTS_PATH := operations/mimir + .PHONY: image-tag image-tag: @echo $(IMAGE_TAG) @@ -369,7 +372,6 @@ check-white-noise: clean-white-noise check-mixin: format-mixin check-mixin-jb check-mixin-mixtool check-mixin-playbook @echo "Checking diff:" - git diff @git diff --exit-code -- $(MIXIN_PATH) || (echo "Please format mixin by running 'make format-mixin'" && false) @cd $(MIXIN_PATH) && \ @@ -396,6 +398,13 @@ build-mixin: check-mixin-jb format-mixin: @find $(MIXIN_PATH) -type f -name '*.libsonnet' -print -o -name '*.jsonnet' -print | xargs jsonnetfmt -i +check-jsonnet-manifests: format-jsonnet-manifests + @echo "Checking diff:" + @git diff --exit-code -- $(JSONNET_MANIFESTS_PATH) || (echo "Please format jsonnet manifests by running 'make format-jsonnet-manifests'" && false) + +format-jsonnet-manifests: + @find $(JSONNET_MANIFESTS_PATH) -type f -name '*.libsonnet' -print -o -name '*.jsonnet' -print | xargs jsonnetfmt -i + check-tsdb-blocks-storage-s3-docker-compose-yaml: cd development/tsdb-blocks-storage-s3 && make check diff --git a/operations/mimir/alertmanager.libsonnet b/operations/mimir/alertmanager.libsonnet new file mode 100644 index 0000000000..480112d382 --- /dev/null +++ b/operations/mimir/alertmanager.libsonnet @@ -0,0 +1,136 @@ +{ + local pvc = $.core.v1.persistentVolumeClaim, + local volumeMount = $.core.v1.volumeMount, + local volume = $.core.v1.volume, + local container = $.core.v1.container, + local statefulSet = $.apps.v1.statefulSet, + local service = $.core.v1.service, + local configMap = $.core.v1.configMap, + + // The Alertmanager has three operational modes. + local haType = if $._config.alertmanager.sharding_enabled then + 'sharding' + else if $._config.alertmanager.replicas > 1 then + 'gossip_multi_replica' + else + 'gossip_single_replica', + // mode represents which operational mode the alertmanager runs in. + // ports: array of container ports used for gossiping. + // args: arguments that are eventually converted to flags on the container + // flags: arguments directly added to the container. For legacy reasons, we need to use -- as a prefix for some flags. + // service: the service definition + local mode = { + sharding: { + ports: [], + args: { + 'alertmanager.sharding-enabled': true, + 'alertmanager.sharding-ring.store': $._config.alertmanager.ring_store, + 'alertmanager.sharding-ring.consul.hostname': $._config.alertmanager.ring_hostname, + 'alertmanager.sharding-ring.replication-factor': $._config.alertmanager.ring_replication_factor, + }, + flags: [], + service: + $.util.serviceFor($.alertmanager_statefulset) + + service.mixin.spec.withClusterIp('None'), + }, + gossip_multi_replica: { + ports: [ + $.core.v1.containerPort.newUDP('gossip-udp', $._config.alertmanager.gossip_port), + $.core.v1.containerPort.new('gossip-tcp', $._config.alertmanager.gossip_port), + ], + args: {}, + flags: [ + '--alertmanager.cluster.listen-address=[$(POD_IP)]:%s' % $._config.alertmanager.gossip_port, + '--alertmanager.cluster.peers=%s' % std.join(',', peers), + ], + service: + $.util.serviceFor($.alertmanager_statefulset) + + service.mixin.spec.withClusterIp('None'), + }, + gossip_single_replica: { + ports: [], + args: {}, + flags: ['--alertmanager.cluster.listen-address=""'], + service: $.util.serviceFor($.alertmanager_statefulset), + }, + }[haType], + local hasFallbackConfig = std.length($._config.alertmanager.fallback_config) > 0, + local peers = [ + 'alertmanager-%d.alertmanager.%s.svc.%s.local:%s' % [i, $._config.namespace, $._config.cluster, $._config.alertmanager.gossip_port] + for i in std.range(0, $._config.alertmanager.replicas - 1) + ], + alertmanager_args:: + $._config.grpcConfig + + $._config.alertmanagerStorageClientConfig + + mode.args + + { + target: 'alertmanager', + 'log.level': 'debug', + 'runtime-config.file': '/etc/cortex/overrides.yaml', + 'experimental.alertmanager.enable-api': 'true', + 'alertmanager.storage.path': '/data', + 'alertmanager.web.external-url': '%s/alertmanager' % $._config.external_url, + } + + (if hasFallbackConfig then { + 'alertmanager.configs.fallback': '/configs/alertmanager_fallback_config.yaml', + } else {}), + + alertmanager_fallback_config_map: + if hasFallbackConfig then + configMap.new('alertmanager-fallback-config') + + configMap.withData({ + 'alertmanager_fallback_config.yaml': $.util.manifestYaml($._config.alertmanager.fallback_config), + }) + else {}, + + + alertmanager_pvc:: + if $._config.alertmanager_enabled then + pvc.new() + + pvc.mixin.metadata.withName('alertmanager-data') + + pvc.mixin.spec.withAccessModes('ReadWriteOnce') + + pvc.mixin.spec.resources.withRequests({ storage: '100Gi' }) + else {}, + + alertmanager_container:: + if $._config.alertmanager_enabled then + container.new('alertmanager', $._images.alertmanager) + + container.withPorts($.util.defaultPorts + mode.ports) + + container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + + container.withArgsMixin( + $.util.mapToFlags($.alertmanager_args) + + mode.flags + ) + + container.withVolumeMountsMixin( + [volumeMount.new('alertmanager-data', '/data')] + + if hasFallbackConfig then + [volumeMount.new('alertmanager-fallback-config', '/configs')] + else [] + ) + + $.util.resourcesRequests('100m', '1Gi') + + $.util.readinessProbe + + $.jaeger_mixin + else {}, + + alertmanager_statefulset: + if $._config.alertmanager_enabled then + statefulSet.new('alertmanager', $._config.alertmanager.replicas, [$.alertmanager_container], $.alertmanager_pvc) + + statefulSet.mixin.spec.withServiceName('alertmanager') + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: 'alertmanager' }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'alertmanager' }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: 'alertmanager' }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + statefulSet.mixin.spec.template.spec.withVolumesMixin( + if hasFallbackConfig then + [volume.fromConfigMap('alertmanager-fallback-config', 'alertmanager-fallback-config')] + else [] + ) + else {}, + + alertmanager_service: + if $._config.alertmanager_enabled then mode.service else {}, +} diff --git a/operations/mimir/common.libsonnet b/operations/mimir/common.libsonnet new file mode 100644 index 0000000000..9daf7ee57e --- /dev/null +++ b/operations/mimir/common.libsonnet @@ -0,0 +1,21 @@ +{ + namespace: + $.core.v1.namespace.new($._config.namespace), + + util+:: { + local containerPort = $.core.v1.containerPort, + local container = $.core.v1.container, + + defaultPorts:: + [ + containerPort.newNamed(name='http-metrics', containerPort=80), + containerPort.newNamed(name='grpc', containerPort=9095), + ], + + readinessProbe:: + container.mixin.readinessProbe.httpGet.withPath('/ready') + + container.mixin.readinessProbe.httpGet.withPort(80) + + container.mixin.readinessProbe.withInitialDelaySeconds(15) + + container.mixin.readinessProbe.withTimeoutSeconds(1), + }, +} diff --git a/operations/mimir/config.libsonnet b/operations/mimir/config.libsonnet new file mode 100644 index 0000000000..7cf316b73e --- /dev/null +++ b/operations/mimir/config.libsonnet @@ -0,0 +1,538 @@ +{ + _config+: { + namespace: error 'must define namespace', + cluster: error 'must define cluster', + replication_factor: 3, + external_url: error 'must define external url for cluster', + + storage_backend: error 'must specify storage backend (cassandra, gcp, aws)', + table_prefix: $._config.namespace, + cassandra_addresses: error 'must specify cassandra addresses', + bigtable_instance: error 'must specify bigtable instance', + bigtable_project: error 'must specify bigtable project', + aws_region: error 'must specify AWS region', + s3_bucket_name: error 'must specify S3 bucket name', + + // If false, ingesters are not unregistered on shutdown and left in the ring with + // the LEAVING state. Setting to false prevents series resharding during ingesters rollouts, + // but requires to: + // 1. Either manually forget ingesters on scale down or invoke the /shutdown endpoint + // 2. Ensure ingester ID is preserved during rollouts + unregister_ingesters_on_shutdown: true, + + // Controls whether multiple pods for the same service can be scheduled on the same node. + cortex_distributor_allow_multiple_replicas_on_same_node: false, + cortex_ruler_allow_multiple_replicas_on_same_node: false, + cortex_querier_allow_multiple_replicas_on_same_node: false, + cortex_query_frontend_allow_multiple_replicas_on_same_node: false, + + // schema is used to generate the storage schema yaml file used by + // the Cortex chunks storage: + // - More information: https://github.com/cortexproject/cortex/pull/1072 + // - Blocks storage doesn't support / uses the schema config. + schema: if $._config.storage_engine != 'blocks' then + error 'must specify a schema config' + else + [], + + max_chunk_idle: '15m', + + test_exporter_enabled: false, + test_exporter_start_time: error 'must specify test exporter start time', + test_exporter_user_id: error 'must specify test exporter used id', + + querier: { + replicas: 6, + concurrency: 8, + }, + + queryFrontend: { + replicas: 2, + }, + + jaeger_agent_host: null, + + // Use the Cortex chunks storage engine by default, while giving the ability + // to switch to blocks storage. + storage_engine: 'chunks', // Available options are 'chunks' or 'blocks' + blocks_storage_backend: 'gcs', // Available options are 'gcs', 's3', 'azure' + blocks_storage_bucket_name: error 'must specify blocks storage bucket name', + blocks_storage_s3_endpoint: 's3.dualstack.us-east-1.amazonaws.com', + blocks_storage_azure_account_name: if $._config.blocks_storage_backend == 'azure' then error 'must specify azure account name' else '', + blocks_storage_azure_account_key: if $._config.blocks_storage_backend == 'azure' then error 'must specify azure account key' else '', + + // Secondary storage engine is only used for querying. + querier_second_storage_engine: null, + + store_gateway_replication_factor: 3, + + // By default ingesters will be run as StatefulSet with WAL. + // If this is set to true, ingesters will use staless deployments without WAL. + ingester_deployment_without_wal: false, + + ingester: { + // These config options are only for the chunks storage. + wal_dir: '/wal_data', + statefulset_disk: '150Gi', + }, + + // Blocks storage engine doesn't require the table manager. + // When running blocks with chunks as secondary storage engine for querier only, we need table-manager to apply + // retention policies. + table_manager_enabled: $._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks', + + // Blocks storage engine doesn't support index-writes (for writes deduplication) cache. + memcached_index_writes_enabled: $._config.storage_engine != 'blocks', + memcached_index_writes_max_item_size_mb: 1, + + // Index and chunks caches are supported by both blocks storage engine and chunks engine. + memcached_index_queries_enabled: true, + memcached_index_queries_max_item_size_mb: 5, + + memcached_chunks_enabled: true, + memcached_chunks_max_item_size_mb: 1, + + memcached_metadata_enabled: $._config.storage_engine == 'blocks', + memcached_metadata_max_item_size_mb: 1, + + // The query-tee is an optional service which can be used to send + // the same input query to multiple backends and make them compete + // (comparing performances). + query_tee_enabled: false, + query_tee_backend_endpoints: [], + query_tee_backend_preferred: '', + + enabledBackends: [ + backend + for backend in std.split($._config.storage_backend, ',') + ], + + client_configs: { + aws: + if std.count($._config.enabledBackends, 'aws') > 0 then { + 'dynamodb.api-limit': 10, + 'dynamodb.url': 'https://%s' % $._config.aws_region, + 's3.url': 'https://%s/%s' % [$._config.aws_region, $._config.s3_bucket_name], + } else {}, + cassandra: + if std.count($._config.enabledBackends, 'cassandra') > 0 then { + 'cassandra.keyspace': $._config.namespace, + 'cassandra.addresses': $._config.cassandra_addresses, + 'cassandra.replication-factor': $._config.replication_factor, + } else {}, + gcp: + if std.count($._config.enabledBackends, 'gcp') > 0 then { + 'bigtable.project': $._config.bigtable_project, + 'bigtable.instance': $._config.bigtable_instance, + } else {}, + }, + + storeConfig: self.storeMemcachedChunksConfig, + + storeMemcachedChunksConfig: if $._config.memcached_chunks_enabled && ($._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks') then + { + 'store.chunks-cache.memcached.hostname': 'memcached.%s.svc.cluster.local' % $._config.namespace, + 'store.chunks-cache.memcached.service': 'memcached-client', + 'store.chunks-cache.memcached.timeout': '3s', + } + else {}, + + grpcConfig:: { + 'server.grpc.keepalive.min-time-between-pings': '10s', + 'server.grpc.keepalive.ping-without-stream-allowed': true, + }, + + storageConfig: + $._config.client_configs.aws + + $._config.client_configs.cassandra + + $._config.client_configs.gcp + + { 'schema-config-file': '/etc/cortex/schema/config.yaml' }, + + genericBlocksStorageConfig:: { + 'store.engine': $._config.storage_engine, // May still be chunks + }, + queryBlocksStorageConfig:: { + 'blocks-storage.bucket-store.sync-dir': '/data/tsdb', + 'blocks-storage.bucket-store.ignore-deletion-marks-delay': '1h', + + 'store-gateway.sharding-enabled': true, + 'store-gateway.sharding-ring.store': 'consul', + 'store-gateway.sharding-ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'store-gateway.sharding-ring.prefix': '', + 'store-gateway.sharding-ring.replication-factor': $._config.store_gateway_replication_factor, + }, + gcsBlocksStorageConfig:: $._config.genericBlocksStorageConfig { + 'blocks-storage.backend': 'gcs', + 'blocks-storage.gcs.bucket-name': $._config.blocks_storage_bucket_name, + }, + s3BlocksStorageConfig:: $._config.genericBlocksStorageConfig { + 'blocks-storage.backend': 's3', + 'blocks-storage.s3.bucket-name': $._config.blocks_storage_bucket_name, + 'blocks-storage.s3.endpoint': $._config.blocks_storage_s3_endpoint, + }, + azureBlocksStorageConfig:: $._config.genericBlocksStorageConfig { + 'blocks-storage.backend': 'azure', + 'blocks-storage.azure.container-name': $._config.blocks_storage_bucket_name, + 'blocks-storage.azure.account-name': $._config.blocks_storage_azure_account_name, + 'blocks-storage.azure.account-key': $._config.blocks_storage_azure_account_key, + }, + // Blocks storage configuration, used only when 'blocks' storage + // engine is explicitly enabled. + blocksStorageConfig: ( + if $._config.storage_engine == 'blocks' || $._config.querier_second_storage_engine == 'blocks' then ( + if $._config.blocks_storage_backend == 'gcs' then $._config.gcsBlocksStorageConfig + else if $._config.blocks_storage_backend == 's3' then $._config.s3BlocksStorageConfig + else if $._config.blocks_storage_backend == 'azure' then $._config.azureBlocksStorageConfig + else $._config.genericBlocksStorageConfig + ) else {} + ), + + // Querier component config (shared between the ruler and querier). + queryConfig: { + 'runtime-config.file': '/etc/cortex/overrides.yaml', + + // Limit the size of the rows we read from the index. + 'store.cardinality-limit': 1e6, + + // Don't allow individual queries of longer than 32days. Due to day query + // splitting in the frontend, the reality is this only limits rate(foo[32d]) + // type queries. 32 days to allow for comparision over the last month (31d) and + // then some. + 'store.max-query-length': '768h', + } + ( + if $._config.storage_engine == 'chunks' then { + // Don't query ingesters for older queries. + // Chunks are held in memory for up to 6hrs right now. Additional 6h are granted for safety reasons because + // the remote writing Prometheus may have a delay or write requests into the database are queued. + 'querier.query-ingesters-within': '12h', + + // Don't query the chunk store for data younger than max_chunk_idle. + 'querier.query-store-after': $._config.max_chunk_idle, + } else if $._config.storage_engine == 'blocks' then { + // Ingesters don't have data older than 13h, no need to ask them. + 'querier.query-ingesters-within': '13h', + + // No need to look at store for data younger than 12h, as ingesters have all of it. + 'querier.query-store-after': '12h', + } + ) + ( + if $._config.memcached_index_queries_enabled && ($._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks') then + { + // Setting for index cache. + 'store.index-cache-validity': '14m', // ingester.retain-period=15m, 1m less for safety. + 'store.index-cache-read.cache.enable-fifocache': true, + 'store.index-cache-read.fifocache.max-size-items': 102400, + 'store.index-cache-read.memcached.hostname': 'memcached-index-queries.%(namespace)s.svc.cluster.local' % $._config, + 'store.index-cache-read.memcached.service': 'memcached-client', + 'store.index-cache-read.memcached.timeout': '500ms', + 'store.cache-lookups-older-than': '36h', + } + else {} + ), + + // PromQL query engine config (shared between all services running PromQL engine, like the ruler and querier). + queryEngineConfig: { + // Keep it even if empty, to allow downstream projects to easily configure it. + }, + + ringConfig: { + 'consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'ring.prefix': '', + }, + + // Some distributor config is shared with the querier. + distributorConfig: { + 'distributor.replication-factor': $._config.replication_factor, + 'distributor.shard-by-all-labels': true, + 'distributor.health-check-ingesters': true, + 'ring.heartbeat-timeout': '10m', + }, + + ruler_enabled: false, + ruler_client_type: error 'you must specify a storage backend type for the ruler (azure, gcs, s3, local)', + ruler_storage_bucket_name: error 'must specify the ruler storage bucket name', + ruler_storage_azure_account_name: error 'must specify the ruler storage Azure account name', + ruler_storage_azure_account_key: error 'must specify the ruler storage Azure account key', + + rulerClientConfig: + { + 'ruler-storage.backend': $._config.ruler_client_type, + } + + { + gcs: { + 'ruler-storage.gcs.bucket-name': $._config.ruler_storage_bucket_name, + }, + s3: { + 'ruler-storage.s3.region': $._config.aws_region, + 'ruler-storage.s3.bucket-name': $._config.ruler_storage_bucket_name, + 'ruler-storage.s3.endpoint': 's3.dualstack.%s.amazonaws.com' % $._config.aws_region, + }, + azure: { + 'ruler-storage.azure.container-name': $._config.ruler_storage_bucket_name, + 'ruler-storage.azure.account-name': $._config.ruler_storage_azure_account_name, + 'ruler-storage.azure.account-key': $._config.ruler_storage_azure_account_key, + }, + 'local': { + 'ruler-storage.local.directory': $._config.ruler_local_directory, + }, + }[$._config.ruler_client_type], + + alertmanager: { + replicas: 3, + sharding_enabled: false, + gossip_port: 9094, + fallback_config: {}, + ring_store: 'consul', + ring_hostname: 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + ring_replication_factor: $._config.replication_factor, + }, + + alertmanager_client_type: error 'you must specify a storage backend type for the alertmanager (azure, gcs, s3, local)', + alertmanager_s3_bucket_name: error 'you must specify the alertmanager S3 bucket name', + alertmanager_gcs_bucket_name: error 'you must specify a GCS bucket name', + alertmanager_azure_container_name: error 'you must specify an Azure container name', + + alertmanagerStorageClientConfig: + { + 'alertmanager-storage.backend': $._config.alertmanager_client_type, + } + + { + azure: { + 'alertmanager-storage.azure.account-key': $._config.alertmanager_azure_account_key, + 'alertmanager-storage.azure.account-name': $._config.alertmanager_azure_account_name, + 'alertmanager-storage.azure.container-name': $._config.alertmanager_azure_container_name, + }, + gcs: { + 'alertmanager-storage.gcs.bucket-name': $._config.alertmanager_gcs_bucket_name, + }, + s3: { + 'alertmanager-storage.s3.region': $._config.aws_region, + 'alertmanager-storage.s3.bucket-name': $._config.alertmanager_s3_bucket_name, + }, + 'local': { + 'alertmanager-storage.local.path': $._config.alertmanager_local_directory, + }, + }[$._config.alertmanager_client_type], + + // === Per-tenant usage limits. === + // + // These are the defaults. + limits: $._config.overrides.extra_small_user, + + // These are all the flags for the default limits. + distributorLimitsConfig: { + 'distributor.ingestion-rate-limit-strategy': 'global', + 'distributor.ingestion-rate-limit': $._config.limits.ingestion_rate, + 'distributor.ingestion-burst-size': $._config.limits.ingestion_burst_size, + }, + ingesterLimitsConfig: { + 'ingester.max-series-per-user': $._config.limits.max_series_per_user, + 'ingester.max-series-per-metric': $._config.limits.max_series_per_metric, + 'ingester.max-global-series-per-user': $._config.limits.max_global_series_per_user, + 'ingester.max-global-series-per-metric': $._config.limits.max_global_series_per_metric, + 'ingester.max-series-per-query': $._config.limits.max_series_per_query, + }, + rulerLimitsConfig: { + 'ruler.max-rules-per-rule-group': $._config.limits.ruler_max_rules_per_rule_group, + 'ruler.max-rule-groups-per-tenant': $._config.limits.ruler_max_rule_groups_per_tenant, + }, + compactorLimitsConfig: { + 'compactor.blocks-retention-period': $._config.limits.compactor_blocks_retention_period, + }, + + limitsConfig: self.distributorLimitsConfig + self.ingesterLimitsConfig + self.rulerLimitsConfig + self.compactorLimitsConfig, + + overrides_configmap: 'overrides', + + overrides: { + extra_small_user:: { + max_series_per_user: 0, // Disabled in favour of the max global limit + max_series_per_metric: 0, // Disabled in favour of the max global limit + + // Our limit should be 100k, but we need some room of about ~50% to take rollouts into account + max_global_series_per_user: 150000, + max_global_series_per_metric: 20000, + + max_series_per_query: 100000, + + ingestion_rate: 10000, + ingestion_burst_size: 200000, + + // 700 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 35, + + // No retention for now. + compactor_blocks_retention_period: '0', + }, + + medium_small_user:: { + max_series_per_user: 0, // Disabled in favour of the max global limit + max_series_per_metric: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 300000, + max_global_series_per_metric: 30000, + + max_series_per_query: 100000, + + ingestion_rate: 30000, + ingestion_burst_size: 300000, + + // 1000 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 50, + }, + + small_user:: { + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 1000000, + max_global_series_per_metric: 100000, + + max_series_per_query: 100000, + + ingestion_rate: 100000, + ingestion_burst_size: 1000000, + + // 1400 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 70, + }, + + medium_user:: { + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 3000000, // 3M + max_global_series_per_metric: 300000, // 300K + + max_series_per_query: 100000, + + ingestion_rate: 350000, // 350K + ingestion_burst_size: 3500000, // 3.5M + + // 1800 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 90, + }, + + big_user:: { + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_series_per_query: 100000, + + max_global_series_per_user: 6000000, // 6M + max_global_series_per_metric: 600000, // 600K + + ingestion_rate: 700000, // 700K + ingestion_burst_size: 7000000, // 7M + + // 2200 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 110, + }, + + super_user:: { + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 12000000, // 12M + max_global_series_per_metric: 1200000, // 1.2M + + max_series_per_query: 100000, + + ingestion_rate: 1500000, // 1.5M + ingestion_burst_size: 15000000, // 15M + + // 2600 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 130, + }, + + // This user class has limits increased by +50% compared to the previous one. + mega_user+:: { + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 16000000, // 16M + max_global_series_per_metric: 1600000, // 1.6M + + max_series_per_query: 100000, + + ingestion_rate: 2250000, // 2.25M + ingestion_burst_size: 22500000, // 22.5M + + // 3000 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 150, + }, + }, + + // if not empty, passed to overrides.yaml as another top-level field + multi_kv_config: {}, + + schemaID: std.md5(std.toString($._config.schema)), + + enable_pod_priorities: true, + + alertmanager_enabled: false, + + // Enables query-scheduler component, and reconfigures querier and query-frontend to use it. + query_scheduler_enabled: false, + + // Enables streaming of chunks from ingesters using blocks. + // Changing it will not cause new rollout of ingesters, as it gets passed to them via runtime-config. + ingester_stream_chunks_when_using_blocks: true, + + // Ingester limits are put directly into runtime config, if not null. Available limits: + // ingester_instance_limits: { + // max_inflight_push_requests: 0, // Max inflight push requests per ingester. 0 = no limit. + // max_ingestion_rate: 0, // Max ingestion rate (samples/second) per ingester. 0 = no limit. + // max_series: 0, // Max number of series per ingester. 0 = no limit. + // max_tenants: 0, // Max number of tenants per ingester. 0 = no limit. + // }, + ingester_instance_limits: null, + }, + + local configMap = $.core.v1.configMap, + + overrides_config: + configMap.new($._config.overrides_configmap) + + configMap.withData({ + 'overrides.yaml': $.util.manifestYaml( + { overrides: $._config.overrides } + + (if std.length($._config.multi_kv_config) > 0 then { multi_kv_config: $._config.multi_kv_config } else {}) + + (if $._config.ingester_stream_chunks_when_using_blocks then { ingester_stream_chunks_when_using_blocks: true } else {}) + + (if $._config.ingester_instance_limits != null then { ingester_limits: $._config.ingester_instance_limits } else {}), + ), + }), + + storage_config: + configMap.new('schema-' + $._config.schemaID) + + configMap.withData({ + 'config.yaml': $.util.manifestYaml({ + configs: $._config.schema, + }), + }), + + local deployment = $.apps.v1.deployment, + storage_config_mixin:: + deployment.mixin.spec.template.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + + $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), + + // This removed the CPU limit from the config. NB won't show up in subset + // diffs, but ks apply will do the right thing. + removeCPULimitsMixin:: { + resources+: { + // Can't use super.memory in limits, as we want to + // override the whole limits struct. + local memoryLimit = super.limits.memory, + + limits: { + memory: memoryLimit, + }, + }, + }, +} diff --git a/operations/mimir/consul.libsonnet b/operations/mimir/consul.libsonnet new file mode 100644 index 0000000000..7e017f8fcd --- /dev/null +++ b/operations/mimir/consul.libsonnet @@ -0,0 +1,65 @@ +local consul = import 'consul/consul.libsonnet'; + +{ + _config+:: { + consul_replicas: 1, + other_namespaces+: [], + }, + + consul: consul { + _config+:: { + consul_replicas: $._config.consul_replicas, + namespace: $._config.namespace, + }, + + // Snapshot the raft.db very frequently, to stop it getting to big. + consul_config+:: { + raft_snapshot_threshold: 128, + raft_trailing_logs: 10e3, + }, + + local container = $.core.v1.container, + + consul_container+:: + container.withArgsMixin([ + '-ui-content-path=/%s/consul/' % $._config.namespace, + ]) + + $.util.resourcesRequests('4', '4Gi'), + + local deployment = $.apps.v1.deployment, + local podAntiAffinity = deployment.mixin.spec.template.spec.affinity.podAntiAffinity, + local volume = $.core.v1.volume, + + // backwards compatibility with ksonnet + local podAffinityTerm = + if std.objectHasAll($.core.v1, 'podAffinityTerm') + then $.core.v1.podAffinityTerm + else podAntiAffinity.requiredDuringSchedulingIgnoredDuringExecutionType, + + consul_deployment+: + + // Keep the consul state on a ramdisk, as they are ephemeral to us. + $.util.emptyVolumeMount( + 'data', + '/consul/data/', + volumeMixin=volume.mixin.emptyDir.withMedium('Memory'), + ) + + + // Ensure Consul is not scheduled on the same host as an ingester + // (in any namespace - hence other_namespaces). + podAntiAffinity.withRequiredDuringSchedulingIgnoredDuringExecutionMixin([ + podAffinityTerm.mixin.labelSelector.withMatchLabels({ name: 'ingester' }) + + podAffinityTerm.withNamespaces([$._config.namespace] + $._config.other_namespaces) + + podAffinityTerm.withTopologyKey('kubernetes.io/hostname'), + ]) + + + $.util.podPriority('high'), + + // Don't healthcheck services, adds load to consul. + consul_exporter+:: + container.withArgsMixin([ + '--no-consul.health-summary', + '--consul.allow_stale', + ]), + }, +} diff --git a/operations/mimir/cortex-manifests.jsonnet.example b/operations/mimir/cortex-manifests.jsonnet.example new file mode 100644 index 0000000000..9abcc1b174 --- /dev/null +++ b/operations/mimir/cortex-manifests.jsonnet.example @@ -0,0 +1,27 @@ +local cortex = import "cortex/cortex.libsonnet"; + +cortex { + _config+:: { + namespace: "default", + schema: [{ + from: '2019-11-15', + store: 'bigtable-hashed', + object_store: 'gcs', + schema: 'v10', + index: { + prefix: 'dev_index_', + period: '168h', + }, + chunks: { + prefix: 'dev_chunks_', + period: '168h', + }, + }], + + storage_backend: 'gcp', + bigtable_instance: 'example-instance-prod', + bigtable_project: 'example-project1-cortex', + ruler_client_type: 'gcs' + }, +} + diff --git a/operations/mimir/cortex.libsonnet b/operations/mimir/cortex.libsonnet new file mode 100644 index 0000000000..b8716d19cc --- /dev/null +++ b/operations/mimir/cortex.libsonnet @@ -0,0 +1,21 @@ +(import 'ksonnet-util/kausal.libsonnet') + +(import 'jaeger-agent-mixin/jaeger.libsonnet') + +(import 'images.libsonnet') + +(import 'common.libsonnet') + +(import 'config.libsonnet') + +(import 'consul.libsonnet') + + +// Cortex services +(import 'distributor.libsonnet') + +(import 'ingester.libsonnet') + +(import 'querier.libsonnet') + +(import 'query-frontend.libsonnet') + +(import 'table-manager.libsonnet') + +(import 'ruler.libsonnet') + +(import 'alertmanager.libsonnet') + +(import 'query-scheduler.libsonnet') + + +// Supporting services +(import 'etcd.libsonnet') + +(import 'memcached.libsonnet') + +(import 'test-exporter.libsonnet') diff --git a/operations/mimir/distributor.libsonnet b/operations/mimir/distributor.libsonnet new file mode 100644 index 0000000000..ea22523e6f --- /dev/null +++ b/operations/mimir/distributor.libsonnet @@ -0,0 +1,71 @@ +{ + local container = $.core.v1.container, + local containerPort = $.core.v1.containerPort, + + distributor_args:: + $._config.grpcConfig + + $._config.ringConfig + + $._config.distributorConfig + + $._config.distributorLimitsConfig + + { + target: 'distributor', + + 'validation.reject-old-samples': true, + 'validation.reject-old-samples.max-age': '12h', + 'runtime-config.file': '/etc/cortex/overrides.yaml', + 'distributor.remote-timeout': '20s', + + 'distributor.ha-tracker.enable': true, + 'distributor.ha-tracker.enable-for-all-users': true, + 'distributor.ha-tracker.store': 'etcd', + 'distributor.ha-tracker.etcd.endpoints': 'etcd-client.%s.svc.cluster.local.:2379' % $._config.namespace, + 'distributor.ha-tracker.prefix': 'prom_ha/', + + // The memory requests are 2G, and we barely use 100M. + // By adding a ballast of 1G, we can drastically reduce GC, but also keep the usage at + // around 1.25G, reducing the 99%ile. + 'mem-ballast-size-bytes': 1 << 30, // 1GB + + 'server.grpc.keepalive.max-connection-age': '2m', + 'server.grpc.keepalive.max-connection-age-grace': '5m', + 'server.grpc.keepalive.max-connection-idle': '1m', + + // The ingestion rate global limit requires the distributors to form a ring. + 'distributor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'distributor.ring.prefix': '', + + // Do not extend the replication set on unhealthy (or LEAVING) ingester when "unregister on shutdown" + // is set to false. + 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, + }, + + distributor_ports:: $.util.defaultPorts, + + distributor_container:: + container.new('distributor', $._images.distributor) + + container.withPorts($.distributor_ports) + + container.withArgsMixin($.util.mapToFlags($.distributor_args)) + + $.util.resourcesRequests('2', '2Gi') + + $.util.resourcesLimits(null, '4Gi') + + $.util.readinessProbe + + $.jaeger_mixin, + + local deployment = $.apps.v1.deployment, + + distributor_deployment_labels:: {}, + + distributor_deployment: + deployment.new('distributor', 3, [$.distributor_container], $.distributor_deployment_labels) + + (if $._config.cortex_distributor_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), + + local service = $.core.v1.service, + + distributor_service_ignored_labels:: [], + + distributor_service: + $.util.serviceFor($.distributor_deployment, $.distributor_service_ignored_labels) + + service.mixin.spec.withClusterIp('None'), +} diff --git a/operations/mimir/etcd.libsonnet b/operations/mimir/etcd.libsonnet new file mode 100644 index 0000000000..41981db9ff --- /dev/null +++ b/operations/mimir/etcd.libsonnet @@ -0,0 +1,9 @@ +local etcd_cluster = import 'etcd-operator/etcd-cluster.libsonnet'; + +etcd_cluster { + etcd: + $.etcd_cluster('etcd', env=[{ + name: 'ETCD_AUTO_COMPACTION_RETENTION', + value: '1h', + }]), +} diff --git a/operations/mimir/flusher-job-blocks.libsonnet b/operations/mimir/flusher-job-blocks.libsonnet new file mode 100644 index 0000000000..1e6266caf7 --- /dev/null +++ b/operations/mimir/flusher-job-blocks.libsonnet @@ -0,0 +1,49 @@ +{ + // Usage example: + // + // local flusher_job = import 'cortex/flusher-job-blocks.libsonnet'; + // + // flusher_job { + // 'flusher-25': $.flusher_job_func('flusher-25', 'ingester-data-ingester-25'), + // } + // + // Where 'flusher-25' is a job name, and 'ingester-data-ingester-25' is PVC to flush. + + local container = $.core.v1.container, + local job = $.batch.v1.job, + local volumeMount = $.core.v1.volumeMount, + local volume = $.core.v1.volume, + + flusher_container:: + container.new('flusher', $._images.flusher) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.ingester_args { + target: 'flusher', + 'blocks-storage.tsdb.retention-period': '10000h', // don't delete old blocks too soon. + })) + + $.util.resourcesRequests('4', '15Gi') + + $.util.resourcesLimits(null, '25Gi') + + $.util.readinessProbe + + $.jaeger_mixin, + + flusher_job_func(jobName, pvcName):: + job.new() + + job.mixin.spec.template.spec.withContainers([ + $.flusher_container + + container.withVolumeMountsMixin([ + volumeMount.new('flusher-data', '/data'), + ]), + ]) + + job.mixin.spec.template.spec.withRestartPolicy('Never') + + job.mixin.spec.template.spec.withVolumes([ + volume.fromPersistentVolumeClaim('flusher-data', pvcName), + ]) + + job.mixin.metadata.withName(jobName) + + job.mixin.metadata.withNamespace($._config.namespace) + + job.mixin.metadata.withLabels({ name: 'flusher' }) + + job.mixin.spec.template.metadata.withLabels({ name: 'flusher' }) + + job.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + job.mixin.spec.template.spec.withTerminationGracePeriodSeconds(300) + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + $.util.podPriority('high'), +} diff --git a/operations/mimir/flusher-job.libsonnet b/operations/mimir/flusher-job.libsonnet new file mode 100644 index 0000000000..4d9a576240 --- /dev/null +++ b/operations/mimir/flusher-job.libsonnet @@ -0,0 +1,51 @@ +{ + // Usage example: + // local flusher_job = import 'cortex/flusher-job.libsonnet'; + // flusher_job + { + // flusher_job: + // $.flusher_job_func('pvc-af8947e6-182e-11ea-82e4-42010a9a0137', 'ingester-pvc-ingester-5'), + // } + + local container = $.core.v1.container, + local job = $.batch.v1.job, + local volumeMount = $.core.v1.volumeMount, + local volume = $.core.v1.volume, + + flusher_container:: + container.new('flusher', $._images.flusher) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.ingester_args { + target: 'flusher', + 'flusher.wal-dir': $._config.wal_dir, + })) + + $.util.resourcesRequests('4', '15Gi') + + $.util.resourcesLimits(null, '25Gi') + + $.util.readinessProbe + + $.jaeger_mixin, + + flusher_job_storage_config_mixin:: + job.mixin.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + + $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), + + flusher_job_func(volumeName, pvcName):: + job.new() + + job.mixin.spec.template.spec.withContainers([ + $.flusher_container + + container.withVolumeMountsMixin([ + volumeMount.new(volumeName, $._config.wal_dir), + ]), + ]) + + job.mixin.spec.template.spec.withRestartPolicy('Never') + + job.mixin.spec.template.spec.withVolumes([ + volume.fromPersistentVolumeClaim(volumeName, pvcName), + ]) + + $.flusher_job_storage_config_mixin + + job.mixin.metadata.withName('flusher') + + job.mixin.metadata.withNamespace($._config.namespace) + + job.mixin.metadata.withLabels({ name: 'flusher' }) + + job.mixin.spec.template.metadata.withLabels({ name: 'flusher' }) + + job.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + job.mixin.spec.template.spec.withTerminationGracePeriodSeconds(300) + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + $.util.podPriority('high'), +} diff --git a/operations/mimir/gossip.libsonnet b/operations/mimir/gossip.libsonnet new file mode 100644 index 0000000000..8b6261641d --- /dev/null +++ b/operations/mimir/gossip.libsonnet @@ -0,0 +1,82 @@ +{ + _config+:: { + // Use memberlist only. This works fine on already-migrated clusters. + // To do a migration from Consul to memberlist, multi kv storage needs to be used (See below). + ringConfig+: { + 'ring.store': 'memberlist', + 'memberlist.abort-if-join-fails': false, + 'memberlist.bind-port': gossipRingPort, + 'memberlist.join': 'gossip-ring.%s.svc.cluster.local:%d' % [$._config.namespace, gossipRingPort], + }, + + // This can be used to enable multi KV store, with consul and memberlist. + ringConfigMulti: { + 'ring.store': 'multi', + 'multi.primary': 'consul', + 'multi.secondary': 'memberlist', + }, + + // When doing migration via multi KV store, this section can be used + // to configure runtime parameters of multi KV store + /* + multi_kv_config: { + primary: 'memberlist', + // 'mirror-enabled': false, // renamed to 'mirror_enabled' on after r67 + }, + */ + }, + + ingester_args+: { + // wait longer to see LEAVING ingester in the gossiped ring, to avoid + // auto-join without transfer from LEAVING ingester. + 'ingester.join-after': '60s', + + // Updating heartbeat is low-cost operation when using gossiped ring, we can + // do it more often (gossiping will happen no matter what, we may as well send + // recent timestamps). + // It also helps other components to see more recent update in the ring. + 'ingester.heartbeat-period': '5s', + }, + + local gossipRingPort = 7946, + + local containerPort = $.core.v1.containerPort, + local gossipPort = containerPort.newNamed(name='gossip-ring', containerPort=gossipRingPort), + + distributor_ports+:: [gossipPort], + querier_ports+:: [gossipPort], + ingester_ports+:: [gossipPort], + + local gossip_member_label = 'gossip_ring_member', + + distributor_deployment_labels+:: { [gossip_member_label]: 'true' }, + ingester_deployment_labels+:: { [gossip_member_label]: 'true' }, + querier_deployment_labels+:: { [gossip_member_label]: 'true' }, + + // Don't use gossip ring member label in service definition. + distributor_service_ignored_labels+:: [gossip_member_label], + ingester_service_ignored_labels+:: [gossip_member_label], + querier_service_ignored_labels+:: [gossip_member_label], + + // Headless service (= no assigned IP, DNS returns all targets instead) pointing to some + // users of gossiped-ring. We use ingesters as seed nodes for joining gossip cluster. + // During migration to gossip, it may be useful to use distributors instead, since they are restarted faster. + gossip_ring_service: + local service = $.core.v1.service; + + // backwards compatibility with ksonnet + local servicePort = + if std.objectHasAll($.core.v1, 'servicePort') + then $.core.v1.servicePort + else service.mixin.spec.portsType; + + local ports = [ + servicePort.newNamed('gossip-ring', gossipRingPort, gossipRingPort) + + servicePort.withProtocol('TCP'), + ]; + service.new( + 'gossip-ring', // name + { [gossip_member_label]: 'true' }, // point to all gossip members + ports, + ) + service.mixin.spec.withClusterIp('None'), // headless service +} diff --git a/operations/mimir/images.libsonnet b/operations/mimir/images.libsonnet new file mode 100644 index 0000000000..87a9dc61ac --- /dev/null +++ b/operations/mimir/images.libsonnet @@ -0,0 +1,26 @@ +{ + _images+:: { + // Various third-party images. + memcached: 'memcached:1.6.9-alpine', + memcachedExporter: 'prom/memcached-exporter:v0.6.0', + + // Our services. + cortex: 'cortexproject/cortex:v1.9.0', + + alertmanager: self.cortex, + distributor: self.cortex, + ingester: self.cortex, + querier: self.cortex, + query_frontend: self.cortex, + tableManager: self.cortex, + compactor: self.cortex, + flusher: self.cortex, + ruler: self.cortex, + store_gateway: self.cortex, + query_scheduler: self.cortex, + + cortex_tools: 'grafana/cortex-tools:v0.4.0', + query_tee: 'quay.io/cortexproject/query-tee:v1.9.0', + testExporter: 'cortexproject/test-exporter:v1.9.0', + }, +} diff --git a/operations/mimir/ingester.libsonnet b/operations/mimir/ingester.libsonnet new file mode 100644 index 0000000000..3078db366b --- /dev/null +++ b/operations/mimir/ingester.libsonnet @@ -0,0 +1,145 @@ +{ + ingester_args:: + $._config.grpcConfig + + $._config.ringConfig + + $._config.storeConfig + + $._config.storageConfig + + $._config.blocksStorageConfig + + $._config.distributorConfig + // This adds the distributor ring flags to the ingester. + $._config.ingesterLimitsConfig + + { + target: 'ingester', + + // Ring config. + 'ingester.num-tokens': 512, + 'ingester.join-after': '30s', + 'ingester.max-transfer-retries': 60, // Each retry is backed off by 5s, so 5mins for new ingester to come up. + 'ingester.heartbeat-period': '15s', + 'ingester.max-stale-chunk-idle': '5m', + 'ingester.unregister-on-shutdown': $._config.unregister_ingesters_on_shutdown, + + // Chunk building/flushing config. + 'ingester.chunk-encoding': 3, // Bigchunk encoding + 'ingester.retain-period': '15m', + 'ingester.max-chunk-age': '6h', + + // Limits config. + 'ingester.max-chunk-idle': $._config.max_chunk_idle, + 'runtime-config.file': '/etc/cortex/overrides.yaml', + 'server.grpc-max-concurrent-streams': 10000, + 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, + 'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024, + } + ( + if $._config.memcached_index_writes_enabled then + { + // Setup index write deduping. + 'store.index-cache-write.memcached.hostname': 'memcached-index-writes.%(namespace)s.svc.cluster.local' % $._config, + 'store.index-cache-write.memcached.service': 'memcached-client', + } + else {} + ), + + ingester_statefulset_args:: + $._config.grpcConfig + { + 'ingester.wal-enabled': true, + 'ingester.checkpoint-enabled': true, + 'ingester.recover-from-wal': true, + 'ingester.wal-dir': $._config.ingester.wal_dir, + 'ingester.checkpoint-duration': '15m', + '-log.level': 'info', + 'ingester.tokens-file-path': $._config.ingester.wal_dir + '/tokens', + }, + + ingester_ports:: $.util.defaultPorts, + + local name = 'ingester', + local container = $.core.v1.container, + + ingester_container:: + container.new(name, $._images.ingester) + + container.withPorts($.ingester_ports) + + container.withArgsMixin($.util.mapToFlags($.ingester_args)) + + $.util.resourcesRequests('4', '15Gi') + + $.util.resourcesLimits(null, '25Gi') + + $.util.readinessProbe + + $.jaeger_mixin, + + local volumeMount = $.core.v1.volumeMount, + + ingester_statefulset_container:: + $.ingester_container + + container.withArgsMixin($.util.mapToFlags($.ingester_statefulset_args)) + + container.withVolumeMountsMixin([ + volumeMount.new('ingester-pvc', $._config.ingester.wal_dir), + ]), + + ingester_deployment_labels:: {}, + + local pvc = $.core.v1.persistentVolumeClaim, + local volume = $.core.v1.volume, + local statefulSet = $.apps.v1.statefulSet, + + local ingester_pvc = + pvc.new('ingester-pvc') + + pvc.mixin.spec.resources.withRequests({ storage: $._config.ingester.statefulset_disk }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName('fast'), + + statefulset_storage_config_mixin:: + statefulSet.mixin.spec.template.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + + $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), + + ingester_statefulset: + if $._config.ingester_deployment_without_wal == false then + statefulSet.new('ingester', 3, [$.ingester_statefulset_container], ingester_pvc) + + statefulSet.mixin.spec.withServiceName('ingester') + + statefulSet.mixin.spec.template.spec.withVolumes([volume.fromPersistentVolumeClaim('ingester-pvc', 'ingester-pvc')]) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: 'ingester' }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: 'ingester' } + $.ingester_deployment_labels) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: 'ingester' }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + $.statefulset_storage_config_mixin + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + $.util.podPriority('high') + + $.util.antiAffinityStatefulSet + else null, + + local deployment = $.apps.v1.deployment, + + ingester_deployment: + if $._config.ingester_deployment_without_wal then + deployment.new(name, 3, [$.ingester_container], $.ingester_deployment_labels) + + $.util.antiAffinity + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + deployment.mixin.metadata.withLabels({ name: name }) + + deployment.mixin.spec.withMinReadySeconds(60) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + + deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + + $.storage_config_mixin + + $.util.podPriority('high') + else null, + + ingester_service_ignored_labels:: [], + + ingester_service: + if $._config.ingester_deployment_without_wal then + $.util.serviceFor($.ingester_deployment, $.ingester_service_ignored_labels) + else + $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), + + local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, + + newIngesterPdb(pdbName, ingesterName):: + podDisruptionBudget.new() + + podDisruptionBudget.mixin.metadata.withName(pdbName) + + podDisruptionBudget.mixin.metadata.withLabels({ name: pdbName }) + + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: ingesterName }) + + podDisruptionBudget.mixin.spec.withMaxUnavailable(1), + + ingester_pdb: self.newIngesterPdb('ingester-pdb', name), +} diff --git a/operations/mimir/jsonnetfile.json b/operations/mimir/jsonnetfile.json new file mode 100644 index 0000000000..e83b85fc06 --- /dev/null +++ b/operations/mimir/jsonnetfile.json @@ -0,0 +1,51 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "consul" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "etcd-operator" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "jaeger-agent-mixin" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "ksonnet-util" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "memcached" + } + }, + "version": "master" + } + ], + "legacyImports": true +} diff --git a/operations/mimir/jsonnetfile.lock.json b/operations/mimir/jsonnetfile.lock.json new file mode 100644 index 0000000000..3751cc2092 --- /dev/null +++ b/operations/mimir/jsonnetfile.lock.json @@ -0,0 +1,56 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "consul" + } + }, + "version": "c19a92e586a6752f11745b47f309b13f02ef7147", + "sum": "qlVBnIShhHEPglAl1xYIAmOP/W8LD0wQmHCT0m9sTLU=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "etcd-operator" + } + }, + "version": "815b0364886cc7bdf6bde2cdcd424bb8cef842b8", + "sum": "dnKsZ5FkKBtCycNVVSYa1AMNjCLofO4VGFrmzoz4344=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "jaeger-agent-mixin" + } + }, + "version": "65a623593025007ef54549550a3569c0e72f085d", + "sum": "DsdBoqgx5kE3zc6fMYnfiGjW2+9Mx2OXFieWm1oFHgY=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "ksonnet-util" + } + }, + "version": "8fa7669cc7b1b1822eb0220f2eda9c6aaa5c5119", + "sum": "/l/RofjusGrnNpJMD0ST+jDgtARyjvBP5vC7kEjPoQI=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "memcached" + } + }, + "version": "c19a92e586a6752f11745b47f309b13f02ef7147", + "sum": "GQeyWFtqhwM+hGxQbdywWG1PFJ/KmSC1at0hai7AHXU=" + } + ], + "legacyImports": false +} diff --git a/operations/mimir/memcached.libsonnet b/operations/mimir/memcached.libsonnet new file mode 100644 index 0000000000..011328c33b --- /dev/null +++ b/operations/mimir/memcached.libsonnet @@ -0,0 +1,78 @@ +local memcached = import 'memcached/memcached.libsonnet'; + +memcached { + memcached+:: { + cpu_limits:: null, + + deployment: {}, + + local statefulSet = $.apps.v1.statefulSet, + + statefulSet: + statefulSet.new(self.name, 3, [ + self.memcached_container, + self.memcached_exporter, + ], []) + + statefulSet.mixin.spec.withServiceName(self.name) + + $.util.antiAffinity, + + local service = $.core.v1.service, + + service: + $.util.serviceFor(self.statefulSet) + + service.mixin.spec.withClusterIp('None'), + }, + + // Dedicated memcached instance used to cache query results. + memcached_frontend: $.memcached { + name: 'memcached-frontend', + max_item_size: '5m', + }, + + // Dedicated memcached instance used to temporarily cache index lookups. + memcached_index_queries: if $._config.memcached_index_queries_enabled then + $.memcached { + name: 'memcached-index-queries', + max_item_size: '%dm' % [$._config.memcached_index_queries_max_item_size_mb], + connection_limit: 16384, + } + else {}, + + // Dedicated memcached instance used to dedupe writes to the index. + memcached_index_writes: if $._config.memcached_index_writes_enabled then + $.memcached { + name: 'memcached-index-writes', + max_item_size: '%dm' % [$._config.memcached_index_writes_max_item_size_mb], + } + else {}, + + // Memcached instance used to cache chunks. + memcached_chunks: if $._config.memcached_chunks_enabled then + $.memcached { + name: 'memcached', + max_item_size: '%dm' % [$._config.memcached_chunks_max_item_size_mb], + + // Save memory by more tightly provisioning memcached chunks. + memory_limit_mb: 6 * 1024, + overprovision_factor: 1.05, + connection_limit: 16384, + + local container = $.core.v1.container, + } + else {}, + + // Memcached instance for caching TSDB blocks metadata (meta.json files, deletion marks, list of users and blocks). + memcached_metadata: if $._config.memcached_metadata_enabled then + $.memcached { + name: 'memcached-metadata', + max_item_size: '%dm' % [$._config.memcached_metadata_max_item_size_mb], + connection_limit: 16384, + + // Metadata cache doesn't need much memory. + memory_limit_mb: 512, + + local statefulSet = $.apps.v1.statefulSet, + statefulSet+: + statefulSet.mixin.spec.withReplicas(1), + }, +} diff --git a/operations/mimir/overrides-exporter.libsonnet b/operations/mimir/overrides-exporter.libsonnet new file mode 100644 index 0000000000..d8eb411ad2 --- /dev/null +++ b/operations/mimir/overrides-exporter.libsonnet @@ -0,0 +1,67 @@ +// this enables overrides exporter, which will expose the configured +// overrides and presets (if configured). Those metrics can be potentially +// high cardinality. +{ + local name = 'overrides-exporter', + + _config+: { + // overrides exporter can also make the configured presets available, this + // list references entries within $._config.overrides + + overrides_exporter_presets:: [ + 'extra_small_user', + 'small_user', + 'medium_user', + 'big_user', + 'super_user', + 'mega_user', + ], + }, + + local presets_enabled = std.length($._config.overrides_exporter_presets) > 0, + + local configMap = $.core.v1.configMap, + overrides_exporter_presets_configmap: + if presets_enabled then + configMap.new('overrides-presets') + + configMap.withData({ + 'overrides-presets.yaml': $.util.manifestYaml( + { + presets: { + [key]: $._config.overrides[key] + for key in $._config.overrides_exporter_presets + }, + } + ), + }), + + local containerPort = $.core.v1.containerPort, + overrides_exporter_port:: containerPort.newNamed(name='http-metrics', containerPort=9683), + + overrides_exporter_args:: { + 'overrides-file': '/etc/cortex/overrides.yaml', + } + if presets_enabled then { + 'presets-file': '/etc/cortex_presets/overrides-presets.yaml', + } else {}, + + local container = $.core.v1.container, + overrides_exporter_container:: + container.new(name, $._images.cortex_tools) + + container.withPorts([ + $.overrides_exporter_port, + ]) + + container.withArgsMixin([name] + $.util.mapToFlags($.overrides_exporter_args, prefix='--')) + + $.util.resourcesRequests('0.5', '0.5Gi') + + $.util.readinessProbe + + container.mixin.readinessProbe.httpGet.withPort($.overrides_exporter_port.name), + + local deployment = $.apps.v1.deployment, + overrides_exporter_deployment: + deployment.new(name, 1, [$.overrides_exporter_container], { name: name }) + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + $.util.configVolumeMount('overrides-presets', '/etc/cortex_presets') + + deployment.mixin.metadata.withLabels({ name: name }), + + overrides_exporter_service: + $.util.serviceFor($.overrides_exporter_deployment), +} diff --git a/operations/mimir/querier.libsonnet b/operations/mimir/querier.libsonnet new file mode 100644 index 0000000000..eb807ee28b --- /dev/null +++ b/operations/mimir/querier.libsonnet @@ -0,0 +1,74 @@ +{ + local container = $.core.v1.container, + + querier_args:: + $._config.grpcConfig + + $._config.ringConfig + + $._config.storeConfig + + $._config.storageConfig + + $._config.blocksStorageConfig + + $._config.queryConfig + + $._config.queryEngineConfig + + $._config.distributorConfig + + { + target: 'querier', + + // Increase HTTP server response write timeout, as we were seeing some + // queries that return a lot of data timeing out. + 'server.http-write-timeout': '1m', + + // Limit query concurrency to prevent multi large queries causing an OOM. + 'querier.max-concurrent': $._config.querier.concurrency, + + // Limit to N/2 worker threads per frontend, as we have two frontends. + 'querier.worker-parallelism': $._config.querier.concurrency / $._config.queryFrontend.replicas, + 'querier.frontend-address': 'query-frontend-discovery.%(namespace)s.svc.cluster.local:9095' % $._config, + 'querier.frontend-client.grpc-max-send-msg-size': 100 << 20, + + 'querier.second-store-engine': $._config.querier_second_storage_engine, + + // We request high memory but the Go heap is typically very low (< 100MB) and this causes + // the GC to trigger continuously. Setting a ballast of 256MB reduces GC. + 'mem-ballast-size-bytes': 1 << 28, // 256M + + 'log.level': 'debug', + }, + + querier_ports:: $.util.defaultPorts, + + querier_env_map:: { + JAEGER_REPORTER_MAX_QUEUE_SIZE: '1024', // Default is 100. + }, + + querier_container:: + container.new('querier', $._images.querier) + + container.withPorts($.querier_ports) + + container.withArgsMixin($.util.mapToFlags($.querier_args)) + + $.jaeger_mixin + + $.util.readinessProbe + + container.withEnvMap($.querier_env_map) + + $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesLimits(null, '24Gi'), + + local deployment = $.apps.v1.deployment, + + querier_deployment_labels: {}, + + newQuerierDeployment(name, container):: + deployment.new(name, $._config.querier.replicas, [container], $.querier_deployment_labels) + + (if $._config.cortex_querier_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + + $.storage_config_mixin, + + querier_deployment: + self.newQuerierDeployment('querier', $.querier_container), + + local service = $.core.v1.service, + + querier_service_ignored_labels:: [], + + querier_service: + $.util.serviceFor($.querier_deployment, $.querier_service_ignored_labels), +} diff --git a/operations/mimir/query-frontend.libsonnet b/operations/mimir/query-frontend.libsonnet new file mode 100644 index 0000000000..80f36d0473 --- /dev/null +++ b/operations/mimir/query-frontend.libsonnet @@ -0,0 +1,75 @@ +{ + local container = $.core.v1.container, + + query_frontend_args:: + $._config.grpcConfig + { + target: 'query-frontend', + + // Need log.level=debug so all queries are logged, needed for analyse.py. + 'log.level': 'debug', + + // Increase HTTP server response write timeout, as we were seeing some + // queries that return a lot of data timeing out. + 'server.http-write-timeout': '1m', + + // Split long queries up into multiple day-long queries. + 'querier.split-queries-by-interval': '24h', + + // Cache query results. + 'querier.align-querier-with-step': false, + 'querier.cache-results': true, + 'frontend.memcached.hostname': 'memcached-frontend.%s.svc.cluster.local' % $._config.namespace, + 'frontend.memcached.service': 'memcached-client', + 'frontend.memcached.timeout': '500ms', + + // So that exporters like cloudwatch can still send in data and be un-cached. + 'frontend.max-cache-freshness': '10m', + + // Use GZIP compression for API responses; improves latency for very big results and slow + // connections. + 'api.response-compression-enabled': true, + + // So it can receive big responses from the querier. + 'server.grpc-max-recv-msg-size-bytes': 100 << 20, + + // Limit queries to 500 days, allow this to be override per-user. + 'store.max-query-length': '12000h', // 500 Days + 'runtime-config.file': '/etc/cortex/overrides.yaml', + }, + + query_frontend_container:: + container.new('query-frontend', $._images.query_frontend) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + + $.jaeger_mixin + + $.util.readinessProbe + + $.util.resourcesRequests('2', '600Mi') + + $.util.resourcesLimits(null, '1200Mi'), + + local deployment = $.apps.v1.deployment, + + newQueryFrontendDeployment(name, container):: + deployment.new(name, $._config.queryFrontend.replicas, [container]) + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + (if $._config.cortex_query_frontend_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(1) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), + + query_frontend_deployment: self.newQueryFrontendDeployment('query-frontend', $.query_frontend_container), + + local service = $.core.v1.service, + + query_frontend_service: + $.util.serviceFor($.query_frontend_deployment), + + query_frontend_discovery_service: + $.util.serviceFor($.query_frontend_deployment) + + // Make sure that query frontend worker, running in the querier, do resolve + // each query-frontend pod IP and NOT the service IP. To make it, we do NOT + // use the service cluster IP so that when the service DNS is resolved it + // returns the set of query-frontend IPs. + service.mixin.spec.withPublishNotReadyAddresses(true) + + service.mixin.spec.withClusterIp('None') + + service.mixin.metadata.withName('query-frontend-discovery'), +} diff --git a/operations/mimir/query-scheduler.libsonnet b/operations/mimir/query-scheduler.libsonnet new file mode 100644 index 0000000000..604d258a6c --- /dev/null +++ b/operations/mimir/query-scheduler.libsonnet @@ -0,0 +1,57 @@ +// Query-scheduler is optional service. When query-scheduler.libsonnet is added to Cortex, querier and frontend +// are reconfigured to use query-scheduler service. +{ + local container = $.core.v1.container, + local deployment = $.apps.v1.deployment, + local service = $.core.v1.service, + + query_scheduler_args+:: + $._config.grpcConfig + { + target: 'query-scheduler', + 'log.level': 'debug', + 'query-scheduler.max-outstanding-requests-per-tenant': 100, + }, + + query_scheduler_container:: + container.new('query-scheduler', $._images.query_scheduler) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.query_scheduler_args)) + + $.jaeger_mixin + + $.util.readinessProbe + + $.util.resourcesRequests('2', '1Gi') + + $.util.resourcesLimits(null, '2Gi'), + + newQuerySchedulerDeployment(name, container):: + deployment.new(name, 2, [container]) + + $.util.configVolumeMount('overrides', '/etc/cortex') + + $.util.antiAffinity + + // Do not run more query-schedulers than expected. + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), + + query_scheduler_deployment: if !$._config.query_scheduler_enabled then {} else + self.newQuerySchedulerDeployment('query-scheduler', $.query_scheduler_container), + + query_scheduler_service: if !$._config.query_scheduler_enabled then {} else + $.util.serviceFor($.query_scheduler_deployment), + + // Headless to make sure resolution gets IP address of target pods, and not service IP. + query_scheduler_discovery_service: if !$._config.query_scheduler_enabled then {} else + $.util.serviceFor($.query_scheduler_deployment) + + service.mixin.spec.withPublishNotReadyAddresses(true) + + service.mixin.spec.withClusterIp('None') + + service.mixin.metadata.withName('query-scheduler-discovery'), + + // Reconfigure querier and query-frontend to use scheduler. + querier_args+:: if !$._config.query_scheduler_enabled then {} else { + 'querier.worker-match-max-concurrent': 'true', + 'querier.worker-parallelism': null, // Disabled since we set worker-match-max-concurrent. + 'querier.frontend-address': null, + 'querier.scheduler-address': 'query-scheduler-discovery.%(namespace)s.svc.cluster.local:9095' % $._config, + }, + + query_frontend_args+:: if !$._config.query_scheduler_enabled then {} else { + 'frontend.scheduler-address': 'query-scheduler-discovery.%(namespace)s.svc.cluster.local:9095' % $._config, + }, +} diff --git a/operations/mimir/query-tee.libsonnet b/operations/mimir/query-tee.libsonnet new file mode 100644 index 0000000000..4ac3b0a127 --- /dev/null +++ b/operations/mimir/query-tee.libsonnet @@ -0,0 +1,33 @@ +{ + local container = $.core.v1.container, + local containerPort = $.core.v1.containerPort, + local deployment = $.apps.v1.deployment, + local service = $.core.v1.service, + local servicePort = $.core.v1.servicePort, + + query_tee_args:: { + 'log.level': 'debug', + 'backend.endpoints': std.join(',', $._config.query_tee_backend_endpoints), + 'backend.preferred': $._config.query_tee_backend_preferred, + }, + + query_tee_container:: if !($._config.query_tee_enabled) then {} else + container.new('query-tee', $._images.query_tee) + + container.withPorts([ + containerPort.newNamed(name='http', containerPort=80), + containerPort.newNamed(name='http-metrics', containerPort=9900), + ]) + + container.withArgsMixin($.util.mapToFlags($.query_tee_args)) + + $.util.resourcesRequests('1', '512Mi') + + $.jaeger_mixin, + + query_tee_deployment: if !($._config.query_tee_enabled) then {} else + deployment.new('query-tee', 2, [$.query_tee_container]), + + query_tee_service: if !($._config.query_tee_enabled) then {} else + service.new('query-tee', { name: 'query-tee' }, [ + servicePort.newNamed('http', 80, 80) + + servicePort.withNodePort($._config.query_tee_node_port), + ]) + + service.mixin.spec.withType('NodePort'), +} diff --git a/operations/mimir/ruler.libsonnet b/operations/mimir/ruler.libsonnet new file mode 100644 index 0000000000..a7df54fd54 --- /dev/null +++ b/operations/mimir/ruler.libsonnet @@ -0,0 +1,68 @@ +{ + local container = $.core.v1.container, + + ruler_args:: + $._config.grpcConfig + + $._config.ringConfig + + $._config.storeConfig + + $._config.storageConfig + + $._config.blocksStorageConfig + + $._config.queryConfig + + $._config.queryEngineConfig + + $._config.distributorConfig + + $._config.rulerClientConfig + + $._config.rulerLimitsConfig + + { + target: 'ruler', + // Alertmanager configs + 'ruler.alertmanager-url': 'http://alertmanager.%s.svc.cluster.local/alertmanager' % $._config.namespace, + 'experimental.ruler.enable-api': true, + 'api.response-compression-enabled': true, + + // Ring Configs + 'ruler.enable-sharding': true, + 'ruler.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + + // Limits + 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, + 'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024, + + // Storage + 'querier.second-store-engine': $._config.querier_second_storage_engine, + + // Do not extend the replication set on unhealthy (or LEAVING) ingester when "unregister on shutdown" + // is set to false. + 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, + }, + + ruler_container:: + if $._config.ruler_enabled then + container.new('ruler', $._images.ruler) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.ruler_args)) + + $.util.resourcesRequests('1', '6Gi') + + $.util.resourcesLimits('16', '16Gi') + + $.util.readinessProbe + + $.jaeger_mixin + else {}, + + local deployment = $.apps.v1.deployment, + + ruler_deployment: + if $._config.ruler_enabled then + deployment.new('ruler', 2, [$.ruler_container]) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + + deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + + (if $._config.cortex_ruler_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + $.storage_config_mixin + else {}, + + local service = $.core.v1.service, + + ruler_service: + if $._config.ruler_enabled then + $.util.serviceFor($.ruler_deployment) + else {}, +} diff --git a/operations/mimir/table-manager.libsonnet b/operations/mimir/table-manager.libsonnet new file mode 100644 index 0000000000..90cb733c33 --- /dev/null +++ b/operations/mimir/table-manager.libsonnet @@ -0,0 +1,44 @@ +{ + local container = $.core.v1.container, + + table_manager_args:: + $._config.storageConfig + { + target: 'table-manager', + + // Rate limit Bigtable Admin calls. Google seem to limit to ~100QPS, + // and given 2yrs worth of tables (~100) a sync will table 20s. This + // allows you to run upto 20 independant Cortex clusters on the same + // Google project before running into issues. + 'bigtable.grpc-client-rate-limit': 5.0, + 'bigtable.grpc-client-rate-limit-burst': 5, + 'bigtable.backoff-on-ratelimits': true, + 'bigtable.table-cache.enabled': true, + 'table-manager.poll-interval': '10m', + 'table-manager.periodic-table.grace-period': '3h', + }, + + table_manager_container:: + if $._config.table_manager_enabled then + container.new('table-manager', $._images.tableManager) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.table_manager_args)) + + $.util.resourcesRequests('100m', '100Mi') + + $.util.resourcesLimits('200m', '200Mi') + + $.util.readinessProbe + + $.jaeger_mixin + else {}, + + local deployment = $.apps.v1.deployment, + + table_manager_deployment: + if $._config.table_manager_enabled then + deployment.new('table-manager', 1, [$.table_manager_container]) + + $.storage_config_mixin + else {}, + + table_manager_service: + if $._config.table_manager_enabled then + $.util.serviceFor($.table_manager_deployment) + else {}, +} diff --git a/operations/mimir/test-exporter.libsonnet b/operations/mimir/test-exporter.libsonnet new file mode 100644 index 0000000000..9d69abee68 --- /dev/null +++ b/operations/mimir/test-exporter.libsonnet @@ -0,0 +1,40 @@ +{ + local container = $.core.v1.container, + local containerPort = $.core.v1.containerPort, + + test_exporter_args:: { + 'user-id': $._config.test_exporter_user_id, + 'prometheus-address': 'http://query-frontend.%(namespace)s.svc.cluster.local/prometheus' % $._config, + 'test-query-start': $._config.test_exporter_start_time, + 'extra-selectors': 'job="%(namespace)s/test-exporter"' % $._config, + 'test-query-min-size': '1m', + 'test-epsilion': '0.05', // There is enough jitter in our system for scrapes to be off by 5%. + }, + + test_exporter_container:: + if !($._config.test_exporter_enabled) + then {} + else + container.new('test-exporter', $._images.testExporter) + + container.withPorts($.util.defaultPorts) + + container.withArgsMixin($.util.mapToFlags($.test_exporter_args)) + + $.util.resourcesRequests('100m', '100Mi') + + $.util.resourcesLimits('100m', '100Mi') + + $.jaeger_mixin, + + local deployment = $.apps.v1.deployment, + + test_exporter_deployment: + if !($._config.test_exporter_enabled) + then {} + else + deployment.new('test-exporter', 1, [ + $.test_exporter_container, + ]), + + test_exporter_service: + if !($._config.test_exporter_enabled) + then {} + else + $.util.serviceFor($.test_exporter_deployment), +} diff --git a/operations/mimir/tsdb.libsonnet b/operations/mimir/tsdb.libsonnet new file mode 100644 index 0000000000..1590209935 --- /dev/null +++ b/operations/mimir/tsdb.libsonnet @@ -0,0 +1,290 @@ +{ + local pvc = $.core.v1.persistentVolumeClaim, + local volumeMount = $.core.v1.volumeMount, + local container = $.core.v1.container, + local statefulSet = $.apps.v1.statefulSet, + local service = $.core.v1.service, + + _config+:: { + // Enforce blocks storage + storage_backend: 'none', + storage_engine: 'blocks', + + // Allow to configure the ingester disk. + cortex_ingester_data_disk_size: '100Gi', + cortex_ingester_data_disk_class: 'fast', + + // Allow to configure the store-gateway disk. + cortex_store_gateway_data_disk_size: '50Gi', + cortex_store_gateway_data_disk_class: 'standard', + + // Allow to configure the compactor disk. + cortex_compactor_data_disk_size: '250Gi', + cortex_compactor_data_disk_class: 'standard', + + // Allow to fine tune compactor. + cortex_compactor_max_concurrency: 1, + // While this is the default value, we want to pass the same to the -blocks-storage.bucket-store.sync-interval + cortex_compactor_cleanup_interval: '15m', + + // Enable use of bucket index by querier, ruler and store-gateway. + // Bucket index is generated by compactor from Cortex 1.7, there is no flag required to enable this on compactor. + cortex_bucket_index_enabled: false, + }, + + blocks_chunks_caching_config:: + ( + if $._config.memcached_index_queries_enabled then { + 'blocks-storage.bucket-store.index-cache.backend': 'memcached', + 'blocks-storage.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.index-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', + } else {} + ) + ( + if $._config.memcached_chunks_enabled then { + 'blocks-storage.bucket-store.chunks-cache.backend': 'memcached', + 'blocks-storage.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.chunks-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', + } else {} + ), + + blocks_metadata_caching_config:: if $._config.memcached_metadata_enabled then { + 'blocks-storage.bucket-store.metadata-cache.backend': 'memcached', + 'blocks-storage.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.metadata-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-item-size': $._config.memcached_metadata_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', + } else {}, + + bucket_index_config:: if $._config.cortex_bucket_index_enabled then { + 'blocks-storage.bucket-store.bucket-index.enabled': true, + + // Bucket index is updated by compactor on each cleanup cycle. + 'blocks-storage.bucket-store.sync-interval': $._config.cortex_compactor_cleanup_interval, + } else {}, + + querier_args+:: $._config.queryBlocksStorageConfig + $.blocks_metadata_caching_config + $.bucket_index_config, + ruler_args+:: $._config.queryBlocksStorageConfig + $.blocks_metadata_caching_config + $.bucket_index_config, + + // The ingesters should persist TSDB blocks and WAL on a persistent + // volume in order to be crash resilient. + local ingester_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_ingester_data_disk_size }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName($._config.cortex_ingester_data_disk_class) + + pvc.mixin.metadata.withName('ingester-data'), + + ingester_deployment: {}, + + ingester_args+:: { + 'blocks-storage.tsdb.dir': '/data/tsdb', + 'blocks-storage.tsdb.block-ranges-period': '2h', + 'blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. + 'blocks-storage.tsdb.ship-interval': '1m', + + // Disable TSDB blocks transfer because of persistent volumes + 'ingester.max-transfer-retries': 0, + 'ingester.join-after': '0s', + + // Persist ring tokens so that when the ingester will be restarted + // it will pick the same tokens + 'ingester.tokens-file-path': '/data/tokens', + }, + + newIngesterStatefulSet(name, container, with_anti_affinity=true):: + statefulSet.new(name, 3, [ + container + $.core.v1.container.withVolumeMountsMixin([ + volumeMount.new('ingester-data', '/data'), + ]), + ], ingester_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name } + $.ingester_deployment_labels) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + // When the ingester needs to flush blocks to the storage, it may take quite a lot of time. + // For this reason, we grant an high termination period (80 minutes). + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(1200) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + $.util.podPriority('high') + + // Parallelly scale up/down ingester instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + (if with_anti_affinity then $.util.antiAffinity else {}), + + ingester_statefulset: self.newIngesterStatefulSet('ingester', $.ingester_container), + + ingester_service: + $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), + + // The compactor runs a statefulset with a single replica, because + // it does not support horizontal scalability yet. + local compactor_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_compactor_data_disk_size }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName($._config.cortex_compactor_data_disk_class) + + pvc.mixin.metadata.withName('compactor-data'), + + compactor_args:: + $._config.grpcConfig + + $._config.storageConfig + + $._config.blocksStorageConfig + + $._config.compactorLimitsConfig + + { + target: 'compactor', + + // Compactor config. + 'compactor.block-ranges': '2h,12h,24h', + 'compactor.data-dir': '/data', + 'compactor.compaction-interval': '30m', + 'compactor.compaction-concurrency': $._config.cortex_compactor_max_concurrency, + 'compactor.cleanup-interval': $._config.cortex_compactor_cleanup_interval, + + // Enable sharding. + 'compactor.sharding-enabled': true, + 'compactor.ring.store': 'consul', + 'compactor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'compactor.ring.prefix': '', + + // Limits config. + 'runtime-config.file': '/etc/cortex/overrides.yaml', + }, + + compactor_ports:: $.util.defaultPorts, + + compactor_container:: + container.new('compactor', $._images.compactor) + + container.withPorts($.compactor_ports) + + container.withArgsMixin($.util.mapToFlags($.compactor_args)) + + container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + + // Do not limit compactor CPU and request enough cores to honor configured max concurrency. + $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '6Gi') + + $.util.resourcesLimits(null, '6Gi') + + $.util.readinessProbe + + $.jaeger_mixin, + + newCompactorStatefulSet(name, container):: + statefulSet.new(name, 1, [container], compactor_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + + // Parallelly scale up/down compactor instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), + + compactor_statefulset: + $.newCompactorStatefulSet('compactor', $.compactor_container), + + // The store-gateway runs a statefulset. + local store_gateway_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_store_gateway_data_disk_size }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName($._config.cortex_store_gateway_data_disk_class) + + pvc.mixin.metadata.withName('store-gateway-data'), + + store_gateway_args:: + $._config.grpcConfig + + $._config.storageConfig + + $._config.blocksStorageConfig + + $._config.queryBlocksStorageConfig + + { + target: 'store-gateway', + 'runtime-config.file': '/etc/cortex/overrides.yaml', + + // Persist ring tokens so that when the store-gateway will be restarted + // it will pick the same tokens + 'store-gateway.sharding-ring.tokens-file-path': '/data/tokens', + + // Block index-headers are pre-downloaded but lazy mmaped and loaded at query time. + 'blocks-storage.bucket-store.index-header-lazy-loading-enabled': 'true', + 'blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout': '60m', + + 'blocks-storage.bucket-store.max-chunk-pool-bytes': 12 * 1024 * 1024 * 1024, + + // We should keep a number of idle connections equal to the max "get" concurrency, + // in order to avoid re-opening connections continuously (this would be slower + // and fill up the conntrack table too). + // + // The downside of this approach is that we'll end up with an higher number of + // active connections to memcached, so we have to make sure connections limit + // set in memcached is high enough. + 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'], + 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'], + 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'], + } + + $.blocks_chunks_caching_config + + $.blocks_metadata_caching_config + + $.bucket_index_config, + + store_gateway_ports:: $.util.defaultPorts, + + store_gateway_container:: + container.new('store-gateway', $._images.store_gateway) + + container.withPorts($.store_gateway_ports) + + container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + + container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + + $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesLimits(null, '18Gi') + + $.util.readinessProbe + + $.jaeger_mixin, + + newStoreGatewayStatefulSet(name, container):: + statefulSet.new(name, 3, [container], store_gateway_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120) + + // Parallelly scale up/down store-gateway instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), + + store_gateway_statefulset: self.newStoreGatewayStatefulSet('store-gateway', $.store_gateway_container), + + store_gateway_service: + $.util.serviceFor($.store_gateway_statefulset), + + local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, + + store_gateway_pdb: + podDisruptionBudget.new() + + podDisruptionBudget.mixin.metadata.withName('store-gateway-pdb') + + podDisruptionBudget.mixin.metadata.withLabels({ name: 'store-gateway-pdb' }) + + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) + + // To avoid any disruption in the read path we need at least 1 replica of each + // block available, so the disruption budget depends on the blocks replication factor. + podDisruptionBudget.mixin.spec.withMaxUnavailable(if $._config.store_gateway_replication_factor > 1 then $._config.store_gateway_replication_factor - 1 else 1), +}