diff --git a/CHANGELOG.md b/CHANGELOG.md index b0d814ec..a3bd9325 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,8 @@ * [CHANGE] Removed `CortexQuerierCapacityFull` alert. #342 * [CHANGE] Changes blocks storage alerts to group metrics by the configured `cluster_labels` (supporting the deprecated `alert_aggregation_labels`). #351 * [CHANGE] Increased `CortexIngesterReachingSeriesLimit` critical alert threshold from 80% to 85%. #363 +* [CHANGE] Decreased `-server.grpc-max-concurrent-streams` from 100k to 10k. #369 +* [CHANGE] Decreased blocks storage ingesters graceful termination period from 80m to 20m. #369 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317 * [ENHANCEMENT] Cortex-mixin: Include `cortex-gw-internal` naming variation in default `gateway` job names. #328 * [ENHANCEMENT] Ruler dashboard: added object storage metrics. #354 @@ -38,11 +40,13 @@ * "Tenant Configuration Sync" row - information about the configuration sync procedure. * "Sharding Initial State Sync" row - information about the initial state sync procedure when sharding is enabled. * "Sharding Runtime State Sync" row - information about various state operations which occur when sharding is enabled (replication, fetch, marge, persist). +* [ENHANCEMENT] Added 256MB memory ballast to querier. #369 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 * [BUGFIX] Fixed scaling dashboard to correctly work when a Cortex service deployment spans across multiple zones (a zone is expected to have the `zone-[a-z]` suffix). #365 * [BUGFIX] Fixed rollout progress dashboard to correctly work when a Cortex service deployment spans across multiple zones (a zone is expected to have the `zone-[a-z]` suffix). #366 +* [BUGFIX] Fixed `-distributor.extend-writes` setting on ruler when `unregister_ingesters_on_shutdown` is disabled. #369 ## 1.9.0 / 2021-05-18 diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet index 34b4d987..e0753d84 100644 --- a/cortex/ingester.libsonnet +++ b/cortex/ingester.libsonnet @@ -31,7 +31,7 @@ 'ingester.max-series-per-query': $._config.limits.max_series_per_query, 'ingester.max-samples-per-query': $._config.limits.max_samples_per_query, 'runtime-config.file': '/etc/cortex/overrides.yaml', - 'server.grpc-max-concurrent-streams': 100000, + 'server.grpc-max-concurrent-streams': 10000, 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, 'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024, } + ( diff --git a/cortex/querier.libsonnet b/cortex/querier.libsonnet index 5dc3c834..574358d8 100644 --- a/cortex/querier.libsonnet +++ b/cortex/querier.libsonnet @@ -26,6 +26,10 @@ 'querier.second-store-engine': $._config.querier_second_storage_engine, + // We request high memory but the Go heap is typically very low (< 100MB) and this causes + // the GC to trigger continuously. Setting a ballast of 256MB reduces GC. + 'mem-ballast-size-bytes': 1 << 28, // 256M + 'log.level': 'debug', }, diff --git a/cortex/ruler.libsonnet b/cortex/ruler.libsonnet index 73029a20..dfb5727e 100644 --- a/cortex/ruler.libsonnet +++ b/cortex/ruler.libsonnet @@ -29,6 +29,10 @@ // Storage 'querier.second-store-engine': $._config.querier_second_storage_engine, + + // Do not extend the replication set on unhealthy (or LEAVING) ingester when "unregister on shutdown" + // is set to false. + 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, }, ruler_container:: diff --git a/cortex/tsdb.libsonnet b/cortex/tsdb.libsonnet index 5c69bf00..1c77abd9 100644 --- a/cortex/tsdb.libsonnet +++ b/cortex/tsdb.libsonnet @@ -115,7 +115,7 @@ statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + // When the ingester needs to flush blocks to the storage, it may take quite a lot of time. // For this reason, we grant an high termination period (80 minutes). - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(1200) + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + $.util.podPriority('high') +