Skip to content

Commit

Permalink
[Task Manager] Log at different levels based on the state (elastic#10…
Browse files Browse the repository at this point in the history
…1751) (elastic#102402)

* Log at different levels based on the state

* Fix types and add tests

* Remove unnecessary code

* Add more descriptive message

* Partially fix failing tests

* Move into separate function

* Get rid of customStatus in favor of moving the logging logic to a separate, mockable function

* Remove debug logging

* Do not log as an error if the stats are empty

* PR feedback

* Add docker whitelist

* alpha order

* English is hard

* Removing extra newline

* PR feedback around ignoring capacity estimation

* Move json utils
  • Loading branch information
chrisronline committed Jun 16, 2021
1 parent df5c56d commit b16513a
Show file tree
Hide file tree
Showing 17 changed files with 576 additions and 80 deletions.
3 changes: 3 additions & 0 deletions docs/settings/task-manager-settings.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ Task Manager runs background tasks by polling for work on an interval. You can
| `xpack.task_manager.max_workers`
| The maximum number of tasks that this Kibana instance will run simultaneously. Defaults to 10.
Starting in 8.0, it will not be possible to set the value greater than 100.

| `xpack.task_manager.monitored_stats_warn_delayed_task_start_in_seconds`
| The amount of seconds we allow a task to delay before printing a warning server log. Defaults to 60.
|===

[float]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ kibana_vars=(
xpack.task_manager.monitored_aggregated_stats_refresh_rate
xpack.task_manager.monitored_stats_required_freshness
xpack.task_manager.monitored_stats_running_average_window
xpack.task_manager.monitored_stats_warn_delayed_task_start_in_seconds
xpack.task_manager.monitored_task_execution_thresholds
xpack.task_manager.poll_interval
xpack.task_manager.request_capacity
Expand Down
3 changes: 3 additions & 0 deletions x-pack/plugins/task_manager/server/config.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ describe('config validation', () => {
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {},
"default": Object {
Expand Down Expand Up @@ -68,6 +69,7 @@ describe('config validation', () => {
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {},
"default": Object {
Expand Down Expand Up @@ -103,6 +105,7 @@ describe('config validation', () => {
"monitored_aggregated_stats_refresh_rate": 60000,
"monitored_stats_required_freshness": 4000,
"monitored_stats_running_average_window": 50,
"monitored_stats_warn_delayed_task_start_in_seconds": 60,
"monitored_task_execution_thresholds": Object {
"custom": Object {
"alerting:always-fires": Object {
Expand Down
5 changes: 5 additions & 0 deletions x-pack/plugins/task_manager/server/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export const DEFAULT_VERSION_CONFLICT_THRESHOLD = 80;
// Refresh aggregated monitored stats at a default rate of once a minute
export const DEFAULT_MONITORING_REFRESH_RATE = 60 * 1000;
export const DEFAULT_MONITORING_STATS_RUNNING_AVERGAE_WINDOW = 50;
export const DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS = 60;

export const taskExecutionFailureThresholdSchema = schema.object(
{
Expand Down Expand Up @@ -109,6 +110,10 @@ export const configSchema = schema.object(
defaultValue: {},
}),
}),
/* The amount of seconds we allow a task to delay before printing a warning server log */
monitored_stats_warn_delayed_task_start_in_seconds: schema.number({
defaultValue: DEFAULT_MONITORING_STATS_WARN_DELAYED_TASK_START_IN_SECONDS,
}),
},
{
validate: (config) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ describe('managed configuration', () => {
version_conflict_threshold: 80,
max_poll_inactivity_cycles: 10,
monitored_aggregated_stats_refresh_rate: 60000,
monitored_stats_warn_delayed_task_start_in_seconds: 60,
monitored_stats_required_freshness: 4000,
monitored_stats_running_average_window: 50,
request_capacity: 1000,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

const createCalculateHealthStatusMock = () => {
return jest.fn();
};

export const calculateHealthStatusMock = {
create: createCalculateHealthStatusMock,
};
79 changes: 79 additions & 0 deletions x-pack/plugins/task_manager/server/lib/calculate_health_status.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { isString } from 'lodash';
import { JsonValue } from '@kbn/common-utils';
import { HealthStatus, RawMonitoringStats } from '../monitoring';
import { TaskManagerConfig } from '../config';

export function calculateHealthStatus(
summarizedStats: RawMonitoringStats,
config: TaskManagerConfig
): HealthStatus {
const now = Date.now();

// if "hot" health stats are any more stale than monitored_stats_required_freshness (pollInterval +1s buffer by default)
// consider the system unhealthy
const requiredHotStatsFreshness: number = config.monitored_stats_required_freshness;

// if "cold" health stats are any more stale than the configured refresh (+ a buffer), consider the system unhealthy
const requiredColdStatsFreshness: number = config.monitored_aggregated_stats_refresh_rate * 1.5;

/**
* If the monitored stats aren't fresh, return a red status
*/
const healthStatus =
hasStatus(summarizedStats.stats, HealthStatus.Error) ||
hasExpiredHotTimestamps(summarizedStats, now, requiredHotStatsFreshness) ||
hasExpiredColdTimestamps(summarizedStats, now, requiredColdStatsFreshness)
? HealthStatus.Error
: hasStatus(summarizedStats.stats, HealthStatus.Warning)
? HealthStatus.Warning
: HealthStatus.OK;
return healthStatus;
}

function hasStatus(stats: RawMonitoringStats['stats'], status: HealthStatus): boolean {
return Object.values(stats)
.map((stat) => stat?.status === status)
.includes(true);
}

/**
* If certain "hot" stats are not fresh, then the _health api will should return a Red status
* @param monitoringStats The monitored stats
* @param now The time to compare against
* @param requiredFreshness How fresh should these stats be
*/
function hasExpiredHotTimestamps(
monitoringStats: RawMonitoringStats,
now: number,
requiredFreshness: number
): boolean {
const diff =
now -
getOldestTimestamp(
monitoringStats.last_update,
monitoringStats.stats.runtime?.value.polling.last_successful_poll
);
return diff > requiredFreshness;
}

function hasExpiredColdTimestamps(
monitoringStats: RawMonitoringStats,
now: number,
requiredFreshness: number
): boolean {
return now - getOldestTimestamp(monitoringStats.stats.workload?.timestamp) > requiredFreshness;
}

function getOldestTimestamp(...timestamps: Array<JsonValue | undefined>): number {
const validTimestamps = timestamps
.map((timestamp) => (isString(timestamp) ? Date.parse(timestamp) : NaN))
.filter((timestamp) => !isNaN(timestamp));
return validTimestamps.length ? Math.min(...validTimestamps) : 0;
}
14 changes: 14 additions & 0 deletions x-pack/plugins/task_manager/server/lib/log_health_metrics.mock.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

const createLogHealthMetricsMock = () => {
return jest.fn();
};

export const logHealthMetricsMock = {
create: createLogHealthMetricsMock,
};
Loading

0 comments on commit b16513a

Please sign in to comment.