Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Alerting] Add telemetry for number of scheduled actions during rule execution #128891

Merged
merged 8 commits into from
Apr 1, 2022
201 changes: 201 additions & 0 deletions x-pack/plugins/alerting/server/usage/alerting_telemetry.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
getExecutionsPerDayCount,
getExecutionTimeoutsPerDayCount,
getFailedAndUnrecognizedTasksPerDay,
parsePercentileAggsByRuleType,
} from './alerting_telemetry';

describe('alerting telemetry', () => {
Expand Down Expand Up @@ -181,6 +182,41 @@ Object {
avgTotalSearchDuration: {
value: 30.642857142857142,
},
percentileScheduledActions: {
values: {
'50.0': 4.0,
'90.0': 26.0,
'99.0': 26.0,
},
},
aggsByType: {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: '.index-threshold',
doc_count: 149,
percentileScheduledActions: {
values: {
'50.0': 4.0,
'90.0': 26.0,
'99.0': 26.0,
},
},
},
{
key: 'logs.alert.document.count',
doc_count: 1,
percentileScheduledActions: {
values: {
'50.0': 10.0,
'90.0': 10.0,
'99.0': 10.0,
},
},
},
],
},
},
hits: {
hits: [],
Expand Down Expand Up @@ -228,6 +264,25 @@ Object {
},
countTotal: 4,
countTotalFailures: 4,
scheduledActionsPercentiles: {
p50: 4,
p90: 26,
p99: 26,
},
scheduledActionsPercentilesByType: {
p50: {
'__index-threshold': 4,
logs__alert__document__count: 10,
},
p90: {
'__index-threshold': 26,
logs__alert__document__count: 10,
},
p99: {
'__index-threshold': 26,
logs__alert__document__count: 10,
},
},
});
});

Expand Down Expand Up @@ -316,4 +371,150 @@ Object {
countTotal: 5,
});
});

test('parsePercentileAggsByRuleType', () => {
const aggsByType = {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: '.index-threshold',
doc_count: 149,
percentileScheduledActions: {
values: {
'50.0': 4.0,
'90.0': 26.0,
'99.0': 26.0,
},
},
},
{
key: 'logs.alert.document.count',
doc_count: 1,
percentileScheduledActions: {
values: {
'50.0': 10.0,
'90.0': 10.0,
'99.0': 10.0,
},
},
},
{
key: 'document.test.',
doc_count: 1,
percentileScheduledActions: {
values: {
'50.0': null,
'90.0': null,
'99.0': null,
},
},
},
],
};
expect(
parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileScheduledActions.values')
).toEqual({
p50: {
'__index-threshold': 4,
document__test__: 0,
logs__alert__document__count: 10,
},
p90: {
'__index-threshold': 26,
document__test__: 0,
logs__alert__document__count: 10,
},
p99: {
'__index-threshold': 26,
document__test__: 0,
logs__alert__document__count: 10,
},
});
});

test('parsePercentileAggsByRuleType handles unknown path', () => {
const aggsByType = {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: '.index-threshold',
doc_count: 149,
percentileScheduledActions: {
values: {
'50.0': 4.0,
'90.0': 26.0,
'99.0': 26.0,
},
},
},
{
key: 'logs.alert.document.count',
doc_count: 1,
percentileScheduledActions: {
values: {
'50.0': 10.0,
'90.0': 10.0,
'99.0': 10.0,
},
},
},
],
};
expect(parsePercentileAggsByRuleType(aggsByType.buckets, 'foo.values')).toEqual({
p50: {},
p90: {},
p99: {},
});
});

test('parsePercentileAggsByRuleType handles unrecognized percentiles', () => {
const aggsByType = {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: '.index-threshold',
doc_count: 149,
percentileScheduledActions: {
values: {
'50.0': 4.0,
'75.0': 8.0,
'90.0': 26.0,
'99.0': 26.0,
},
},
},
{
key: 'logs.alert.document.count',
doc_count: 1,
percentileScheduledActions: {
values: {
'50.0': 10.0,
'75.0': 10.0,
'90.0': 10.0,
'99.0': 10.0,
},
},
},
],
};
expect(
parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileScheduledActions.values')
).toEqual({
p50: {
'__index-threshold': 4,
logs__alert__document__count: 10,
},
p90: {
'__index-threshold': 26,
logs__alert__document__count: 10,
},
p99: {
'__index-threshold': 26,
logs__alert__document__count: 10,
},
});
});
});
76 changes: 76 additions & 0 deletions x-pack/plugins/alerting/server/usage/alerting_telemetry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,17 @@
* 2.0.
*/

import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { ElasticsearchClient } from 'kibana/server';
import { get, merge } from 'lodash';
import { AlertingUsage } from './types';
import { NUM_ALERTING_RULE_TYPES } from './alerting_usage_collector';

const percentileFieldNameMapping: Record<string, string> = {
'50.0': 'p50',
'90.0': 'p90',
'99.0': 'p99',
};

const ruleTypeMetric = {
scripted_metric: {
Expand Down Expand Up @@ -38,6 +47,13 @@ const ruleTypeMetric = {
},
};

const scheduledActionsPercentilesAgg = {
percentiles: {
field: 'kibana.alert.rule.execution.metrics.number_of_scheduled_actions',
percents: [50, 90, 99],
},
};

const ruleTypeExecutionsWithDurationMetric = {
scripted_metric: {
init_script:
Expand Down Expand Up @@ -409,6 +425,16 @@ export async function getExecutionsPerDayCount(
avgTotalSearchDuration: {
avg: { field: 'kibana.alert.rule.execution.metrics.total_search_duration_ms' },
},
percentileScheduledActions: scheduledActionsPercentilesAgg,
aggsByType: {
terms: {
field: 'rule.category',
size: NUM_ALERTING_RULE_TYPES,
},
aggs: {
percentileScheduledActions: scheduledActionsPercentilesAgg,
},
},
},
},
});
Expand Down Expand Up @@ -439,6 +465,14 @@ export async function getExecutionsPerDayCount(
searchResult.aggregations.avgTotalSearchDuration.value
);

const aggsScheduledActionsPercentiles =
// @ts-expect-error aggegation type is not specified
searchResult.aggregations.percentileScheduledActions.values;

const aggsByTypeBuckets =
// @ts-expect-error aggegation type is not specified
searchResult.aggregations.aggsByType.buckets;

const executionFailuresAggregations = searchResult.aggregations as {
failuresByReason: { value: { reasons: Record<string, Record<string, string>> } };
};
Expand Down Expand Up @@ -537,6 +571,21 @@ export async function getExecutionsPerDayCount(
}),
{}
),
scheduledActionsPercentiles: Object.keys(aggsScheduledActionsPercentiles).reduce(
// ES DSL aggregations are returned as `any` by esClient.search
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(acc: any, curr: string) => ({
...acc,
...(percentileFieldNameMapping[curr]
? { [percentileFieldNameMapping[curr]]: aggsScheduledActionsPercentiles[curr] }
: {}),
}),
{}
),
scheduledActionsPercentilesByType: parsePercentileAggsByRuleType(
aggsByTypeBuckets,
'percentileScheduledActions.values'
),
};
}

Expand Down Expand Up @@ -701,3 +750,30 @@ function replaceDotSymbolsInRuleTypeIds(ruleTypeIdObj: Record<string, string>) {
{}
);
}

export function parsePercentileAggsByRuleType(
aggsByType: estypes.AggregationsStringTermsBucketKeys[],
path: string
) {
return (aggsByType ?? []).reduce(
(acc, curr) => {
const percentiles = get(curr, path, {});
return merge(
acc,
Object.keys(percentiles).reduce((pacc, pcurr) => {
return {
...pacc,
...(percentileFieldNameMapping[pcurr]
? {
[percentileFieldNameMapping[pcurr]]: {
[replaceDotSymbols(curr.key)]: percentiles[pcurr] ?? 0,
},
}
: {}),
};
}, {})
);
},
{ p50: {}, p90: {}, p99: {} }
);
}
28 changes: 28 additions & 0 deletions x-pack/plugins/alerting/server/usage/alerting_usage_collector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ const byTypeSchema: MakeSchemaFrom<AlertingUsage>['count_by_type'] = {
xpack__ml__anomaly_detection_jobs_health: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
};

export const NUM_ALERTING_RULE_TYPES = Object.keys(byTypeSchema).length;

const byReasonSchema: MakeSchemaFrom<AlertingUsage>['count_rules_executions_failured_by_reason_per_day'] =
{
// TODO: Find out an automated way to populate the keys or reformat these into an array (and change the Remote Telemetry indexer accordingly)
Expand All @@ -66,6 +68,20 @@ const byReasonSchema: MakeSchemaFrom<AlertingUsage>['count_rules_executions_fail
unknown: { type: 'long' },
};

const byPercentileSchema: MakeSchemaFrom<AlertingUsage>['percentile_num_scheduled_actions_per_day'] =
{
p50: { type: 'long' },
p90: { type: 'long' },
p99: { type: 'long' },
};

const byPercentileSchemaByType: MakeSchemaFrom<AlertingUsage>['percentile_num_scheduled_actions_by_type_per_day'] =
{
p50: byTypeSchema,
p90: byTypeSchema,
p99: byTypeSchema,
};

const byReasonSchemaByType: MakeSchemaFrom<AlertingUsage>['count_rules_executions_failured_by_reason_by_type_per_day'] =
{
// TODO: Find out an automated way to populate the keys or reformat these into an array (and change the Remote Telemetry indexer accordingly)
Expand Down Expand Up @@ -160,6 +176,16 @@ export function createAlertingUsageCollector(
avg_es_search_duration_by_type_per_day: {},
avg_total_search_duration_per_day: 0,
avg_total_search_duration_by_type_per_day: {},
percentile_num_scheduled_actions_per_day: {
p50: 0,
p90: 0,
p99: 0,
},
percentile_num_scheduled_actions_by_type_per_day: {
p50: {},
p90: {},
p99: {},
},
};
}
},
Expand Down Expand Up @@ -211,6 +237,8 @@ export function createAlertingUsageCollector(
avg_es_search_duration_by_type_per_day: byTypeSchema,
avg_total_search_duration_per_day: { type: 'long' },
avg_total_search_duration_by_type_per_day: byTypeSchema,
percentile_num_scheduled_actions_per_day: byPercentileSchema,
percentile_num_scheduled_actions_by_type_per_day: byPercentileSchemaByType,
},
});
}
Expand Down
Loading