Skip to content

Commit

Permalink
[Alerting] Add telemetry for number of scheduled actions during rule …
Browse files Browse the repository at this point in the history
…execution (#128891) (#129253)

* Adding telemetry for number of scheduled actions

* Adding percentile by type types

* Parsing percentiles by rule type and adding tests

* Adding functional tests

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
(cherry picked from commit e36a08c)

Co-authored-by: Ying Mao <ying.mao@elastic.co>
  • Loading branch information
kibanamachine and ymao1 authored Apr 1, 2022
1 parent 7254eb1 commit ee581d0
Show file tree
Hide file tree
Showing 7 changed files with 740 additions and 20 deletions.
201 changes: 201 additions & 0 deletions x-pack/plugins/alerting/server/usage/alerting_telemetry.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
getExecutionsPerDayCount,
getExecutionTimeoutsPerDayCount,
getFailedAndUnrecognizedTasksPerDay,
parsePercentileAggsByRuleType,
} from './alerting_telemetry';

describe('alerting telemetry', () => {
Expand Down Expand Up @@ -181,6 +182,41 @@ Object {
avgTotalSearchDuration: {
value: 30.642857142857142,
},
percentileScheduledActions: {
values: {
'50.0': 4.0,
'90.0': 26.0,
'99.0': 26.0,
},
},
aggsByType: {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: '.index-threshold',
doc_count: 149,
percentileScheduledActions: {
values: {
'50.0': 4.0,
'90.0': 26.0,
'99.0': 26.0,
},
},
},
{
key: 'logs.alert.document.count',
doc_count: 1,
percentileScheduledActions: {
values: {
'50.0': 10.0,
'90.0': 10.0,
'99.0': 10.0,
},
},
},
],
},
},
hits: {
hits: [],
Expand Down Expand Up @@ -228,6 +264,25 @@ Object {
},
countTotal: 4,
countTotalFailures: 4,
scheduledActionsPercentiles: {
p50: 4,
p90: 26,
p99: 26,
},
scheduledActionsPercentilesByType: {
p50: {
'__index-threshold': 4,
logs__alert__document__count: 10,
},
p90: {
'__index-threshold': 26,
logs__alert__document__count: 10,
},
p99: {
'__index-threshold': 26,
logs__alert__document__count: 10,
},
},
});
});

Expand Down Expand Up @@ -316,4 +371,150 @@ Object {
countTotal: 5,
});
});

test('parsePercentileAggsByRuleType', () => {
const aggsByType = {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: '.index-threshold',
doc_count: 149,
percentileScheduledActions: {
values: {
'50.0': 4.0,
'90.0': 26.0,
'99.0': 26.0,
},
},
},
{
key: 'logs.alert.document.count',
doc_count: 1,
percentileScheduledActions: {
values: {
'50.0': 10.0,
'90.0': 10.0,
'99.0': 10.0,
},
},
},
{
key: 'document.test.',
doc_count: 1,
percentileScheduledActions: {
values: {
'50.0': null,
'90.0': null,
'99.0': null,
},
},
},
],
};
expect(
parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileScheduledActions.values')
).toEqual({
p50: {
'__index-threshold': 4,
document__test__: 0,
logs__alert__document__count: 10,
},
p90: {
'__index-threshold': 26,
document__test__: 0,
logs__alert__document__count: 10,
},
p99: {
'__index-threshold': 26,
document__test__: 0,
logs__alert__document__count: 10,
},
});
});

test('parsePercentileAggsByRuleType handles unknown path', () => {
const aggsByType = {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: '.index-threshold',
doc_count: 149,
percentileScheduledActions: {
values: {
'50.0': 4.0,
'90.0': 26.0,
'99.0': 26.0,
},
},
},
{
key: 'logs.alert.document.count',
doc_count: 1,
percentileScheduledActions: {
values: {
'50.0': 10.0,
'90.0': 10.0,
'99.0': 10.0,
},
},
},
],
};
expect(parsePercentileAggsByRuleType(aggsByType.buckets, 'foo.values')).toEqual({
p50: {},
p90: {},
p99: {},
});
});

test('parsePercentileAggsByRuleType handles unrecognized percentiles', () => {
const aggsByType = {
doc_count_error_upper_bound: 0,
sum_other_doc_count: 0,
buckets: [
{
key: '.index-threshold',
doc_count: 149,
percentileScheduledActions: {
values: {
'50.0': 4.0,
'75.0': 8.0,
'90.0': 26.0,
'99.0': 26.0,
},
},
},
{
key: 'logs.alert.document.count',
doc_count: 1,
percentileScheduledActions: {
values: {
'50.0': 10.0,
'75.0': 10.0,
'90.0': 10.0,
'99.0': 10.0,
},
},
},
],
};
expect(
parsePercentileAggsByRuleType(aggsByType.buckets, 'percentileScheduledActions.values')
).toEqual({
p50: {
'__index-threshold': 4,
logs__alert__document__count: 10,
},
p90: {
'__index-threshold': 26,
logs__alert__document__count: 10,
},
p99: {
'__index-threshold': 26,
logs__alert__document__count: 10,
},
});
});
});
76 changes: 76 additions & 0 deletions x-pack/plugins/alerting/server/usage/alerting_telemetry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,17 @@
* 2.0.
*/

import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { ElasticsearchClient } from 'kibana/server';
import { get, merge } from 'lodash';
import { AlertingUsage } from './types';
import { NUM_ALERTING_RULE_TYPES } from './alerting_usage_collector';

const percentileFieldNameMapping: Record<string, string> = {
'50.0': 'p50',
'90.0': 'p90',
'99.0': 'p99',
};

const ruleTypeMetric = {
scripted_metric: {
Expand Down Expand Up @@ -38,6 +47,13 @@ const ruleTypeMetric = {
},
};

const scheduledActionsPercentilesAgg = {
percentiles: {
field: 'kibana.alert.rule.execution.metrics.number_of_scheduled_actions',
percents: [50, 90, 99],
},
};

const ruleTypeExecutionsWithDurationMetric = {
scripted_metric: {
init_script:
Expand Down Expand Up @@ -409,6 +425,16 @@ export async function getExecutionsPerDayCount(
avgTotalSearchDuration: {
avg: { field: 'kibana.alert.rule.execution.metrics.total_search_duration_ms' },
},
percentileScheduledActions: scheduledActionsPercentilesAgg,
aggsByType: {
terms: {
field: 'rule.category',
size: NUM_ALERTING_RULE_TYPES,
},
aggs: {
percentileScheduledActions: scheduledActionsPercentilesAgg,
},
},
},
},
});
Expand Down Expand Up @@ -439,6 +465,14 @@ export async function getExecutionsPerDayCount(
searchResult.aggregations.avgTotalSearchDuration.value
);

const aggsScheduledActionsPercentiles =
// @ts-expect-error aggegation type is not specified
searchResult.aggregations.percentileScheduledActions.values;

const aggsByTypeBuckets =
// @ts-expect-error aggegation type is not specified
searchResult.aggregations.aggsByType.buckets;

const executionFailuresAggregations = searchResult.aggregations as {
failuresByReason: { value: { reasons: Record<string, Record<string, string>> } };
};
Expand Down Expand Up @@ -537,6 +571,21 @@ export async function getExecutionsPerDayCount(
}),
{}
),
scheduledActionsPercentiles: Object.keys(aggsScheduledActionsPercentiles).reduce(
// ES DSL aggregations are returned as `any` by esClient.search
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(acc: any, curr: string) => ({
...acc,
...(percentileFieldNameMapping[curr]
? { [percentileFieldNameMapping[curr]]: aggsScheduledActionsPercentiles[curr] }
: {}),
}),
{}
),
scheduledActionsPercentilesByType: parsePercentileAggsByRuleType(
aggsByTypeBuckets,
'percentileScheduledActions.values'
),
};
}

Expand Down Expand Up @@ -701,3 +750,30 @@ function replaceDotSymbolsInRuleTypeIds(ruleTypeIdObj: Record<string, string>) {
{}
);
}

export function parsePercentileAggsByRuleType(
aggsByType: estypes.AggregationsStringTermsBucketKeys[],
path: string
) {
return (aggsByType ?? []).reduce(
(acc, curr) => {
const percentiles = get(curr, path, {});
return merge(
acc,
Object.keys(percentiles).reduce((pacc, pcurr) => {
return {
...pacc,
...(percentileFieldNameMapping[pcurr]
? {
[percentileFieldNameMapping[pcurr]]: {
[replaceDotSymbols(curr.key)]: percentiles[pcurr] ?? 0,
},
}
: {}),
};
}, {})
);
},
{ p50: {}, p90: {}, p99: {} }
);
}
28 changes: 28 additions & 0 deletions x-pack/plugins/alerting/server/usage/alerting_usage_collector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ const byTypeSchema: MakeSchemaFrom<AlertingUsage>['count_by_type'] = {
xpack__ml__anomaly_detection_jobs_health: { type: 'long' }, // eslint-disable-line @typescript-eslint/naming-convention
};

export const NUM_ALERTING_RULE_TYPES = Object.keys(byTypeSchema).length;

const byReasonSchema: MakeSchemaFrom<AlertingUsage>['count_rules_executions_failured_by_reason_per_day'] =
{
// TODO: Find out an automated way to populate the keys or reformat these into an array (and change the Remote Telemetry indexer accordingly)
Expand All @@ -66,6 +68,20 @@ const byReasonSchema: MakeSchemaFrom<AlertingUsage>['count_rules_executions_fail
unknown: { type: 'long' },
};

const byPercentileSchema: MakeSchemaFrom<AlertingUsage>['percentile_num_scheduled_actions_per_day'] =
{
p50: { type: 'long' },
p90: { type: 'long' },
p99: { type: 'long' },
};

const byPercentileSchemaByType: MakeSchemaFrom<AlertingUsage>['percentile_num_scheduled_actions_by_type_per_day'] =
{
p50: byTypeSchema,
p90: byTypeSchema,
p99: byTypeSchema,
};

const byReasonSchemaByType: MakeSchemaFrom<AlertingUsage>['count_rules_executions_failured_by_reason_by_type_per_day'] =
{
// TODO: Find out an automated way to populate the keys or reformat these into an array (and change the Remote Telemetry indexer accordingly)
Expand Down Expand Up @@ -160,6 +176,16 @@ export function createAlertingUsageCollector(
avg_es_search_duration_by_type_per_day: {},
avg_total_search_duration_per_day: 0,
avg_total_search_duration_by_type_per_day: {},
percentile_num_scheduled_actions_per_day: {
p50: 0,
p90: 0,
p99: 0,
},
percentile_num_scheduled_actions_by_type_per_day: {
p50: {},
p90: {},
p99: {},
},
};
}
},
Expand Down Expand Up @@ -211,6 +237,8 @@ export function createAlertingUsageCollector(
avg_es_search_duration_by_type_per_day: byTypeSchema,
avg_total_search_duration_per_day: { type: 'long' },
avg_total_search_duration_by_type_per_day: byTypeSchema,
percentile_num_scheduled_actions_per_day: byPercentileSchema,
percentile_num_scheduled_actions_by_type_per_day: byPercentileSchemaByType,
},
});
}
Expand Down
Loading

0 comments on commit ee581d0

Please sign in to comment.