Skip to content

Commit

Permalink
Merge branch 'main' into Java-RMI-Installation-upgrade
Browse files Browse the repository at this point in the history
  • Loading branch information
nr-mlosier authored Oct 5, 2023
2 parents f61138a + 3d80446 commit 2667222
Show file tree
Hide file tree
Showing 122 changed files with 14,509 additions and 1,495 deletions.
35 changes: 35 additions & 0 deletions alert-policies/adobe-commerce-business-insights/5xxErrors.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: 5xx Server Errors

description: |+
This alert is triggered if the customer faces 5xx server errors more than 5 times in 5 minutes.
type: STATIC
nrql:
query: "SELECT count(*) as '5xx Server Errors' from Transaction WHERE httpResponseCode LIKE '5%'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 10
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 5
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
35 changes: 35 additions & 0 deletions alert-policies/adobe-commerce-business-insights/cpuUsage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: CPU Usage (%)

description: |+
This alert is triggered if CPU usage exceeds 90% for 5 minutes.
type: STATIC
nrql:
query: "SELECT latest(host.cpuPercent) AS 'CPU Used %' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 80
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
35 changes: 35 additions & 0 deletions alert-policies/adobe-commerce-business-insights/downtime.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Downtime (%)

description: |+
This alert is triggered if Downtime is more than 1% for 2 minutes.
type: STATIC
nrql:
query: "SELECT percentage(count(result), where result = 'FAILED') as 'Downtime (%)' from SyntheticCheck"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 0.5
# Time in seconds; 120 - 3600
thresholdDuration: 120
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
35 changes: 35 additions & 0 deletions alert-policies/adobe-commerce-business-insights/memoryUsage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Memory Usage (%)

description: |+
This alert is triggered if Memory usage exceeds 90% for 5 minutes.
type: STATIC
nrql:
query: "SELECT latest(host.memoryUsedPercent) as 'Memory Used %' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 80
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/amazon-appstream/HighCapacityUtilization.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: High Capacity Utilization

description: |+
This alert is triggered when the Capacity Utilization is above 90%.
type: STATIC
nrql:
query: "SELECT average(`aws.appstream.CapacityUtilization`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/amazon-appstream/HighInsufficientCapacityErrors.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: High Insufficient Capacity Errors

description: |+
This alert is triggered when Insufficient Capacity Errors are above 10 in 10 minutes.
type: STATIC
nrql:
query: "SELECT count(`aws.appstream.InsufficientCapacityError`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 100
# Time in seconds; 120 - 3600
thresholdDuration: 600
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/amazon-cloudsearch/HighIndexUtilization.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: High Index Utilization

description: |+
This alert is triggered when the Index Utilization is above 90%.
type: STATIC
nrql:
query: "SELECT average(`aws.cloudsearch.IndexUtilization`) as 'Query' FROM Metric"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
33 changes: 33 additions & 0 deletions alert-policies/azure-machine-learning/ModelDeployFailed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Model Deployment Failed

description: |+
This alert is triggered if the number of Failure exceeds 20 within 10 minutes.
type: STATIC
nrql:
query: "FROM Metric SELECT sum(azure.machinelearningservices.workspaces.ModelDeployFailed) AS 'ModelDeployFailed'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 20
# Time in seconds; 120 - 3600
thresholdDuration: 600
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
- priority: WARNING
operator: ABOVE
threshold: 10
thresholdDuration: 600
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
19 changes: 19 additions & 0 deletions alert-policies/f5/f5-node-offline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: F5 Node Offline
description: |+
This alert fires when an F5 Node has an availability state = 'offline' for at least 10 minutes.
type: STATIC
nrql:
query: "FROM F5BigIpNodeSample SELECT count(*) FACET reportingEndpoint, displayName WHERE node.availabilityState = 0"
valueFunction: SINGLE_VALUE
terms:
- priority: CRITICAL
operator: ABOVE
threshold: 0
thresholdDuration: 600
thresholdOccurrences: ALL
signal:
aggregationDelay: 120
aggregationMethod: EVENT_FLOW
aggregationWindow: 60

violationTimeLimitSeconds: 259200
19 changes: 19 additions & 0 deletions alert-policies/f5/f5-pool-member-offline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: F5 Pool Member Offline
description: |+
This alert fires when an F5 Pool Member has an availability state = 'offline' for at least 10 minutes.
type: STATIC
nrql:
query: "FROM F5BigIpPoolMemberSample SELECT count(*) FACET aparse(url, '%//*'), poolName, displayName WHERE member.availabilityState = 0"
valueFunction: SINGLE_VALUE
terms:
- priority: CRITICAL
operator: ABOVE
threshold: 0
thresholdDuration: 600
thresholdOccurrences: ALL
signal:
aggregationDelay: 120
aggregationMethod: EVENT_FLOW
aggregationWindow: 60

violationTimeLimitSeconds: 259200
19 changes: 19 additions & 0 deletions alert-policies/f5/f5-pool-offline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: F5 Pool Offline
description: |+
This alert fires when an F5 Pool has an availability state = 'offline' for at least 10 minutes.
type: STATIC
nrql:
query: "FROM F5BigIpPoolSample SELECT count(*) FACET reportingEndpoint, displayName WHERE pool.availabilityState = 0"
valueFunction: SINGLE_VALUE
terms:
- priority: CRITICAL
operator: ABOVE
threshold: 0
thresholdDuration: 600
thresholdOccurrences: ALL
signal:
aggregationDelay: 120
aggregationMethod: EVENT_FLOW
aggregationWindow: 60

violationTimeLimitSeconds: 259200
19 changes: 19 additions & 0 deletions alert-policies/f5/f5-virtual-server-offline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: F5 Virtual Server Offline
description: |+
This alert fires when an F5 Virtual Server has an availability state = 'offline' for at least 10 minutes.
type: STATIC
nrql:
query: "FROM F5BigIpVirtualServerSample SELECT count(*) FACET reportingEndpoint, displayName WHERE virtualserver.availabilityState = 0"
valueFunction: SINGLE_VALUE
terms:
- priority: CRITICAL
operator: ABOVE
threshold: 0
thresholdDuration: 600
thresholdOccurrences: ALL
signal:
aggregationDelay: 120
aggregationMethod: EVENT_FLOW
aggregationWindow: 60

violationTimeLimitSeconds: 259200
32 changes: 32 additions & 0 deletions alert-policies/openstack-controller/ImageStatus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Name of the alert
name: Image Status

# Description and details
description: |+
This alert is triggered when the image status is inactive for 5 minutes.
# Type of alert
type: STATIC

# NRQL query
nrql:

query: "SELECT count(*)FROM OSImageSample where openstack.glance.image.status != 'active'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
Loading

0 comments on commit 2667222

Please sign in to comment.