From 40be71066c57f116e5a0a5a3329ba521ef5ce0f8 Mon Sep 17 00:00:00 2001 From: Doua Vue Date: Tue, 17 Sep 2024 12:22:41 -0500 Subject: [PATCH 01/10] feat: Add new quickstart and alert policies for k8s otel --- .../ContainerCPUThrottling.yaml | 76 ++++++++++++++++++ .../ContainerHighCPUUtil.yaml | 76 ++++++++++++++++++ .../ContainerHighMemUtil.yaml | 76 ++++++++++++++++++ .../ContainerRestarting.yaml | 73 ++++++++++++++++++ .../ContainerWaiting.yaml | 73 ++++++++++++++++++ .../DaemonsetPodsMissing.yaml | 73 ++++++++++++++++++ .../DeploymentPodsMissing.yaml | 73 ++++++++++++++++++ .../JobFailedOtel.yaml | 77 +++++++++++++++++++ .../NodeHighAllocatableCPUUtil.yaml | 76 ++++++++++++++++++ .../NodeHighAllocatableMemUtil.yaml | 76 ++++++++++++++++++ .../NodeHighFSCapacityUtil.yaml | 76 ++++++++++++++++++ .../NodeIsNotReady.yaml | 77 +++++++++++++++++++ .../NodePodCapacity.yaml | 77 +++++++++++++++++++ .../PersistentVolumeErrors.yaml | 73 ++++++++++++++++++ .../kubernetes-opentelemetry/PodNotReady.yaml | 77 +++++++++++++++++++ .../PodNotScheduled.yaml | 76 ++++++++++++++++++ .../PodsFailingNamespace.yaml | 73 ++++++++++++++++++ .../StatefulsetPodsMissing.yaml | 73 ++++++++++++++++++ .../kubernetes-opentelemetry/config.yml | 54 +++++++++++++ quickstarts/kubernetes-opentelemetry/logo.svg | 1 + 20 files changed, 1406 insertions(+) create mode 100644 alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/ContainerHighCPUUtil.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/ContainerRestarting.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/ContainerWaiting.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/DaemonsetPodsMissing.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/DeploymentPodsMissing.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/JobFailedOtel.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/PersistentVolumeErrors.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/PodNotReady.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/PodNotScheduled.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml create mode 100644 alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml create mode 100644 quickstarts/kubernetes-opentelemetry/config.yml create mode 100644 quickstarts/kubernetes-opentelemetry/logo.svg diff --git a/alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml b/alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml new file mode 100644 index 0000000000..896c786d39 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml @@ -0,0 +1,76 @@ +name: Container cpu throttling is high +# Description and details +description: | + Alert when container is being throttled > 25% of the time for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(container_cpu_cfs_throttled_periods_total) / latest(container_cpu_cfs_periods_total)* 100 where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/ContainerHighCPUUtil.yaml b/alert-policies/kubernetes-opentelemetry/ContainerHighCPUUtil.yaml new file mode 100644 index 0000000000..625670c141 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/ContainerHighCPUUtil.yaml @@ -0,0 +1,76 @@ +name: Container high cpu utilization +# Description and details +description: | + Alert when the average container cpu utilization (vs. Limit) is > 90% for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select average(container.cpu.utilization) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml b/alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml new file mode 100644 index 0000000000..fe3db565c0 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml @@ -0,0 +1,76 @@ +name: Container high memory utilization +# Description and details +description: | + Alert when the average container memory utilization (vs. Limit) is > 90% for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric max(container_memory_working_set_bytes) / filter(max(kube_pod_container_resource_limits), where resource = 'memory') where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/ContainerRestarting.yaml b/alert-policies/kubernetes-opentelemetry/ContainerRestarting.yaml new file mode 100644 index 0000000000..9df6519ebd --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/ContainerRestarting.yaml @@ -0,0 +1,73 @@ +name: Container is Restarting + +# Description and details +description: | + Alert when the container restart count is greater than 0 in a sliding 5 minute window + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select sum(kube_pod_container_status_restarts_total) where metricName = 'kube_pod_container_status_restarts_total' and k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') FACET k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/ContainerWaiting.yaml b/alert-policies/kubernetes-opentelemetry/ContainerWaiting.yaml new file mode 100644 index 0000000000..d47e2bc813 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/ContainerWaiting.yaml @@ -0,0 +1,73 @@ +name: Container is Waiting + +# Description and details +description: | + Alert when a container is Waiting for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select uniqueCount(k8s.pod.name) WHERE container_phase = 'waiting' and k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') FACET k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/DaemonsetPodsMissing.yaml b/alert-policies/kubernetes-opentelemetry/DaemonsetPodsMissing.yaml new file mode 100644 index 0000000000..3dd5ae52f3 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/DaemonsetPodsMissing.yaml @@ -0,0 +1,73 @@ +name: Daemonset is missing Pods + +# Description and details +description: | + Alert when Daemonset is missing Pods for > 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(kube_daemonset_status_desired_number_scheduled) - latest(kube_daemonset_status_number_ready) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.daemonset.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/DeploymentPodsMissing.yaml b/alert-policies/kubernetes-opentelemetry/DeploymentPodsMissing.yaml new file mode 100644 index 0000000000..0dc3e4f664 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/DeploymentPodsMissing.yaml @@ -0,0 +1,73 @@ +name: Deployment is missing Pods + +# Description and details +description: | + Alert when Deployment is missing Pods for > 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(kube_deployment_spec_replicas) - latest(kube_deployment_status_replicas) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.deployment.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/JobFailedOtel.yaml b/alert-policies/kubernetes-opentelemetry/JobFailedOtel.yaml new file mode 100644 index 0000000000..2b5c52374e --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/JobFailedOtel.yaml @@ -0,0 +1,77 @@ +name: Job Failed + +# Description and details +description: | + Alert when a Job reports a failed status + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select if(latest(kube_job_failed), uniqueCount(job_name), 0) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet job_name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + # - priority: CRITICAL + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 300 + # # How many data points must be in violation for the duration + # thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + - priority: WARNING + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 60 + # How many data points must be in violation for the duration + thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: null + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml new file mode 100644 index 0000000000..ecf61d4fe2 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml @@ -0,0 +1,76 @@ +name: Node allocatable cpu utilization is high +# Description and details +description: | + Alert when the average Node allocatable cpu utilization is > 90% for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(k8s.node.cpu.utilization) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 900 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml new file mode 100644 index 0000000000..da0e55aa99 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml @@ -0,0 +1,76 @@ +name: Node allocatable memory utilization is high +# Description and details +description: | + Alert when the average Node allocatable memory utilization is > 90% for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(k8s.node.memory.working_set) / filter(latest(kube_node_status_allocatable), WHERE resource = 'memory') where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 900 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml b/alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml new file mode 100644 index 0000000000..a09b71d454 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml @@ -0,0 +1,76 @@ +name: Node root file system capacity utilization is high +# Description and details +description: | + Alert when the average Node root file system capacity utilization is > 90% for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select max(k8s.node.filesystem.usage) / max(k8s.node.filesystem.capacity) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 900 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: 60 # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml b/alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml new file mode 100644 index 0000000000..b46ee313d6 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml @@ -0,0 +1,77 @@ +name: Node is not ready + +# Description and details +description: | + Alert when a Node is not ready for > 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric if(latest(condition) = 'Ready', 0, 1) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: BELOW + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 300 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml b/alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml new file mode 100644 index 0000000000..09c20296fd --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml @@ -0,0 +1,77 @@ +name: Node Pod count nearing capacity + +# Description and details +description: | + Alert when the Running pod count on a Node is > 90% of the Node's Pod Capacity for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "FROM Metric select filter(uniqueCount(k8s.pod.name), where phase = 'Running' AND (metricName = 'kube_pod_status_phase' AND kube_pod_status_phase ['latest'] = 1) and created_by_kind != 'Job' ) / filter(latest(kube_node_status_allocatable), WHERE resource = 'cpu' ) * 100 as 'Pod Capacity %' where k8s.node.name != '' and k8s.node.name is not null and k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + # - priority: CRITICAL + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 90 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 300 + # # How many data points must be in violation for the duration + # thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + - priority: WARNING + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 90 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 300 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/PersistentVolumeErrors.yaml b/alert-policies/kubernetes-opentelemetry/PersistentVolumeErrors.yaml new file mode 100644 index 0000000000..73fd69119f --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/PersistentVolumeErrors.yaml @@ -0,0 +1,73 @@ +name: Persistent Volume has errors + +# Description and details +description: | + Alert when Persistent Volume is in a Failed or Pending state for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select uniqueCount(persistentvolume) where phase in ('Failed','Pending') and k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet persistentvolume, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/PodNotReady.yaml b/alert-policies/kubernetes-opentelemetry/PodNotReady.yaml new file mode 100644 index 0000000000..1418e1189e --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/PodNotReady.yaml @@ -0,0 +1,77 @@ +name: Pod is not ready + +# Description and details +description: | + Alert when a Pod is not ready for > 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(kube_pod_status_ready) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: BELOW + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/PodNotScheduled.yaml b/alert-policies/kubernetes-opentelemetry/PodNotScheduled.yaml new file mode 100644 index 0000000000..38aea1d89d --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/PodNotScheduled.yaml @@ -0,0 +1,76 @@ +name: Pod cannot be scheduled + +# Description and details +description: | + Alert when a Pod cannot be scheduled for more than 5 minutes +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select latest(kube_pod_status_scheduled) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') and metricName = 'kube_pod_status_scheduled' facet k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: BELOW + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # # Operator used to compare against the threshold. + # operator: ABOVE + # # Value that triggers a violation + # threshold: 0 + # # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + # thresholdDuration: 60 + # # How many data points must be in violation for the duration + # thresholdOccurrences: AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml b/alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml new file mode 100644 index 0000000000..e8926f955c --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml @@ -0,0 +1,73 @@ +name: More than 5 pods failing in namespace + +# Description and details +description: | + Alert when more than 5 pods are failing in a namespace for more than 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric select uniqueCount(k8s.pod.name) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') and phase = 'Failed' facet k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml b/alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml new file mode 100644 index 0000000000..7452575bb6 --- /dev/null +++ b/alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml @@ -0,0 +1,73 @@ +name: Statefulset is missing Pods + +# Description and details +description: | + Alert when Statefulset is missing Pods for > 5 minutes + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "from Metric latest(kube_statefulset_replicas) - latest(kube_statefulset_status_replicas_ready) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.statefulset.name, k8s.namespace.name, k8s.cluster.name" + +# Direction in which baseline is set (Default: LOWER_ONLY) +# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 0 + # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + + # Adding a Warning threshold is optional + # - priority: WARNING + # operator: ABOVE + # threshold: 1 + # thresholdDuration: 300 + # thresholdOccurrences: ALL | AT_LEAST_ONCE + +# Loss of Signal Settings +expiration: + # Close open violations if signal is lost (Default: false) + closeViolationsOnExpiration: true + # Open "Loss of Signal" violation if signal is lost (Default: false) + openViolationOnExpiration: false + # Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' + expirationDuration: 300 + +# Advanced Signal Settings +# Duration of the time window used to evaluate the NRQL Condition +signal: + # How long we wait for data that belongs in each aggregation window + aggregationDelay: 60 # seconds + # The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. + aggregationMethod: EVENT_FLOW + # How long we wait after each data point arrives to make sure we've processed the whole batch. + aggregationTimer: null # seconds + # Controls the duration of the time window used to evaluate the NRQL query + aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes + # Option that determines the type of value that should be used to fill gaps (empty windows). + fillOption: NONE # defaults to STATIC + # If using the static fill option, this value is used for filling. + fillValue: null # default + # This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. + slideBy: null # seconds + # Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. + evaluationDelay: 60 + +# OPTIONAL: URL of runbook to be sent with notification +runbookUrl: + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 21600 diff --git a/quickstarts/kubernetes-opentelemetry/config.yml b/quickstarts/kubernetes-opentelemetry/config.yml new file mode 100644 index 0000000000..0a2b17214f --- /dev/null +++ b/quickstarts/kubernetes-opentelemetry/config.yml @@ -0,0 +1,54 @@ +slug: kubernetes-opentelemetry +title: Kubernetes (OpenTelemetry) +description: | + ## Why monitor Kubernetes? + + Kubernetes is an open-source system for automating deployment, scaling, and, management of containerized applications. The New Relic Kubernetes monitoring quickstart gives you visibility into your Kubernetes clusters and workloads in minutes, whether your clusters are hosted on-premises or in the cloud. + + ### Kubernetes quickstart highlights + + The New Relic Kubernetes quickstart uses dashboards to proactively monitor your metrics, like: + + - resources used + - number of K8s objects + - namespaces per cluster + - pods by namespace + - container cpu usage + - container restarts + - missing pods by deployment + - node resource consumption, and more. + + This quickstart is also compatible with on-host integrations like: + + - Cassandra + - MySQL + - Apache, and more. + + ### New Relic + Kubernetes = Optimum performance monitoring + + The [New Relic Kubernetes](https://docs.newrelic.com/docs/integrations/kubernetes-integration/installation/kubernetes-integration-install-configure/) quickstart has multiple components that work together to give you end-to-end observability across your clusters. While you have the flexibility to deploy the components that you prefer, to achieve full observability, you need to install the complete package to monitor all metrics. Use our quickstart to generate a Kubernetes manifest and add Pixie for more fine-grained telemetry data. You can also do the installation with Pixie for fine-grained telemetry data. + Our quickstart monitors the aggregated core and memory usage across all nodes in your cluster. This allows you to meet resource requirements for optimal application performance. It also empowers you to track resource consumption, find pods that aren't running, monitor disk usage, and troubleshoot container restarts. The New Relic Kubernetes integration has dashboards and a cluster explorer that provide a multi-dimensional representation of a Kubernetes cluster from which you can explore your namespaces, deployments, nodes, pods, containers, and applications. Download the New Relic Kubernetes quickstart today to gain instant visibility into your Kubernetes services, clusters and workloads in minutes. +summary: | + Monitoring Kubernetes is crucial to gain instant visibility into Kubernetes clusters and workloads. Download New Relic Kubernetes quickstart to proactively monitor Kubernetes cluster health and capacity. +level: New Relic +authors: + - New Relic +documentation: + - name: Kubernetes installation docs + description: | + Kubernetes is an open-source container-orchestration system for automating + computer application deployment, scaling, and management. + url: >- + https://docs.newrelic.com/docs/integrations/host-integrations/host-integrations-list/kubernetes-monitoring-integration +dataSourceIds: + - kubernetes +keywords: + - kubernetes + - containers + - pixie + - k8s + - opentelemetry +dashboards: + - kubernetes +alertPolicies: + - kubernetes-opentelemetry diff --git a/quickstarts/kubernetes-opentelemetry/logo.svg b/quickstarts/kubernetes-opentelemetry/logo.svg new file mode 100644 index 0000000000..1efad8e7a6 --- /dev/null +++ b/quickstarts/kubernetes-opentelemetry/logo.svg @@ -0,0 +1 @@ + \ No newline at end of file From da8ce23e60212f847135836220bb908cb7197ca8 Mon Sep 17 00:00:00 2001 From: Doua Vue Date: Mon, 7 Oct 2024 14:42:47 -0500 Subject: [PATCH 02/10] feat: Add datasource for k8s otel --- data-sources/kubernetes-opentelemetry/config.yml | 9 +++++++++ data-sources/kubernetes-opentelemetry/logo.svg | 1 + quickstarts/kubernetes-opentelemetry/config.yml | 12 +++++------- 3 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 data-sources/kubernetes-opentelemetry/config.yml create mode 100644 data-sources/kubernetes-opentelemetry/logo.svg diff --git a/data-sources/kubernetes-opentelemetry/config.yml b/data-sources/kubernetes-opentelemetry/config.yml new file mode 100644 index 0000000000..2cf845d05d --- /dev/null +++ b/data-sources/kubernetes-opentelemetry/config.yml @@ -0,0 +1,9 @@ +id: kubernetes-opentelemetry +displayName: Kubernetes (OpenTelemetry) +description: | + Manage telemetry data coming into New Relic. +install: + primary: + link: + url: https://docs.newrelic.com/docs/kubernetes-pixie/kubernetes-integration/installation/k8s-otel/#install +icon: logo.png diff --git a/data-sources/kubernetes-opentelemetry/logo.svg b/data-sources/kubernetes-opentelemetry/logo.svg new file mode 100644 index 0000000000..e33c0ab527 --- /dev/null +++ b/data-sources/kubernetes-opentelemetry/logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/quickstarts/kubernetes-opentelemetry/config.yml b/quickstarts/kubernetes-opentelemetry/config.yml index 0a2b17214f..2900593745 100644 --- a/quickstarts/kubernetes-opentelemetry/config.yml +++ b/quickstarts/kubernetes-opentelemetry/config.yml @@ -5,7 +5,7 @@ description: | Kubernetes is an open-source system for automating deployment, scaling, and, management of containerized applications. The New Relic Kubernetes monitoring quickstart gives you visibility into your Kubernetes clusters and workloads in minutes, whether your clusters are hosted on-premises or in the cloud. - ### Kubernetes quickstart highlights + ### Kubernetes (OpenTelemetry) quickstart highlights The New Relic Kubernetes quickstart uses dashboards to proactively monitor your metrics, like: @@ -26,7 +26,7 @@ description: | ### New Relic + Kubernetes = Optimum performance monitoring - The [New Relic Kubernetes](https://docs.newrelic.com/docs/integrations/kubernetes-integration/installation/kubernetes-integration-install-configure/) quickstart has multiple components that work together to give you end-to-end observability across your clusters. While you have the flexibility to deploy the components that you prefer, to achieve full observability, you need to install the complete package to monitor all metrics. Use our quickstart to generate a Kubernetes manifest and add Pixie for more fine-grained telemetry data. You can also do the installation with Pixie for fine-grained telemetry data. + The [New Relic Kubernetes (OpenTelemetry)](https://docs.newrelic.com/docs/integrations/kubernetes-integration/installation/kubernetes-integration-install-configure/) quickstart has multiple components that work together to give you end-to-end observability across your clusters. While you have the flexibility to deploy the components that you prefer, to achieve full observability, you need to install the complete package to monitor all metrics. Use our quickstart to generate a Kubernetes manifest and add Pixie for more fine-grained telemetry data. You can also do the installation with Pixie for fine-grained telemetry data. Our quickstart monitors the aggregated core and memory usage across all nodes in your cluster. This allows you to meet resource requirements for optimal application performance. It also empowers you to track resource consumption, find pods that aren't running, monitor disk usage, and troubleshoot container restarts. The New Relic Kubernetes integration has dashboards and a cluster explorer that provide a multi-dimensional representation of a Kubernetes cluster from which you can explore your namespaces, deployments, nodes, pods, containers, and applications. Download the New Relic Kubernetes quickstart today to gain instant visibility into your Kubernetes services, clusters and workloads in minutes. summary: | Monitoring Kubernetes is crucial to gain instant visibility into Kubernetes clusters and workloads. Download New Relic Kubernetes quickstart to proactively monitor Kubernetes cluster health and capacity. @@ -34,21 +34,19 @@ level: New Relic authors: - New Relic documentation: - - name: Kubernetes installation docs + - name: Kubernetes (Opentelemetry) installation docs description: | Kubernetes is an open-source container-orchestration system for automating computer application deployment, scaling, and management. url: >- - https://docs.newrelic.com/docs/integrations/host-integrations/host-integrations-list/kubernetes-monitoring-integration + https://docs.newrelic.com/docs/kubernetes-pixie/kubernetes-integration/installation/k8s-otel/#install dataSourceIds: - - kubernetes + - kubernetes-opentelemetry keywords: - kubernetes - containers - pixie - k8s - opentelemetry -dashboards: - - kubernetes alertPolicies: - kubernetes-opentelemetry From ef5d5bfe659194132e3a95888f157f0f0c638d27 Mon Sep 17 00:00:00 2001 From: Doua Vue Date: Tue, 8 Oct 2024 08:43:15 -0500 Subject: [PATCH 03/10] fix: Update image extension --- data-sources/kubernetes-opentelemetry/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-sources/kubernetes-opentelemetry/config.yml b/data-sources/kubernetes-opentelemetry/config.yml index 2cf845d05d..371748f75a 100644 --- a/data-sources/kubernetes-opentelemetry/config.yml +++ b/data-sources/kubernetes-opentelemetry/config.yml @@ -6,4 +6,4 @@ install: primary: link: url: https://docs.newrelic.com/docs/kubernetes-pixie/kubernetes-integration/installation/k8s-otel/#install -icon: logo.png +icon: logo.svg From d0572acb07e1a4e6d31430b70077855d39c885a3 Mon Sep 17 00:00:00 2001 From: Doua Vue Date: Tue, 8 Oct 2024 08:47:06 -0500 Subject: [PATCH 04/10] fix: Update image for k8s-otel quickstart --- quickstarts/kubernetes-opentelemetry/logo.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quickstarts/kubernetes-opentelemetry/logo.svg b/quickstarts/kubernetes-opentelemetry/logo.svg index 1efad8e7a6..e33c0ab527 100644 --- a/quickstarts/kubernetes-opentelemetry/logo.svg +++ b/quickstarts/kubernetes-opentelemetry/logo.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file From 08de38e8256afa986992cf7fa157582ac7937b1d Mon Sep 17 00:00:00 2001 From: Doua Vue Date: Wed, 9 Oct 2024 09:50:24 -0500 Subject: [PATCH 05/10] fix: Address CR feedback re nrql and configs --- .../kubernetes-opentelemetry/ContainerCPUThrottling.yaml | 2 +- .../kubernetes-opentelemetry/ContainerHighMemUtil.yaml | 2 +- .../kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml | 2 +- .../kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml | 2 +- .../kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml | 2 +- alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml | 2 +- alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml | 2 +- .../kubernetes-opentelemetry/PodsFailingNamespace.yaml | 2 +- .../kubernetes-opentelemetry/StatefulsetPodsMissing.yaml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml b/alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml index 896c786d39..6ef77428b7 100644 --- a/alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml +++ b/alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml @@ -22,7 +22,7 @@ terms: # Operator used to compare against the threshold. operator: ABOVE # Value that triggers a violation - threshold: 90 + threshold: 25 # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions thresholdDuration: 300 # How many data points must be in violation for the duration diff --git a/alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml b/alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml index fe3db565c0..20f68e6954 100644 --- a/alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml +++ b/alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml @@ -8,7 +8,7 @@ type: STATIC # NRQL query nrql: - query: "from Metric max(container_memory_working_set_bytes) / filter(max(kube_pod_container_resource_limits), where resource = 'memory') where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" + query: "from Metric select max(container_memory_working_set_bytes) / filter(max(kube_pod_container_resource_limits), where resource = 'memory') where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet pod, container, k8s.namespace.name, k8s.cluster.name" # Direction in which baseline is set (Default: LOWER_ONLY) # baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY diff --git a/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml index ecf61d4fe2..400094e63f 100644 --- a/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml +++ b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableCPUUtil.yaml @@ -8,7 +8,7 @@ type: STATIC # NRQL query nrql: - query: "from Metric select latest(k8s.node.cpu.utilization) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + query: "from Metric select average(k8s.node.cpu.utilization) * 100 where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" # Direction in which baseline is set (Default: LOWER_ONLY) # baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY diff --git a/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml index da0e55aa99..c34b683fae 100644 --- a/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml +++ b/alert-policies/kubernetes-opentelemetry/NodeHighAllocatableMemUtil.yaml @@ -8,7 +8,7 @@ type: STATIC # NRQL query nrql: - query: "from Metric select latest(k8s.node.memory.working_set) / filter(latest(kube_node_status_allocatable), WHERE resource = 'memory') where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + query: "from Metric select (average(k8s.node.memory.working_set) / filter(latest(kube_node_status_allocatable), WHERE resource = 'memory')) * 100 where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" # Direction in which baseline is set (Default: LOWER_ONLY) # baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY diff --git a/alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml b/alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml index a09b71d454..4c57ae6d0b 100644 --- a/alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml +++ b/alert-policies/kubernetes-opentelemetry/NodeHighFSCapacityUtil.yaml @@ -8,7 +8,7 @@ type: STATIC # NRQL query nrql: - query: "from Metric select max(k8s.node.filesystem.usage) / max(k8s.node.filesystem.capacity) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + query: "from Metric select (average(k8s.node.filesystem.usage) / average(k8s.node.filesystem.capacity)) * 100 where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" # Direction in which baseline is set (Default: LOWER_ONLY) # baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY diff --git a/alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml b/alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml index b46ee313d6..b64fd15328 100644 --- a/alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml +++ b/alert-policies/kubernetes-opentelemetry/NodeIsNotReady.yaml @@ -9,7 +9,7 @@ type: STATIC # NRQL query nrql: - query: "from Metric if(latest(condition) = 'Ready', 0, 1) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + query: "from Metric select if(latest(condition) = 'Ready', 0, 1) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" # Direction in which baseline is set (Default: LOWER_ONLY) # baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY diff --git a/alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml b/alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml index 09c20296fd..41026339b3 100644 --- a/alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml +++ b/alert-policies/kubernetes-opentelemetry/NodePodCapacity.yaml @@ -9,7 +9,7 @@ type: STATIC # NRQL query nrql: - query: "FROM Metric select filter(uniqueCount(k8s.pod.name), where phase = 'Running' AND (metricName = 'kube_pod_status_phase' AND kube_pod_status_phase ['latest'] = 1) and created_by_kind != 'Job' ) / filter(latest(kube_node_status_allocatable), WHERE resource = 'cpu' ) * 100 as 'Pod Capacity %' where k8s.node.name != '' and k8s.node.name is not null and k8s.cluster.name in ('YOUR_CLUSTER_NAME') facet k8s.node.name, k8s.cluster.name" + query: "FROM Metric select filter(uniqueCount(k8s.pod.name), where phase = 'Running' AND (metricName = 'kube_pod_status_phase' AND kube_pod_status_phase ['latest'] = 1) and created_by_kind != 'Job' ) / filter(latest(kube_node_status_allocatable), WHERE resource = 'pods' ) * 100 as 'Pod Capacity %' where k8s.cluster.name in ('YOUR_CLUSTER_NAME')" # Direction in which baseline is set (Default: LOWER_ONLY) # baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY diff --git a/alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml b/alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml index e8926f955c..17ad1daaef 100644 --- a/alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml +++ b/alert-policies/kubernetes-opentelemetry/PodsFailingNamespace.yaml @@ -23,7 +23,7 @@ terms: # Operator used to compare against the threshold. operator: ABOVE # Value that triggers a violation - threshold: 0 + threshold: 5 # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions thresholdDuration: 300 # How many data points must be in violation for the duration diff --git a/alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml b/alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml index 7452575bb6..0e666bae6c 100644 --- a/alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml +++ b/alert-policies/kubernetes-opentelemetry/StatefulsetPodsMissing.yaml @@ -9,7 +9,7 @@ type: STATIC # NRQL query nrql: - query: "from Metric latest(kube_statefulset_replicas) - latest(kube_statefulset_status_replicas_ready) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.statefulset.name, k8s.namespace.name, k8s.cluster.name" + query: "from Metric select latest(kube_statefulset_replicas) - latest(kube_statefulset_status_replicas_ready) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.statefulset.name, k8s.namespace.name, k8s.cluster.name" # Direction in which baseline is set (Default: LOWER_ONLY) # baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY From 949697005e84bfb4e120b226bba593b1a1fa2918 Mon Sep 17 00:00:00 2001 From: rossfromwork <150628820+rossfromwork@users.noreply.github.com> Date: Tue, 15 Oct 2024 20:16:13 +1100 Subject: [PATCH 06/10] Updated replication.master.slaves.Offset NRQL Corrected NRQL used for Replication byte offset widget. Changed from average(replication.master.slaves.Offset) to average('replication.master.slaves.Offset') --- dashboards/gcp-redis/gcp-redis.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dashboards/gcp-redis/gcp-redis.json b/dashboards/gcp-redis/gcp-redis.json index 4672c7c03e..3306d54db0 100644 --- a/dashboards/gcp-redis/gcp-redis.json +++ b/dashboards/gcp-redis/gcp-redis.json @@ -335,7 +335,7 @@ "nrqlQueries": [ { "accountIds": [], - "query": "SELECT AVERAGE(replication.master.slaves.Offset) AS `Average Replication Byte Offset(Slaves)`, AVERAGE(replication.MasterReplOffset) AS `Average Replication Byte Offset(Master)` FROM GcpRedisInstanceSample WHERE (`provider` = 'GcpRedisInstance') FACET slave TIMESERIES AUTO" + "query": "SELECT AVERAGE('replication.master.slaves.Offset') AS `Average Replication Byte Offset(Slaves)`, AVERAGE(replication.MasterReplOffset) AS `Average Replication Byte Offset(Master)` FROM GcpRedisInstanceSample WHERE (`provider` = 'GcpRedisInstance') FACET slave TIMESERIES AUTO" } ], "platformOptions": { From 4f54b21c51df0e3b837ba652c11056dc284ffc02 Mon Sep 17 00:00:00 2001 From: Doua Vue Date: Tue, 15 Oct 2024 09:00:11 -0500 Subject: [PATCH 07/10] feat: Update k8s otel description --- .../kubernetes-opentelemetry/config.yml | 63 +++++++++++-------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/quickstarts/kubernetes-opentelemetry/config.yml b/quickstarts/kubernetes-opentelemetry/config.yml index 2900593745..1a4e7592a1 100644 --- a/quickstarts/kubernetes-opentelemetry/config.yml +++ b/quickstarts/kubernetes-opentelemetry/config.yml @@ -1,43 +1,53 @@ slug: kubernetes-opentelemetry title: Kubernetes (OpenTelemetry) description: | - ## Why monitor Kubernetes? + ## Why monitor Kubernetes using OpenTelemetry? + Kubernetes is an open-source system for automating deployment, scaling, and, management of containerized applications. + OpenTelemetry is an open source observability framework that provides IT teams with standardized protocols and tools for collecting and routing telemetry. + Organizations adopting OpenTelemetry will get advantage of vendor neutrality by using flexible and open-source agents and/or SDKs. - Kubernetes is an open-source system for automating deployment, scaling, and, management of containerized applications. The New Relic Kubernetes monitoring quickstart gives you visibility into your Kubernetes clusters and workloads in minutes, whether your clusters are hosted on-premises or in the cloud. + New Relic provides OpenTelemetry observability for Kubernetes which aims not only to ingest all the data sent but also to provide first-class K8s experiences and insights. + This quicktart gives you visibility into your Kubernetes clusters and workloads in minutes, whether your clusters are hosted on-premises or in the cloud. ### Kubernetes (OpenTelemetry) quickstart highlights - - The New Relic Kubernetes quickstart uses dashboards to proactively monitor your metrics, like: - - - resources used - - number of K8s objects - - namespaces per cluster - - pods by namespace - - container cpu usage - - container restarts - - missing pods by deployment - - node resource consumption, and more. - - This quickstart is also compatible with on-host integrations like: - - - Cassandra - - MySQL - - Apache, and more. - + Included in this quickstart you will find: + + * Instructions to install our [K8s instrumentation with OpenTelemetry](https://docs.newrelic.com/docs/kubernetes-pixie/kubernetes-integration/installation/k8s-otel/). + * A complete recommended alert policy including alert conditions to be notified on most of the common relevant issues. + * Container CPU throttling is high (alert condition) + * Container high CPU utilization (alert condition) + * Container high memory utilization (alert condition) + * Container is restarting (alert condition) + * Container is waiting (alert condition) + * Daemonset is missing pods (alert condition) + * Deployment is missing pods (alert condition) + * Etcd fie descriptor utilization is high (alert condition) + * Etcd has no leader (alert condition) + * HPA current replicas < desired replicas (alert condition) + * HPA has reached maximum replicas (alert condition) + * Job Failed (alert condition) + * More than 5 pods failing in namespace (alert condition) + * Node allocatable CPU utilization is high (alert condition) + * Node allocatable memory utilization is high (alert condition) + * Node is not ready (alert condition) + * Node is unschedulable (alert condition) + * Node pod count nearing capacity (alert condition) + * Node root file system capacity utilization is high (alert condition) + * Persistent volume has errors (alert condition) + * Pod cannot be scheduled (alert condition) + * Pod is not ready (alert condition) + * Statefulset is missing pods (alert condition) ### New Relic + Kubernetes = Optimum performance monitoring - - The [New Relic Kubernetes (OpenTelemetry)](https://docs.newrelic.com/docs/integrations/kubernetes-integration/installation/kubernetes-integration-install-configure/) quickstart has multiple components that work together to give you end-to-end observability across your clusters. While you have the flexibility to deploy the components that you prefer, to achieve full observability, you need to install the complete package to monitor all metrics. Use our quickstart to generate a Kubernetes manifest and add Pixie for more fine-grained telemetry data. You can also do the installation with Pixie for fine-grained telemetry data. - Our quickstart monitors the aggregated core and memory usage across all nodes in your cluster. This allows you to meet resource requirements for optimal application performance. It also empowers you to track resource consumption, find pods that aren't running, monitor disk usage, and troubleshoot container restarts. The New Relic Kubernetes integration has dashboards and a cluster explorer that provide a multi-dimensional representation of a Kubernetes cluster from which you can explore your namespaces, deployments, nodes, pods, containers, and applications. Download the New Relic Kubernetes quickstart today to gain instant visibility into your Kubernetes services, clusters and workloads in minutes. summary: | - Monitoring Kubernetes is crucial to gain instant visibility into Kubernetes clusters and workloads. Download New Relic Kubernetes quickstart to proactively monitor Kubernetes cluster health and capacity. + Monitoring Kubernetes with OpenTelemetry is crucial to gain instant visibility into Kubernetes clusters and workloads using open-source agents which provide vendor neutrality. + level: New Relic authors: - New Relic documentation: - name: Kubernetes (Opentelemetry) installation docs description: | - Kubernetes is an open-source container-orchestration system for automating - computer application deployment, scaling, and management. + OpenTelemetry observability for Kubernetes provides complete, open-source setup paired with a top-notch Kubernetes UI that is already compatible with our proprietary Kubernetes instrumentation. Our K8s UIs are designed to be provider agnostic, allowing you to select either OpenTelemetry or New Relic instrumentation based on your needs. url: >- https://docs.newrelic.com/docs/kubernetes-pixie/kubernetes-integration/installation/k8s-otel/#install dataSourceIds: @@ -45,7 +55,6 @@ dataSourceIds: keywords: - kubernetes - containers - - pixie - k8s - opentelemetry alertPolicies: From 5439de35a07d9bf455e380339786d237497115fe Mon Sep 17 00:00:00 2001 From: Doua Vue Date: Wed, 16 Oct 2024 09:00:35 -0500 Subject: [PATCH 08/10] fix: Add icon key/value --- quickstarts/kubernetes-opentelemetry/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/quickstarts/kubernetes-opentelemetry/config.yml b/quickstarts/kubernetes-opentelemetry/config.yml index 1a4e7592a1..ef24cee69f 100644 --- a/quickstarts/kubernetes-opentelemetry/config.yml +++ b/quickstarts/kubernetes-opentelemetry/config.yml @@ -41,6 +41,7 @@ description: | summary: | Monitoring Kubernetes with OpenTelemetry is crucial to gain instant visibility into Kubernetes clusters and workloads using open-source agents which provide vendor neutrality. +icon: logo.svg level: New Relic authors: - New Relic From 12115d369d60fa177a00b35ce35e28e0e12191ae Mon Sep 17 00:00:00 2001 From: nr-opensource-bot Date: Thu, 17 Oct 2024 18:36:01 +0000 Subject: [PATCH 09/10] chore: generate UUID(s) [skip ci] --- quickstarts/kubernetes-opentelemetry/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/quickstarts/kubernetes-opentelemetry/config.yml b/quickstarts/kubernetes-opentelemetry/config.yml index ef24cee69f..aa9eed87af 100644 --- a/quickstarts/kubernetes-opentelemetry/config.yml +++ b/quickstarts/kubernetes-opentelemetry/config.yml @@ -1,3 +1,4 @@ +id: 7cc82a78-e523-4d35-bba9-aee81029d0f7 slug: kubernetes-opentelemetry title: Kubernetes (OpenTelemetry) description: | From 599132fb10bd7f3ed19023e69d7ee1dd3b19b87d Mon Sep 17 00:00:00 2001 From: Michel Losier Date: Thu, 17 Oct 2024 11:48:17 -0700 Subject: [PATCH 10/10] triggering the build