Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modifying alerting logic based on alert classifications #219

Merged
merged 2 commits into from
Oct 30, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions tendrl/monitoring_integration/alert/handlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ def _load_handlers(self):
# node.cpu_utilization
# cluster.gluster.cluster_utilization
alert_classification = alert_handlers.rsplit(".", 1)[0]
cls.classification = alert_classification.split(".")[0]
alert_classification = alert_classification.replace(
".", "/")
if alert_classification in self.alert_types:
Expand All @@ -106,7 +105,6 @@ def handle_alert(self, alert_id):
handled_alert = False
for handler in AlertHandler.handlers:
if handler.handles in alert_json['Name'].lower():
alert_json["classification"] = handler.classification
handler.handle(alert_json)
handled_alert = True
if not handled_alert:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class BrickHandler(AlertHandler):

handles = 'brick'
representive_name = 'bricks_utilization_alert'
representive_name = 'brick_utilization'

def __init__(self):
AlertHandler.__init__(self)
Expand All @@ -37,7 +37,6 @@ def format_alert(self, alert_json):
alert['significance'] = constants.SIGNIFICANCE_HIGH
alert['pid'] = utils.find_grafana_pid()
alert['source'] = constants.ALERT_SOURCE
alert['classification'] = alert_json["classification"]
alert['tags']['cluster_name'] = utils.find_cluster_name(
alert['tags']['integration_id'])
if alert['severity'] == "WARNING":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class ClusterHandler(AlertHandler):

handles = 'cluster'
representive_name = 'cluster_utilization_alert'
representive_name = 'cluster_utilization'

def __init__(self):
AlertHandler.__init__(self)
Expand All @@ -33,7 +33,6 @@ def format_alert(self, alert_json):
alert['significance'] = constants.SIGNIFICANCE_HIGH
alert['pid'] = utils.find_grafana_pid()
alert['source'] = constants.ALERT_SOURCE
alert['classification'] = alert_json["classification"]
alert['tags']['cluster_name'] = utils.find_cluster_name(
alert['tags']['integration_id'])
if alert['severity'] == "WARNING":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
class VolumeHandler(AlertHandler):

handles = 'volume'
representive_name = 'volume_utilization_alert'
representive_name = 'volume_utilization'

def __init__(self):
AlertHandler.__init__(self)
self.template = "tendrl.clusters.{cluster_id}.volumes.{volume_name}."\
"nodes.*.bricks.*.utilization.gauge-total"
"pcnt_used"

def format_alert(self, alert_json):
alert = self.parse_alert_metrics(alert_json)
Expand All @@ -33,7 +33,6 @@ def format_alert(self, alert_json):
alert['significance'] = constants.SIGNIFICANCE_HIGH
alert['pid'] = utils.find_grafana_pid()
alert['source'] = constants.ALERT_SOURCE
alert['classification'] = alert_json["classification"]
alert['tags']['cluster_name'] = utils.find_cluster_name(
alert['tags']['integration_id'])
if alert['severity'] == "WARNING":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@
class CpuHandler(AlertHandler):

handles = 'cpu'
representive_name = 'cpu_alert'
representive_name = 'cpu_utilization'

def __init__(self):
AlertHandler.__init__(self)
self.template = "tendrl.clusters.{cluster_id}.nodes.{host_name}.cpu"
self.template = "tendrl.clusters.{integration_id}.nodes.{host_name}.cpu"

def format_alert(self, alert_json):
alert = self.parse_alert_metrics(alert_json)
alert, integration_id = self.parse_alert_metrics(alert_json)
try:
alert["alert_id"] = None
alert["node_id"] = utils.find_node_id(
alert['tags']['integration_id'],
integration_id,
alert['tags']['fqdn']
)
alert["time_stamp"] = alert_json['NewStateDate']
Expand All @@ -36,7 +36,6 @@ def format_alert(self, alert_json):
alert['pid'] = utils.find_grafana_pid()
alert['source'] = constants.ALERT_SOURCE
alert['tags']['fqdn'] = alert['tags']['fqdn']
alert['classification'] = alert_json["classification"]
if alert['severity'] == "WARNING":
alert['tags']['message'] = (
"Cpu utilization of node %s is"
Expand Down Expand Up @@ -122,6 +121,6 @@ def parse_alert_metrics(self, alert_json):
# Cpu target is an aggregation, So spliting and giving [0]
# Because both have same cluster and node ids
result = utils.parse_target(target, self.template)
alert['tags']['integration_id'] = result["cluster_id"]
integration_id = result["integration_id"]
alert["tags"]["fqdn"] = result["host_name"].replace("_", ".")
return alert
return alert, integration_id
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@
class MemoryHandler(AlertHandler):

handles = 'memory'
representive_name = 'memory_alert'
representive_name = 'memory_utilization'

def __init__(self):
AlertHandler.__init__(self)
self.template = "tendrl.clusters.{cluster_id}.nodes.{host_name}.memory"
self.template = "tendrl.clusters.{integration_id}.nodes.{host_name}.memory"

def format_alert(self, alert_json):
alert = self.parse_alert_metrics(alert_json)
alert, integration_id = self.parse_alert_metrics(alert_json)
try:
alert["alert_id"] = None
alert["node_id"] = utils.find_node_id(
alert['tags']['integration_id'],
integration_id,
alert['tags']['fqdn']
)
alert["time_stamp"] = alert_json['NewStateDate']
Expand All @@ -36,7 +36,6 @@ def format_alert(self, alert_json):
alert['pid'] = utils.find_grafana_pid()
alert['source'] = constants.ALERT_SOURCE
alert['tags']['fqdn'] = alert['tags']['fqdn']
alert['classification'] = alert_json["classification"]
if alert['severity'] == "WARNING":
alert['tags']['message'] = (
"Memory utilization of node %s is"
Expand Down Expand Up @@ -116,6 +115,6 @@ def parse_alert_metrics(self, alert_json):
alert_json['Settings']['conditions'][0]['evaluator']['params'])
# identifying cluster_id and node_id from target
result = utils.parse_target(target, self.template)
alert['tags']['integration_id'] = result["cluster_id"]
integration_id = result["integration_id"]
alert["tags"]["fqdn"] = result["host_name"].replace("_", ".")
return alert
return alert, integration_id
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@
class SwapHandler(AlertHandler):

handles = 'swap'
representive_name = 'swap_alert'
representive_name = 'swap_utilization'

def __init__(self):
AlertHandler.__init__(self)
self.template = "tendrl.clusters.{cluster_id}.nodes.{host_name}.swap"
self.template = "tendrl.clusters.{integration_id}.nodes.{host_name}.swap"

def format_alert(self, alert_json):
alert = self.parse_alert_metrics(alert_json)
alert, integration_id = self.parse_alert_metrics(alert_json)
try:
alert["alert_id"] = None
alert["node_id"] = utils.find_node_id(
alert['tags']['integration_id'],
integration_id,
alert['tags']['fqdn']
)
alert["time_stamp"] = alert_json['NewStateDate']
Expand All @@ -36,7 +36,6 @@ def format_alert(self, alert_json):
alert['pid'] = utils.find_grafana_pid()
alert['source'] = constants.ALERT_SOURCE
alert['tags']['fqdn'] = alert['tags']['fqdn']
alert['classification'] = alert_json["classification"]
if alert['severity'] == "WARNING":
alert['tags']['message'] = ("Swap utilization of node %s is"
" %s which is above the %s "
Expand Down Expand Up @@ -115,6 +114,6 @@ def parse_alert_metrics(self, alert_json):
alert_json['Settings']['conditions'][0]['evaluator']['params'])
# identifying cluster_id and node_id from target
result = utils.parse_target(target, self.template)
alert['tags']['integration_id'] = result["cluster_id"]
integration_id = result["integration_id"]
alert["tags"]["fqdn"] = result["host_name"].replace("_", ".")
return alert
return alert, integration_id
46 changes: 28 additions & 18 deletions tendrl/monitoring_integration/grafana/webhook_receiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
import threading

from werkzeug.serving import run_simple
from werkzeug.wrappers import Request, Response

from tendrl.commons.event import Event
from tendrl.commons.message import ExceptionMessage
from tendrl.commons.utils import log_utils as logger
from tendrl.monitoring_integration.alert.handlers import AlertHandlerManager

HOST = "127.0.0.1"
Expand All @@ -20,23 +22,30 @@ def __init__(self):
def _application(self, env, start_response):
try:
if env['PATH_INFO'] != '/grafana_callback':
start_response(
'404 Not Found',
[('Content-Type', 'text/html')]
)
response = [b'<h1>Alert Not Found</h1>']
response = Response('Alert not found')
response.headers['content-length'] = len(response.data)
response.status_code = 404
else:
data = env['wsgi.input'].read()
data = json.loads(data)
self.alert_handler.handle_alert(
data["ruleId"]
)
start_response(
'200 OK',
[('Content-Type', 'text/html')]
data = env['wsgi.input'].read(
int(env['CONTENT_LENGTH'])
)
response = [b'<h1>Alert Received</h1>']
except (IOError, AssertionError) as ex:
data = json.loads(data)
if "ruleId" in data:
self.alert_handler.handle_alert(
data["ruleId"]
)
response = Response('Alert received successfully')
response.headers['content-length'] = len(response.data)
response.status_code = 200
else:
logger.log(
"error",
NS.publisher_id,
{
"message": "Unable to find ruleId %s" % data
}
)
except (IOError, AssertionError, KeyError) as ex:
Event(
ExceptionMessage(
priority="error",
Expand All @@ -47,9 +56,10 @@ def _application(self, env, start_response):
}
)
)
response = [b'<h1>Error in reading alert from socket</h1>']

return response
response = Response('Error in reading alert from socket')
response.headers['content-length'] = len(response.data)
response.status_code = 500
return response(env, start_response)

def run(self):
try:
Expand Down