Skip to content

Commit

Permalink
WIP - Added alerting mechanism for job status
Browse files Browse the repository at this point in the history
tendrl-bug-id: Tendrl/node-agent#602
Signed-off-by: Shubhendu <shtripat@redhat.com>
  • Loading branch information
Shubhendu committed Sep 6, 2017
1 parent 2c36537 commit cb3db7e
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 0 deletions.
13 changes: 13 additions & 0 deletions tendrl/commons/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from tendrl.commons.message import Message
from tendrl.commons.objects import AtomExecutionFailedError
from tendrl.commons.objects.job import Job
from tendrl.commons.utils import alert_utils
from tendrl.commons.utils import time_utils


Expand Down Expand Up @@ -104,6 +105,10 @@ def process_job(job):
_msg = str("Timed-out (>10min as 'new')")
job.errors = _msg
job.save()
alert_utils.alert_job_status(
"failed",
"Job %s timed out" % jid
)
return
else:
_now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10)
Expand Down Expand Up @@ -219,6 +224,10 @@ def process_job(job):
}
)
)
alert_utils.alert_job_status(
"finished",
"Job %s finished successfully" % job.job_id
)
except (FlowExecutionFailedError,
AtomExecutionFailedError,
Exception) as e:
Expand Down Expand Up @@ -265,6 +274,10 @@ def process_job(job):
else:
job = job.load()
job.errors = _trace
alert_utils.alert_job_status(
"failed",
"Job %s failed" % job.job_id
)
job.save()


Expand Down
40 changes: 40 additions & 0 deletions tendrl/commons/utils/alert_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import json
import os
import socket

from tendrl.commons.event import Event
from tendrl.commons.message import Message
from tendrl.commons.utils.time_utils import now as tendrl_now


def alert_job_status(curr_value, msg):
import pdb; pdb.set_trace()
alert = {}
alert['source'] = NS.publisher_id
alert['classification'] = 'cluster'
alert['pid'] = os.getpid()
alert['time_stamp'] = tendrl_now().isoformat()
alert['alert_type'] = 'STATUS'
severity = "INFO"
if curr_value.lower() == "failed":
severity = "WARNING"
alert['severity'] = severity
alert['resource'] = 'cluster'
alert['current_value'] = curr_value
alert['tags'] = dict(
message=msg,
integration_id=NS.tendrl_context.integration_id,
cluster_name=NS.tendrl_context.cluster_name,
sds_name=NS.tendrl_context.sds_name,
fqdn=socket.getfqdn()
)
alert['node_id'] = NS.node_context.node_id
if not NS.node_context.node_id:
return
Event(
Message(
"notice",
"alerting",
{'message': json.dumps(alert)}
)
)

0 comments on commit cb3db7e

Please sign in to comment.