Skip to content

Commit

Permalink
Added alerting mechanism for job status
Browse files Browse the repository at this point in the history
tendrl-bug-id: Tendrl/node-agent#602
Signed-off-by: Shubhendu <shtripat@redhat.com>
  • Loading branch information
Shubhendu committed Sep 8, 2017
1 parent 2c36537 commit 380dcc2
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 0 deletions.
40 changes: 40 additions & 0 deletions tendrl/commons/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from tendrl.commons.message import Message
from tendrl.commons.objects import AtomExecutionFailedError
from tendrl.commons.objects.job import Job
from tendrl.commons.utils import alert_utils
from tendrl.commons.utils import time_utils


Expand Down Expand Up @@ -104,6 +105,19 @@ def process_job(job):
_msg = str("Timed-out (>10min as 'new')")
job.errors = _msg
job.save()
if job.payload.get('parent') is None:
alert_utils.alert_job_status(
"failed",
"Job timed out (job_id: %s)" % jid,
integration_id=NS.tendrl_context.integration_id or
job.payload['parameters'].get(
'TendrlContext.integration_id'
),
cluster_name=NS.tendrl_context.cluster_name or
job.payload['parameters'].get(
'TendrlContext.cluster_name'
)
)
return
else:
_now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10)
Expand Down Expand Up @@ -219,6 +233,19 @@ def process_job(job):
}
)
)
if job.payload.get('parent') is None:
alert_utils.alert_job_status(
"finished",
"Job finished successfully (job_id: %s)" % job.job_id,
integration_id=NS.tendrl_context.integration_id or
job.payload['parameters'].get(
'TendrlContext.integration_id'
),
cluster_name=NS.tendrl_context.cluster_name or
job.payload['parameters'].get(
'TendrlContext.cluster_name'
)
)
except (FlowExecutionFailedError,
AtomExecutionFailedError,
Exception) as e:
Expand Down Expand Up @@ -265,6 +292,19 @@ def process_job(job):
else:
job = job.load()
job.errors = _trace
if job.payload.get('parent') is None:
alert_utils.alert_job_status(
"failed",
"Job failed (job_id: %s)" % job.job_id,
integration_id=NS.tendrl_context.integration_id or
job.payload['parameters'].get(
'TendrlContext.integration_id'
),
cluster_name=NS.tendrl_context.cluster_name or
job.payload['parameters'].get(
'TendrlContext.cluster_name'
)
)
job.save()


Expand Down
37 changes: 37 additions & 0 deletions tendrl/commons/utils/alert_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import json
import os

from tendrl.commons.utils import log_utils as logger
from tendrl.commons.utils.time_utils import now as tendrl_now


def alert_job_status(curr_value, msg, integration_id=None, cluster_name=None):
alert = {}
alert['source'] = NS.publisher_id
alert['classification'] = 'cluster'
alert['pid'] = os.getpid()
alert['time_stamp'] = tendrl_now().isoformat()
alert['alert_type'] = 'STATUS'
severity = "INFO"
if curr_value.lower() == "failed":
severity = "WARNING"
alert['severity'] = severity
alert['resource'] = 'job_status'
alert['current_value'] = curr_value
alert['tags'] = dict(
message=msg,
integration_id=integration_id or
NS.tendrl_context.integration_id,
cluster_name=cluster_name or
NS.tendrl_context.cluster_name,
sds_name=NS.tendrl_context.sds_name,
fqdn=NS.node_context.fqdn
)
alert['node_id'] = NS.node_context.node_id
if not NS.node_context.node_id:
return
logger.log(
"notice",
"alerting",
{'message': json.dumps(alert)}
)
1 change: 1 addition & 0 deletions tendrl/commons/utils/monitoring_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ def update_dashboard(res_name, res_type, integration_id, action):
_job_id = str(uuid.uuid4())
_params = {
"TendrlContext.integration_id": NS.tendrl_context.integration_id,
"TendrlContext.cluster_name": NS.tendrl_context.cluster_name,
"Trigger.resource_name": res_name,
"Trigger.resource_type": res_type,
"Trigger.action": action
Expand Down

0 comments on commit 380dcc2

Please sign in to comment.