Skip to content

Commit

Permalink
add etcd increased-traffic check
Browse files Browse the repository at this point in the history
  • Loading branch information
juanvallejo committed May 30, 2017
1 parent f8ad97d commit 899d89f
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 0 deletions.
95 changes: 95 additions & 0 deletions roles/openshift_health_checker/library/etcdlogs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# pylint: disable=missing-docstring

"""Interface to journalctl"""

from time import time
import json
import re
import subprocess

from ansible.module_utils.basic import AnsibleModule


def exit_json(module, failed, matched, result):
module.exit_json(
changed=False,
failed=failed,
matched=matched,
result=result,
)


def stamp_too_old(stamp, time_limit):
epoch = int(stamp) / 1000000
return epoch - (time() - time_limit) < 0


def main():
module = AnsibleModule(
argument_spec=dict(
log_count_limit=dict(type="int", default=500),
start_match=dict(type="str", default="Starting Etcd Server"),
log_matcher=dict(type="dict", required=True),
output=dict(type="str", default="json"),
),
)

cmd = [
'/bin/journalctl',
'-ru', 'docker',
'--output', module.params["output"],
]

time_limit_seconds = 60 * 60 # 1 hour

log_count_limit = module.params["log_count_limit"]
start_matcher = re.compile(module.params["start_match"])
log_matcher = module.params["log_matcher"]

try:
regexp = json.loads(log_matcher["regexp"])
matcher = re.compile(regexp)
except ValueError as err:
exit_json(module, True, False, str(err))
return

log_count = 0
cmd_result = "{}"

failed = True
matched = False
try:
cmd_output = subprocess.Popen(list(cmd), stdout=subprocess.PIPE)

for line in iter(cmd_output.stdout.readline, ''):
log_count += 1
if log_count >= log_count_limit:
break

try:
js = json.loads(line.rstrip())
if start_matcher.match(js["msg"]):
break

if stamp_too_old(js["__REALTIME_TIMESTAMP"], time_limit_seconds):
break

if matcher.match(js["msg"]):
matched = True
cmd_result = line.rstrip()
break

except ValueError:
continue

failed = False
except subprocess.CalledProcessError as exc:
cmd_result = '[rc {}] {}\n{}'.format(exc.returncode, ' '.join(exc.cmd), exc.output)
except OSError as exc:
cmd_result = str(exc)

exit_json(module, failed, matched, cmd_result)


if __name__ == '__main__':
main()
32 changes: 32 additions & 0 deletions roles/openshift_health_checker/openshift_checks/etcd_traffic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# pylint: disable=missing-docstring
from openshift_checks import OpenShiftCheck, get_var


class EtcdTraffic(OpenShiftCheck):
"""Check that recommended memory is available."""

name = "etcd_traffic"
tags = ["health", "etcd"]

@classmethod
def is_active(cls, task_vars):
"""Skip hosts that do not have recommended memory requirements."""
group_names = get_var(task_vars, "group_names", default=[])
active = "masters" in group_names or "etcd" in group_names
return super(EtcdTraffic, cls).is_active(task_vars) and active

def run(self, tmp, task_vars):
match = self.module_executor("etcdlogs", {
"log_matcher": {
"regexp": "etcd: sync duration of \d+\.\d+s, expected less than 1s",
"level": "warning",
}
}, task_vars)

if match["matched"]:
msg = ("Higher than normal etcd traffic detected. "
"OpenShift 3.4 introduced an increase in etcd traffic by at least a factor of 4."
"\nUpgrading to OpenShift 3.6 is recommended in order to fix this issue.")
return {"failed": True, msg: msg}

return {}

0 comments on commit 899d89f

Please sign in to comment.