diff --git a/roles/openshift_health_checker/library/etcdlogs.py b/roles/openshift_health_checker/library/etcdlogs.py new file mode 100644 index 00000000000..b9a397769bb --- /dev/null +++ b/roles/openshift_health_checker/library/etcdlogs.py @@ -0,0 +1,95 @@ +# pylint: disable=missing-docstring + +"""Interface to journalctl""" + +from time import time +import json +import re +import subprocess + +from ansible.module_utils.basic import AnsibleModule + + +def exit_json(module, failed, matched, result): + module.exit_json( + changed=False, + failed=failed, + matched=matched, + result=result, + ) + + +def stamp_too_old(stamp, time_limit): + epoch = int(stamp) / 1000000 + return epoch - (time() - time_limit) < 0 + + +def main(): + module = AnsibleModule( + argument_spec=dict( + log_count_limit=dict(type="int", default=500), + start_match=dict(type="str", default="Starting Etcd Server"), + log_matcher=dict(type="dict", required=True), + output=dict(type="str", default="json"), + ), + ) + + cmd = [ + '/bin/journalctl', + '-ru', 'etcd', + '--output', module.params["output"], + ] + + time_limit_seconds = 60 * 60 # 1 hour + + log_count_limit = module.params["log_count_limit"] + start_matcher = re.compile(module.params["start_match"]) + log_matcher = module.params["log_matcher"] + + try: + regexp = json.loads(log_matcher["regexp"]) + matcher = re.compile(regexp) + except ValueError as err: + exit_json(module, True, False, str(err)) + return + + log_count = 0 + cmd_result = "{}" + + failed = True + matched = False + try: + cmd_output = subprocess.Popen(list(cmd), stdout=subprocess.PIPE) + + for line in iter(cmd_output.stdout.readline, ''): + log_count += 1 + if log_count >= log_count_limit: + break + + try: + js = json.loads(line.rstrip()) + if start_matcher.match(js["MESSAGE"]): + break + + if stamp_too_old(js["__REALTIME_TIMESTAMP"], time_limit_seconds): + break + + if matcher.match(js["MESSAGE"]): + matched = True + cmd_result = line.rstrip() + break + + except ValueError: + continue + + failed = False + except subprocess.CalledProcessError as exc: + cmd_result = '[rc {}] {}\n{}'.format(exc.returncode, ' '.join(exc.cmd), exc.output) + except OSError as exc: + cmd_result = str(exc) + + exit_json(module, failed, matched, cmd_result) + + +if __name__ == '__main__': + main() diff --git a/roles/openshift_health_checker/openshift_checks/etcd_traffic.py b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py new file mode 100644 index 00000000000..2a1d9579a24 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/etcd_traffic.py @@ -0,0 +1,32 @@ +# pylint: disable=missing-docstring +from openshift_checks import OpenShiftCheck, get_var + + +class EtcdTraffic(OpenShiftCheck): + """Check that recommended memory is available.""" + + name = "etcd_traffic" + tags = ["health", "etcd"] + + @classmethod + def is_active(cls, task_vars): + """Skip hosts that do not have recommended memory requirements.""" + group_names = get_var(task_vars, "group_names", default=[]) + active = "masters" in group_names or "etcd" in group_names + return super(EtcdTraffic, cls).is_active(task_vars) and active + + def run(self, tmp, task_vars): + match = self.module_executor("etcdlogs", { + "log_matcher": { + "regexp": "etcd: sync duration of \d+\.\d+s, expected less than 1s", + "level": "warning", + } + }, task_vars) + + if match["matched"]: + msg = ("Higher than normal etcd traffic detected. " + "OpenShift 3.4 introduced an increase in etcd traffic by at least a factor of 4." + "\nUpgrading to OpenShift 3.6 is recommended in order to fix this issue.") + return {"failed": True, msg: msg} + + return {}