add fluentd logging driver config check

openshift · Jul 27, 2017 · 153fbaa · 153fbaa
1 parent 1cd8964
commit 153fbaa
Show file tree

Hide file tree

Showing 11 changed files with 563 additions and 109 deletions.
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -105,6 +105,29 @@ def get_var(self, *keys, **kwargs):
             raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys))))
         return value
 
+    @staticmethod
+    def get_major_minor_version(openshift_image_tag):
+        """Parse and return the deployed version of OpenShift as a tuple."""
+        if openshift_image_tag and openshift_image_tag[0] == 'v':
+            openshift_image_tag = openshift_image_tag[1:]
+
+        # map major release versions across releases
+        # to a common major version
+        openshift_major_release_version = {
+            "1": "3",
+        }
+
+        components = openshift_image_tag.split(".")
+        if not components or len(components) < 2:
+            msg = "An invalid version of OpenShift was found for this host: {}"
+            raise OpenShiftCheckException(msg.format(openshift_image_tag))
+
+        if components[0] in openshift_major_release_version:
+            components[0] = openshift_major_release_version[components[0]]
+
+        components = tuple(int(x) for x in components[:2])
+        return components
+
 
 LOADER_EXCLUDES = (
     "__init__.py",

diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -9,11 +9,11 @@ class Curator(LoggingCheck):
     name = "curator"
     tags = ["health", "logging"]
 
-    logging_namespace = None
-
     def run(self):
+        """Check various things and gather errors. Returns: result as hash"""
+
         self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
-        curator_pods, error = super(Curator, self).get_pods_for_component(
+        curator_pods, error = self.get_pods_for_component(
             self.logging_namespace,
             "curator",
         )
@@ -23,7 +23,6 @@ def run(self):
 
         if check_error:
             msg = ("The following Curator deployment issue was found:"
-                   "\n-------\n"
                    "{}".format(check_error))
             return {"failed": True, "changed": False, "msg": msg}
 
@@ -39,7 +38,7 @@ def check_curator(self, pods):
                 "Is Curator correctly deployed?"
             )
 
-        not_running = super(Curator, self).not_running_pods(pods)
+        not_running = self.not_running_pods(pods)
         if len(not_running) == len(pods):
             return (
                 "The Curator pod is not currently in a running state,\n"

diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -12,13 +12,11 @@ class Elasticsearch(LoggingCheck):
     name = "elasticsearch"
     tags = ["health", "logging"]
 
-    logging_namespace = None
-
     def run(self):
         """Check various things and gather errors. Returns: result as hash"""
 
         self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
-        es_pods, error = super(Elasticsearch, self).get_pods_for_component(
+        es_pods, error = self.get_pods_for_component(
             self.logging_namespace,
             "es",
         )
@@ -28,7 +26,6 @@ def run(self):
 
         if check_error:
             msg = ("The following Elasticsearch deployment issue was found:"
-                   "\n-------\n"
                    "{}".format(check_error))
             return {"failed": True, "changed": False, "msg": msg}
 
@@ -37,7 +34,7 @@ def run(self):
 
     def _not_running_elasticsearch_pods(self, es_pods):
         """Returns: list of pods that are not running, list of errors about non-running pods"""
-        not_running = super(Elasticsearch, self).not_running_pods(es_pods)
+        not_running = self.not_running_pods(es_pods)
         if not_running:
             return not_running, [(
                 'The following Elasticsearch pods are not running:\n'
@@ -78,7 +75,7 @@ def _check_elasticsearch_masters(self, pods_by_name):
         for pod_name in pods_by_name.keys():
             # Compare what each ES node reports as master and compare for split brain
             get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
-            master_name_str = self._exec_oc(get_master_cmd, [])
+            master_name_str = self.exec_oc(self.logging_namespace, get_master_cmd, [])
             master_names = (master_name_str or '').split(' ')
             if len(master_names) > 1:
                 es_master_names.add(master_names[1])
@@ -111,7 +108,7 @@ def _check_elasticsearch_node_list(self, pods_by_name):
 
         # get ES cluster nodes
         node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
-        cluster_node_data = self._exec_oc(node_cmd, [])
+        cluster_node_data = self.exec_oc(self.logging_namespace, node_cmd, [])
         try:
             cluster_nodes = json.loads(cluster_node_data)['nodes']
         except (ValueError, KeyError):
@@ -138,7 +135,7 @@ def _check_es_cluster_health(self, pods_by_name):
         error_msgs = []
         for pod_name in pods_by_name.keys():
             cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
-            cluster_health_data = self._exec_oc(cluster_health_cmd, [])
+            cluster_health_data = self.exec_oc(self.logging_namespace, cluster_health_cmd, [])
             try:
                 health_res = json.loads(cluster_health_data)
                 if not health_res or not health_res.get('status'):
@@ -165,7 +162,7 @@ def _check_elasticsearch_diskspace(self, pods_by_name):
         error_msgs = []
         for pod_name in pods_by_name.keys():
             df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
-            disk_output = self._exec_oc(df_cmd, [])
+            disk_output = self.exec_oc(self.logging_namespace, df_cmd, [])
             lines = disk_output.splitlines()
             # expecting one header looking like 'IUse% Use%' and one body line
             body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
@@ -201,10 +198,3 @@ def _check_elasticsearch_diskspace(self, pods_by_name):
                     ))
 
         return error_msgs
-
-    def _exec_oc(self, cmd_str, extra_args):
-        return super(Elasticsearch, self).exec_oc(
-            self.logging_namespace,
-            cmd_str,
-            extra_args,
-        )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -11,8 +11,6 @@ class Fluentd(LoggingCheck):
     name = "fluentd"
     tags = ["health", "logging"]
 
-    logging_namespace = None
-
     def run(self):
         """Check various things and gather errors. Returns: result as hash"""
 
@@ -27,7 +25,6 @@ def run(self):
 
         if check_error:
             msg = ("The following Fluentd deployment issue was found:"
-                   "\n-------\n"
                    "{}".format(check_error))
             return {"failed": True, "changed": False, "msg": msg}
 
@@ -147,7 +144,11 @@ def check_fluentd(self, pods):
 
     def get_nodes_by_name(self):
         """Retrieve all the node definitions. Returns: dict(name: node), error string"""
-        nodes_json = self._exec_oc("get nodes -o json", [])
+        nodes_json = self.exec_oc(
+            self.logging_namespace,
+            "get nodes -o json",
+            []
+        )
         try:
             nodes = json.loads(nodes_json)
         except ValueError:  # no valid json - should not happen
@@ -158,10 +159,3 @@ def get_nodes_by_name(self):
             node['metadata']['name']: node
             for node in nodes['items']
         }, None
-
-    def _exec_oc(self, cmd_str, extra_args):
-        return super(Fluentd, self).exec_oc(
-            self.logging_namespace,
-            cmd_str,
-            extra_args,
-        )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
@@ -0,0 +1,138 @@
+"""
+Module for performing checks on a Fluentd logging deployment configuration
+"""
+
+from openshift_checks import OpenShiftCheckException
+from openshift_checks.logging.logging import LoggingCheck
+
+
+class FluentdConfig(LoggingCheck):
+    """Module that checks logging configuration of an integrated logging Fluentd deployment"""
+    name = "fluentd_config"
+    tags = ["health"]
+
+    def is_active(self):
+        logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False)
+
+        try:
+            version = self.get_major_minor_version(self.get_var("openshift_image_tag"))
+        except ValueError:
+            # if failed to parse OpenShift version, perform check anyway (if logging enabled)
+            return logging_deployed
+
+        return logging_deployed and version < (3, 6)
+
+    def run(self):
+        """Check that Fluentd has running pods, and that its logging config matches Docker's logging config."""
+        self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace)
+        config_error = self.check_logging_config()
+        if config_error:
+            msg = ("The following Fluentd logging configuration problem was found:"
+                   "\n{}".format(config_error))
+            return {"failed": True, "msg": msg}
+
+        return {}
+
+    def check_logging_config(self):
+        """Ensure that the configured Docker logging driver matches fluentd settings.
+        This means that, at least for now, if the following condition is met:
+
+            openshift_logging_fluentd_use_journal == True
+
+        then the value of the configured Docker logging driver should be "journald".
+        Otherwise, the value of the Docker logging driver should be "json-file".
+        Returns an error string if the above condition is not met, or None otherwise."""
+        use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True)
+
+        # if check is running on a master, retrieve all running pods
+        # and check any pod's container for the env var "USE_JOURNAL"
+        group_names = self.get_var("group_names")
+        if "masters" in group_names:
+            use_journald = self.check_fluentd_env_var()
+
+        docker_info = self.execute_module("docker_info", {})
+        try:
+            logging_driver = docker_info["info"]["LoggingDriver"]
+        except KeyError:
+            return "Unable to determine Docker logging driver."
+
+        logging_driver = docker_info["info"]["LoggingDriver"]
+        recommended_logging_driver = "journald"
+        error = None
+
+        # If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver`
+        # option as an inventory file variable, or adding the log driver value as part of the
+        # Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that
+        # can be passed to the Docker binary; the only other recommendation that can be made, would be
+        # to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running
+        # individual containers.
+        if use_journald and logging_driver != "journald":
+            error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n'
+                     'This differs from your Docker configuration, which has been set to use "{driver}" '
+                     'as the default method of storing logs.\n'
+                     'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+                     'from your Docker containers.').format(driver=logging_driver)
+        elif not use_journald and logging_driver != "json-file":
+            recommended_logging_driver = "json-file"
+            error = ('Your Fluentd configuration is set to aggregate Docker container logs from '
+                     'individual json log files per container.\n '
+                     'This differs from your Docker configuration, which has been set to use '
+                     '"{driver}" as the default method of storing logs.\n'
+                     'This discrepancy in configuration will prevent Fluentd from receiving any logs'
+                     'from your Docker containers.').format(driver=logging_driver)
+
+        if error:
+            error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n'
+                      '  openshift_docker_options="--log-driver={driver}"\n\n'
+                      'Alternatively, you can add the following option to your Docker configuration, located in'
+                      '"/etc/docker/daemon.json":\n\n'
+                      '{{ "log-driver": "{driver}" }}\n\n'
+                      'See https://docs.docker.com/engine/admin/logging/json-file '
+                      'for more information.').format(driver=recommended_logging_driver)
+
+        return error
+
+    def check_fluentd_env_var(self):
+        """Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod."""
+        running_pods = self.running_fluentd_pods()
+
+        try:
+            pod_containers = running_pods[0]["spec"]["containers"]
+        except KeyError:
+            return "Unable to detect running containers on selected Fluentd pod."
+
+        if not pod_containers:
+            msg = ('There are no running containers on selected Fluentd pod "{}".\n'
+                   'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", ""))
+            raise OpenShiftCheckException(msg)
+
+        pod_env = pod_containers[0].get("env")
+        if not pod_env:
+            msg = ('There are no environment variables set on the Fluentd container "{}".\n'
+                   'Unable to calculate expected logging driver.').format(pod_containers[0].get("name"))
+            raise OpenShiftCheckException(msg)
+
+        for env in pod_env:
+            if env["name"] == "USE_JOURNAL":
+                return env.get("value", "false") != "false"
+
+        return False
+
+    def running_fluentd_pods(self):
+        """Return a list of running fluentd pods."""
+        fluentd_pods, error = self.get_pods_for_component(
+            self.logging_namespace,
+            "fluentd",
+        )
+        if error:
+            msg = 'Unable to retrieve any pods for the "fluentd" logging component: {}'.format(error)
+            raise OpenShiftCheckException(msg)
+
+        running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running']
+        if not running_fluentd_pods:
+            msg = ('No Fluentd pods were found to be in the "Running" state. '
+                   'At least one Fluentd pod is required in order to perform this check.')
+
+            raise OpenShiftCheckException(msg)
+
+        return running_fluentd_pods
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -21,13 +21,11 @@ class Kibana(LoggingCheck):
     name = "kibana"
     tags = ["health", "logging"]
 
-    logging_namespace = None
-
     def run(self):
         """Check various things and gather errors. Returns: result as hash"""
 
         self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
-        kibana_pods, error = super(Kibana, self).get_pods_for_component(
+        kibana_pods, error = self.get_pods_for_component(
             self.logging_namespace,
             "kibana",
         )
@@ -40,7 +38,6 @@ def run(self):
 
         if check_error:
             msg = ("The following Kibana deployment issue was found:"
-                   "\n-------\n"
                    "{}".format(check_error))
             return {"failed": True, "changed": False, "msg": msg}
 
@@ -118,7 +115,11 @@ def _get_kibana_url(self):
         """
 
         # Get logging url
-        get_route = self._exec_oc("get route logging-kibana -o json", [])
+        get_route = self.exec_oc(
+            self.logging_namespace,
+            "get route logging-kibana -o json",
+            [],
+        )
         if not get_route:
             return None, 'no_route_exists'
 
@@ -217,10 +218,3 @@ def _check_kibana_route(self):
             ).format(error=error)
             return error
         return None
-
-    def _exec_oc(self, cmd_str, extra_args):
-        return super(Kibana, self).exec_oc(
-            self.logging_namespace,
-            cmd_str,
-            extra_args,
-        )