Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add fluentd logging driver config check #4592

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions roles/openshift_health_checker/openshift_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,29 @@ def get_var(self, *keys, **kwargs):
raise OpenShiftCheckException("'{}' is undefined".format(".".join(map(str, keys))))
return value

@staticmethod
def get_major_minor_version(openshift_image_tag):
"""Parse and return the deployed version of OpenShift as a tuple."""
if openshift_image_tag and openshift_image_tag[0] == 'v':
openshift_image_tag = openshift_image_tag[1:]

# map major release versions across releases
# to a common major version
openshift_major_release_version = {
"1": "3",
}

components = openshift_image_tag.split(".")
if not components or len(components) < 2:
msg = "An invalid version of OpenShift was found for this host: {}"
raise OpenShiftCheckException(msg.format(openshift_image_tag))

if components[0] in openshift_major_release_version:
components[0] = openshift_major_release_version[components[0]]

components = tuple(int(x) for x in components[:2])
return components


LOADER_EXCLUDES = (
"__init__.py",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ class Curator(LoggingCheck):
name = "curator"
tags = ["health", "logging"]

logging_namespace = None

def run(self):
"""Check various things and gather errors. Returns: result as hash"""

self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
curator_pods, error = super(Curator, self).get_pods_for_component(
curator_pods, error = self.get_pods_for_component(
self.logging_namespace,
"curator",
)
Expand All @@ -23,7 +23,6 @@ def run(self):

if check_error:
msg = ("The following Curator deployment issue was found:"
"\n-------\n"
"{}".format(check_error))
return {"failed": True, "changed": False, "msg": msg}

Expand All @@ -39,7 +38,7 @@ def check_curator(self, pods):
"Is Curator correctly deployed?"
)

not_running = super(Curator, self).not_running_pods(pods)
not_running = self.not_running_pods(pods)
if len(not_running) == len(pods):
return (
"The Curator pod is not currently in a running state,\n"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,11 @@ class Elasticsearch(LoggingCheck):
name = "elasticsearch"
tags = ["health", "logging"]

logging_namespace = None

def run(self):
"""Check various things and gather errors. Returns: result as hash"""

self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
es_pods, error = super(Elasticsearch, self).get_pods_for_component(
es_pods, error = self.get_pods_for_component(
self.logging_namespace,
"es",
)
Expand All @@ -28,7 +26,6 @@ def run(self):

if check_error:
msg = ("The following Elasticsearch deployment issue was found:"
"\n-------\n"
"{}".format(check_error))
return {"failed": True, "changed": False, "msg": msg}

Expand All @@ -37,7 +34,7 @@ def run(self):

def _not_running_elasticsearch_pods(self, es_pods):
"""Returns: list of pods that are not running, list of errors about non-running pods"""
not_running = super(Elasticsearch, self).not_running_pods(es_pods)
not_running = self.not_running_pods(es_pods)
if not_running:
return not_running, [(
'The following Elasticsearch pods are not running:\n'
Expand Down Expand Up @@ -78,7 +75,7 @@ def _check_elasticsearch_masters(self, pods_by_name):
for pod_name in pods_by_name.keys():
# Compare what each ES node reports as master and compare for split brain
get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
master_name_str = self._exec_oc(get_master_cmd, [])
master_name_str = self.exec_oc(self.logging_namespace, get_master_cmd, [])
master_names = (master_name_str or '').split(' ')
if len(master_names) > 1:
es_master_names.add(master_names[1])
Expand Down Expand Up @@ -111,7 +108,7 @@ def _check_elasticsearch_node_list(self, pods_by_name):

# get ES cluster nodes
node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
cluster_node_data = self._exec_oc(node_cmd, [])
cluster_node_data = self.exec_oc(self.logging_namespace, node_cmd, [])
try:
cluster_nodes = json.loads(cluster_node_data)['nodes']
except (ValueError, KeyError):
Expand All @@ -138,7 +135,7 @@ def _check_es_cluster_health(self, pods_by_name):
error_msgs = []
for pod_name in pods_by_name.keys():
cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
cluster_health_data = self._exec_oc(cluster_health_cmd, [])
cluster_health_data = self.exec_oc(self.logging_namespace, cluster_health_cmd, [])
try:
health_res = json.loads(cluster_health_data)
if not health_res or not health_res.get('status'):
Expand All @@ -165,7 +162,7 @@ def _check_elasticsearch_diskspace(self, pods_by_name):
error_msgs = []
for pod_name in pods_by_name.keys():
df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
disk_output = self._exec_oc(df_cmd, [])
disk_output = self.exec_oc(self.logging_namespace, df_cmd, [])
lines = disk_output.splitlines()
# expecting one header looking like 'IUse% Use%' and one body line
body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
Expand Down Expand Up @@ -201,10 +198,3 @@ def _check_elasticsearch_diskspace(self, pods_by_name):
))

return error_msgs

def _exec_oc(self, cmd_str, extra_args):
return super(Elasticsearch, self).exec_oc(
self.logging_namespace,
cmd_str,
extra_args,
)
16 changes: 5 additions & 11 deletions roles/openshift_health_checker/openshift_checks/logging/fluentd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ class Fluentd(LoggingCheck):
name = "fluentd"
tags = ["health", "logging"]

logging_namespace = None

def run(self):
"""Check various things and gather errors. Returns: result as hash"""

Expand All @@ -27,7 +25,6 @@ def run(self):

if check_error:
msg = ("The following Fluentd deployment issue was found:"
"\n-------\n"
"{}".format(check_error))
return {"failed": True, "changed": False, "msg": msg}

Expand Down Expand Up @@ -147,7 +144,11 @@ def check_fluentd(self, pods):

def get_nodes_by_name(self):
"""Retrieve all the node definitions. Returns: dict(name: node), error string"""
nodes_json = self._exec_oc("get nodes -o json", [])
nodes_json = self.exec_oc(
self.logging_namespace,
"get nodes -o json",
[]
)
try:
nodes = json.loads(nodes_json)
except ValueError: # no valid json - should not happen
Expand All @@ -158,10 +159,3 @@ def get_nodes_by_name(self):
node['metadata']['name']: node
for node in nodes['items']
}, None

def _exec_oc(self, cmd_str, extra_args):
return super(Fluentd, self).exec_oc(
self.logging_namespace,
cmd_str,
extra_args,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""
Module for performing checks on a Fluentd logging deployment configuration
"""

from openshift_checks import OpenShiftCheckException
from openshift_checks.logging.logging import LoggingCheck


class FluentdConfig(LoggingCheck):
"""Module that checks logging configuration of an integrated logging Fluentd deployment"""
name = "fluentd_config"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@juanvallejo @sosiouxme, now that I noticed we have two checks -- fluentd and fluentd_config -- is it justifiable to have them separate? Is the distinction clear enough to explain users?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am okay with combining these two checks; will wait for @sosiouxme 's input

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fluentd would run on the first master only, while fluentd_config could theoretically run on all the nodes, but I'm forgetting now how the impl goes and maybe it doesn't. If is_active is different it makes sense to keep separate...

tags = ["health"]

def is_active(self):
logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False)

try:
version = self.get_major_minor_version(self.get_var("openshift_image_tag"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that this can raise OpenShiftCheckException. We should protect the action plugin call to is_active with a try... except block for cases like this, to prevent the action plugin run from blowing up. I can include that in my current work branch.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, good idea. While you're at it, probably best to add an except Exception to catch any dumb bug we leave on one check so the others still run.

except ValueError:
# if failed to parse OpenShift version, perform check anyway (if logging enabled)
return logging_deployed

return logging_deployed and version < (3, 6)

def run(self):
"""Check that Fluentd has running pods, and that its logging config matches Docker's logging config."""
self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace)
config_error = self.check_logging_config()
if config_error:
msg = ("The following Fluentd logging configuration problem was found:"
"\n{}".format(config_error))
return {"failed": True, "msg": msg}

return {}

def check_logging_config(self):
"""Ensure that the configured Docker logging driver matches fluentd settings.
This means that, at least for now, if the following condition is met:
openshift_logging_fluentd_use_journal == True
then the value of the configured Docker logging driver should be "journald".
Otherwise, the value of the Docker logging driver should be "json-file".
Returns an error string if the above condition is not met, or None otherwise."""
use_journald = self.get_var("openshift_logging_fluentd_use_journal", default=True)

# if check is running on a master, retrieve all running pods
# and check any pod's container for the env var "USE_JOURNAL"
group_names = self.get_var("group_names")
if "masters" in group_names:
use_journald = self.check_fluentd_env_var()

docker_info = self.execute_module("docker_info", {})
try:
logging_driver = docker_info["info"]["LoggingDriver"]
except KeyError:
return "Unable to determine Docker logging driver."

logging_driver = docker_info["info"]["LoggingDriver"]
recommended_logging_driver = "journald"
error = None

# If fluentd is set to use journald but Docker is not, recommend setting the `--log-driver`
# option as an inventory file variable, or adding the log driver value as part of the
# Docker configuration in /etc/docker/daemon.json. There is no global --log-driver flag that
# can be passed to the Docker binary; the only other recommendation that can be made, would be
# to pass the `--log-driver` flag to the "run" sub-command of the `docker` binary when running
# individual containers.
if use_journald and logging_driver != "journald":
error = ('Your Fluentd configuration is set to aggregate Docker container logs from "journald".\n'
'This differs from your Docker configuration, which has been set to use "{driver}" '
'as the default method of storing logs.\n'
'This discrepancy in configuration will prevent Fluentd from receiving any logs'
'from your Docker containers.').format(driver=logging_driver)
elif not use_journald and logging_driver != "json-file":
recommended_logging_driver = "json-file"
error = ('Your Fluentd configuration is set to aggregate Docker container logs from '
'individual json log files per container.\n '
'This differs from your Docker configuration, which has been set to use '
'"{driver}" as the default method of storing logs.\n'
'This discrepancy in configuration will prevent Fluentd from receiving any logs'
'from your Docker containers.').format(driver=logging_driver)

if error:
error += ('\nTo resolve this issue, add the following variable to your Ansible inventory file:\n\n'
' openshift_docker_options="--log-driver={driver}"\n\n'
'Alternatively, you can add the following option to your Docker configuration, located in'
'"/etc/docker/daemon.json":\n\n'
'{{ "log-driver": "{driver}" }}\n\n'
'See https://docs.docker.com/engine/admin/logging/json-file '
'for more information.').format(driver=recommended_logging_driver)

return error

def check_fluentd_env_var(self):
"""Read and return the value of the 'USE_JOURNAL' environment variable on a fluentd pod."""
running_pods = self.running_fluentd_pods()

try:
pod_containers = running_pods[0]["spec"]["containers"]
except KeyError:
return "Unable to detect running containers on selected Fluentd pod."

if not pod_containers:
msg = ('There are no running containers on selected Fluentd pod "{}".\n'
'Unable to calculate expected logging driver.').format(running_pods[0]["metadata"].get("name", ""))
raise OpenShiftCheckException(msg)

pod_env = pod_containers[0].get("env")
if not pod_env:
msg = ('There are no environment variables set on the Fluentd container "{}".\n'
'Unable to calculate expected logging driver.').format(pod_containers[0].get("name"))
raise OpenShiftCheckException(msg)

for env in pod_env:
if env["name"] == "USE_JOURNAL":
return env.get("value", "false") != "false"

return False

def running_fluentd_pods(self):
"""Return a list of running fluentd pods."""
fluentd_pods, error = self.get_pods_for_component(
self.logging_namespace,
"fluentd",
)
if error:
msg = 'Unable to retrieve any pods for the "fluentd" logging component: {}'.format(error)
raise OpenShiftCheckException(msg)

running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running']
if not running_fluentd_pods:
msg = ('No Fluentd pods were found to be in the "Running" state. '
'At least one Fluentd pod is required in order to perform this check.')

raise OpenShiftCheckException(msg)

return running_fluentd_pods
18 changes: 6 additions & 12 deletions roles/openshift_health_checker/openshift_checks/logging/kibana.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,11 @@ class Kibana(LoggingCheck):
name = "kibana"
tags = ["health", "logging"]

logging_namespace = None

def run(self):
"""Check various things and gather errors. Returns: result as hash"""

self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
kibana_pods, error = super(Kibana, self).get_pods_for_component(
kibana_pods, error = self.get_pods_for_component(
self.logging_namespace,
"kibana",
)
Expand All @@ -40,7 +38,6 @@ def run(self):

if check_error:
msg = ("The following Kibana deployment issue was found:"
"\n-------\n"
"{}".format(check_error))
return {"failed": True, "changed": False, "msg": msg}

Expand Down Expand Up @@ -118,7 +115,11 @@ def _get_kibana_url(self):
"""

# Get logging url
get_route = self._exec_oc("get route logging-kibana -o json", [])
get_route = self.exec_oc(
self.logging_namespace,
"get route logging-kibana -o json",
[],
)
if not get_route:
return None, 'no_route_exists'

Expand Down Expand Up @@ -217,10 +218,3 @@ def _check_kibana_route(self):
).format(error=error)
return error
return None

def _exec_oc(self, cmd_str, extra_args):
return super(Kibana, self).exec_oc(
self.logging_namespace,
cmd_str,
extra_args,
)
Loading