From 6419b9b45838356154f18fc1c3cc2479a669b336 Mon Sep 17 00:00:00 2001 From: Benjamin Fernandes Date: Thu, 17 Jul 2014 19:33:22 -0400 Subject: [PATCH 1/3] Revamp Docker check, more metrics, less bugs, support of CoreOS --- checks.d/docker.py | 162 ++++++++++++++++++++----------------- conf.d/docker.yaml.example | 68 ++++++++-------- 2 files changed, 125 insertions(+), 105 deletions(-) diff --git a/checks.d/docker.py b/checks.d/docker.py index 4f99731a5c..4a1b1ffb0a 100644 --- a/checks.d/docker.py +++ b/checks.d/docker.py @@ -16,10 +16,10 @@ DEFAULT_MAX_CONTAINERS = 20 EVENT_TYPE = SOURCE_TYPE_NAME = 'docker' -LXC_METRICS = [ +CGROUP_METRICS = [ { "cgroup": "memory", - "file": "%s/%s/memory.stat", + "file": "memory.stat", "metrics": { "active_anon": ("docker.mem.active_anon", "gauge"), "active_file": ("docker.mem.active_file", "gauge"), @@ -53,7 +53,7 @@ }, { "cgroup": "cpuacct", - "file": "%s/%s/cpuacct.stat", + "file": "cpuacct.stat", "metrics": { "user": ("docker.cpu.user", "rate"), "system": ("docker.cpu.system", "rate"), @@ -72,6 +72,7 @@ SOCKET_TIMEOUT = 5 + class UnixHTTPConnection(httplib.HTTPConnection, object): """Class used in conjuction with UnixSocketHandler to make urllib2 compatible with Unix sockets.""" @@ -110,71 +111,54 @@ def unix_open(self, req): class Docker(AgentCheck): - def __init__(self, *args, **kwargs): - super(Docker, self).__init__(*args, **kwargs) + def __init__(self, name, init_config, agentConfig): + AgentCheck.__init__(self, name, init_config, agentConfig) self._mountpoints = {} - self.cgroup_path_prefix = None # Depending on the version - for metric in LXC_METRICS: - self._mountpoints[metric["cgroup"]] = self._find_cgroup(metric["cgroup"]) - self._path_prefix = None + docker_root = init_config.get('docker_root', '/') + for metric in CGROUP_METRICS: + self._mountpoints[metric["cgroup"]] = self._find_cgroup(metric["cgroup"], docker_root) + self._cgroup_filename_pattern = self._find_cgroup_filename_pattern() self._last_event_collection_ts = defaultdict(lambda: None) self.url_opener = urllib2.build_opener(UnixSocketHandler()) self.should_get_size = True - @property - def path_prefix(self): - if self._path_prefix is None: - metric = LXC_METRICS[0] - mountpoint = self._mountpoints[metric["cgroup"]] - stat_file_lxc = os.path.join(mountpoint, "lxc") - stat_file_docker = os.path.join(mountpoint, "docker") - - if os.path.exists(stat_file_lxc): - self._path_prefix = "lxc" - elif os.path.exists(stat_file_docker): - self._path_prefix = "docker" - else: - raise Exception("Cannot find Docker cgroup file. If you are using Docker 0.9 or 0.10, it is a known bug in Docker fixed in Docker 0.11") - return self._path_prefix + def _find_cgroup_filename_pattern(self): + if self._mountpoints: + mountpoint = self._mountpoints.values()[0] + stat_file_path_lxc = os.path.join(mountpoint, "lxc") + stat_file_path_docker = os.path.join(mountpoint, "docker") + stat_file_path_coreos = os.path.join(mountpoint, "system.slice") - def check(self, instance): - tags = instance.get("tags") or [] - skipped_cgroup = 0 + if os.path.exists(stat_file_path_lxc): + return os.path.join('%(mountpoint)s/lxc/%(id)s/%(file)s') + elif os.path.exists(stat_file_path_docker): + return os.path.join('%(mountpoint)s/docker/%(id)s/%(file)s') + elif os.path.exists(stat_file_path_coreos): + return os.path.join('%(mountpoint)s/system.slice/docker-%(id)s.scope/%(file)s') + + raise Exception("Cannot find Docker cgroup directory. If you are using Docker 0.9 or 0.10," + "it is a known bug in Docker fixed in Docker 0.11") + def check(self, instance): try: self._process_events(self._get_events(instance)) - except socket.timeout: + except socket.timeout, urllib2.URLError: self.warning('Timeout during socket connection. Events will be missing.') - if self.should_get_size: - try: - containers = self._get_containers(instance, with_size=True) - except socket.timeout: - # Probably because of: https://github.com/DataDog/dd-agent/issues/963 - # Then we should stop trying to get size info - self.log.info('Cannot get container size because of API timeout. Turn size flag off.') - self.should_get_size = False - - if not self.should_get_size: - containers = self._get_containers(instance, with_size=False) - - if not containers: - containers = [] - self.warning("No containers are running.") - return - - self.gauge("docker.containers.running", len(containers)) + self._count_images(instance) + containers = self._get_and_count_containers(instance) max_containers = instance.get('max_containers', DEFAULT_MAX_CONTAINERS) if not instance.get("exclude") or not instance.get("include"): if len(containers) > max_containers: - self.warning("Too many containers to collect. Please refine the containers to collect by editing the configuration file. Truncating to %s containers" % max_containers) + self.warning("Too many containers to collect. Please refine the containers to collect" + "by editing the configuration file. Truncating to %s containers" % max_containers) containers = containers[:max_containers] collected_containers = 0 for container in containers: - container_tags = list(tags) + container_tags = instance.get("tags", []) for name in container["Names"]: container_tags.append(self._make_tag("name", name.lstrip("/"))) for key in DOCKER_TAGS: @@ -187,27 +171,25 @@ def check(self, instance): for key, (dd_key, metric_type) in DOCKER_METRICS.items(): if key in container: getattr(self, metric_type)(dd_key, int(container[key]), tags=container_tags) - for metric in LXC_METRICS: - mountpoint = self._mountpoints[metric["cgroup"]] - stat_file = os.path.join(mountpoint, metric["file"] % (self.path_prefix, container["Id"])) + for cgroup in CGROUP_METRICS: + stat_file = self._cgroup_filename_pattern % (dict( + mountpoint=self._mountpoints[cgroup["cgroup"]], + id=container['Id'], + file=cgroup['file'], + )) stats = self._parse_cgroup_file(stat_file) if stats: - for key, (dd_key, metric_type) in metric["metrics"].items(): - if key.startswith("total_") and not instance.get("collect_total"): + for key, (dd_key, metric_type) in cgroup['metrics'].items(): + if key.startswith('total_') and not instance.get('collect_total'): continue if key in stats: getattr(self, metric_type)(dd_key, int(stats[key]), tags=container_tags) - else: - skipped_cgroup += 1 collected_containers += 1 if collected_containers >= max_containers: self.warning("Too many containers are matching the current configuration. Some containers will not be collected. Please refine your configuration") break - if skipped_cgroup and skipped_cgroup == collected_containers * len(LXC_METRICS): - raise IOError("We were unable to open cgroup files. If you are using Docker 0.9 or 0.10, it is a known bug in Docker fixed in Docker 0.11") - def _process_events(self, events): for ev in events: self.log.debug("Creating event for %s" % ev) @@ -220,6 +202,39 @@ def _process_events(self, events): 'event_object': ev['from'], }) + def _count_images(self, instance): + tags = instance.get("tags", []) + active_images = len(self._get_images(instance, get_all=False)) + all_images = len(self._get_images(instance, get_all=True)) + + self.gauge("docker.images.available", active_images, tags=tags) + self.gauge("docker.images.intermediate", (all_images - active_images), tags=tags) + + def _get_and_count_containers(self, instance): + tags = instance.get("tags", []) + if self.should_get_size: + try: + containers = self._get_containers(instance, with_size=True) + except socket.timeout, urllib.URLError: + # Probably because of: https://github.com/DataDog/dd-agent/issues/963 + # Then we should stop trying to get size info + self.log.info("Cannot get container size because of API timeout. Stop asking for it.") + self.should_get_size = False + + if not self.should_get_size: + containers = self._get_containers(instance, with_size=False) + + if not containers: + containers = [] + self.warning("No containers are running.") + return + + stopped_containers_count = len(self._get_containers(instance, get_all=True)) - len(containers) + self.gauge("docker.containers.running", len(containers), tags=tags) + self.gauge("docker.containers.stopped", stopped_containers_count, tags=tags) + + return containers + def _make_tag(self, key, value): return "%s:%s" % (key.lower(), value.strip()) @@ -238,9 +253,14 @@ def _is_tag_included(tag): return True return False - def _get_containers(self, instance, with_size=True): - """Gets the list of running containers in Docker.""" - return self._get_json("%(url)s/containers/json" % instance, params={'size': with_size}) + + def _get_containers(self, instance, with_size=False, get_all=False): + """Gets the list of running/all containers in Docker.""" + return self._get_json("%(url)s/containers/json" % instance, params={'size': with_size, 'all': get_all}) + + def _get_images(self, instance, with_size=True, get_all=False): + """Gets the list of images in Docker.""" + return self._get_json("%(url)s/images/json" % instance, params={'all': get_all}) def _get_events(self, instance): """Get the list of events """ @@ -275,35 +295,33 @@ def _get_json(self, uri, params=None, multi=False): return json.loads(response) - def _find_cgroup(self, hierarchy): + def _find_cgroup(self, hierarchy, docker_root): """Finds the mount point for a specified cgroup hierarchy. Works with old style and new style mounts.""" try: - fp = open("/proc/mounts") + fp = open(os.path.join(docker_root, "/proc/mounts")) mounts = map(lambda x: x.split(), fp.read().splitlines()) finally: fp.close() cgroup_mounts = filter(lambda x: x[2] == "cgroup", mounts) # Old cgroup style if len(cgroup_mounts) == 1: - return cgroup_mounts[0][1] + return os.path.join(docker_root, cgroup_mounts[0][1]) for _, mountpoint, _, opts, _, _ in cgroup_mounts: if hierarchy in opts: - return mountpoint + return os.path.join(docker_root, mountpoint) - def _parse_cgroup_file(self, file_): + def _parse_cgroup_file(self, stat_file): """Parses a cgroup pseudo file for key/values.""" fp = None - self.log.debug("Opening file: %s" % file_) + self.log.debug("Opening file: %s" % stat_file) try: try: - fp = open(file_) + fp = open(stat_file) return dict(map(lambda x: x.split(), fp.read().splitlines())) except IOError: - # Can be because of Docker 0.9/0.10 bug or because the container got stopped - # Count this kind of exception, if it happens to often it is because of the bug - self.log.info("Can't open %s. Metrics for this container are skipped." % file_) - return None + # Can be because the container got stopped + self.log.info("Can't open %s. Metrics for this container are skipped." % stat_file) finally: if fp is not None: fp.close() diff --git a/conf.d/docker.yaml.example b/conf.d/docker.yaml.example index 71a2266c81..c72b586c6d 100644 --- a/conf.d/docker.yaml.example +++ b/conf.d/docker.yaml.example @@ -2,41 +2,43 @@ # The user running the Datadog Agent (usually "dd-agent") must be part of the "docker" group init_config: + # Change the root directory to look at to get cgroup statistics. Useful when running inside a + # container with host directories mounted on a different folder. Default: /. + # + # docker_root: /host instances: - url: "unix://var/run/docker.sock" -# Include/Exclude rules -# -# To include or exclude containers based on their tags, use the include and -# exclude keys in your instance. -# The reasoning is: if a tag matches an exclude rule, it won't be included -# unless it also matches an include rule. -# -# Examples: -# exclude all, except ubuntu and debian: -# instances: -# - url: "unix://var/run/docker.sock" -# include: -# - "image:ubuntu" -# - "image:debian" -# exclude: -# - ".*" -# -# include all, except ubuntu and debian: -# instances: -# - url: "unix://var/run/docker.sock" -# include: [] -# exclude: -# - "image:ubuntu" -# - "image:debian" + # You can add extra tags to your Docker metrics with the tags list option. Default: []. + # + # tags: [extra_tag1, extra_tag2] -# Sub-cgroups metrics -# -# If you want to include sub-cgroups metrics, turn the collect_total option on. -# It is useless for a normal usage, as total_ metrics will be the same as normal ones. -# -# Example: -# instances: -# - url: "unix://var/run/docker.sock" -# collect_total: true + # It is possible to collect the number of Docker images with the collect_images option. Default: false. + # collect_images: true + + # To include or exclude containers based on their tags, use the include and + # exclude keys in your instance. + # The reasoning is: if a tag matches an exclude rule, it won't be included + # unless it also matches an include rule. + # + # Examples: + # exclude all, except ubuntu and debian. + # + # include: + # - "image:ubuntu" + # - "image:debian" + # exclude: + # - ".*" + # + # include all, except ubuntu and debian. + # + # include: [] + # exclude: + # - "image:ubuntu" + # - "image:debian" + + # If you want to include sub-cgroups metrics, turn the collect_total option on. + # It is useless for a normal usage, as total_ metrics will be the same as normal ones. Default: false. + # + # collect_total: true \ No newline at end of file From 0819c793e2d0ad4f54300dd92fe79a1ba6a341f5 Mon Sep 17 00:00:00 2001 From: Benjamin Fernandes Date: Fri, 18 Jul 2014 18:10:02 -0400 Subject: [PATCH 2/3] Clarify Docker check exception messages --- checks.d/docker.py | 6 ++++-- conf.d/docker.yaml.example | 5 +---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/checks.d/docker.py b/checks.d/docker.py index 4a1b1ffb0a..93dec86df1 100644 --- a/checks.d/docker.py +++ b/checks.d/docker.py @@ -136,8 +136,7 @@ def _find_cgroup_filename_pattern(self): elif os.path.exists(stat_file_path_coreos): return os.path.join('%(mountpoint)s/system.slice/docker-%(id)s.scope/%(file)s') - raise Exception("Cannot find Docker cgroup directory. If you are using Docker 0.9 or 0.10," - "it is a known bug in Docker fixed in Docker 0.11") + raise Exception("Cannot find Docker cgroup directory.") def check(self, instance): try: @@ -304,6 +303,9 @@ def _find_cgroup(self, hierarchy, docker_root): finally: fp.close() cgroup_mounts = filter(lambda x: x[2] == "cgroup", mounts) + if len(cgroup_mounts) == 0: + raise Exception("Can't find mounted cgroups. If you run the Agent inside a container," + " please refer to the documentation.") # Old cgroup style if len(cgroup_mounts) == 1: return os.path.join(docker_root, cgroup_mounts[0][1]) diff --git a/conf.d/docker.yaml.example b/conf.d/docker.yaml.example index c72b586c6d..43fda46a25 100644 --- a/conf.d/docker.yaml.example +++ b/conf.d/docker.yaml.example @@ -14,9 +14,6 @@ instances: # # tags: [extra_tag1, extra_tag2] - # It is possible to collect the number of Docker images with the collect_images option. Default: false. - # collect_images: true - # To include or exclude containers based on their tags, use the include and # exclude keys in your instance. # The reasoning is: if a tag matches an exclude rule, it won't be included @@ -41,4 +38,4 @@ instances: # If you want to include sub-cgroups metrics, turn the collect_total option on. # It is useless for a normal usage, as total_ metrics will be the same as normal ones. Default: false. # - # collect_total: true \ No newline at end of file + # collect_total: true From 8d7998ebf253b9c2854cad61461a95a2207121f2 Mon Sep 17 00:00:00 2001 From: Benjamin Fernandes Date: Fri, 18 Jul 2014 19:10:22 -0400 Subject: [PATCH 3/3] Find croup file at first check instead of init Also some minor fixes --- checks.d/docker.py | 79 ++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/checks.d/docker.py b/checks.d/docker.py index 93dec86df1..bd669e900f 100644 --- a/checks.d/docker.py +++ b/checks.d/docker.py @@ -117,31 +117,43 @@ def __init__(self, name, init_config, agentConfig): docker_root = init_config.get('docker_root', '/') for metric in CGROUP_METRICS: self._mountpoints[metric["cgroup"]] = self._find_cgroup(metric["cgroup"], docker_root) - self._cgroup_filename_pattern = self._find_cgroup_filename_pattern() self._last_event_collection_ts = defaultdict(lambda: None) self.url_opener = urllib2.build_opener(UnixSocketHandler()) self.should_get_size = True + self._cgroup_filename_pattern = None def _find_cgroup_filename_pattern(self): if self._mountpoints: - mountpoint = self._mountpoints.values()[0] - stat_file_path_lxc = os.path.join(mountpoint, "lxc") - stat_file_path_docker = os.path.join(mountpoint, "docker") - stat_file_path_coreos = os.path.join(mountpoint, "system.slice") - - if os.path.exists(stat_file_path_lxc): - return os.path.join('%(mountpoint)s/lxc/%(id)s/%(file)s') - elif os.path.exists(stat_file_path_docker): - return os.path.join('%(mountpoint)s/docker/%(id)s/%(file)s') - elif os.path.exists(stat_file_path_coreos): - return os.path.join('%(mountpoint)s/system.slice/docker-%(id)s.scope/%(file)s') - - raise Exception("Cannot find Docker cgroup directory.") + # We try with different cgroups so that it works even if only one is properly working + for mountpoint in self._mountpoints.values(): + stat_file_path_lxc = os.path.join(mountpoint, "lxc") + stat_file_path_docker = os.path.join(mountpoint, "docker") + stat_file_path_coreos = os.path.join(mountpoint, "system.slice") + + if os.path.exists(stat_file_path_lxc): + return os.path.join('%(mountpoint)s/lxc/%(id)s/%(file)s') + elif os.path.exists(stat_file_path_docker): + return os.path.join('%(mountpoint)s/docker/%(id)s/%(file)s') + elif os.path.exists(stat_file_path_coreos): + return os.path.join('%(mountpoint)s/system.slice/docker-%(id)s.scope/%(file)s') + + raise Exception("Cannot find Docker cgroup directory. Be sure your system is supported.") + + def _get_cgroup_file(self, cgroup, container_id, filename): + # This can't be initialized at startup because cgroups may not be mounted + if not self._cgroup_filename_pattern: + self._cgroup_filename_pattern = self._find_cgroup_filename_pattern() + + return self._cgroup_filename_pattern % (dict( + mountpoint=self._mountpoints[cgroup], + id=container_id, + file=filename, + )) def check(self, instance): try: self._process_events(self._get_events(instance)) - except socket.timeout, urllib2.URLError: + except (socket.timeout, urllib2.URLError): self.warning('Timeout during socket connection. Events will be missing.') self._count_images(instance) @@ -171,11 +183,7 @@ def check(self, instance): if key in container: getattr(self, metric_type)(dd_key, int(container[key]), tags=container_tags) for cgroup in CGROUP_METRICS: - stat_file = self._cgroup_filename_pattern % (dict( - mountpoint=self._mountpoints[cgroup["cgroup"]], - id=container['Id'], - file=cgroup['file'], - )) + stat_file = self._get_cgroup_file(cgroup["cgroup"], container['Id'], cgroup['file']) stats = self._parse_cgroup_file(stat_file) if stats: for key, (dd_key, metric_type) in cgroup['metrics'].items(): @@ -211,17 +219,15 @@ def _count_images(self, instance): def _get_and_count_containers(self, instance): tags = instance.get("tags", []) - if self.should_get_size: - try: - containers = self._get_containers(instance, with_size=True) - except socket.timeout, urllib.URLError: - # Probably because of: https://github.com/DataDog/dd-agent/issues/963 - # Then we should stop trying to get size info - self.log.info("Cannot get container size because of API timeout. Stop asking for it.") - self.should_get_size = False - - if not self.should_get_size: - containers = self._get_containers(instance, with_size=False) + + try: + containers = self._get_containers(instance, with_size=self.should_get_size) + except (socket.timeout, urllib2.URLError): + # Probably because of: https://github.com/DataDog/dd-agent/issues/963 + # Then we should stop trying to get size info + self.log.info("Cannot get container size because of API timeout. Stop collecting it.") + self.should_get_size = False + containers = self._get_containers(instance, with_size=self.should_get_size) if not containers: containers = [] @@ -318,12 +324,11 @@ def _parse_cgroup_file(self, stat_file): fp = None self.log.debug("Opening file: %s" % stat_file) try: - try: - fp = open(stat_file) - return dict(map(lambda x: x.split(), fp.read().splitlines())) - except IOError: - # Can be because the container got stopped - self.log.info("Can't open %s. Metrics for this container are skipped." % stat_file) + fp = open(stat_file) + return dict(map(lambda x: x.split(), fp.read().splitlines())) + except IOError: + # Can be because the container got stopped + self.log.info("Can't open %s. Metrics for this container are skipped." % stat_file) finally: if fp is not None: fp.close()