diff --git a/.travis.yml b/.travis.yml index e5b6c4638e..a8f85756bb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,3 +12,4 @@ before_script: - sudo apt-get install python-mysqldb - sudo apt-get install nginx - sudo apt-get install apache2 + - sudo apt-get install sysstat diff --git a/agent.py b/agent.py index 6adcbe9de4..ed3a952c31 100755 --- a/agent.py +++ b/agent.py @@ -19,6 +19,10 @@ import time import urllib +# Constants +PID_DIR="/var/run/dd-agent" +PID_FILE="dd-agent.pid" + # Watchdog implementation from threading import Timer WATCHDOG_MULTIPLIER = 10 # will fire if no checks have been collected in N * checkFreq, 150s by default @@ -31,6 +35,7 @@ # Custom modules from checks.common import checks +from checks.ec2 import EC2 from config import get_config, get_system_stats, get_parsed_args from daemon import Daemon from emitter import http_emitter @@ -38,22 +43,6 @@ # Override the generic daemon class to run our checks class agent(Daemon): - EC2_URL = "http://169.254.169.254/latest/meta-data/instance-id" - - @staticmethod - def get_ec2_instance_id(): - """Fetch EC2 instance ID if possible. If not on EC2 returns None""" - try: - url = urllib.urlopen(agent.EC2_URL) - instanceId = url.read() - assert instanceId.startswith("i-"), "Malformed instance-id: %s" % instanceId - return instanceId - - except Exception, e: - logging.getLogger('agent').exception('Cannot determine instance-id. Is this machine on EC2?') - - return None - def late(self, cks, threshold, crash=True): """Determine whether the agent run is late and optionally kill it if so. """ @@ -75,19 +64,17 @@ def run(self, agentConfig=None, run_forever=True): agentLogger.debug('Creating checks instance') if agentConfig is None: - agentConfig, rawConfig = get_config() - else: - rawConfig = {} + agentConfig = get_config() # Try to fetch instance Id from EC2 if not hostname has been set # in the config file if agentConfig.get('hostname') is None and agentConfig.get('useEC2InstanceId'): - instanceId = self.get_ec2_instance_id() + instanceId = EC2.get_instance_id() if instanceId is not None: agentLogger.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: - agentLogger.info('Not running on EC2') + agentLogger.info('Not running on EC2, using hostname to identify this server') emitter = http_emitter @@ -95,7 +82,7 @@ def run(self, agentConfig=None, run_forever=True): lateThresh = checkFreq * WATCHDOG_MULTIPLIER # Checks instance - c = checks(agentConfig, rawConfig, emitter) + c = checks(agentConfig, emitter) # Run once c.doChecks(True, systemStats) @@ -120,9 +107,10 @@ def run(self, agentConfig=None, run_forever=True): agentLogger.debug("Getting ready to sleep for %s seconds." % lateThresh) def setupLogging(agentConfig): - """Used by ddagent.py as well""" + """Configure logging to use syslog whenever possible. + Also controls debugMode.""" if agentConfig['debugMode']: - logFile = os.path.join(agentConfig['tmpDirectory'], 'dd-agent.log') + logFile = "/tmp/dd-agent.log" logging.basicConfig(filename=logFile, filemode='w', level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logging.info("Logging to %s" % logFile) else: @@ -146,49 +134,68 @@ def setupLogging(agentConfig): sys.stderr.write("Error while setting up syslog logging (%s). No logging available" % str(e)) logging.disable(logging.ERROR) -def getPidFile(command, agentConfig, clean): - """Used by ddagent.py as well""" - - if command == 'init': - # This path added for newer Linux packages which run under - # a separate dd-agent user account. - if os.path.exists('/var/run/dd-agent/'): - pidFile = '/var/run/dd-agent/dd-agent.pid' - else: - pidFile = '/var/run/dd-agent.pid' - +def getPidFile(pid_dir=PID_DIR): + """Find a good spot for the pid file. + By default PID_DIR/PID_FILE + """ + try: + # Can we write to the directory + if os.access(pid_dir, os.W_OK): + pidfile = os.path.join(pid_dir, PID_FILE) + logging.info("Pid file is: %s" % pidfile) + return pidfile + except: + logging.exception("Cannot locate pid file, defaulting to /tmp/%s" % PID_FILE) + # continue + + # if all else fails + if os.access("/tmp", os.W_OK): + logging.warn("Pid file: /tmp/%s" % PID_FILE) + return os.path.join("/tmp", PID_FILE) else: - pidFile = os.path.join(agentConfig['pidfileDirectory'], 'dd-agent.pid') - - if clean: - logging.debug('Agent called with --clean option, removing .pid') - try: - os.remove(pidFile) - except OSError: - # Did not find pid file - pass - - return pidFile + # Can't save pid file, bail out + logging.error("Cannot save pid file anywhere") + sys.exit(-2) + +def cleanPidFile(pid_dir=PID_DIR): + try: + logging.debug("Cleaning up pid file %s" % getPidFile(pid_dir)) + os.remove(getPidFile(pid_dir)) + return True + except: + logging.exception("Could not clean up pid file") + return False + +def getPid(pid_dir=PID_DIR): + "Retrieve the actual pid" + try: + pf = open(getPidFile(pid_dir)) + pid_s = pf.read() + pf.close() + + return int(pid_s.strip()) + except: + logging.exception("Cannot read pid") + return None # Control of daemon if __name__ == '__main__': options, args = get_parsed_args() - agentConfig, rawConfig = get_config() + agentConfig = get_config() # Logging setupLogging(agentConfig) - # FIXME - # Ever heard of optparse? - argLen = len(sys.argv) if len(args) > 0: command = args[0] + + if options.clean: + cleanPidFile() - # Daemon instance from agent class - pidFile = getPidFile(command, agentConfig, options.clean) + pidFile = getPidFile() daemon = agent(pidFile) if 'start' == command: @@ -208,16 +215,8 @@ def getPidFile(command, agentConfig, clean): daemon.run() elif 'status' == command: - try: - pf = file(pidFile,'r') - pid = int(pf.read().strip()) - pf.close() - except IOError: - pid = None - except SystemExit: - pid = None - - if pid: + pid = getPid() + if pid is not None: sys.stdout.write('dd-agent is running as pid %s.\n' % pid) logging.info("dd-agent is running as pid %s." % pid) else: diff --git a/checks/common.py b/checks/common.py index b353b636f2..d526880a8a 100644 --- a/checks/common.py +++ b/checks/common.py @@ -65,9 +65,8 @@ def wrapper(*args, **kwargs): return wrapper class checks: - def __init__(self, agentConfig, rawConfig, emitter): + def __init__(self, agentConfig, emitter): self.agentConfig = agentConfig - self.rawConfig = rawConfig self.plugins = None self.emitter = emitter self.last_post_ts = None diff --git a/checks/ec2.py b/checks/ec2.py index 116b3ff9d5..4c08203aeb 100644 --- a/checks/ec2.py +++ b/checks/ec2.py @@ -8,13 +8,14 @@ class EC2(Check): """Retrieve EC2 metadata """ - URL = "http://169.254.169.254/1.0/meta-data/" + URL = "http://169.254.169.254/latest/meta-data/" TIMEOUT = 0.1 # second def __init__(self, logger): Check.__init__(self, logger) - def get_metadata(self): + @staticmethod + def get_metadata(): """Use the ec2 http service to introspect the instance. This adds latency if not running on EC2 """ # >>> import urllib2 @@ -42,7 +43,7 @@ def get_metadata(self): assert type(v) in (types.StringType, types.UnicodeType) and len(v) > 0, "%s is not a string" % v metadata[k] = v except: - self.logger.exception("(Ignore if !ec2) Cannot extract EC2 metadata %s" % k) + pass try: if socket_to is None: @@ -52,3 +53,10 @@ def get_metadata(self): pass return metadata + + @staticmethod + def get_instance_id(): + try: + return EC2.get_metadata().get("instance-id", None) + except: + return None diff --git a/config.py b/config.py index 4f89c9e760..e7ab6f8537 100644 --- a/config.py +++ b/config.py @@ -9,6 +9,7 @@ # CONSTANTS DATADOG_CONF = "datadog.conf" +DEFAULT_CHECK_FREQUENCY = 15 # seconds def get_parsed_args(): parser = OptionParser() @@ -25,7 +26,7 @@ def get_parsed_args(): return options, args def get_version(): - return "2.2.16" + return "2.2.17" def skip_leading_wsp(f): "Works on a file, returns a file-like object" @@ -42,11 +43,9 @@ def get_config(parse_args = True, cfg_path=None): agentConfig = {} agentConfig['debugMode'] = False # not really a frequency, but the time to sleep between checks - agentConfig['checkFreq'] = 15 + agentConfig['checkFreq'] = DEFAULT_CHECK_FREQUENCY agentConfig['version'] = get_version() - rawConfig = {} - # Config handling try: # Find the right config file @@ -67,7 +66,11 @@ def get_config(parse_args = True, cfg_path=None): config = ConfigParser.ConfigParser() config.readfp(skip_leading_wsp(open(config_path))) + # # Core config + # + + # Where to send the data if options is not None and options.use_forwarder: listen_port = 17123 if config.has_option('Main','listen_port'): @@ -80,14 +83,10 @@ def get_config(parse_args = True, cfg_path=None): if agentConfig['ddUrl'].endswith('/'): agentConfig['ddUrl'] = agentConfig['ddUrl'][:-1] + # Which API key to use agentConfig['apiKey'] = config.get('Main', 'api_key') - - if os.path.exists('/var/log/dd-agent/'): - agentConfig['tmpDirectory'] = '/var/log/dd-agent/' - else: - agentConfig['tmpDirectory'] = '/tmp/' # default which may be overriden in the config later - agentConfig['pidfileDirectory'] = agentConfig['tmpDirectory'] - + + # Debug mode agentConfig['debugMode'] = config.get('Main', 'debug_mode').lower() in ("yes", "true") if config.has_option('Main', 'use_ec2_instance_id'): @@ -98,7 +97,10 @@ def get_config(parse_args = True, cfg_path=None): agentConfig['useEC2InstanceId'] = False if config.has_option('Main', 'check_freq'): - agentConfig['checkFreq'] = int(config.get('Main', 'check_freq')) + try: + agentConfig['checkFreq'] = int(config.get('Main', 'check_freq')) + except: + agentConfig['checkFreq'] = DEFAULT_CHECK_FREQUENCY if config.has_option('Main','hostname'): agentConfig['hostname'] = config.get('Main','hostname') @@ -161,12 +163,6 @@ def get_config(parse_args = True, cfg_path=None): if config.has_option('Main', 'nginx_status_url'): agentConfig['nginxStatusUrl'] = config.get('Main', 'nginx_status_url') - if config.has_option('Main', 'tmp_directory'): - agentConfig['tmpDirectory'] = config.get('Main', 'tmp_directory') - - if config.has_option('Main', 'pidfile_directory'): - agentConfig['pidfileDirectory'] = config.get('Main', 'pidfile_directory') - if config.has_option('Main', 'plugin_directory'): agentConfig['pluginDirectory'] = config.get('Main', 'plugin_directory') @@ -308,13 +304,7 @@ def get_config(parse_args = True, cfg_path=None): sys.stderr.write('You have configured MongoDB for monitoring, but the pymongo module is not installed.\n') sys.exit(2) - for section in config.sections(): - rawConfig[section] = {} - - for option in config.options(section): - rawConfig[section][option] = config.get(section, option) - - return agentConfig, rawConfig + return agentConfig def get_system_stats(): systemStats = { diff --git a/datadog.conf.example b/datadog.conf.example index b752e2e0d2..3ece980471 100644 --- a/datadog.conf.example +++ b/datadog.conf.example @@ -17,7 +17,7 @@ debug_mode: no # Use the amazon EC2 instance-id instead of hostname (unless hostname is # explicitly set) -use_ec2_instance_id: no +use_ec2_instance_id: yes # Use mount points instead of volumes to track disk and fs metrics use_mount: no diff --git a/ddagent.py b/ddagent.py index 594e376fe4..0c0763271a 100755 --- a/ddagent.py +++ b/ddagent.py @@ -227,7 +227,7 @@ def main(): import tornado.httpclient - agentConfig, rawConfig = get_config(parse_args = False) + agentConfig = get_config(parse_args = False) port = agentConfig.get('listen_port', None) if port is None: diff --git a/tests/test_cacti.py b/tests/test_cacti.py index 0717204388..b9e0c1fc8c 100644 --- a/tests/test_cacti.py +++ b/tests/test_cacti.py @@ -26,17 +26,22 @@ def setUp(self): pass def _restore_rrds(self, xml_dir): - for filename in os.listdir(xml_dir): - if filename.endswith('.xml'): - xml_path = '/'.join([xml_dir, filename]) - rrd_name = filename.replace('.xml', '.rrd') - subprocess.call( - ["/usr/bin/rrdtool","restore", xml_path, '/'.join([self.tmp_dir, rrd_name])] - ) + if os.access("/usr/bin/rrdtool", os.R_OK | os.X_OK): + for filename in os.listdir(xml_dir): + if filename.endswith('.xml'): + xml_path = '/'.join([xml_dir, filename]) + rrd_name = filename.replace('.xml', '.rrd') + subprocess.call( + ["/usr/bin/rrdtool","restore", xml_path, '/'.join([self.tmp_dir, rrd_name])] + ) + return True + else: + return False def testChecks(self): # Restore the RRDs from the XML dumps - self._restore_rrds(self.rrd_dir) + if not self._restore_rrds(self.rrd_dir): + return # Do a first check results1 = self.cacti.check(self.config) diff --git a/tests/test_config.py b/tests/test_config.py index 50203e0ad6..4dbfb929f5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,13 +1,46 @@ import unittest import os.path -from config import * +import tempfile + +from config import get_config +from agent import getPidFile, cleanPidFile, getPid class TestConfig(unittest.TestCase): def testWhiteSpaceConfig(self): """Leading whitespace confuse ConfigParser """ - agentConfig, rawConfig = get_config(cfg_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "badconfig.conf")) + agentConfig = get_config(cfg_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "badconfig.conf")) self.assertEquals(agentConfig["ddUrl"], "https://app.datadoghq.com") self.assertEquals(agentConfig["apiKey"], "1234") self.assertEquals(agentConfig["nagios_log"], "/var/log/nagios3/nagios.log") self.assertEquals(agentConfig["graphite_listen_port"], 17126) + + def testGoodPidFie(self): + """Verify that the pid file succeeds and fails appropriately""" + # Pidfile always writable + pid_dir = tempfile.mkdtemp() + pid_file = getPidFile(pid_dir) + pid = "666" + pid_f = open(pid_file, 'w') + pid_f.write(pid) + pid_f.close() + self.assertEquals(getPid(pid_dir), 666) + # clean up + self.assertEquals(cleanPidFile(pid_dir), True) + self.assertEquals(os.path.exists(pid_file), False) + + def testBadPidFile(self): + pid_dir = "/does-not-exist" + pid_file = getPidFile(pid_dir) + self.assertEquals(pid_file, "/tmp/dd-agent.pid") + pid = "666" + pid_f = open(pid_file, 'w') + pid_f.write(pid) + pid_f.close() + self.assertEquals(getPid(pid_dir), 666) + self.assertEquals(cleanPidFile(pid_dir), True) + self.assertEquals(os.path.exists(pid_file), False) + +if __name__ == '__main__': + unittest.main() + diff --git a/tests/test_mysql.py b/tests/test_mysql.py index bdbbb4e443..1846944ef0 100644 --- a/tests/test_mysql.py +++ b/tests/test_mysql.py @@ -1,15 +1,23 @@ import unittest import logging; logger = logging.getLogger() -import MySQLdb from checks.db.mysql import MySql class TestMySql(unittest.TestCase): def setUp(self): - self.mock = MySQLdb.MockSql() - self.mysql = MySql(logger) + # This should run on pre-2.7 python so no skiptest + self.skip = False + try: + import MySQLdb + self.mock = MySQLdb.MockSql() + self.mysql = MySql(logger) + except ImportError: + self.skip = True def testChecks(self): + if self.skip: + return + # First round for gauges results = self.mysql.check({"MySQLServer": "localhost", "MySQLUser": "dog", "MySQLPass": "dog"}) self.assertEquals(results["mysqlCreatedTmpDiskTables"], 2.0) diff --git a/tests/test_tail.py b/tests/test_tail.py index c81dfcde3f..968b9987a2 100644 --- a/tests/test_tail.py +++ b/tests/test_tail.py @@ -18,7 +18,7 @@ def setUp(self): self.logrotate_config.flush() self.logrotate_state_file = tempfile.NamedTemporaryFile() self.last_line = None - + def _trigger_logrotate(self): subprocess.check_call([ 'logrotate', @@ -50,17 +50,20 @@ def line_parser(l): # Verify that the tail consumed the data I wrote self.assertEquals(tail._size, len(init_string)) - # Trigger a copytruncate logrotation on the log file - self._trigger_logrotate() - - # Write a new line to the log file - new_string = "I am shorter\n" - self.log_file.write(new_string) - self.log_file.flush() - - # Verify that the tail recognized the logrotation - gen.next() - self.assertEquals(self.last_line, new_string[:-1], self.last_line) + try: + # Trigger a copytruncate logrotation on the log file + self._trigger_logrotate() + + # Write a new line to the log file + new_string = "I am shorter\n" + self.log_file.write(new_string) + self.log_file.flush() + + # Verify that the tail recognized the logrotation + gen.next() + self.assertEquals(self.last_line, new_string[:-1], self.last_line) + except OSError: + "logrotate is not present" if __name__ == '__main__': logging.getLogger().setLevel(logging.DEBUG) diff --git a/tests/test_watchdog.py b/tests/test_watchdog.py index 9d32235eda..35f0f892dc 100644 --- a/tests/test_watchdog.py +++ b/tests/test_watchdog.py @@ -23,7 +23,7 @@ def runTest(self): """ start = time.time() try: - result = subprocess.check_output(["python", "test_watchdog.py", "busy"], stderr=subprocess.STDOUT) + result = subprocess.check_output(["python", "tests/test_watchdog.py", "busy"], stderr=subprocess.STDOUT) raise Exception("Should have died with an error") except subprocess.CalledProcessError: duration = int(time.time() - start) @@ -32,8 +32,9 @@ def runTest(self): # Start pseudo web server print "nc pid", subprocess.Popen(["nc", "-l", "31834"]).pid + start = time.time() try: - subprocess.check_call(["python", "test_watchdog.py", "net"]) + subprocess.check_call(["python", "tests/test_watchdog.py", "net"]) raise Exception("Should have died with an error") except subprocess.CalledProcessError: duration = int(time.time() - start) diff --git a/tests/test_web.py b/tests/test_web.py index d1dd0f7341..f539835610 100644 --- a/tests/test_web.py +++ b/tests/test_web.py @@ -13,19 +13,21 @@ def setUp(self): def testApache(self): results = self.apache.check({"apacheStatusUrl": "apache", "version": "test"}) - self.assertEquals(results["apacheBusyWorkers"], 1.0) - self.assertEquals(results["apacheIdleWorkers"], 15.0) - self.assertEquals(results["apacheTotalAccesses"], 456.0) - self.assertEquals(results["apacheUptime"], 3.0) - self.assertEquals(results["apacheCPULoad"], 0.00817439) - self.assertEquals(results["apacheTotalBytes"], 12345 * 1024.0) + if results: + self.assertEquals(results["apacheBusyWorkers"], 1.0) + self.assertEquals(results["apacheIdleWorkers"], 15.0) + self.assertEquals(results["apacheTotalAccesses"], 456.0) + self.assertEquals(results["apacheUptime"], 3.0) + self.assertEquals(results["apacheCPULoad"], 0.00817439) + self.assertEquals(results["apacheTotalBytes"], 12345 * 1024.0) def testNginx(self): results = self.nginx.check({"nginxStatusUrl": "nginx", "version": "test"}) - self.assertEquals(results["nginxConnections"], 8.0) - self.assertEquals(results["nginxReading"], 0.0) - self.assertEquals(results["nginxWriting"], 1.0) - self.assertEquals(results["nginxWaiting"], 7.0) + if results: + self.assertEquals(results["nginxConnections"], 8.0) + self.assertEquals(results["nginxReading"], 0.0) + self.assertEquals(results["nginxWriting"], 1.0) + self.assertEquals(results["nginxWaiting"], 7.0) if __name__ == '__main__': unittest.main()