Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Logging #297

Merged
merged 27 commits into from
Jan 12, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2302ad9
consistent logging using centrally configured python logger
xcolour Dec 4, 2012
f9fb135
consistent logger naming
xcolour Dec 4, 2012
efd30dc
programmatically configure loggers to clear up logging config
xcolour Dec 10, 2012
14ea3a3
deb packaging for new logging
xcolour Dec 11, 2012
81e3227
clean up some py24 incompatiblities
xcolour Dec 11, 2012
d245f44
rpm packaging for new logging
xcolour Dec 11, 2012
8cf9df5
give pid files safe permissions. fixes #293.
xcolour Dec 11, 2012
e114614
don't log during 'info' command
xcolour Dec 11, 2012
9f3fde1
show log locations in info command
xcolour Dec 11, 2012
364e771
version 3.5.0
xcolour Dec 11, 2012
8f9d191
clean up logging config
xcolour Dec 12, 2012
80a5481
rotate logs
xcolour Dec 12, 2012
e1912d0
dump the vestigal 'get_logger_name()'
xcolour Dec 12, 2012
96572d0
print the whole traceback when logging fails
xcolour Dec 12, 2012
b6294cc
syslog logging. on by default, and configurable.
xcolour Dec 12, 2012
ad88088
show deprecation warning if old-style python logging config exists
xcolour Dec 12, 2012
d70b556
rename 'logger' to 'log' for oli
xcolour Dec 12, 2012
0c67cc4
Merge branch 'master' into logging
conorbranagan Jan 9, 2013
f80a706
Only log "starting collector run.." every 20 runs.
conorbranagan Jan 9, 2013
342c0c5
fix some issues from the conflicting merge
conorbranagan Jan 10, 2013
f909947
Merge branch 'master' into logging
conorbranagan Jan 10, 2013
b3ae1e5
remove old setup_logging call
conorbranagan Jan 10, 2013
fcabffc
Always show "starting run #x" logging because it's at debug level.
conorbranagan Jan 10, 2013
8a57fd4
update travis to use master branch for downloads
conorbranagan Jan 11, 2013
cb6abeb
generate a datadog.conf file before testing with travis
conorbranagan Jan 11, 2013
cdbf8b0
Fix syntax in travis.yml
conorbranagan Jan 12, 2013
d7194b8
Add a branch for datadog.conf file in travis.yml
conorbranagan Jan 12, 2013
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,22 @@ before_script:
- sudo apt-get install sysstat
- sudo apt-get install haproxy
- sudo apt-get install python-mysqldb
- curl -L https://raw.github.com/DataDog/dd-agent/check-haproxy/tests/haproxy.cfg > /tmp/haproxy.cfg
- curl -L https://raw.github.com/DataDog/dd-agent/master/tests/haproxy.cfg > /tmp/haproxy.cfg
- curl -L http://mirror.sdunix.com/apache/tomcat/tomcat-6/v6.0.36/bin/apache-tomcat-6.0.36.tar.gz | tar -C /tmp -xzf - && mv /tmp/apache-tomcat-6.0.36 /tmp/apache-tomcat-6 && echo 'export CATALINA_OPTS="-Dcom.sun.management.jmxremote.port=8090 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" export CATALINA_OUT="/tmp/apache-tomcat-6/catalina.out"' > /tmp/apache-tomcat-6/bin/setenv.sh
- curl -L http://mirrors.ibiblio.org/apache/tomcat/tomcat-7/v7.0.34/bin/apache-tomcat-7.0.34.tar.gz | tar -C /tmp -xzf - && mv /tmp/apache-tomcat-7.0.34/ /tmp/apache-tomcat-7 && echo 'export CATALINA_OPTS="-Dcom.sun.management.jmxremote.port=8091 -Dcom.sun.management.jmxremote.authenticate=true -Dcom.sun.management.jmxremote.password.file=/tmp/apache-tomcat-7/conf/jmxremote.password -Dcom.sun.management.jmxremote.access.file=/tmp/apache-tomcat-7/conf/jmxremote.access -Dcom.sun.management.jmxremote.ssl=false" export CATALINA_OUT="/tmp/apache-tomcat-7/catalina.out"' > /tmp/apache-tomcat-7/bin/setenv.sh && echo 'monitorRole readonly' > /tmp/apache-tomcat-7/conf/jmxremote.access && echo 'monitorRole tomcat' > /tmp/apache-tomcat-7/conf/jmxremote.password && chmod 400 /tmp/apache-tomcat-7/conf/jmxremote.password
- curl -L https://raw.github.com/DataDog/dd-agent/jmx_multiple_checks/tests/tomcat_cfg.xml > /tmp/apache-tomcat-6/conf/server.xml
- curl -L https://raw.github.com/DataDog/dd-agent/master/tests/tomcat_cfg.xml > /tmp/apache-tomcat-6/conf/server.xml
- curl -L http://mirror.cc.columbia.edu/pub/software/apache/lucene/solr/3.6.1/apache-solr-3.6.1.tgz > /tmp/solr.tgz && tar -C /tmp -xzf /tmp/solr.tgz && mv /tmp/apache-solr-3.6.1 /tmp/apache-solr-3 && echo 'monitorRole readonly' > /tmp/apache-solr-3/example/jmxremote.access && echo 'monitorRole solr' > /tmp/apache-solr-3/example/jmxremote.password && chmod 400 /tmp/apache-solr-3/example/jmxremote.password
- sudo apt-get install nginx
- curl -L https://raw.github.com/DataDog/dd-agent/multiple_instances/tests/nginx.conf > /tmp/default.conf
- curl -L https://raw.github.com/DataDog/dd-agent/master/tests/nginx.conf > /tmp/default.conf
- sudo cp /tmp/default.conf /etc/nginx/conf.d/default.conf
- sudo /etc/init.d/nginx restart
- sudo apt-get install apache2
- sudo bash -c "curl -L https://raw.github.com/DataDog/dd-agent/checks_to_checksd/tests/apache/ports.conf > /etc/apache2/ports.conf"
- sudo bash -c "curl -L https://raw.github.com/DataDog/dd-agent/checks_to_checksd/tests/apache/apache.conf > /etc/apache2/apache.conf"
- sudo bash -c "curl -L https://raw.github.com/DataDog/dd-agent/master/tests/apache/ports.conf > /etc/apache2/ports.conf"
- sudo bash -c "curl -L https://raw.github.com/DataDog/dd-agent/master/tests/apache/apache.conf > /etc/apache2/apache.conf"
- sudo /etc/init.d/apache2 restart
- sudo apt-get remove memcached
- sudo apt-get install memcached
- sudo mkdir -p /etc/dd-agent/
- sudo bash -c "curl -L https://raw.github.com/DataDog/dd-agent/master/datadog.conf.example > /etc/dd-agent/datadog.conf"
env:
- DB=redis
66 changes: 18 additions & 48 deletions agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
(C) Datadog, Inc. 2010 all rights reserved
'''

# set up logging before importing any other components
from config import initialize_logging; initialize_logging('collector')

import os; os.umask(022)

# Core modules
Expand Down Expand Up @@ -44,8 +47,7 @@
WATCHDOG_MULTIPLIER = 10

# Globals
agent_logger = logging.getLogger('agent')

log = logging.getLogger('collector')

class Agent(Daemon):
"""
Expand All @@ -58,7 +60,7 @@ def __init__(self, pidfile):
self.collector = None

def _handle_sigterm(self, signum, frame):
agent_logger.debug("Caught sigterm. Stopping run loop.")
log.debug("Caught sigterm. Stopping run loop.")
self.run_forever = False
if self.collector:
self.collector.stop()
Expand Down Expand Up @@ -87,7 +89,7 @@ def run(self, config=None):
# Configure the watchdog.
check_frequency = int(agentConfig['check_freq'])
watchdog = self._get_watchdog(check_frequency, agentConfig)

# Run the main loop.
while self.run_forever:
# Do the work.
Expand All @@ -108,7 +110,7 @@ def run(self, config=None):

# Explicitly kill the process, because it might be running
# as a daemon.
agent_logger.info("Exiting. Bye bye.")
log.info("Exiting. Bye bye.")
sys.exit(0)

def _get_emitters(self, agentConfig):
Expand All @@ -132,51 +134,16 @@ def _set_agent_config_hostname(self, agentConfig):
if agentConfig.get('hostname') is None and agentConfig.get('use_ec2_instance_id'):
instanceId = EC2.get_instance_id()
if instanceId is not None:
agent_logger.info("Running on EC2, instanceId: %s" % instanceId)
log.info("Running on EC2, instanceId: %s" % instanceId)
agentConfig['hostname'] = instanceId
else:
agent_logger.info('Not running on EC2, using hostname to identify this server')
log.info('Not running on EC2, using hostname to identify this server')
return agentConfig


def setup_logging(agentConfig):
"""Configure logging to use syslog whenever possible.
Also controls debug_mode."""
if agentConfig['debug_mode']:
logFile = "/tmp/dd-agent.log"
logging.basicConfig(filename=logFile, filemode='w',
level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.info("Logging to %s" % logFile)
else:
try:
from logging.handlers import SysLogHandler
rootLog = logging.getLogger()
rootLog.setLevel(logging.INFO)

sys_log_addr = "/dev/log"

# Special-case macs
if sys.platform == 'darwin':
sys_log_addr = "/var/run/syslog"

handler = SysLogHandler(address=sys_log_addr, facility=SysLogHandler.LOG_DAEMON)
formatter = logging.Formatter("dd-agent - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
rootLog.addHandler(handler)
logging.info('Logging to syslog is set up')
except Exception,e:
sys.stderr.write("Error while setting up syslog logging (%s). No logging available" % str(e))
logging.disable(logging.ERROR)


def main():
options, args = get_parsed_args()
agentConfig = get_config(options=options)

# Logging
setup_logging(agentConfig)


COMMANDS = [
'start',
'stop',
Expand All @@ -196,7 +163,7 @@ def main():
return 3

pid_file = PidFile('dd-agent')

# Only initialize the Agent if we're starting or stopping it.
if command in ['start', 'stop', 'restart', 'foreground']:

Expand All @@ -206,15 +173,15 @@ def main():
agent = Agent(pid_file.get_path())

if 'start' == command:
logging.info('Start daemon')
log.info('Start daemon')
agent.start()

elif 'stop' == command:
logging.info('Stop daemon')
log.info('Stop daemon')
agent.stop()

elif 'restart' == command:
logging.info('Restart daemon')
log.info('Restart daemon')
agent.restart()

elif 'foreground' == command:
Expand All @@ -227,10 +194,13 @@ def main():
pid = pid_file.get_pid()
if pid is not None:
sys.stdout.write('dd-agent is running as pid %s.\n' % pid)
log.info("dd-agent is running as pid %s." % pid)
else:
sys.stdout.write('dd-agent is not running.\n')
log.info("dd-agent is not running.")

elif 'info' == command:
logging.getLogger().setLevel(logging.ERROR)
return CollectorStatus.print_latest_status(verbose=options.verbose)

return 0
Expand All @@ -239,10 +209,10 @@ def main():
if __name__ == '__main__':
try:
sys.exit(main())
except Exception:
except StandardError:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does StandardError work with python 3.x too?
(If not, why don't we just catch "Exception", since we just log and exit, it's not a problem if we catch system.exit())

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

StandardError is gone in 3.x, but I don't think supporting everything from 2.4 - 3.x with the same code is a reasonable goal.

daemon.py does some process magic that calls sys.exit() a few times and results in ugly stack traces in 2.4. I think clean logs are more valuable than being 3.x compatible.

# Try our best to log the error.
try:
agent_logger.exception("Uncaught error running the agent")
log.exception("Uncaught error running the agent")
except:
pass
raise
12 changes: 5 additions & 7 deletions aggregator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import logging
from time import time


logger = logging.getLogger(__name__)

log = logging.getLogger(__name__)

class Infinity(Exception): pass
class UnknownValue(Exception): pass
Expand Down Expand Up @@ -199,12 +197,12 @@ def sample(self, value, sample_rate):
def _rate(self, sample1, sample2):
interval = sample2[0] - sample1[0]
if interval == 0:
logger.warn('Metric %s has an interval of 0. Not flushing.' % self.name)
log.warn('Metric %s has an interval of 0. Not flushing.' % self.name)
raise Infinity()

delta = sample2[1] - sample1[1]
if delta < 0:
logger.warn('Metric %s has a rate < 0. Not flushing.' % self.name)
log.warn('Metric %s has a rate < 0. Not flushing.' % self.name)
raise UnknownValue()

return (delta / interval)
Expand Down Expand Up @@ -349,13 +347,13 @@ def flush(self):
metrics = []
for context, metric in self.metrics.items():
if metric.last_sample_time < expiry_timestamp:
logger.debug("%s hasn't been submitted in %ss. Expiring." % (context, self.expiry_seconds))
log.debug("%s hasn't been submitted in %ss. Expiring." % (context, self.expiry_seconds))
del self.metrics[context]
else:
metrics += metric.flush(timestamp, self.interval)

# Save some stats.
logger.debug("received %s payloads since last flush" % self.count)
log.debug("received %s payloads since last flush" % self.count)
self.total_count += self.count
self.count = 0
return metrics
Expand Down
6 changes: 4 additions & 2 deletions checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from util import LaconicFilter
from checks import check_status

log = logging.getLogger(__name__)

# Konstants
class CheckException(Exception): pass
class Infinity(CheckException): pass
Expand Down Expand Up @@ -273,7 +275,7 @@ def __init__(self, name, init_config, agentConfig, instances=None):
self.init_config = init_config
self.agentConfig = agentConfig
self.hostname = gethostname(agentConfig)
self.log = logging.getLogger('checks.%s' % name)
self.log = logging.getLogger('%s.%s' % (__name__, name))
self.aggregator = MetricsAggregator(self.hostname, formatter=agent_formatter)
self.events = []
self.instances = instances or []
Expand Down Expand Up @@ -482,7 +484,7 @@ def gethostname(agentConfig):
try:
return socket.getfqdn()
except socket.error, e:
logging.debug("processes: unable to get hostname: " + str(e))
log.debug("processes: unable to get hostname: " + str(e))

def agent_formatter(metric, value, timestamp, tags, hostname, device_name=None):
""" Formats metrics coming from the MetricsAggregator. Will look like:
Expand Down
2 changes: 1 addition & 1 deletion checks/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def check(self, logger, agentConfig):

hudson_home, apiKey = sys.argv[1:3]

logger = logging.getLogger('hudson')
logger = logging.getLogger('ddagent.checks.hudson')
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())
hudson = Hudson()
Expand Down
16 changes: 16 additions & 0 deletions checks/check_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,21 @@ def stylize(cls, text, *styles):
def style(*args):
return Stylizer.stylize(*args)

def logger_info():
loggers = []
root_logger = logging.getLogger()
if len(root_logger.handlers) > 0:
for handler in root_logger.handlers:
if isinstance(handler, logging.StreamHandler):
loggers.append(handler.stream.name)
if isinstance(handler, logging.handlers.SysLogHandler):
if isinstance(handler.address, basestring):
loggers.append('syslog:%s' % handler.address)
else:
loggers.append('syslog:(%s, %s)' % handler.address)
else:
loggers.append("No loggers configured")
return ', '.join(loggers)


class AgentStatus(object):
Expand Down Expand Up @@ -125,6 +140,7 @@ def _header_lines(self, indent):
("Pid", self.created_by_pid),
("Platform", platform.platform()),
("Python Version", platform.python_version()),
("Logs", logger_info()),
]

for key, value in fields:
Expand Down
Loading