Update logging calls to take advantage of structured logging

Includes a style guide that details the basics of how to log.
influxdata · Feb 17, 2018 · 791f7a7 · 791f7a7
1 parent 1a3af44
commit 791f7a7
Show file tree

Hide file tree

Showing 25 changed files with 487 additions and 165 deletions.
diff --git a/cmd/influxd/main.go b/cmd/influxd/main.go
@@ -96,11 +96,11 @@ func (m *Main) Run(args ...string) error {
 		cmd.Logger.Info("Waiting for clean shutdown...")
 		select {
 		case <-signalCh:
-			cmd.Logger.Info("second signal received, initializing hard shutdown")
+			cmd.Logger.Info("Second signal received, initializing hard shutdown")
 		case <-time.After(time.Second * 30):
-			cmd.Logger.Info("time limit reached, initializing hard shutdown")
+			cmd.Logger.Info("Time limit reached, initializing hard shutdown")
 		case <-cmd.Closed:
-			cmd.Logger.Info("server shutdown completed")
+			cmd.Logger.Info("Server shutdown completed")
 		}
 
 		// goodbye.

diff --git a/cmd/influxd/run/command.go b/cmd/influxd/run/command.go
@@ -99,9 +99,13 @@ func (cmd *Command) Run(args ...string) error {
 	}
 
 	// Mark start-up in log.
-	cmd.Logger.Info(fmt.Sprintf("InfluxDB starting, version %s, branch %s, commit %s",
-		cmd.Version, cmd.Branch, cmd.Commit))
-	cmd.Logger.Info(fmt.Sprintf("Go version %s, GOMAXPROCS set to %d", runtime.Version(), runtime.GOMAXPROCS(0)))
+	cmd.Logger.Info("InfluxDB starting",
+		zap.String("version", cmd.Version),
+		zap.String("branch", cmd.Branch),
+		zap.String("commit", cmd.Commit))
+	cmd.Logger.Info("Go runtime",
+		zap.String("version", runtime.Version()),
+		zap.Int("maxprocs", runtime.GOMAXPROCS(0)))
 
 	// If there was an error on startup when creating the logger, output it now.
 	if logErr != nil {
@@ -187,7 +191,7 @@ func (cmd *Command) monitorServerErrors() {
 func (cmd *Command) removePIDFile() {
 	if cmd.pidfile != "" {
 		if err := os.Remove(cmd.pidfile); err != nil {
-			cmd.Logger.Error("unable to remove pidfile", zap.Error(err))
+			cmd.Logger.Error("Unable to remove pidfile", zap.Error(err))
 		}
 	}
 }
@@ -235,11 +239,11 @@ func (cmd *Command) writePIDFile(path string) error {
 func (cmd *Command) ParseConfig(path string) (*Config, error) {
 	// Use demo configuration if no config path is specified.
 	if path == "" {
-		cmd.Logger.Info("no configuration provided, using default settings")
+		cmd.Logger.Info("No configuration provided, using default settings")
 		return NewDemoConfig()
 	}
 
-	cmd.Logger.Info(fmt.Sprintf("Using configuration at: %s", path))
+	cmd.Logger.Info("Loading configuration file", zap.String("path", path))
 
 	config := NewConfig()
 	if err := config.FromTomlFile(path); err != nil {

diff --git a/coordinator/points_writer.go b/coordinator/points_writer.go
@@ -2,7 +2,6 @@ package coordinator
 
 import (
 	"errors"
-	"fmt"
 	"sort"
 	"sync"
 	"sync/atomic"
@@ -380,15 +379,15 @@ func (w *PointsWriter) writeToShard(shard *meta.ShardInfo, database, retentionPo
 	if err == tsdb.ErrShardNotFound {
 		err = w.TSDBStore.CreateShard(database, retentionPolicy, shard.ID, true)
 		if err != nil {
-			w.Logger.Info(fmt.Sprintf("write failed for shard %d: %v", shard.ID, err))
+			w.Logger.Info("Write failed", zap.Uint64("shard", shard.ID), zap.Error(err))
 
 			atomic.AddInt64(&w.stats.WriteErr, 1)
 			return err
 		}
 	}
 	err = w.TSDBStore.WriteToShard(shard.ID, points)
 	if err != nil {
-		w.Logger.Info(fmt.Sprintf("write failed for shard %d: %v", shard.ID, err))
+		w.Logger.Info("Write failed", zap.Uint64("shard", shard.ID), zap.Error(err))
 		atomic.AddInt64(&w.stats.WriteErr, 1)
 		return err
 	}

diff --git a/logger/logger.go b/logger/logger.go
@@ -11,6 +11,8 @@ import (
 	"go.uber.org/zap/zapcore"
 )
 
+const TimeFormat = "2006-01-02T15:04:05.000000Z07:00"
+
 func New(w io.Writer) *zap.Logger {
 	config := NewConfig()
 	l, _ := config.New(w)
@@ -63,11 +65,13 @@ func newEncoder(format string) (zapcore.Encoder, error) {
 func newEncoderConfig() zapcore.EncoderConfig {
 	config := zap.NewProductionEncoderConfig()
 	config.EncodeTime = func(ts time.Time, encoder zapcore.PrimitiveArrayEncoder) {
-		encoder.AppendString(ts.UTC().Format(time.RFC3339))
+		encoder.AppendString(ts.UTC().Format(TimeFormat))
 	}
 	config.EncodeDuration = func(d time.Duration, encoder zapcore.PrimitiveArrayEncoder) {
-		encoder.AppendString(d.String())
+		val := float64(d) / float64(time.Millisecond)
+		encoder.AppendString(fmt.Sprintf("%.3fms", val))
 	}
+	config.LevelKey = "lvl"
 	return config
 }
 
@@ -80,3 +84,44 @@ func IsTerminal(w io.Writer) bool {
 	}
 	return false
 }
+
+const (
+	year = 365 * 24 * time.Hour
+	week = 7 * 24 * time.Hour
+	day  = 24 * time.Hour
+)
+
+func DurationLiteral(key string, val time.Duration) zapcore.Field {
+	if val == 0 {
+		return zap.String(key, "0s")
+	}
+
+	var (
+		value int
+		unit  string
+	)
+	switch {
+	case val%year == 0:
+		value = int(val / year)
+		unit = "y"
+	case val%week == 0:
+		value = int(val / week)
+		unit = "w"
+	case val%day == 0:
+		value = int(val / day)
+		unit = "d"
+	case val%time.Hour == 0:
+		value = int(val / time.Hour)
+		unit = "h"
+	case val%time.Minute == 0:
+		value = int(val / time.Minute)
+		unit = "m"
+	case val%time.Second == 0:
+		value = int(val / time.Second)
+		unit = "s"
+	default:
+		value = int(val / time.Millisecond)
+		unit = "ms"
+	}
+	return zap.String(key, fmt.Sprintf("%d%s", value, unit))
+}
diff --git a/logger/style_guide.md b/logger/style_guide.md
@@ -0,0 +1,186 @@
+# Logging Style Guide
+
+The intention of logging is to give insight to the administrator of how
+the server is running and also notify the administrator of any problems
+or potential problems with the system.
+
+At the moment, log level filtering is the only option to configure
+logging in InfluxDB. Adding a logging message and choosing its level
+should be done according to the guidelines in this document for
+operational clarity. The available log levels are:
+
+* Error
+* Warn
+* Info
+* Debug
+
+InfluxDB uses structured logging. Structured logging is when you log
+messages and attach context to those messages with more easily read data
+regarding the state of the system. A structured log message is composed
+of:
+
+* Time
+* Level
+* Message
+* (Optionally) Additional context
+
+## Guidelines
+
+**Log messages** should be simple statements or phrases that begin with
+a capital letter, but have no punctuation at the end. The message should be a
+constant so that every time it is logged it is easily identified and can
+be filtered by without regular expressions.
+
+Any **dynamic content** should be expressed by context. The key should
+be a constant and the value is the dynamic content.
+
+Do not log messages in tight loops or other high performance locations.
+It will likely create a performance problem.
+
+## Naming Conventions
+
+If the log encoding format uses keys for the time, message, or level,
+the key names should be `ts` for time, `msg` for the message, and
+`lvl` for the level.
+
+If the log encoding format does not use keys for the time, message, or
+level and instead outputs them in some other method, this guideline can
+be ignored. The output formats logfmt and json both use keys when
+encoding these values.
+
+### Context Key Names
+
+The key for the dynamic content in the context should be formatted in
+`snake_case`. The key should be completely lower case.
+
+## Levels
+
+As a reminder, levels are usually the only way to configure what is
+logged. There are four available logging levels.
+
+* Error
+* Warn
+* Info
+* Debug
+
+It is important to get the right logging level to ensure the log
+messages are useful for end users to act on.
+
+In general, when considering which log level to use, you should use
+**info**. If you are considering using another level, read the below
+expanded descriptions to determine which level your message belongs in.
+
+### Error
+
+The **error** level is intended to communicate that there is a serious
+problem with the server. **An error should be emitted only when an
+on-call engineer can take some action to remedy the situation _and_ the
+system cannot continue operating properly without remedying the
+situation.**
+
+An example of what may qualify as an error level message is the creation
+of the internal storage for the monitor service. For that system to
+function at all, a database must be created. If no database is created,
+the service itself cannot function. The error has a clear actionable
+solution. Figure out why the database isn't being created and create it.
+
+An example of what does not qualify as an error is failing to parse a
+query or a socket closing prematurely. Both of these usually indicate
+some kind of user error rather than system error. Both are ephemeral
+errors and they would not be clearly actionable to an administrator who
+was paged at 3 AM. Both of these are examples of logging messages that
+should be emitted at the info level with an error key rather than being
+logged at the error level.
+
+Logged errors **must not propagate**. Propagating the error risks
+logging it in multiple locations and confusing users when the same error
+is reported multiple times. In general, if you are returning an error,
+never log at any level. By returning the error, you are telling the
+parent function to handle the error. Logging a message at any level is
+handling the error.
+
+This logging message should be used very rarely and any messages that
+use this logging level should not repeat frequently. Assume that
+anything that is logged with error will page someone in the middle of
+the night.
+
+### Warn
+
+The **warn** level is intended to communicate that there is likely to be
+a serious problem with the server if it not addressed. **A warning
+should be emitted only when a support engineer can take some action to
+remedy the situation _and_ the system may not continue operating
+properly in the near future without remedying the situation.**
+
+An example of what may qualify as a warning is the `max-values-per-tag`
+setting. If the server starts to approach the maximum number of values,
+the server may stop being able to function properly when it reaches the
+maximum number.
+
+An example of what does not qualify as a warning is the
+`log-queries-after` setting. While the message is "warning" that a query
+was running for a long period of time, it is not clearly actionable and
+does not indicate that the server will fail in the near future. This
+should be logged at the info level instead.
+
+This logging message should be used very rarely and any messages that
+use this logging level should not repeat frequently. Assume that
+anything that is logged with warn will page someone in the middle of the
+night and potentially ignored until normal working hours.
+
+### Info
+
+The **info** level should be used for almost anything. If you are not
+sure which logging level to use, use info. Temporary or user errors
+should be logged at the info level and any informational messages for
+administrators should be logged at this level. Info level messages
+should be safe for an administrator to discard if they really want to,
+but most people will run the system at the info level.
+
+### Debug
+
+The **debug** level exists to log messages that are useful only for
+debugging a bad running instance.
+
+This level should be rarely used if ever. If you intend to use this
+level, please have a rationale ready. Most messages that could be
+considered debug either shouldn't exist or should be logged at the info
+level. Debug messages will be suppressed by default.
+
+## Value Formatting
+
+Formatting for strings, integers, and other standard values are usually
+determined by the log format itself and those will be kept ambiguous.
+The following specific formatting choices are for data types that could
+be output in multiple ways.
+
+### Time
+
+Time values should be encoded using RFC3339 with microsecond precision.
+The size of the string should be normalized to the same number of digits
+every time to ensure that it is easier to read the time as a column.
+
+### Duration
+
+There are two types of durations.
+
+* Tracks a (usually small) period of time and is meant for timing how
+  long something take. The content is dynamic and may be graphed.
+* Duration literal where the content is dynamic, is unlikely to be
+  graphed, and usually comes from some type of configuration.
+
+If the content is dynamic, the duration should be printed as a number of
+milliseconds with a decimal indicating the number of microseconds. Any
+duration lower than microseconds should be truncated. The decimal section
+should always print exactly 3 points after the decimal point.
+
+If the content is static, the duration should be printed with a single
+number and a suffix indicating the unit in years (`y`), weeks (`w`),
+days (`d`), hours (`h`), minutes (`m`), seconds (`s`), or
+milliseconds (`ms`). The suffix should be the greatest unit that can be
+used without truncating the value. As an example, if the duration is
+60 minutes, then `1h` should be used. If the duration is 61 minutes,
+then `61m` should be used.
+
+For anything lower than milliseconds that is static, the duration should
+be truncated. A value of zero should be shown as `0s`.