Skip to content

Commit

Permalink
Update logging calls to take advantage of structured logging
Browse files Browse the repository at this point in the history
Includes a style guide that details the basics of how to log.
  • Loading branch information
jsternberg committed Feb 17, 2018
1 parent 1a3af44 commit 791f7a7
Show file tree
Hide file tree
Showing 25 changed files with 487 additions and 165 deletions.
6 changes: 3 additions & 3 deletions cmd/influxd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,11 @@ func (m *Main) Run(args ...string) error {
cmd.Logger.Info("Waiting for clean shutdown...")
select {
case <-signalCh:
cmd.Logger.Info("second signal received, initializing hard shutdown")
cmd.Logger.Info("Second signal received, initializing hard shutdown")
case <-time.After(time.Second * 30):
cmd.Logger.Info("time limit reached, initializing hard shutdown")
cmd.Logger.Info("Time limit reached, initializing hard shutdown")
case <-cmd.Closed:
cmd.Logger.Info("server shutdown completed")
cmd.Logger.Info("Server shutdown completed")
}

// goodbye.
Expand Down
16 changes: 10 additions & 6 deletions cmd/influxd/run/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,13 @@ func (cmd *Command) Run(args ...string) error {
}

// Mark start-up in log.
cmd.Logger.Info(fmt.Sprintf("InfluxDB starting, version %s, branch %s, commit %s",
cmd.Version, cmd.Branch, cmd.Commit))
cmd.Logger.Info(fmt.Sprintf("Go version %s, GOMAXPROCS set to %d", runtime.Version(), runtime.GOMAXPROCS(0)))
cmd.Logger.Info("InfluxDB starting",
zap.String("version", cmd.Version),
zap.String("branch", cmd.Branch),
zap.String("commit", cmd.Commit))
cmd.Logger.Info("Go runtime",
zap.String("version", runtime.Version()),
zap.Int("maxprocs", runtime.GOMAXPROCS(0)))

// If there was an error on startup when creating the logger, output it now.
if logErr != nil {
Expand Down Expand Up @@ -187,7 +191,7 @@ func (cmd *Command) monitorServerErrors() {
func (cmd *Command) removePIDFile() {
if cmd.pidfile != "" {
if err := os.Remove(cmd.pidfile); err != nil {
cmd.Logger.Error("unable to remove pidfile", zap.Error(err))
cmd.Logger.Error("Unable to remove pidfile", zap.Error(err))
}
}
}
Expand Down Expand Up @@ -235,11 +239,11 @@ func (cmd *Command) writePIDFile(path string) error {
func (cmd *Command) ParseConfig(path string) (*Config, error) {
// Use demo configuration if no config path is specified.
if path == "" {
cmd.Logger.Info("no configuration provided, using default settings")
cmd.Logger.Info("No configuration provided, using default settings")
return NewDemoConfig()
}

cmd.Logger.Info(fmt.Sprintf("Using configuration at: %s", path))
cmd.Logger.Info("Loading configuration file", zap.String("path", path))

config := NewConfig()
if err := config.FromTomlFile(path); err != nil {
Expand Down
5 changes: 2 additions & 3 deletions coordinator/points_writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package coordinator

import (
"errors"
"fmt"
"sort"
"sync"
"sync/atomic"
Expand Down Expand Up @@ -380,15 +379,15 @@ func (w *PointsWriter) writeToShard(shard *meta.ShardInfo, database, retentionPo
if err == tsdb.ErrShardNotFound {
err = w.TSDBStore.CreateShard(database, retentionPolicy, shard.ID, true)
if err != nil {
w.Logger.Info(fmt.Sprintf("write failed for shard %d: %v", shard.ID, err))
w.Logger.Info("Write failed", zap.Uint64("shard", shard.ID), zap.Error(err))

atomic.AddInt64(&w.stats.WriteErr, 1)
return err
}
}
err = w.TSDBStore.WriteToShard(shard.ID, points)
if err != nil {
w.Logger.Info(fmt.Sprintf("write failed for shard %d: %v", shard.ID, err))
w.Logger.Info("Write failed", zap.Uint64("shard", shard.ID), zap.Error(err))
atomic.AddInt64(&w.stats.WriteErr, 1)
return err
}
Expand Down
49 changes: 47 additions & 2 deletions logger/logger.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"go.uber.org/zap/zapcore"
)

const TimeFormat = "2006-01-02T15:04:05.000000Z07:00"

func New(w io.Writer) *zap.Logger {
config := NewConfig()
l, _ := config.New(w)
Expand Down Expand Up @@ -63,11 +65,13 @@ func newEncoder(format string) (zapcore.Encoder, error) {
func newEncoderConfig() zapcore.EncoderConfig {
config := zap.NewProductionEncoderConfig()
config.EncodeTime = func(ts time.Time, encoder zapcore.PrimitiveArrayEncoder) {
encoder.AppendString(ts.UTC().Format(time.RFC3339))
encoder.AppendString(ts.UTC().Format(TimeFormat))
}
config.EncodeDuration = func(d time.Duration, encoder zapcore.PrimitiveArrayEncoder) {
encoder.AppendString(d.String())
val := float64(d) / float64(time.Millisecond)
encoder.AppendString(fmt.Sprintf("%.3fms", val))
}
config.LevelKey = "lvl"
return config
}

Expand All @@ -80,3 +84,44 @@ func IsTerminal(w io.Writer) bool {
}
return false
}

const (
year = 365 * 24 * time.Hour
week = 7 * 24 * time.Hour
day = 24 * time.Hour
)

func DurationLiteral(key string, val time.Duration) zapcore.Field {
if val == 0 {
return zap.String(key, "0s")
}

var (
value int
unit string
)
switch {
case val%year == 0:
value = int(val / year)
unit = "y"
case val%week == 0:
value = int(val / week)
unit = "w"
case val%day == 0:
value = int(val / day)
unit = "d"
case val%time.Hour == 0:
value = int(val / time.Hour)
unit = "h"
case val%time.Minute == 0:
value = int(val / time.Minute)
unit = "m"
case val%time.Second == 0:
value = int(val / time.Second)
unit = "s"
default:
value = int(val / time.Millisecond)
unit = "ms"
}
return zap.String(key, fmt.Sprintf("%d%s", value, unit))
}
186 changes: 186 additions & 0 deletions logger/style_guide.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
# Logging Style Guide

The intention of logging is to give insight to the administrator of how
the server is running and also notify the administrator of any problems
or potential problems with the system.

At the moment, log level filtering is the only option to configure
logging in InfluxDB. Adding a logging message and choosing its level
should be done according to the guidelines in this document for
operational clarity. The available log levels are:

* Error
* Warn
* Info
* Debug

InfluxDB uses structured logging. Structured logging is when you log
messages and attach context to those messages with more easily read data
regarding the state of the system. A structured log message is composed
of:

* Time
* Level
* Message
* (Optionally) Additional context

## Guidelines

**Log messages** should be simple statements or phrases that begin with
a capital letter, but have no punctuation at the end. The message should be a
constant so that every time it is logged it is easily identified and can
be filtered by without regular expressions.

Any **dynamic content** should be expressed by context. The key should
be a constant and the value is the dynamic content.

Do not log messages in tight loops or other high performance locations.
It will likely create a performance problem.

## Naming Conventions

If the log encoding format uses keys for the time, message, or level,
the key names should be `ts` for time, `msg` for the message, and
`lvl` for the level.

If the log encoding format does not use keys for the time, message, or
level and instead outputs them in some other method, this guideline can
be ignored. The output formats logfmt and json both use keys when
encoding these values.

### Context Key Names

The key for the dynamic content in the context should be formatted in
`snake_case`. The key should be completely lower case.

## Levels

As a reminder, levels are usually the only way to configure what is
logged. There are four available logging levels.

* Error
* Warn
* Info
* Debug

It is important to get the right logging level to ensure the log
messages are useful for end users to act on.

In general, when considering which log level to use, you should use
**info**. If you are considering using another level, read the below
expanded descriptions to determine which level your message belongs in.

### Error

The **error** level is intended to communicate that there is a serious
problem with the server. **An error should be emitted only when an
on-call engineer can take some action to remedy the situation _and_ the
system cannot continue operating properly without remedying the
situation.**

An example of what may qualify as an error level message is the creation
of the internal storage for the monitor service. For that system to
function at all, a database must be created. If no database is created,
the service itself cannot function. The error has a clear actionable
solution. Figure out why the database isn't being created and create it.

An example of what does not qualify as an error is failing to parse a
query or a socket closing prematurely. Both of these usually indicate
some kind of user error rather than system error. Both are ephemeral
errors and they would not be clearly actionable to an administrator who
was paged at 3 AM. Both of these are examples of logging messages that
should be emitted at the info level with an error key rather than being
logged at the error level.

Logged errors **must not propagate**. Propagating the error risks
logging it in multiple locations and confusing users when the same error
is reported multiple times. In general, if you are returning an error,
never log at any level. By returning the error, you are telling the
parent function to handle the error. Logging a message at any level is
handling the error.

This logging message should be used very rarely and any messages that
use this logging level should not repeat frequently. Assume that
anything that is logged with error will page someone in the middle of
the night.

### Warn

The **warn** level is intended to communicate that there is likely to be
a serious problem with the server if it not addressed. **A warning
should be emitted only when a support engineer can take some action to
remedy the situation _and_ the system may not continue operating
properly in the near future without remedying the situation.**

An example of what may qualify as a warning is the `max-values-per-tag`
setting. If the server starts to approach the maximum number of values,
the server may stop being able to function properly when it reaches the
maximum number.

An example of what does not qualify as a warning is the
`log-queries-after` setting. While the message is "warning" that a query
was running for a long period of time, it is not clearly actionable and
does not indicate that the server will fail in the near future. This
should be logged at the info level instead.

This logging message should be used very rarely and any messages that
use this logging level should not repeat frequently. Assume that
anything that is logged with warn will page someone in the middle of the
night and potentially ignored until normal working hours.

### Info

The **info** level should be used for almost anything. If you are not
sure which logging level to use, use info. Temporary or user errors
should be logged at the info level and any informational messages for
administrators should be logged at this level. Info level messages
should be safe for an administrator to discard if they really want to,
but most people will run the system at the info level.

### Debug

The **debug** level exists to log messages that are useful only for
debugging a bad running instance.

This level should be rarely used if ever. If you intend to use this
level, please have a rationale ready. Most messages that could be
considered debug either shouldn't exist or should be logged at the info
level. Debug messages will be suppressed by default.

## Value Formatting

Formatting for strings, integers, and other standard values are usually
determined by the log format itself and those will be kept ambiguous.
The following specific formatting choices are for data types that could
be output in multiple ways.

### Time

Time values should be encoded using RFC3339 with microsecond precision.
The size of the string should be normalized to the same number of digits
every time to ensure that it is easier to read the time as a column.

### Duration

There are two types of durations.

* Tracks a (usually small) period of time and is meant for timing how
long something take. The content is dynamic and may be graphed.
* Duration literal where the content is dynamic, is unlikely to be
graphed, and usually comes from some type of configuration.

If the content is dynamic, the duration should be printed as a number of
milliseconds with a decimal indicating the number of microseconds. Any
duration lower than microseconds should be truncated. The decimal section
should always print exactly 3 points after the decimal point.

If the content is static, the duration should be printed with a single
number and a suffix indicating the unit in years (`y`), weeks (`w`),
days (`d`), hours (`h`), minutes (`m`), seconds (`s`), or
milliseconds (`ms`). The suffix should be the greatest unit that can be
used without truncating the value. As an example, if the duration is
60 minutes, then `1h` should be used. If the duration is 61 minutes,
then `61m` should be used.

For anything lower than milliseconds that is static, the duration should
be truncated. A value of zero should be shown as `0s`.
Loading

0 comments on commit 791f7a7

Please sign in to comment.