Skip to content

Commit

Permalink
Merge pull request #3916 from influxdb/new_stats_diags
Browse files Browse the repository at this point in the history
Statistics and Diagnostics service
  • Loading branch information
otoolep committed Sep 2, 2015
2 parents b423599 + 366c011 commit 14c04eb
Show file tree
Hide file tree
Showing 16 changed files with 581 additions and 134 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ With this release InfluxDB is moving to Go 1.5.
### Features
- [#3863](https://github.com/influxdb/influxdb/pull/3863): Move to Go 1.5
- [#3892](https://github.com/influxdb/influxdb/pull/3892): Support IF NOT EXISTS for CREATE DATABASE
- [#3916](https://github.com/influxdb/influxdb/pull/3916): New statistics and diagnostics support. Graphite first to be instrumented.

### Bugfixes
- [#3804](https://github.com/influxdb/influxdb/pull/3804): init.d script fixes, fixes issue 3803.
Expand Down
7 changes: 3 additions & 4 deletions cmd/influxd/run/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ import (

"github.com/influxdb/influxdb/cluster"
"github.com/influxdb/influxdb/meta"
"github.com/influxdb/influxdb/monitor"
"github.com/influxdb/influxdb/services/admin"
"github.com/influxdb/influxdb/services/collectd"
"github.com/influxdb/influxdb/services/continuous_querier"
"github.com/influxdb/influxdb/services/graphite"
"github.com/influxdb/influxdb/services/hh"
"github.com/influxdb/influxdb/services/httpd"
"github.com/influxdb/influxdb/services/monitor"
"github.com/influxdb/influxdb/services/opentsdb"
"github.com/influxdb/influxdb/services/precreator"
"github.com/influxdb/influxdb/services/retention"
Expand All @@ -36,14 +36,14 @@ type Config struct {
Precreator precreator.Config `toml:"shard-precreation"`

Admin admin.Config `toml:"admin"`
Monitor monitor.Config `toml:"monitor"`
HTTPD httpd.Config `toml:"http"`
Graphites []graphite.Config `toml:"graphite"`
Collectd collectd.Config `toml:"collectd"`
OpenTSDB opentsdb.Config `toml:"opentsdb"`
UDPs []udp.Config `toml:"udp"`

// Snapshot SnapshotConfig `toml:"snapshot"`
Monitoring monitor.Config `toml:"monitoring"`
ContinuousQuery continuous_querier.Config `toml:"continuous_queries"`

HintedHandoff hh.Config `toml:"hinted-handoff"`
Expand All @@ -61,12 +61,12 @@ func NewConfig() *Config {
c.Precreator = precreator.NewConfig()

c.Admin = admin.NewConfig()
c.Monitor = monitor.NewConfig()
c.HTTPD = httpd.NewConfig()
c.Collectd = collectd.NewConfig()
c.OpenTSDB = opentsdb.NewConfig()
c.Graphites = append(c.Graphites, graphite.NewConfig())

c.Monitoring = monitor.NewConfig()
c.ContinuousQuery = continuous_querier.NewConfig()
c.Retention = retention.NewConfig()
c.HintedHandoff = hh.NewConfig()
Expand Down Expand Up @@ -95,7 +95,6 @@ func NewDemoConfig() (*Config, error) {
c.Data.WALDir = filepath.Join(homeDir, ".influxdb/wal")

c.Admin.Enabled = true
c.Monitoring.Enabled = false

return c, nil
}
Expand Down
2 changes: 0 additions & 2 deletions cmd/influxd/run/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ enabled = true
t.Fatalf("unexpected opentsdb bind address: %s", c.OpenTSDB.BindAddress)
} else if c.UDPs[0].BindAddress != ":4444" {
t.Fatalf("unexpected udp bind address: %s", c.UDPs[0].BindAddress)
} else if c.Monitoring.Enabled != true {
t.Fatalf("unexpected monitoring enabled: %v", c.Monitoring.Enabled)
} else if c.ContinuousQuery.Enabled != true {
t.Fatalf("unexpected continuous query enabled: %v", c.ContinuousQuery.Enabled)
}
Expand Down
15 changes: 15 additions & 0 deletions cmd/influxd/run/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

"github.com/influxdb/influxdb/cluster"
"github.com/influxdb/influxdb/meta"
"github.com/influxdb/influxdb/monitor"
"github.com/influxdb/influxdb/services/admin"
"github.com/influxdb/influxdb/services/collectd"
"github.com/influxdb/influxdb/services/continuous_querier"
Expand Down Expand Up @@ -57,6 +58,8 @@ type Server struct {
ClusterService *cluster.Service
SnapshotterService *snapshotter.Service

MonitorService *monitor.Service

// Server reporting
reportingDisabled bool

Expand Down Expand Up @@ -85,6 +88,16 @@ func NewServer(c *Config, version string) (*Server, error) {
reportingDisabled: c.ReportingDisabled,
}

// Start the monitor service.
clusterID, err := s.MetaStore.ClusterID()
if err != nil {
return nil, err
}
s.MonitorService = monitor.NewService(c.Monitor)
if err := s.MonitorService.Open(clusterID, s.MetaStore.NodeID(), s.Hostname); err != nil {
return nil, err
}

// Copy TSDB configuration.
s.TSDBStore.EngineOptions.MaxWALSize = c.Data.MaxWALSize
s.TSDBStore.EngineOptions.WALFlushInterval = time.Duration(c.Data.WALFlushInterval)
Expand All @@ -100,6 +113,7 @@ func NewServer(c *Config, version string) (*Server, error) {
s.QueryExecutor = tsdb.NewQueryExecutor(s.TSDBStore)
s.QueryExecutor.MetaStore = s.MetaStore
s.QueryExecutor.MetaStatementExecutor = &meta.StatementExecutor{Store: s.MetaStore}
s.QueryExecutor.MonitorStatementExecutor = s.MonitorService
s.QueryExecutor.ShardMapper = s.ShardMapper

// Set the shard writer
Expand Down Expand Up @@ -230,6 +244,7 @@ func (s *Server) appendGraphiteService(c graphite.Config) error {

srv.PointsWriter = s.PointsWriter
srv.MetaStore = s.MetaStore
srv.MonitorService = s.MonitorService
s.Services = append(s.Services, srv)
return nil
}
Expand Down
18 changes: 10 additions & 8 deletions etc/config.sample.toml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ reporting-disabled = false
enabled = true
check-interval = "10m"

###
### Controls the system self-monitoring, statistics, diagnostics, and expvar data.
###

[monitor]
store-enabled = true # Whether to record statistics in an InfluxDB system
store-database = "_internal" # The destination database for recorded statistics
store-interval = "1m" # The interval at which to record statistics
store-address = "http://127.0.0.1" # The protocol and host for the recorded data

###
### [admin]
###
Expand Down Expand Up @@ -207,14 +217,6 @@ reporting-disabled = false
# batch-size = 1000 # will flush if this many points get buffered
# batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit

###
### [monitoring]
###

[monitoring]
enabled = true
write-interval = "24h"

###
### [continuous_queries]
###
Expand Down
6 changes: 0 additions & 6 deletions meta/statement_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@ func (e *StatementExecutor) ExecuteStatement(stmt influxql.Statement) *influxql.
return e.executeDropContinuousQueryStatement(stmt)
case *influxql.ShowContinuousQueriesStatement:
return e.executeShowContinuousQueriesStatement(stmt)
case *influxql.ShowStatsStatement:
return e.executeShowStatsStatement(stmt)
default:
panic(fmt.Sprintf("unsupported statement type: %T", stmt))
}
Expand Down Expand Up @@ -283,7 +281,3 @@ func (e *StatementExecutor) executeShowContinuousQueriesStatement(stmt *influxql
}
return &influxql.Result{Series: rows}
}

func (e *StatementExecutor) executeShowStatsStatement(stmt *influxql.ShowStatsStatement) *influxql.Result {
return &influxql.Result{Err: fmt.Errorf("SHOW STATS is not implemented yet")}
}
49 changes: 49 additions & 0 deletions monitor/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# System Monitoring
_System Monitoring_ means all statistical and diagnostic information made availabe to the user of InfluxDB system, about the system itself. Its purpose is to assist with troubleshooting and performance analysis.

## Supported Commands

* `SHOW STATS`
* `SHOW DIAGNOSTICS`

If statistical information is also written to an InfluxDB system, the data will also be queryable by the InfluxQL query language.

## Statistics vs. Diagnostics
A distinction between _statistics_ and _diagnostics_ is made for the purposes of monitoring. Generally a statistical quality is something that is being counted, and for which it makes sense to store for historical analysis. Diagnostic information is not necessarily numerical, and may not make sense to store.

An example of statistical information would be the number of points received over UDP, or the number of queries executed. Examples of diagnostic information would be a list of current Graphite TCP connections, the version of InfluxDB, or the uptime of the process.

## Design and Implementation

A new module named `monitor` supports all statistics and diagnostic functionality. This includes:

* Allowing other modules to register statistics and diagnostics information, allowing it to be accessed on demand by the `monitor` module.
* Serving the statistics and diagnostic information to the user, in response to commands such as `SHOW DIAGNOSTICS`.
* Expose standard Go runtime information such as garbage collection statistics.
* Make all collected expvar data via HTTP, for collection by 3rd-party tools.
* Writing the statistical information to an InfluxDB system, for historical analysis. This may be the same system generating the statistical information, but it does not have to be. Information is written used the Line Protocol.

To register with `monitor`, a module must implement the following interface:

```
type Client interface {
Statistics() (map[string]interface{}, error)
Diagnostics() (map[string]interface{}, error)
}
```

The module then calls `Register(name string, tags map[string]string, client Client)`. `name` is the Measurement name that will be associated with the statistics. `tags` will be the tags, though an empty map is acceptable. `client` is the module which implements the `Client` interface.

### expvar
Statistical information is gathered by each package using [expvar](https://golang.org/pkg/expvar). Each package registers a map using its package name.

Due to the nature of `expvar`, statistical information is reset to its initial state when a server is restarted.

## Configuration
The `monitor` module will allow the following configuration:

* Whether to write statistical and diagnostic information to an InfluxDB system. This is enabled by default.
* The name of the database to where this information should be written. Defaults to `_internal`. The information is written to the default retention policy for the given database.
* The name of the retention policy, along with full configuration control of the retention policy.
* The address and port of the InfluxDB system. This will default to the system generating the data.
* The rate at which this information should be written. The maximum rate will be once a second.
40 changes: 40 additions & 0 deletions monitor/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package monitor

import (
"time"

"github.com/influxdb/influxdb/toml"
)

const (
// DefaultStoreEnabled is whether the system writes gathered information in
// an InfluxDB system for historical analysis.
DefaultStoreEnabled = true

// DefaultStoreDatabase is the name of the database where gathered information is written
DefaultStoreDatabase = "_internal"

// DefaultStoreInterval is the period between storing gathered information.
DefaultStoreInterval = time.Minute

// DefaultStoreAddress is the destination system for gathered information.
DefaultStoreAddress = "127.0.0.1:8086"
)

// Config represents the configuration for the monitor service.
type Config struct {
StoreEnabled bool `toml:"store-enabled"`
StoreDatabase string `toml:"store-database"`
StoreInterval toml.Duration `toml:"store-interval"`
StoreAddress string `toml:"store-address"`
}

// NewConfig returns an instance of Config with defaults.
func NewConfig() Config {
return Config{
StoreEnabled: false,
StoreDatabase: DefaultStoreDatabase,
StoreInterval: toml.Duration(DefaultStoreInterval),
StoreAddress: DefaultStoreAddress,
}
}
33 changes: 33 additions & 0 deletions monitor/config_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package monitor_test

import (
"testing"
"time"

"github.com/BurntSushi/toml"
"github.com/influxdb/influxdb/monitor"
)

func TestConfig_Parse(t *testing.T) {
// Parse configuration.
var c monitor.Config
if _, err := toml.Decode(`
store-enabled=true
store-database="the_db"
store-interval="10m"
store-address="server1"
`, &c); err != nil {
t.Fatal(err)
}

// Validate configuration.
if !c.StoreEnabled {
t.Fatalf("unexpected store-enabled: %v", c.StoreEnabled)
} else if c.StoreDatabase != "the_db" {
t.Fatalf("unexpected store-database: %s", c.StoreDatabase)
} else if time.Duration(c.StoreInterval) != 10*time.Minute {
t.Fatalf("unexpected store-interval: %s", c.StoreInterval)
} else if c.StoreAddress != "server1" {
t.Fatalf("unexpected store-address: %s", c.StoreAddress)
}
}
37 changes: 37 additions & 0 deletions monitor/go_runtime.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package monitor

import (
"runtime"
)

// goRuntime captures Go runtime statistics and implements the monitor client interface
type goRuntime struct{}

// Statistics returns the statistics for the goRuntime type
func (g *goRuntime) Statistics() (map[string]interface{}, error) {
var m runtime.MemStats
runtime.ReadMemStats(&m)

return map[string]interface{}{
"Alloc": int64(m.Alloc),
"TotalAlloc": int64(m.TotalAlloc),
"Sys": int64(m.Sys),
"Lookups": int64(m.Lookups),
"Mallocs": int64(m.Mallocs),
"Frees": int64(m.Frees),
"HeapAlloc": int64(m.HeapAlloc),
"HeapSys": int64(m.HeapSys),
"HeapIdle": int64(m.HeapIdle),
"HeapInUse": int64(m.HeapInuse),
"HeapReleased": int64(m.HeapReleased),
"HeapObjects": int64(m.HeapObjects),
"PauseTotalNs": int64(m.PauseTotalNs),
"NumGC": int64(m.NumGC),
"NumGoroutine": int64(runtime.NumGoroutine()),
}, nil
}

// Diagnostics returns the statistics for the goRuntime type
func (g *goRuntime) Diagnostics() (map[string]interface{}, error) {
return nil, nil
}
Loading

0 comments on commit 14c04eb

Please sign in to comment.