From 16df404e80084e30e6c91bdfd929063ada6a7bd5 Mon Sep 17 00:00:00 2001 From: Daniel Liu Date: Fri, 13 Dec 2024 14:00:13 +0800 Subject: [PATCH] metrics: improve reading Go runtime metrics (#25886) This changes how we read performance metrics from the Go runtime. Instead of using runtime.ReadMemStats, we now rely on the API provided by package runtime/metrics. runtime/metrics provides more accurate information. For example, the new interface has better reporting of memory use. In my testing, the reported value of held memory more accurately reflects the usage reported by the OS. The semantics of metrics system/memory/allocs and system/memory/frees have changed to report amounts in bytes. ReadMemStats only reported the count of allocations in number-of-objects. This is imprecise: 'tiny objects' are not counted because the runtime allocates them in batches; and certain improvements in allocation behavior, such as struct size optimizations, will be less visible when the number of allocs doesn't change. Changing allocation reports to be in bytes makes it appear in graphs that lots more is being allocated. I don't think that's a problem because this metric is primarily interesting for geth developers. The metric system/memory/pauses has been changed to report statistical values from the histogram provided by the runtime. Its name in influxdb has changed from geth.system/memory/pauses.meter to geth.system/memory/pauses.histogram. We also have a new histogram metric, system/cpu/schedlatency, reporting the Go scheduler latency. --- metrics/influxdb/influxdb.go | 35 +-- metrics/metrics.go | 176 ++++++++++----- metrics/metrics_test.go | 32 +-- metrics/runtime.go | 212 ------------------ metrics/runtime_cgo.go | 10 - metrics/runtime_gccpufraction.go | 10 - metrics/runtime_no_cgo.go | 8 - metrics/runtime_no_gccpufraction.go | 10 - metrics/runtime_test.go | 88 -------- metrics/runtimehistogram.go | 319 ++++++++++++++++++++++++++++ metrics/runtimehistogram_test.go | 133 ++++++++++++ 11 files changed, 602 insertions(+), 431 deletions(-) delete mode 100644 metrics/runtime.go delete mode 100644 metrics/runtime_cgo.go delete mode 100644 metrics/runtime_gccpufraction.go delete mode 100644 metrics/runtime_no_cgo.go delete mode 100644 metrics/runtime_no_gccpufraction.go delete mode 100644 metrics/runtime_test.go create mode 100644 metrics/runtimehistogram.go create mode 100644 metrics/runtimehistogram_test.go diff --git a/metrics/influxdb/influxdb.go b/metrics/influxdb/influxdb.go index d4ebea74fbe7..56f19e944abf 100644 --- a/metrics/influxdb/influxdb.go +++ b/metrics/influxdb/influxdb.go @@ -135,27 +135,28 @@ func (r *reporter) send() error { }) case metrics.Histogram: ms := metric.Snapshot() - if ms.Count() > 0 { - ps := ms.Percentiles([]float64{0.5, 0.75, 0.95, 0.99, 0.999, 0.9999}) + ps := ms.Percentiles([]float64{0.25, 0.5, 0.75, 0.95, 0.99, 0.999, 0.9999}) + fields := map[string]interface{}{ + "count": ms.Count(), + "max": ms.Max(), + "mean": ms.Mean(), + "min": ms.Min(), + "stddev": ms.StdDev(), + "variance": ms.Variance(), + "p25": ps[0], + "p50": ps[1], + "p75": ps[2], + "p95": ps[3], + "p99": ps[4], + "p999": ps[5], + "p9999": ps[6], + } pts = append(pts, client.Point{ Measurement: fmt.Sprintf("%s%s.histogram", namespace, name), Tags: r.tags, - Fields: map[string]interface{}{ - "count": ms.Count(), - "max": ms.Max(), - "mean": ms.Mean(), - "min": ms.Min(), - "stddev": ms.StdDev(), - "variance": ms.Variance(), - "p50": ps[0], - "p75": ps[1], - "p95": ps[2], - "p99": ps[3], - "p999": ps[4], - "p9999": ps[5], - }, - Time: now, + Fields: fields, + Time: now, }) } case metrics.Meter: diff --git a/metrics/metrics.go b/metrics/metrics.go index 2df2404b5f60..2b8bad8bee36 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -7,7 +7,8 @@ package metrics import ( "os" - "runtime" + "runtime/metrics" + "runtime/pprof" "strings" "time" @@ -54,38 +55,106 @@ func init() { } } -// CollectProcessMetrics periodically collects various metrics about the running -// process. +var threadCreateProfile = pprof.Lookup("threadcreate") + +type runtimeStats struct { + GCPauses *metrics.Float64Histogram + GCAllocBytes uint64 + GCFreedBytes uint64 + + MemTotal uint64 + HeapObjects uint64 + HeapFree uint64 + HeapReleased uint64 + HeapUnused uint64 + + Goroutines uint64 + SchedLatency *metrics.Float64Histogram +} + +var runtimeSamples = []metrics.Sample{ + {Name: "/gc/pauses:seconds"}, // histogram + {Name: "/gc/heap/allocs:bytes"}, + {Name: "/gc/heap/frees:bytes"}, + {Name: "/memory/classes/total:bytes"}, + {Name: "/memory/classes/heap/objects:bytes"}, + {Name: "/memory/classes/heap/free:bytes"}, + {Name: "/memory/classes/heap/released:bytes"}, + {Name: "/memory/classes/heap/unused:bytes"}, + {Name: "/sched/goroutines:goroutines"}, + {Name: "/sched/latencies:seconds"}, // histogram +} + +func readRuntimeStats(v *runtimeStats) { + metrics.Read(runtimeSamples) + for _, s := range runtimeSamples { + // Skip invalid/unknown metrics. This is needed because some metrics + // are unavailable in older Go versions, and attempting to read a 'bad' + // metric panics. + if s.Value.Kind() == metrics.KindBad { + continue + } + + switch s.Name { + case "/gc/pauses:seconds": + v.GCPauses = s.Value.Float64Histogram() + case "/gc/heap/allocs:bytes": + v.GCAllocBytes = s.Value.Uint64() + case "/gc/heap/frees:bytes": + v.GCFreedBytes = s.Value.Uint64() + case "/memory/classes/total:bytes": + v.MemTotal = s.Value.Uint64() + case "/memory/classes/heap/objects:bytes": + v.HeapObjects = s.Value.Uint64() + case "/memory/classes/heap/free:bytes": + v.HeapFree = s.Value.Uint64() + case "/memory/classes/heap/released:bytes": + v.HeapReleased = s.Value.Uint64() + case "/memory/classes/heap/unused:bytes": + v.HeapUnused = s.Value.Uint64() + case "/sched/goroutines:goroutines": + v.Goroutines = s.Value.Uint64() + case "/sched/latencies:seconds": + v.SchedLatency = s.Value.Float64Histogram() + } + } +} + +// CollectProcessMetrics periodically collects various metrics about the running process. func CollectProcessMetrics(refresh time.Duration) { // Short circuit if the metrics system is disabled if !Enabled { return } + refreshFreq := int64(refresh / time.Second) // Create the various data collectors - cpuStats := make([]*CPUStats, 2) - memstats := make([]*runtime.MemStats, 2) - diskstats := make([]*DiskStats, 2) - for i := 0; i < len(memstats); i++ { - cpuStats[i] = new(CPUStats) - memstats[i] = new(runtime.MemStats) - diskstats[i] = new(DiskStats) - } - // Define the various metrics to collect var ( - cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry) - cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry) - cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry) - cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry) - cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry) - - memPauses = GetOrRegisterMeter("system/memory/pauses", DefaultRegistry) - memAllocs = GetOrRegisterMeter("system/memory/allocs", DefaultRegistry) - memFrees = GetOrRegisterMeter("system/memory/frees", DefaultRegistry) - memHeld = GetOrRegisterGauge("system/memory/held", DefaultRegistry) - memUsed = GetOrRegisterGauge("system/memory/used", DefaultRegistry) + cpustats = make([]CPUStats, 2) + diskstats = make([]DiskStats, 2) + rstats = make([]runtimeStats, 2) + ) + + // This scale factor is used for the runtime's time metrics. It's useful to convert to + // ns here because the runtime gives times in float seconds, but runtimeHistogram can + // only provide integers for the minimum and maximum values. + const secondsToNs = float64(time.Second) + // Define the various metrics to collect + var ( + cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry) + cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry) + cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry) + cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry) + cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry) + cpuSchedLatency = getOrRegisterRuntimeHistogram("system/cpu/schedlatency", secondsToNs, nil) + memPauses = getOrRegisterRuntimeHistogram("system/memory/pauses", secondsToNs, nil) + memAllocs = GetOrRegisterMeter("system/memory/allocs", DefaultRegistry) + memFrees = GetOrRegisterMeter("system/memory/frees", DefaultRegistry) + memTotal = GetOrRegisterGauge("system/memory/held", DefaultRegistry) + heapUsed = GetOrRegisterGauge("system/memory/used", DefaultRegistry) + heapObjects = GetOrRegisterGauge("system/memory/objects", DefaultRegistry) diskReads = GetOrRegisterMeter("system/disk/readcount", DefaultRegistry) diskReadBytes = GetOrRegisterMeter("system/disk/readdata", DefaultRegistry) diskReadBytesCounter = GetOrRegisterCounter("system/disk/readbytes", DefaultRegistry) @@ -93,34 +162,43 @@ func CollectProcessMetrics(refresh time.Duration) { diskWriteBytes = GetOrRegisterMeter("system/disk/writedata", DefaultRegistry) diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry) ) - // Iterate loading the different stats and updating the meters - for i := 1; ; i++ { - location1 := i % 2 - location2 := (i - 1) % 2 - - ReadCPUStats(cpuStats[location1]) - cpuSysLoad.Update((cpuStats[location1].GlobalTime - cpuStats[location2].GlobalTime) / refreshFreq) - cpuSysWait.Update((cpuStats[location1].GlobalWait - cpuStats[location2].GlobalWait) / refreshFreq) - cpuProcLoad.Update((cpuStats[location1].LocalTime - cpuStats[location2].LocalTime) / refreshFreq) + + // Iterate loading the different stats and updating the meters. + now, prev := 0, 1 + for ; ; now, prev = prev, now { + // CPU + ReadCPUStats(&cpustats[now]) + cpuSysLoad.Update((cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq) + cpuSysWait.Update((cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq) + cpuProcLoad.Update((cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq) + + // Threads cpuThreads.Update(int64(threadCreateProfile.Count())) - cpuGoroutines.Update(int64(runtime.NumGoroutine())) - - runtime.ReadMemStats(memstats[location1]) - memPauses.Mark(int64(memstats[location1].PauseTotalNs - memstats[location2].PauseTotalNs)) - memAllocs.Mark(int64(memstats[location1].Mallocs - memstats[location2].Mallocs)) - memFrees.Mark(int64(memstats[location1].Frees - memstats[location2].Frees)) - memHeld.Update(int64(memstats[location1].HeapSys - memstats[location1].HeapReleased)) - memUsed.Update(int64(memstats[location1].Alloc)) - - if ReadDiskStats(diskstats[location1]) == nil { - diskReads.Mark(diskstats[location1].ReadCount - diskstats[location2].ReadCount) - diskReadBytes.Mark(diskstats[location1].ReadBytes - diskstats[location2].ReadBytes) - diskWrites.Mark(diskstats[location1].WriteCount - diskstats[location2].WriteCount) - diskWriteBytes.Mark(diskstats[location1].WriteBytes - diskstats[location2].WriteBytes) - - diskReadBytesCounter.Inc(diskstats[location1].ReadBytes - diskstats[location2].ReadBytes) - diskWriteBytesCounter.Inc(diskstats[location1].WriteBytes - diskstats[location2].WriteBytes) + + // Go runtime metrics + readRuntimeStats(&rstats[now]) + + cpuGoroutines.Update(int64(rstats[now].Goroutines)) + cpuSchedLatency.update(rstats[now].SchedLatency) + memPauses.update(rstats[now].GCPauses) + + memAllocs.Mark(int64(rstats[now].GCAllocBytes - rstats[prev].GCAllocBytes)) + memFrees.Mark(int64(rstats[now].GCFreedBytes - rstats[prev].GCFreedBytes)) + + memTotal.Update(int64(rstats[now].MemTotal)) + heapUsed.Update(int64(rstats[now].MemTotal - rstats[now].HeapUnused - rstats[now].HeapFree - rstats[now].HeapReleased)) + heapObjects.Update(int64(rstats[now].HeapObjects)) + + // Disk + if ReadDiskStats(&diskstats[now]) == nil { + diskReads.Mark(diskstats[now].ReadCount - diskstats[prev].ReadCount) + diskReadBytes.Mark(diskstats[now].ReadBytes - diskstats[prev].ReadBytes) + diskWrites.Mark(diskstats[now].WriteCount - diskstats[prev].WriteCount) + diskWriteBytes.Mark(diskstats[now].WriteBytes - diskstats[prev].WriteBytes) + diskReadBytesCounter.Inc(diskstats[now].ReadBytes - diskstats[prev].ReadBytes) + diskWriteBytesCounter.Inc(diskstats[now].WriteBytes - diskstats[prev].WriteBytes) } + time.Sleep(refresh) } } diff --git a/metrics/metrics_test.go b/metrics/metrics_test.go index 029c99870eba..e3fde1ea62ce 100644 --- a/metrics/metrics_test.go +++ b/metrics/metrics_test.go @@ -2,8 +2,6 @@ package metrics import ( "fmt" - "io" - "log" "sync" "testing" "time" @@ -11,11 +9,11 @@ import ( const FANOUT = 128 -// Stop the compiler from complaining during debugging. -var ( - _ = io.Discard - _ = log.LstdFlags -) +func TestReadRuntimeValues(t *testing.T) { + var v runtimeStats + readRuntimeStats(&v) + t.Logf("%+v", v) +} func BenchmarkMetrics(b *testing.B) { r := NewRegistry() @@ -26,7 +24,6 @@ func BenchmarkMetrics(b *testing.B) { m := NewRegisteredMeter("meter", r) t := NewRegisteredTimer("timer", r) RegisterDebugGCStats(r) - RegisterRuntimeMemStats(r) b.ResetTimer() ch := make(chan bool) @@ -48,24 +45,6 @@ func BenchmarkMetrics(b *testing.B) { }() //*/ - wgR := &sync.WaitGroup{} - //* - wgR.Add(1) - go func() { - defer wgR.Done() - //log.Println("go CaptureRuntimeMemStats") - for { - select { - case <-ch: - //log.Println("done CaptureRuntimeMemStats") - return - default: - CaptureRuntimeMemStatsOnce(r) - } - } - }() - //*/ - wgW := &sync.WaitGroup{} /* wgW.Add(1) @@ -104,7 +83,6 @@ func BenchmarkMetrics(b *testing.B) { wg.Wait() close(ch) wgD.Wait() - wgR.Wait() wgW.Wait() } diff --git a/metrics/runtime.go b/metrics/runtime.go deleted file mode 100644 index 9450c479bad7..000000000000 --- a/metrics/runtime.go +++ /dev/null @@ -1,212 +0,0 @@ -package metrics - -import ( - "runtime" - "runtime/pprof" - "time" -) - -var ( - memStats runtime.MemStats - runtimeMetrics struct { - MemStats struct { - Alloc Gauge - BuckHashSys Gauge - DebugGC Gauge - EnableGC Gauge - Frees Gauge - HeapAlloc Gauge - HeapIdle Gauge - HeapInuse Gauge - HeapObjects Gauge - HeapReleased Gauge - HeapSys Gauge - LastGC Gauge - Lookups Gauge - Mallocs Gauge - MCacheInuse Gauge - MCacheSys Gauge - MSpanInuse Gauge - MSpanSys Gauge - NextGC Gauge - NumGC Gauge - GCCPUFraction GaugeFloat64 - PauseNs Histogram - PauseTotalNs Gauge - StackInuse Gauge - StackSys Gauge - Sys Gauge - TotalAlloc Gauge - } - NumCgoCall Gauge - NumGoroutine Gauge - NumThread Gauge - ReadMemStats Timer - } - frees uint64 - lookups uint64 - mallocs uint64 - numGC uint32 - numCgoCalls int64 - - threadCreateProfile = pprof.Lookup("threadcreate") -) - -// Capture new values for the Go runtime statistics exported in -// runtime.MemStats. This is designed to be called as a goroutine. -func CaptureRuntimeMemStats(r Registry, d time.Duration) { - for range time.Tick(d) { - CaptureRuntimeMemStatsOnce(r) - } -} - -// Capture new values for the Go runtime statistics exported in -// runtime.MemStats. This is designed to be called in a background -// goroutine. Giving a registry which has not been given to -// RegisterRuntimeMemStats will panic. -// -// Be very careful with this because runtime.ReadMemStats calls the C -// functions runtime·semacquire(&runtime·worldsema) and runtime·stoptheworld() -// and that last one does what it says on the tin. -func CaptureRuntimeMemStatsOnce(r Registry) { - t := time.Now() - runtime.ReadMemStats(&memStats) // This takes 50-200us. - runtimeMetrics.ReadMemStats.UpdateSince(t) - - runtimeMetrics.MemStats.Alloc.Update(int64(memStats.Alloc)) - runtimeMetrics.MemStats.BuckHashSys.Update(int64(memStats.BuckHashSys)) - if memStats.DebugGC { - runtimeMetrics.MemStats.DebugGC.Update(1) - } else { - runtimeMetrics.MemStats.DebugGC.Update(0) - } - if memStats.EnableGC { - runtimeMetrics.MemStats.EnableGC.Update(1) - } else { - runtimeMetrics.MemStats.EnableGC.Update(0) - } - - runtimeMetrics.MemStats.Frees.Update(int64(memStats.Frees - frees)) - runtimeMetrics.MemStats.HeapAlloc.Update(int64(memStats.HeapAlloc)) - runtimeMetrics.MemStats.HeapIdle.Update(int64(memStats.HeapIdle)) - runtimeMetrics.MemStats.HeapInuse.Update(int64(memStats.HeapInuse)) - runtimeMetrics.MemStats.HeapObjects.Update(int64(memStats.HeapObjects)) - runtimeMetrics.MemStats.HeapReleased.Update(int64(memStats.HeapReleased)) - runtimeMetrics.MemStats.HeapSys.Update(int64(memStats.HeapSys)) - runtimeMetrics.MemStats.LastGC.Update(int64(memStats.LastGC)) - runtimeMetrics.MemStats.Lookups.Update(int64(memStats.Lookups - lookups)) - runtimeMetrics.MemStats.Mallocs.Update(int64(memStats.Mallocs - mallocs)) - runtimeMetrics.MemStats.MCacheInuse.Update(int64(memStats.MCacheInuse)) - runtimeMetrics.MemStats.MCacheSys.Update(int64(memStats.MCacheSys)) - runtimeMetrics.MemStats.MSpanInuse.Update(int64(memStats.MSpanInuse)) - runtimeMetrics.MemStats.MSpanSys.Update(int64(memStats.MSpanSys)) - runtimeMetrics.MemStats.NextGC.Update(int64(memStats.NextGC)) - runtimeMetrics.MemStats.NumGC.Update(int64(memStats.NumGC - numGC)) - runtimeMetrics.MemStats.GCCPUFraction.Update(gcCPUFraction(&memStats)) - - // - i := numGC % uint32(len(memStats.PauseNs)) - ii := memStats.NumGC % uint32(len(memStats.PauseNs)) - if memStats.NumGC-numGC >= uint32(len(memStats.PauseNs)) { - for i = 0; i < uint32(len(memStats.PauseNs)); i++ { - runtimeMetrics.MemStats.PauseNs.Update(int64(memStats.PauseNs[i])) - } - } else { - if i > ii { - for ; i < uint32(len(memStats.PauseNs)); i++ { - runtimeMetrics.MemStats.PauseNs.Update(int64(memStats.PauseNs[i])) - } - i = 0 - } - for ; i < ii; i++ { - runtimeMetrics.MemStats.PauseNs.Update(int64(memStats.PauseNs[i])) - } - } - frees = memStats.Frees - lookups = memStats.Lookups - mallocs = memStats.Mallocs - numGC = memStats.NumGC - - runtimeMetrics.MemStats.PauseTotalNs.Update(int64(memStats.PauseTotalNs)) - runtimeMetrics.MemStats.StackInuse.Update(int64(memStats.StackInuse)) - runtimeMetrics.MemStats.StackSys.Update(int64(memStats.StackSys)) - runtimeMetrics.MemStats.Sys.Update(int64(memStats.Sys)) - runtimeMetrics.MemStats.TotalAlloc.Update(int64(memStats.TotalAlloc)) - - currentNumCgoCalls := numCgoCall() - runtimeMetrics.NumCgoCall.Update(currentNumCgoCalls - numCgoCalls) - numCgoCalls = currentNumCgoCalls - - runtimeMetrics.NumGoroutine.Update(int64(runtime.NumGoroutine())) - - runtimeMetrics.NumThread.Update(int64(threadCreateProfile.Count())) -} - -// Register runtimeMetrics for the Go runtime statistics exported in runtime and -// specifically runtime.MemStats. The runtimeMetrics are named by their -// fully-qualified Go symbols, i.e. runtime.MemStats.Alloc. -func RegisterRuntimeMemStats(r Registry) { - runtimeMetrics.MemStats.Alloc = NewGauge() - runtimeMetrics.MemStats.BuckHashSys = NewGauge() - runtimeMetrics.MemStats.DebugGC = NewGauge() - runtimeMetrics.MemStats.EnableGC = NewGauge() - runtimeMetrics.MemStats.Frees = NewGauge() - runtimeMetrics.MemStats.HeapAlloc = NewGauge() - runtimeMetrics.MemStats.HeapIdle = NewGauge() - runtimeMetrics.MemStats.HeapInuse = NewGauge() - runtimeMetrics.MemStats.HeapObjects = NewGauge() - runtimeMetrics.MemStats.HeapReleased = NewGauge() - runtimeMetrics.MemStats.HeapSys = NewGauge() - runtimeMetrics.MemStats.LastGC = NewGauge() - runtimeMetrics.MemStats.Lookups = NewGauge() - runtimeMetrics.MemStats.Mallocs = NewGauge() - runtimeMetrics.MemStats.MCacheInuse = NewGauge() - runtimeMetrics.MemStats.MCacheSys = NewGauge() - runtimeMetrics.MemStats.MSpanInuse = NewGauge() - runtimeMetrics.MemStats.MSpanSys = NewGauge() - runtimeMetrics.MemStats.NextGC = NewGauge() - runtimeMetrics.MemStats.NumGC = NewGauge() - runtimeMetrics.MemStats.GCCPUFraction = NewGaugeFloat64() - runtimeMetrics.MemStats.PauseNs = NewHistogram(NewExpDecaySample(1028, 0.015)) - runtimeMetrics.MemStats.PauseTotalNs = NewGauge() - runtimeMetrics.MemStats.StackInuse = NewGauge() - runtimeMetrics.MemStats.StackSys = NewGauge() - runtimeMetrics.MemStats.Sys = NewGauge() - runtimeMetrics.MemStats.TotalAlloc = NewGauge() - runtimeMetrics.NumCgoCall = NewGauge() - runtimeMetrics.NumGoroutine = NewGauge() - runtimeMetrics.NumThread = NewGauge() - runtimeMetrics.ReadMemStats = NewTimer() - - r.Register("runtime.MemStats.Alloc", runtimeMetrics.MemStats.Alloc) - r.Register("runtime.MemStats.BuckHashSys", runtimeMetrics.MemStats.BuckHashSys) - r.Register("runtime.MemStats.DebugGC", runtimeMetrics.MemStats.DebugGC) - r.Register("runtime.MemStats.EnableGC", runtimeMetrics.MemStats.EnableGC) - r.Register("runtime.MemStats.Frees", runtimeMetrics.MemStats.Frees) - r.Register("runtime.MemStats.HeapAlloc", runtimeMetrics.MemStats.HeapAlloc) - r.Register("runtime.MemStats.HeapIdle", runtimeMetrics.MemStats.HeapIdle) - r.Register("runtime.MemStats.HeapInuse", runtimeMetrics.MemStats.HeapInuse) - r.Register("runtime.MemStats.HeapObjects", runtimeMetrics.MemStats.HeapObjects) - r.Register("runtime.MemStats.HeapReleased", runtimeMetrics.MemStats.HeapReleased) - r.Register("runtime.MemStats.HeapSys", runtimeMetrics.MemStats.HeapSys) - r.Register("runtime.MemStats.LastGC", runtimeMetrics.MemStats.LastGC) - r.Register("runtime.MemStats.Lookups", runtimeMetrics.MemStats.Lookups) - r.Register("runtime.MemStats.Mallocs", runtimeMetrics.MemStats.Mallocs) - r.Register("runtime.MemStats.MCacheInuse", runtimeMetrics.MemStats.MCacheInuse) - r.Register("runtime.MemStats.MCacheSys", runtimeMetrics.MemStats.MCacheSys) - r.Register("runtime.MemStats.MSpanInuse", runtimeMetrics.MemStats.MSpanInuse) - r.Register("runtime.MemStats.MSpanSys", runtimeMetrics.MemStats.MSpanSys) - r.Register("runtime.MemStats.NextGC", runtimeMetrics.MemStats.NextGC) - r.Register("runtime.MemStats.NumGC", runtimeMetrics.MemStats.NumGC) - r.Register("runtime.MemStats.GCCPUFraction", runtimeMetrics.MemStats.GCCPUFraction) - r.Register("runtime.MemStats.PauseNs", runtimeMetrics.MemStats.PauseNs) - r.Register("runtime.MemStats.PauseTotalNs", runtimeMetrics.MemStats.PauseTotalNs) - r.Register("runtime.MemStats.StackInuse", runtimeMetrics.MemStats.StackInuse) - r.Register("runtime.MemStats.StackSys", runtimeMetrics.MemStats.StackSys) - r.Register("runtime.MemStats.Sys", runtimeMetrics.MemStats.Sys) - r.Register("runtime.MemStats.TotalAlloc", runtimeMetrics.MemStats.TotalAlloc) - r.Register("runtime.NumCgoCall", runtimeMetrics.NumCgoCall) - r.Register("runtime.NumGoroutine", runtimeMetrics.NumGoroutine) - r.Register("runtime.NumThread", runtimeMetrics.NumThread) - r.Register("runtime.ReadMemStats", runtimeMetrics.ReadMemStats) -} diff --git a/metrics/runtime_cgo.go b/metrics/runtime_cgo.go deleted file mode 100644 index 4307ebdba689..000000000000 --- a/metrics/runtime_cgo.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build cgo && !appengine && !js -// +build cgo,!appengine,!js - -package metrics - -import "runtime" - -func numCgoCall() int64 { - return runtime.NumCgoCall() -} diff --git a/metrics/runtime_gccpufraction.go b/metrics/runtime_gccpufraction.go deleted file mode 100644 index 28cd44752b45..000000000000 --- a/metrics/runtime_gccpufraction.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build go1.5 -// +build go1.5 - -package metrics - -import "runtime" - -func gcCPUFraction(memStats *runtime.MemStats) float64 { - return memStats.GCCPUFraction -} diff --git a/metrics/runtime_no_cgo.go b/metrics/runtime_no_cgo.go deleted file mode 100644 index 1799bef63bfb..000000000000 --- a/metrics/runtime_no_cgo.go +++ /dev/null @@ -1,8 +0,0 @@ -//go:build !cgo || appengine || js -// +build !cgo appengine js - -package metrics - -func numCgoCall() int64 { - return 0 -} diff --git a/metrics/runtime_no_gccpufraction.go b/metrics/runtime_no_gccpufraction.go deleted file mode 100644 index af1a4b63c809..000000000000 --- a/metrics/runtime_no_gccpufraction.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build !go1.5 -// +build !go1.5 - -package metrics - -import "runtime" - -func gcCPUFraction(memStats *runtime.MemStats) float64 { - return 0 -} diff --git a/metrics/runtime_test.go b/metrics/runtime_test.go deleted file mode 100644 index f85f7868f71a..000000000000 --- a/metrics/runtime_test.go +++ /dev/null @@ -1,88 +0,0 @@ -package metrics - -import ( - "runtime" - "testing" - "time" -) - -func BenchmarkRuntimeMemStats(b *testing.B) { - r := NewRegistry() - RegisterRuntimeMemStats(r) - b.ResetTimer() - for i := 0; i < b.N; i++ { - CaptureRuntimeMemStatsOnce(r) - } -} - -func TestRuntimeMemStats(t *testing.T) { - r := NewRegistry() - RegisterRuntimeMemStats(r) - CaptureRuntimeMemStatsOnce(r) - zero := runtimeMetrics.MemStats.PauseNs.Count() // Get a "zero" since GC may have run before these tests. - runtime.GC() - CaptureRuntimeMemStatsOnce(r) - if count := runtimeMetrics.MemStats.PauseNs.Count(); count-zero != 1 { - t.Fatal(count - zero) - } - runtime.GC() - runtime.GC() - CaptureRuntimeMemStatsOnce(r) - if count := runtimeMetrics.MemStats.PauseNs.Count(); count-zero != 3 { - t.Fatal(count - zero) - } - for i := 0; i < 256; i++ { - runtime.GC() - } - CaptureRuntimeMemStatsOnce(r) - if count := runtimeMetrics.MemStats.PauseNs.Count(); count-zero != 259 { - t.Fatal(count - zero) - } - for i := 0; i < 257; i++ { - runtime.GC() - } - CaptureRuntimeMemStatsOnce(r) - if count := runtimeMetrics.MemStats.PauseNs.Count(); count-zero != 515 { // We lost one because there were too many GCs between captures. - t.Fatal(count - zero) - } -} - -func TestRuntimeMemStatsNumThread(t *testing.T) { - r := NewRegistry() - RegisterRuntimeMemStats(r) - CaptureRuntimeMemStatsOnce(r) - - if value := runtimeMetrics.NumThread.Value(); value < 1 { - t.Fatalf("got NumThread: %d, wanted at least 1", value) - } -} - -func TestRuntimeMemStatsBlocking(t *testing.T) { - if g := runtime.GOMAXPROCS(0); g < 2 { - t.Skipf("skipping TestRuntimeMemStatsBlocking with GOMAXPROCS=%d\n", g) - } - ch := make(chan int) - go testRuntimeMemStatsBlocking(ch) - var memStats runtime.MemStats - t0 := time.Now() - runtime.ReadMemStats(&memStats) - t1 := time.Now() - t.Log("i++ during runtime.ReadMemStats:", <-ch) - go testRuntimeMemStatsBlocking(ch) - d := t1.Sub(t0) - t.Log(d) - time.Sleep(d) - t.Log("i++ during time.Sleep:", <-ch) -} - -func testRuntimeMemStatsBlocking(ch chan int) { - i := 0 - for { - select { - case ch <- i: - return - default: - i++ - } - } -} diff --git a/metrics/runtimehistogram.go b/metrics/runtimehistogram.go new file mode 100644 index 000000000000..c68939af1ef7 --- /dev/null +++ b/metrics/runtimehistogram.go @@ -0,0 +1,319 @@ +package metrics + +import ( + "math" + "runtime/metrics" + "sort" + "sync/atomic" +) + +func getOrRegisterRuntimeHistogram(name string, scale float64, r Registry) *runtimeHistogram { + if r == nil { + r = DefaultRegistry + } + constructor := func() Histogram { return newRuntimeHistogram(scale) } + return r.GetOrRegister(name, constructor).(*runtimeHistogram) +} + +// runtimeHistogram wraps a runtime/metrics histogram. +type runtimeHistogram struct { + v atomic.Value + scaleFactor float64 +} + +func newRuntimeHistogram(scale float64) *runtimeHistogram { + h := &runtimeHistogram{scaleFactor: scale} + h.update(&metrics.Float64Histogram{}) + return h +} + +func (h *runtimeHistogram) update(mh *metrics.Float64Histogram) { + if mh == nil { + // The update value can be nil if the current Go version doesn't support a + // requested metric. It's just easier to handle nil here than putting + // conditionals everywhere. + return + } + + s := runtimeHistogramSnapshot{ + Counts: make([]uint64, len(mh.Counts)), + Buckets: make([]float64, len(mh.Buckets)), + } + copy(s.Counts, mh.Counts) + copy(s.Buckets, mh.Buckets) + for i, b := range s.Buckets { + s.Buckets[i] = b * h.scaleFactor + } + h.v.Store(&s) +} + +func (h *runtimeHistogram) load() *runtimeHistogramSnapshot { + return h.v.Load().(*runtimeHistogramSnapshot) +} + +func (h *runtimeHistogram) Clear() { + panic("runtimeHistogram does not support Clear") +} +func (h *runtimeHistogram) Update(int64) { + panic("runtimeHistogram does not support Update") +} +func (h *runtimeHistogram) Sample() Sample { + return NilSample{} +} + +// Snapshot returns a non-changing cop of the histogram. +func (h *runtimeHistogram) Snapshot() Histogram { + return h.load() +} + +// Count returns the sample count. +func (h *runtimeHistogram) Count() int64 { + return h.load().Count() +} + +// Mean returns an approximation of the mean. +func (h *runtimeHistogram) Mean() float64 { + return h.load().Mean() +} + +// StdDev approximates the standard deviation of the histogram. +func (h *runtimeHistogram) StdDev() float64 { + return h.load().StdDev() +} + +// Variance approximates the variance of the histogram. +func (h *runtimeHistogram) Variance() float64 { + return h.load().Variance() +} + +// Percentile computes the p'th percentile value. +func (h *runtimeHistogram) Percentile(p float64) float64 { + return h.load().Percentile(p) +} + +// Percentiles computes all requested percentile values. +func (h *runtimeHistogram) Percentiles(ps []float64) []float64 { + return h.load().Percentiles(ps) +} + +// Max returns the highest sample value. +func (h *runtimeHistogram) Max() int64 { + return h.load().Max() +} + +// Min returns the lowest sample value. +func (h *runtimeHistogram) Min() int64 { + return h.load().Min() +} + +// Sum returns the sum of all sample values. +func (h *runtimeHistogram) Sum() int64 { + return h.load().Sum() +} + +type runtimeHistogramSnapshot metrics.Float64Histogram + +func (h *runtimeHistogramSnapshot) Clear() { + panic("runtimeHistogram does not support Clear") +} +func (h *runtimeHistogramSnapshot) Update(int64) { + panic("runtimeHistogram does not support Update") +} +func (h *runtimeHistogramSnapshot) Sample() Sample { + return NilSample{} +} + +func (h *runtimeHistogramSnapshot) Snapshot() Histogram { + return h +} + +// Count returns the sample count. +func (h *runtimeHistogramSnapshot) Count() int64 { + var count int64 + for _, c := range h.Counts { + count += int64(c) + } + return count +} + +// Mean returns an approximation of the mean. +func (h *runtimeHistogramSnapshot) Mean() float64 { + if len(h.Counts) == 0 { + return 0 + } + mean, _ := h.mean() + return mean +} + +// mean computes the mean and also the total sample count. +func (h *runtimeHistogramSnapshot) mean() (mean, totalCount float64) { + var sum float64 + for i, c := range h.Counts { + midpoint := h.midpoint(i) + sum += midpoint * float64(c) + totalCount += float64(c) + } + return sum / totalCount, totalCount +} + +func (h *runtimeHistogramSnapshot) midpoint(bucket int) float64 { + high := h.Buckets[bucket+1] + low := h.Buckets[bucket] + if math.IsInf(high, 1) { + // The edge of the highest bucket can be +Inf, and it's supposed to mean that this + // bucket contains all remaining samples > low. We can't get the middle of an + // infinite range, so just return the lower bound of this bucket instead. + return low + } + if math.IsInf(low, -1) { + // Similarly, we can get -Inf in the left edge of the lowest bucket, + // and it means the bucket contains all remaining values < high. + return high + } + return (low + high) / 2 +} + +// StdDev approximates the standard deviation of the histogram. +func (h *runtimeHistogramSnapshot) StdDev() float64 { + return math.Sqrt(h.Variance()) +} + +// Variance approximates the variance of the histogram. +func (h *runtimeHistogramSnapshot) Variance() float64 { + if len(h.Counts) == 0 { + return 0 + } + + mean, totalCount := h.mean() + if totalCount <= 1 { + // There is no variance when there are zero or one items. + return 0 + } + + var sum float64 + for i, c := range h.Counts { + midpoint := h.midpoint(i) + d := midpoint - mean + sum += float64(c) * (d * d) + } + return sum / (totalCount - 1) +} + +// Percentile computes the p'th percentile value. +func (h *runtimeHistogramSnapshot) Percentile(p float64) float64 { + threshold := float64(h.Count()) * p + values := [1]float64{threshold} + h.computePercentiles(values[:]) + return values[0] +} + +// Percentiles computes all requested percentile values. +func (h *runtimeHistogramSnapshot) Percentiles(ps []float64) []float64 { + // Compute threshold values. We need these to be sorted + // for the percentile computation, but restore the original + // order later, so keep the indexes as well. + count := float64(h.Count()) + thresholds := make([]float64, len(ps)) + indexes := make([]int, len(ps)) + for i, percentile := range ps { + thresholds[i] = count * math.Max(0, math.Min(1.0, percentile)) + indexes[i] = i + } + sort.Sort(floatsAscendingKeepingIndex{thresholds, indexes}) + + // Now compute. The result is stored back into the thresholds slice. + h.computePercentiles(thresholds) + + // Put the result back into the requested order. + sort.Sort(floatsByIndex{thresholds, indexes}) + return thresholds +} + +func (h *runtimeHistogramSnapshot) computePercentiles(thresh []float64) { + var totalCount float64 + for i, count := range h.Counts { + totalCount += float64(count) + + for len(thresh) > 0 && thresh[0] < totalCount { + thresh[0] = h.Buckets[i] + thresh = thresh[1:] + } + if len(thresh) == 0 { + return + } + } +} + +// Note: runtime/metrics.Float64Histogram is a collection of float64s, but the methods +// below need to return int64 to satisfy the interface. The histogram provided by runtime +// also doesn't keep track of individual samples, so results are approximated. + +// Max returns the highest sample value. +func (h *runtimeHistogramSnapshot) Max() int64 { + for i := len(h.Counts) - 1; i >= 0; i-- { + count := h.Counts[i] + if count > 0 { + edge := h.Buckets[i+1] + if math.IsInf(edge, 1) { + edge = h.Buckets[i] + } + return int64(math.Ceil(edge)) + } + } + return 0 +} + +// Min returns the lowest sample value. +func (h *runtimeHistogramSnapshot) Min() int64 { + for i, count := range h.Counts { + if count > 0 { + return int64(math.Floor(h.Buckets[i])) + } + } + return 0 +} + +// Sum returns the sum of all sample values. +func (h *runtimeHistogramSnapshot) Sum() int64 { + var sum float64 + for i := range h.Counts { + sum += h.Buckets[i] * float64(h.Counts[i]) + } + return int64(math.Ceil(sum)) +} + +type floatsAscendingKeepingIndex struct { + values []float64 + indexes []int +} + +func (s floatsAscendingKeepingIndex) Len() int { + return len(s.values) +} + +func (s floatsAscendingKeepingIndex) Less(i, j int) bool { + return s.values[i] < s.values[j] +} + +func (s floatsAscendingKeepingIndex) Swap(i, j int) { + s.values[i], s.values[j] = s.values[j], s.values[i] + s.indexes[i], s.indexes[j] = s.indexes[j], s.indexes[i] +} + +type floatsByIndex struct { + values []float64 + indexes []int +} + +func (s floatsByIndex) Len() int { + return len(s.values) +} + +func (s floatsByIndex) Less(i, j int) bool { + return s.indexes[i] < s.indexes[j] +} + +func (s floatsByIndex) Swap(i, j int) { + s.values[i], s.values[j] = s.values[j], s.values[i] + s.indexes[i], s.indexes[j] = s.indexes[j], s.indexes[i] +} diff --git a/metrics/runtimehistogram_test.go b/metrics/runtimehistogram_test.go new file mode 100644 index 000000000000..d53a01438311 --- /dev/null +++ b/metrics/runtimehistogram_test.go @@ -0,0 +1,133 @@ +package metrics + +import ( + "fmt" + "math" + "reflect" + "runtime/metrics" + "testing" +) + +var _ Histogram = (*runtimeHistogram)(nil) + +type runtimeHistogramTest struct { + h metrics.Float64Histogram + + Count int64 + Min int64 + Max int64 + Sum int64 + Mean float64 + Variance float64 + StdDev float64 + Percentiles []float64 // .5 .8 .9 .99 .995 +} + +// This test checks the results of statistical functions implemented +// by runtimeHistogramSnapshot. +func TestRuntimeHistogramStats(t *testing.T) { + tests := []runtimeHistogramTest{ + 0: { + h: metrics.Float64Histogram{ + Counts: []uint64{}, + Buckets: []float64{}, + }, + Count: 0, + Max: 0, + Min: 0, + Sum: 0, + Mean: 0, + Variance: 0, + StdDev: 0, + Percentiles: []float64{0, 0, 0, 0, 0}, + }, + 1: { + // This checks the case where the highest bucket is +Inf. + h: metrics.Float64Histogram{ + Counts: []uint64{0, 1, 2}, + Buckets: []float64{0, 0.5, 1, math.Inf(1)}, + }, + Count: 3, + Max: 1, + Min: 0, + Sum: 3, + Mean: 0.9166666, + Percentiles: []float64{1, 1, 1, 1, 1}, + Variance: 0.020833, + StdDev: 0.144433, + }, + 2: { + h: metrics.Float64Histogram{ + Counts: []uint64{8, 6, 3, 1}, + Buckets: []float64{12, 16, 18, 24, 25}, + }, + Count: 18, + Max: 25, + Min: 12, + Sum: 270, + Mean: 16.75, + Variance: 10.3015, + StdDev: 3.2096, + Percentiles: []float64{16, 18, 18, 24, 24}, + }, + } + + for i, test := range tests { + t.Run(fmt.Sprint(i), func(t *testing.T) { + s := runtimeHistogramSnapshot(test.h) + + if v := s.Count(); v != test.Count { + t.Errorf("Count() = %v, want %v", v, test.Count) + } + if v := s.Min(); v != test.Min { + t.Errorf("Min() = %v, want %v", v, test.Min) + } + if v := s.Max(); v != test.Max { + t.Errorf("Max() = %v, want %v", v, test.Max) + } + if v := s.Sum(); v != test.Sum { + t.Errorf("Sum() = %v, want %v", v, test.Sum) + } + if v := s.Mean(); !approxEqual(v, test.Mean, 0.0001) { + t.Errorf("Mean() = %v, want %v", v, test.Mean) + } + if v := s.Variance(); !approxEqual(v, test.Variance, 0.0001) { + t.Errorf("Variance() = %v, want %v", v, test.Variance) + } + if v := s.StdDev(); !approxEqual(v, test.StdDev, 0.0001) { + t.Errorf("StdDev() = %v, want %v", v, test.StdDev) + } + ps := []float64{.5, .8, .9, .99, .995} + if v := s.Percentiles(ps); !reflect.DeepEqual(v, test.Percentiles) { + t.Errorf("Percentiles(%v) = %v, want %v", ps, v, test.Percentiles) + } + }) + } +} + +func approxEqual(x, y, ε float64) bool { + if math.IsInf(x, -1) && math.IsInf(y, -1) { + return true + } + if math.IsInf(x, 1) && math.IsInf(y, 1) { + return true + } + if math.IsNaN(x) && math.IsNaN(y) { + return true + } + return math.Abs(x-y) < ε +} + +// This test verifies that requesting Percentiles in unsorted order +// returns them in the requested order. +func TestRuntimeHistogramStatsPercentileOrder(t *testing.T) { + p := runtimeHistogramSnapshot{ + Counts: []uint64{1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + Buckets: []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + } + result := p.Percentiles([]float64{1, 0.2, 0.5, 0.1, 0.2}) + expected := []float64{10, 2, 5, 1, 2} + if !reflect.DeepEqual(result, expected) { + t.Fatal("wrong result:", result) + } +}