From 7f4941a3b48af6d866f4e90cd5a0a37439f8eab2 Mon Sep 17 00:00:00 2001 From: Karsten Weiss Date: Wed, 23 Aug 2017 15:59:57 +0200 Subject: [PATCH 1/4] cpu: Metric 'package_throttles_total' is per package. 'package_throttles_total' is per package, not per cpu. This also reduces the total number of cpu time series a lot (esp for multi core cpus). --- collector/cpu_linux.go | 57 ++++++++++++++++++++++++------- collector/fixtures/e2e-output.txt | 4 +-- collector/fixtures/sys.ttar | 25 ++++++++++++++ 3 files changed, 70 insertions(+), 16 deletions(-) diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index ebf47c721f..a2781bf570 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go @@ -17,8 +17,10 @@ package collector import ( "fmt" + "io/ioutil" "os" "path/filepath" + "strings" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/log" @@ -65,6 +67,7 @@ func NewCPUCollector() (Collector, error) { "Maximum cpu thread frequency in hertz.", []string{"cpu"}, nil, ), + // FIXME: This should be a per core metric, not per cpu! cpuCoreThrottle: prometheus.NewDesc( prometheus.BuildFQName(Namespace, cpuCollectorNamespace, "core_throttles_total"), "Number of times this cpu core has been throttled.", @@ -73,7 +76,7 @@ func NewCPUCollector() (Collector, error) { cpuPackageThrottle: prometheus.NewDesc( prometheus.BuildFQName(Namespace, cpuCollectorNamespace, "package_throttles_total"), "Number of times this cpu package has been throttled.", - []string{"cpu"}, nil, + []string{"node"}, nil, ), }, nil } @@ -98,6 +101,7 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { var value uint64 + // cpu loop for _, cpu := range cpus { _, cpuname := filepath.Split(cpu) @@ -106,17 +110,17 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { } else { // sysfs cpufreq values are kHz, thus multiply by 1000 to export base units (hz). // See https://www.kernel.org/doc/Documentation/cpu-freq/user-guide.txt - if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq/scaling_cur_freq")); err != nil { + if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_cur_freq")); err != nil { return err } ch <- prometheus.MustNewConstMetric(c.cpuFreq, prometheus.GaugeValue, float64(value)*1000.0, cpuname) - if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq/scaling_min_freq")); err != nil { + if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_min_freq")); err != nil { return err } ch <- prometheus.MustNewConstMetric(c.cpuFreqMin, prometheus.GaugeValue, float64(value)*1000.0, cpuname) - if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq/scaling_max_freq")); err != nil { + if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_max_freq")); err != nil { return err } ch <- prometheus.MustNewConstMetric(c.cpuFreqMax, prometheus.GaugeValue, float64(value)*1000.0, cpuname) @@ -124,17 +128,44 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { if _, err := os.Stat(filepath.Join(cpu, "thermal_throttle")); os.IsNotExist(err) { log.Debugf("CPU %q is missing thermal_throttle", cpu) - } else { - if value, err = readUintFromFile(filepath.Join(cpu, "thermal_throttle/core_throttle_count")); err != nil { - return err - } - ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle, prometheus.CounterValue, float64(value), cpuname) + continue + } + if value, err = readUintFromFile(filepath.Join(cpu, "thermal_throttle", "core_throttle_count")); err != nil { + return err + } + ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle, prometheus.CounterValue, float64(value), cpuname) + } - if value, err = readUintFromFile(filepath.Join(cpu, "thermal_throttle/package_throttle_count")); err != nil { - return err - } - ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, prometheus.CounterValue, float64(value), cpuname) + pkgs, err := filepath.Glob(sysFilePath("bus/node/devices/node[0-9]*")) + if err != nil { + return err + } + + // package/node loop + for pkgno, pkg := range pkgs { + if _, err := os.Stat(filepath.Join(pkg, "cpulist")); os.IsNotExist(err) { + log.Debugf("package %q is missing cpulist", pkg) + continue + } + cpulist, err := ioutil.ReadFile(filepath.Join(pkg, "cpulist")) + if err != nil { + log.Debugf("could not read cpulist of package %q", pkg) + return err + } + // cpulist example of one package/node with HT: "0-11,24-35" + firstCPU := strings.Split(string(cpulist), "\n")[0] + if strings.Contains(firstCPU, "-") { + // multi-core: Use first cpu of package + firstCPU = strings.Split(firstCPU, "-")[0] + } + if _, err := os.Stat(filepath.Join(pkg, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); os.IsNotExist(err) { + log.Debugf("Package %q CPU %q is missing package_throttle", pkg, firstCPU) + continue + } + if value, err = readUintFromFile(filepath.Join(pkg, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); err != nil { + return err } + ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, prometheus.CounterValue, float64(value), fmt.Sprintf("%d", pkgno)) } return nil diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index bd1b2b24a7..93a6a6c79f 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -299,9 +299,7 @@ node_cpu_frequency_min_hertz{cpu="cpu1"} 8e+08 node_cpu_frequency_min_hertz{cpu="cpu3"} 1e+06 # HELP node_cpu_package_throttles_total Number of times this cpu package has been throttled. # TYPE node_cpu_package_throttles_total counter -node_cpu_package_throttles_total{cpu="cpu0"} 30 -node_cpu_package_throttles_total{cpu="cpu1"} 30 -node_cpu_package_throttles_total{cpu="cpu2"} 6 +node_cpu_package_throttles_total{node="0"} 30 # HELP node_disk_bytes_read The total number of bytes read successfully. # TYPE node_disk_bytes_read counter node_disk_bytes_read{device="dm-0"} 5.13708655616e+11 diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index 710e04a8bb..1a2119e8a2 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -116,6 +116,31 @@ Lines: 1 1000 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices/node0 +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices/node0/cpu0 +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices/node0/cpu0/thermal_throttle +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/bus/node/devices/node0/cpu0/thermal_throttle/package_throttle_count +Lines: 1 +30 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/bus/node/devices/node0/cpulist +Lines: 1 +0-3 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/class Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - From b4f227fd5e762b6b26528d29bc970c370adcbf58 Mon Sep 17 00:00:00 2001 From: Karsten Weiss Date: Thu, 24 Aug 2017 10:23:32 +0200 Subject: [PATCH 2/4] cpu: Better handling of a cpulist edge-case. --- collector/cpu_linux.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index a2781bf570..7a82546e19 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go @@ -153,11 +153,10 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { return err } // cpulist example of one package/node with HT: "0-11,24-35" - firstCPU := strings.Split(string(cpulist), "\n")[0] - if strings.Contains(firstCPU, "-") { - // multi-core: Use first cpu of package - firstCPU = strings.Split(firstCPU, "-")[0] - } + line := strings.Split(string(cpulist), "\n")[0] + firstCPU := strings.FieldsFunc(line, func(r rune) bool { + return r == '-' || r == ',' + })[0] if _, err := os.Stat(filepath.Join(pkg, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); os.IsNotExist(err) { log.Debugf("Package %q CPU %q is missing package_throttle", pkg, firstCPU) continue From cbe1e069821560a1f1c24fad70bc593b7b37f950 Mon Sep 17 00:00:00 2001 From: Karsten Weiss Date: Thu, 24 Aug 2017 11:43:27 +0200 Subject: [PATCH 3/4] cpu: Extract the package number from the directory name. Do not rely on the range index. --- collector/cpu_linux.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index 7a82546e19..dfd1667cea 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go @@ -20,6 +20,7 @@ import ( "io/ioutil" "os" "path/filepath" + "regexp" "strings" "github.com/prometheus/client_golang/prometheus" @@ -31,6 +32,10 @@ const ( cpuCollectorNamespace = "cpu" ) +var ( + digitRegexp = regexp.MustCompile("[0-9]+") +) + type cpuCollector struct { cpu *prometheus.Desc cpuFreq *prometheus.Desc @@ -142,7 +147,7 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { } // package/node loop - for pkgno, pkg := range pkgs { + for _, pkg := range pkgs { if _, err := os.Stat(filepath.Join(pkg, "cpulist")); os.IsNotExist(err) { log.Debugf("package %q is missing cpulist", pkg) continue @@ -164,7 +169,8 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { if value, err = readUintFromFile(filepath.Join(pkg, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); err != nil { return err } - ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, prometheus.CounterValue, float64(value), fmt.Sprintf("%d", pkgno)) + pkgno := digitRegexp.FindAllString(pkg, 1)[0] + ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, prometheus.CounterValue, float64(value), pkgno) } return nil From 0fc40357a342a2c581c45fa494f6d4a78ef222d1 Mon Sep 17 00:00:00 2001 From: Karsten Weiss Date: Mon, 4 Sep 2017 15:32:12 +0200 Subject: [PATCH 4/4] cpu: Add package_throttle_count for node0 cpu1 This file must be ignored by the cpu collector. --- collector/fixtures/sys.ttar | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index 1a2119e8a2..829425abf1 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -136,6 +136,17 @@ Lines: 1 30 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices/node0/cpu1 +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices/node0/cpu1/thermal_throttle +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/bus/node/devices/node0/cpu1/thermal_throttle/package_throttle_count +Lines: 1 +30 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/bus/node/devices/node0/cpulist Lines: 1 0-3