From a8d7d1101af6b17650467108d23650e261f5c3d5 Mon Sep 17 00:00:00 2001 From: Karsten Weiss Date: Fri, 10 Nov 2017 15:31:26 +0100 Subject: [PATCH] cpu: Support processor-less (memory-only) NUMA nodes (#734) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * cpu: Support processor-less (memory-only) NUMA nodes Processor-less (memory-only) NUMA nodes exist e.g. in systems that use Intel Optane drives for RAM expansion using Intel Memory Drive Technology (IMDT). IMDT RAM expansion supports two modes: * "Unify Remote Memory domains": present a processor-less (memory-only) NUMA domain, which is the default * "Expand local memory domains": to expand each processor’s memory domain with a portion of the memory made available by Optane and IMDT This commit fixes a crash in the first case (when "cpulist" is empty). Here's an example of such a system: $ numastat -m|head -n5 Per-node system memory usage (in MBs): Node 0 Node 1 Node 2 Total --------------- --------------- --------------- --------------- MemTotal 118239.56 130816.00 464384.00 713439.56 $ for i in {0..2}; do echo -n "$i: " ; cat /sys/bus/node/devices/node$i/cpulist ; done 0: 0-7,16-23 1: 8-15,24-31 2: $ /opt/vsmp/bin/vsmpversion -vvv Memory Drive Technology: 8.2.1455.74 (Sep 28 2017 13:09:59) System configuration: Boards: 3 1 x Proc. + I/O + Memory 2 x NVM devices (Intel SSDPED1K375GAQ) Processors: 2, Cores: 16, Threads: 32 Intel(R) Xeon(R) CPU E5-2667 v4 @ 3.20GHz Stepping 01 Memory (MB): 713472 (of 977450), Cache: 251416, Private: 12562 1 x 249088MB [262036/ 678/12270] 1 x 232192MB [357707/125369/ 146] 82:00.0#1 1 x 232192MB [357707/125369/ 146] 83:00.0#1 * cpu: rename some variables (pkg => node) * cpu: Use %v not %q in log.Debugf() format strings --- collector/cpu_linux.go | 35 +++++++++++++++++++++-------------- collector/fixtures/sys.ttar | 8 ++++++++ 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index 9cbf2371d8..e3eafbb5b0 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go @@ -111,7 +111,7 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { _, cpuname := filepath.Split(cpu) if _, err := os.Stat(filepath.Join(cpu, "cpufreq")); os.IsNotExist(err) { - log.Debugf("CPU %q is missing cpufreq", cpu) + log.Debugf("CPU %v is missing cpufreq", cpu) } else { // sysfs cpufreq values are kHz, thus multiply by 1000 to export base units (hz). // See https://www.kernel.org/doc/Documentation/cpu-freq/user-guide.txt @@ -132,7 +132,7 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { } if _, err := os.Stat(filepath.Join(cpu, "thermal_throttle")); os.IsNotExist(err) { - log.Debugf("CPU %q is missing thermal_throttle", cpu) + log.Debugf("CPU %v is missing thermal_throttle", cpu) continue } if value, err = readUintFromFile(filepath.Join(cpu, "thermal_throttle", "core_throttle_count")); err != nil { @@ -141,36 +141,43 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle, prometheus.CounterValue, float64(value), cpuname) } - pkgs, err := filepath.Glob(sysFilePath("bus/node/devices/node[0-9]*")) + nodes, err := filepath.Glob(sysFilePath("bus/node/devices/node[0-9]*")) if err != nil { return err } - // package/node loop - for _, pkg := range pkgs { - if _, err := os.Stat(filepath.Join(pkg, "cpulist")); os.IsNotExist(err) { - log.Debugf("package %q is missing cpulist", pkg) + // package / NUMA node loop + for _, node := range nodes { + if _, err := os.Stat(filepath.Join(node, "cpulist")); os.IsNotExist(err) { + log.Debugf("NUMA node %v is missing cpulist", node) continue } - cpulist, err := ioutil.ReadFile(filepath.Join(pkg, "cpulist")) + cpulist, err := ioutil.ReadFile(filepath.Join(node, "cpulist")) if err != nil { - log.Debugf("could not read cpulist of package %q", pkg) + log.Debugf("could not read cpulist of NUMA node %v", node) return err } // cpulist example of one package/node with HT: "0-11,24-35" line := strings.Split(string(cpulist), "\n")[0] + if line == "" { + // Skip processor-less (memory-only) NUMA nodes. + // E.g. RAM expansion with Intel Optane Drive(s) using + // Intel Memory Drive Technology (IMDT). + log.Debugf("skipping processor-less (memory-only) NUMA node %v", node) + continue + } firstCPU := strings.FieldsFunc(line, func(r rune) bool { return r == '-' || r == ',' })[0] - if _, err := os.Stat(filepath.Join(pkg, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); os.IsNotExist(err) { - log.Debugf("Package %q CPU %q is missing package_throttle", pkg, firstCPU) + if _, err := os.Stat(filepath.Join(node, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); os.IsNotExist(err) { + log.Debugf("Node %v CPU %v is missing package_throttle", node, firstCPU) continue } - if value, err = readUintFromFile(filepath.Join(pkg, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); err != nil { + if value, err = readUintFromFile(filepath.Join(node, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); err != nil { return err } - pkgno := digitRegexp.FindAllString(pkg, 1)[0] - ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, prometheus.CounterValue, float64(value), pkgno) + nodeno := digitRegexp.FindAllString(node, 1)[0] + ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, prometheus.CounterValue, float64(value), nodeno) } return nil diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index 829425abf1..199f62807b 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -150,6 +150,14 @@ Mode: 644 Path: sys/bus/node/devices/node0/cpulist Lines: 1 0-3 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices/node1 +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/bus/node/devices/node1/cpulist +Lines: 1 + Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/class