Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Distances field to machine.Node #3179

Merged
merged 3 commits into from
Oct 3, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/storage/prometheus.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,4 @@ Metric name | Type | Description | Unit (where applicable) | option parameter |
`machine_nvm_avg_power_budget_watts` | Gauge | NVM power budget | watts | | libipmctl
`machine_nvm_capacity` | Gauge | NVM capacity value labeled by NVM mode (memory mode or app direct mode) | bytes | | libipmctl
`machine_thread_siblings_count` | Gauge | Number of CPU thread siblings | | cpu_topology |
`machine_node_distance` | Gauge | Distance between NUMA node and target NUMA node | | cpu_topology |
PiotrProkop marked this conversation as resolved.
Show resolved Hide resolved
1 change: 1 addition & 0 deletions info/v1/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ type Node struct {
HugePages []HugePagesInfo `json:"hugepages"`
Cores []Core `json:"cores"`
Caches []Cache `json:"caches"`
Distances []uint64 `json:"distances"`
}

type Core struct {
Expand Down
23 changes: 22 additions & 1 deletion machine/topology_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,10 @@ func TestTopology(t *testing.T) {
"/fakeSysfs/devices/system/node/node0/cpu11": "1",
}
sysFs.SetPhysicalPackageIDs(physicalPackageIDs, nil)

sysFs.SetDistances("/fakeSysfs/devices/system/node/node0", "10 11", nil)
sysFs.SetDistances("/fakeSysfs/devices/system/node/node1", "11 10", nil)

topology, numCores, err := GetTopology(sysFs)
assert.Nil(t, err)
assert.Equal(t, 12, numCores)
Expand All @@ -217,12 +221,17 @@ func TestTopology(t *testing.T) {
Type: "unified",
Level: 1,
}
distances := [][]uint64{
{10, 11},
{11, 10},
}
for i := 0; i < numNodes; i++ {
node := info.Node{Id: i}
// Copy over Memory from result. TODO(rjnagal): Use memory from fake.
node.Memory = topology[i].Memory
// Copy over HugePagesInfo from result. TODO(ohsewon): Use HugePagesInfo from fake.
node.HugePages = topology[i].HugePages
node.Distances = distances[i]
for j := 0; j < numCoresPerNode; j++ {
core := info.Core{Id: i*numCoresPerNode + j}
core.Caches = append(core.Caches, cache)
Expand Down Expand Up @@ -298,12 +307,13 @@ func TestTopologyWithoutNodes(t *testing.T) {
topologyJSON2, err := json.Marshal(topology[1])
assert.Nil(t, err)

expectedTopology1 := `{"node_id":0,"memory":0,"hugepages":null,"cores":[{"core_id":0,"thread_ids":[0,2],"caches":[{"id":0, "size":32768,"type":"unified","level":0}], "socket_id": 0, "uncore_caches":null}],"caches":null}`
expectedTopology1 := `{"node_id":0,"memory":0,"hugepages":null,"distances":null,"cores":[{"core_id":0,"thread_ids":[0,2],"caches":[{"id":0, "size":32768,"type":"unified","level":0}], "socket_id": 0, "uncore_caches":null}],"caches":null}`
expectedTopology2 := `
{
"node_id":1,
"memory":0,
"hugepages":null,
"distances": null,
"cores":[
{
"core_id":1,
Expand Down Expand Up @@ -359,6 +369,9 @@ func TestTopologyWithNodesWithoutCPU(t *testing.T) {
}
sysFs.SetHugePagesNr(hugePageNr, nil)

sysFs.SetDistances("/fakeSysfs/devices/system/node/node0", "10 11", nil)
sysFs.SetDistances("/fakeSysfs/devices/system/node/node1", "11 10", nil)

topology, numCores, err := GetTopology(sysFs)

assert.Nil(t, err)
Expand All @@ -381,6 +394,10 @@ func TestTopologyWithNodesWithoutCPU(t *testing.T) {
"page_size": 1048576
}
],
"distances": [
10,
11
],
"memory": 33604804608,
"node_id": 0
},
Expand All @@ -397,6 +414,10 @@ func TestTopologyWithNodesWithoutCPU(t *testing.T) {
"page_size": 1048576
}
],
"distances": [
11,
10
],
"memory": 33604804608,
"node_id": 1
}
Expand Down
8 changes: 8 additions & 0 deletions metrics/prometheus_fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,10 @@ func (p testSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, erro
},
},
},
Distances: []uint64{
10,
12,
},
},
{
Id: 1,
Expand Down Expand Up @@ -260,6 +264,10 @@ func (p testSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, erro
Level: 3,
},
},
Distances: []uint64{
12,
10,
},
},
},
}, nil
Expand Down
40 changes: 33 additions & 7 deletions metrics/prometheus_machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,14 @@ import (
var baseLabelsNames = []string{"machine_id", "system_uuid", "boot_id"}

const (
prometheusModeLabelName = "mode"
prometheusTypeLabelName = "type"
prometheusLevelLabelName = "level"
prometheusNodeLabelName = "node_id"
prometheusCoreLabelName = "core_id"
prometheusThreadLabelName = "thread_id"
prometheusPageSizeLabelName = "page_size"
prometheusModeLabelName = "mode"
prometheusTypeLabelName = "type"
prometheusLevelLabelName = "level"
prometheusNodeLabelName = "node_id"
prometheusCoreLabelName = "core_id"
prometheusThreadLabelName = "thread_id"
prometheusPageSizeLabelName = "page_size"
prometheusTargetNodeLabelName = "target_node_id"

nvmMemoryMode = "memory_mode"
nvmAppDirectMode = "app_direct_mode"
Expand Down Expand Up @@ -191,6 +192,15 @@ func NewPrometheusMachineCollector(i infoProvider, includedMetrics container.Met
return getHugePagesCount(machineInfo)
},
},
{
name: "machine_node_distance",
help: "Distance between NUMA node and target NUMA node.",
valueType: prometheus.GaugeValue,
extraLabels: []string{prometheusNodeLabelName, prometheusTargetNodeLabelName},
getValues: func(machineInfo *info.MachineInfo) metricValues {
return getDistance(machineInfo)
},
},
}...)
}
return c
Expand Down Expand Up @@ -356,3 +366,19 @@ func getCaches(machineInfo *info.MachineInfo) metricValues {
}
return mValues
}

func getDistance(machineInfo *info.MachineInfo) metricValues {
mValues := make(metricValues, 0, len(machineInfo.Topology)^2)
for _, node := range machineInfo.Topology {
nodeID := strconv.Itoa(node.Id)
for i, target := range node.Distances {
mValues = append(mValues,
metricValue{
value: float64(target),
labels: []string{nodeID, strconv.Itoa(i)},
timestamp: machineInfo.Timestamp,
})
}
}
return mValues
}
16 changes: 16 additions & 0 deletions metrics/prometheus_machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,22 @@ func TestGetHugePagesCount(t *testing.T) {
assertMetricValues(t, expectedMetricVals, metricVals, "Unexpected information about Node memory")
}

func TestGetDistance(t *testing.T) {
machineInfo, err := testSubcontainersInfoProvider{}.GetMachineInfo()
assert.Nil(t, err)

metricVals := getDistance(machineInfo)

assert.Equal(t, 4, len(metricVals))
expectedMetricVals := []metricValue{
{value: 10, labels: []string{"0", "0"}, timestamp: time.Unix(1395066363, 0)},
{value: 12, labels: []string{"0", "1"}, timestamp: time.Unix(1395066363, 0)},
{value: 12, labels: []string{"1", "0"}, timestamp: time.Unix(1395066363, 0)},
{value: 10, labels: []string{"1", "1"}, timestamp: time.Unix(1395066363, 0)},
}
assertMetricValues(t, expectedMetricVals, metricVals, "Unexpected information about Node memory")
}

func assertMetricValues(t *testing.T, expected metricValues, actual metricValues, message string) {
for i := range actual {
assert.Truef(t, reflect.DeepEqual(expected[i], actual[i]),
Expand Down
6 changes: 6 additions & 0 deletions metrics/testdata/prometheus_machine_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ machine_dimm_count{boot_id="boot-id-test",machine_id="machine-id-test",system_uu
# HELP machine_memory_bytes Amount of memory installed on the machine.
# TYPE machine_memory_bytes gauge
machine_memory_bytes{boot_id="boot-id-test",machine_id="machine-id-test",system_uuid="system-uuid-test"} 1024 1395066363000
# HELP machine_node_distance Distance between NUMA node and target NUMA node.
# TYPE machine_node_distance gauge
machine_node_distance{boot_id="boot-id-test",machine_id="machine-id-test",node_id="0",system_uuid="system-uuid-test",target_node_id="0"} 10 1395066363000
machine_node_distance{boot_id="boot-id-test",machine_id="machine-id-test",node_id="0",system_uuid="system-uuid-test",target_node_id="1"} 12 1395066363000
machine_node_distance{boot_id="boot-id-test",machine_id="machine-id-test",node_id="1",system_uuid="system-uuid-test",target_node_id="0"} 12 1395066363000
machine_node_distance{boot_id="boot-id-test",machine_id="machine-id-test",node_id="1",system_uuid="system-uuid-test",target_node_id="1"} 10 1395066363000
# HELP machine_node_hugepages_count Numer of hugepages assigned to NUMA node.
# TYPE machine_node_hugepages_count gauge
machine_node_hugepages_count{boot_id="boot-id-test",machine_id="machine-id-test",node_id="0",page_size="1048576",system_uuid="system-uuid-test"} 0 1395066363000
Expand Down
24 changes: 24 additions & 0 deletions utils/sysfs/fakesysfs/fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ type FakeSysFs struct {
hugePagesNr map[string]string
hugePagesNrErr error

distances map[string]string
distancesErr error

onlineCPUs map[string]interface{}
}

Expand Down Expand Up @@ -201,6 +204,27 @@ func (fs *FakeSysFs) GetSystemUUID() (string, error) {
return "1F862619-BA9F-4526-8F85-ECEAF0C97430", nil
}

func (fs *FakeSysFs) GetDistances(nodeDir string) (string, error) {
if fs.distancesErr != nil {
return "", fs.distancesErr
}

if _, ok := fs.distances[nodeDir]; !ok {
return "", fmt.Errorf("distance not found")
}

return fs.distances[nodeDir], nil
}

func (fs *FakeSysFs) SetDistances(nodeDir string, distances string, err error) {
if fs.distances == nil {
fs.distances = map[string]string{nodeDir: distances}
} else {
fs.distances[nodeDir] = distances
}
fs.distancesErr = err
}

func (fs *FakeSysFs) IsCPUOnline(dir string) bool {
if fs.onlineCPUs == nil {
return true
Expand Down
15 changes: 15 additions & 0 deletions utils/sysfs/sysfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ const (

meminfoFile = "meminfo"

distanceFile = "distance"

sysFsCPUTopology = "topology"

// CPUPhysicalPackageID is a physical package id of cpu#. Typically corresponds to a physical socket number,
Expand Down Expand Up @@ -113,6 +115,10 @@ type SysFs interface {
GetCacheInfo(cpu int, cache string) (CacheInfo, error)

GetSystemUUID() (string, error)

// GetDistances returns distance array
GetDistances(string) (string, error)

// IsCPUOnline determines if CPU status from kernel hotplug machanism standpoint.
// See: https://www.kernel.org/doc/html/latest/core-api/cpu_hotplug.html
IsCPUOnline(dir string) bool
Expand Down Expand Up @@ -161,6 +167,15 @@ func (fs *realSysFs) GetMemInfo(nodePath string) (string, error) {
return strings.TrimSpace(string(meminfo)), err
}

func (fs *realSysFs) GetDistances(nodePath string) (string, error) {
distancePath := fmt.Sprintf("%s/%s", nodePath, distanceFile)
distance, err := ioutil.ReadFile(distancePath)
if err != nil {
return "", err
}
return strings.TrimSpace(string(distance)), err
}

func (fs *realSysFs) GetHugePagesInfo(hugePagesDirectory string) ([]os.FileInfo, error) {
return ioutil.ReadDir(hugePagesDirectory)
}
Expand Down
14 changes: 14 additions & 0 deletions utils/sysfs/sysfs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,3 +305,17 @@ func TestUniqueCPUPropertyOnSingleSocketMultipleNUMAsSystem(t *testing.T) {
count = GetUniqueCPUPropertyCount("./testdata_single_socket_many_NUMAs/", CPUCoreID)
assert.Equal(t, 16, count)
}

func TestGetDistances(t *testing.T) {
sysFs := NewRealSysFs()
distances, err := sysFs.GetDistances("./testdata/node0")
assert.Nil(t, err)
assert.Equal(t, "10 11", distances)
}

func TestGetDistancesFileIsMissing(t *testing.T) {
sysFs := NewRealSysFs()
distances, err := sysFs.GetDistances("./testdata/node1")
assert.NotNil(t, err)
assert.Equal(t, "", distances)
}
1 change: 1 addition & 0 deletions utils/sysfs/testdata/node0/distance
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
10 11
26 changes: 26 additions & 0 deletions utils/sysinfo/sysinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,11 @@ func GetNodesInfo(sysFs sysfs.SysFs) ([]info.Node, int, error) {
return nil, 0, err
}

node.Distances, err = getDistances(sysFs, nodeDir)
if err != nil {
return nil, 0, err
}

nodes = append(nodes, node)
}
return nodes, allLogicalCoresCount, err
Expand Down Expand Up @@ -391,6 +396,27 @@ func getNodeMemInfo(sysFs sysfs.SysFs, nodeDir string) (uint64, error) {
return uint64(memory), nil
}

// getDistances returns information about distances between NUMA nodes
func getDistances(sysFs sysfs.SysFs, nodeDir string) ([]uint64, error) {
rawDistance, err := sysFs.GetDistances(nodeDir)
if err != nil {
//Ignore if per-node info is not available.
klog.Warningf("Found node without distance information, nodeDir: %s", nodeDir)
return nil, nil
}

distances := []uint64{}
for _, distance := range strings.Split(rawDistance, " ") {
distanceUint, err := strconv.ParseUint(distance, 10, 64)
if err != nil {
return nil, fmt.Errorf("cannot convert %s to int", distance)
}
distances = append(distances, distanceUint)
}

return distances, nil
}

// getCoresInfo returns information about physical cores
func getCoresInfo(sysFs sysfs.SysFs, cpuDirs []string) ([]info.Core, error) {
cores := make([]info.Core, 0, len(cpuDirs))
Expand Down
Loading