Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add monitor for sysctl para #3913

Merged
merged 2 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions pkg/daemon/controller_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,31 @@ func (c *Controller) loopEncapIPCheck() {

func (c *Controller) ovnMetricsUpdate() {
c.setOvnSubnetGatewayMetric()

resetSysParaMetrics()
c.setIPLocalPortRangeMetric()
c.setCheckSumErrMetric()
c.setCniConfigMetric()
c.setDNSSearchMetric()
c.setTCPTwRecycleMetric()
c.setTCPMtuProbingMetric()
c.setConntrackTCPLiberalMetric()
c.setBridgeNfCallIptablesMetric()
c.setIPv6RouteMaxsizeMetric()
c.setTCPMemMetric()
}

func resetSysParaMetrics() {
metricIPLocalPortRange.Reset()
metricCheckSumErr.Reset()
metricCniConfig.Reset()
metricDNSSearch.Reset()
metricTCPTwRecycle.Reset()
metricTCPMtuProbing.Reset()
metricConntrackTCPLiberal.Reset()
metricBridgeNfCallIptables.Reset()
metricTCPMem.Reset()
metricIPv6RouteMaxsize.Reset()
}

func rotateLog() {
Expand Down
186 changes: 186 additions & 0 deletions pkg/daemon/exporter_metric.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package daemon

import (
"os"
"os/exec"
"strconv"
"strings"

"k8s.io/klog/v2"

"github.com/containernetworking/cni/libcni"
"github.com/docker/docker/libnetwork/resolvconf"
)

func (c *Controller) setIPLocalPortRangeMetric() {
output, err := os.ReadFile("/proc/sys/net/ipv4/ip_local_port_range")
if err != nil {
klog.Errorf("failed to get value of ip_local_port_range, err %v", err)
return
}

values := strings.Fields(string(output))
if len(values) != 2 {
klog.Errorf("unexpected ip_local_port_range value: %q", string(output))
return
}
metricIPLocalPortRange.WithLabelValues(c.config.NodeName, values[0], values[1]).Set(1)
}

func (c *Controller) setCheckSumErrMetric() {
cmdstr := "netstat -us"
cmd := exec.Command("sh", "-c", cmdstr)
output, err := cmd.CombinedOutput()
if err != nil {
klog.Errorf("failed to exec cmd 'netstat -us', err %v", err)
return
}

found := false
lines := strings.Split(string(output), "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
continue
}

if strings.Contains(line, "InCsumErrors") {
values := strings.Split(line, ":")
if len(values) == 2 {
val, _ := strconv.Atoi(strings.TrimSpace(values[1]))
metricCheckSumErr.WithLabelValues(c.config.NodeName).Set(float64(val))
found = true
}
}
}
if !found {
metricCheckSumErr.WithLabelValues(c.config.NodeName).Set(float64(0))
}
}

func (c *Controller) setCniConfigMetric() {
files, err := libcni.ConfFiles(c.config.CniConfDir, []string{".conf", ".conflist"})
if err != nil {
klog.Errorf("failed to list cni config files in %s: %v", c.config.CniConfDir, err)
return
}

found := false
for _, file := range files {
if file == c.config.CniConfName {
continue
}
found = true
metricCniConfig.WithLabelValues(c.config.NodeName, c.config.CniConfName, file).Set(1)
}
if !found {
metricCniConfig.WithLabelValues(c.config.NodeName, c.config.CniConfName, "no other cni config").Set(1)
}
}

func (c *Controller) setDNSSearchMetric() {
file, err := resolvconf.Get()
if err != nil {
klog.Errorf("failed to get /etc/resolv.conf content: %v", err)
return
}
domains := resolvconf.GetSearchDomains(file.Content)

found := false
for _, domain := range domains {
if strings.Contains(domain, "local") {
continue
}

found = true
metricDNSSearch.WithLabelValues(c.config.NodeName, domain).Set(1)
}
if !found {
metricDNSSearch.WithLabelValues(c.config.NodeName, "no additional search domain").Set(1)
}
}

func (c *Controller) setTCPTwRecycleMetric() {
output, err := os.ReadFile("/proc/sys/net/ipv4/tcp_tw_recycle")
if err != nil {
if os.IsNotExist(err) {
return
}
klog.Errorf("failed to get value of tcp_tw_recycle, err %v", err)
return
}

val, _ := strconv.Atoi(strings.TrimSpace(string(output)))
metricTCPTwRecycle.WithLabelValues(c.config.NodeName).Set(float64(val))
}

func (c *Controller) setTCPMtuProbingMetric() {
output, err := os.ReadFile("/proc/sys/net/ipv4/tcp_mtu_probing")
if err != nil {
if os.IsNotExist(err) {
return
}
klog.Errorf("failed to get value of tcp_mtu_probing, err %v", err)
return
}

val, _ := strconv.Atoi(strings.TrimSpace(string(output)))
metricTCPMtuProbing.WithLabelValues(c.config.NodeName).Set(float64(val))
}

func (c *Controller) setConntrackTCPLiberalMetric() {
output, err := os.ReadFile("/proc/sys/net/netfilter/nf_conntrack_tcp_be_liberal")
if err != nil {
if os.IsNotExist(err) {
return
}
klog.Errorf("failed to get value of nf_conntrack_tcp_be_liberal, err %v", err)
return
}

val, _ := strconv.Atoi(strings.TrimSpace(string(output)))
metricConntrackTCPLiberal.WithLabelValues(c.config.NodeName).Set(float64(val))
}

func (c *Controller) setBridgeNfCallIptablesMetric() {
output, err := os.ReadFile("/proc/sys/net/bridge/bridge-nf-call-iptables")
if err != nil {
if os.IsNotExist(err) {
return
}
klog.Errorf("failed to get value of bridge-nf-call-iptables, err %v", err)
return
}

val, _ := strconv.Atoi(strings.TrimSpace(string(output)))
metricBridgeNfCallIptables.WithLabelValues(c.config.NodeName).Set(float64(val))
}

func (c *Controller) setIPv6RouteMaxsizeMetric() {
output, err := os.ReadFile("/proc/sys/net/ipv6/route/max_size")
if err != nil {
klog.Errorf("failed to get value of ipv6 route max_size, err %v", err)
return
}

val, _ := strconv.Atoi(strings.TrimSpace(string(output)))
metricIPv6RouteMaxsize.WithLabelValues(c.config.NodeName).Set(float64(val))
}

func (c *Controller) setTCPMemMetric() {
output, err := os.ReadFile("/proc/sys/net/ipv4/tcp_mem")
if err != nil {
if os.IsNotExist(err) {
return
}
klog.Errorf("failed to get value of ipv4 tcp_mem, err %v", err)
return
}

values := strings.Fields(string(output))
if len(values) != 3 {
klog.Errorf("unexpected tcp_mem value: %q", string(output))
return
}
metricTCPMem.WithLabelValues(c.config.NodeName, values[0], values[1], values[2]).Set(1)
}
93 changes: 93 additions & 0 deletions pkg/daemon/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,85 @@ var (
"protocol",
},
)

metricIPLocalPortRange = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "ip_local_port_range",
Help: "value of system parameter /proc/sys/net/ipv4/ip_local_port_range, which should not conflict with the nodeport range",
}, []string{
"hostname",
"start",
"end",
})

metricCheckSumErr = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "checksum_err_count",
Help: "Value of InCsumErrors for cmd `netstat -us`, checksum is error when value is greater than 0",
},
[]string{"hostname"})

metricCniConfig = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cni_config_file",
Help: "cni config file in /etc/cni/net.d/",
}, []string{
"hostname",
"ovn",
"other",
})

metricDNSSearch = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "dns_search_domain",
Help: "search domain in /etc/resolv.conf",
}, []string{
"hostname",
"additional",
})

metricTCPTwRecycle = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "tcp_tw_recycle",
Help: "value of system parameter /proc/sys/net/ipv4/tcp_tw_recycle, the recommended value is 0",
}, []string{
"hostname",
})

metricTCPMtuProbing = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "tcp_mtu_probing",
Help: "value of system parameter /proc/sys/net/ipv4/tcp_mtu_probing, the recommended value is 1",
}, []string{
"hostname",
})

metricConntrackTCPLiberal = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "nf_conntrack_tcp_be_liberal",
Help: "value of system parameter /proc/sys/net/netfilter/nf_conntrack_tcp_be_liberal, the recommended value is 1",
}, []string{
"hostname",
})

metricBridgeNfCallIptables = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "bridge_nf_call_iptables",
Help: "value of system parameter /proc/sys/net/bridge/bridge-nf-call-iptables, the recommended value is 1 for overlay, and 0 for underlay network",
}, []string{
"hostname",
})

metricTCPMem = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "tcp_mem",
Help: "value of system parameter /proc/sys/net/ipv4/tcp_mem, recommend a large number value",
}, []string{
"hostname",
"minimum",
"pressure",
"maximum",
})

metricIPv6RouteMaxsize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "max_size",
Help: "value of system parameter /proc/sys/net/ipv6/route/max_size, recommend a large number value, at least 16384",
}, []string{
"hostname",
})

// reflector metrics

// TODO(directxman12): update these to be histograms once the metrics overhaul KEP
Expand Down Expand Up @@ -151,6 +230,7 @@ func InitMetrics() {
registerReflectorMetrics()
registerClientMetrics()
registerOvnSubnetGatewayMetrics()
registerSystemParameterMetrics()
prometheus.MustRegister(cniOperationHistogram)
prometheus.MustRegister(cniWaitAddressResult)
prometheus.MustRegister(cniConnectivityResult)
Expand All @@ -161,6 +241,19 @@ func registerOvnSubnetGatewayMetrics() {
prometheus.MustRegister(metricOvnSubnetGatewayPackets)
}

func registerSystemParameterMetrics() {
prometheus.MustRegister(metricIPLocalPortRange)
prometheus.MustRegister(metricCheckSumErr)
prometheus.MustRegister(metricCniConfig)
prometheus.MustRegister(metricDNSSearch)
prometheus.MustRegister(metricTCPTwRecycle)
prometheus.MustRegister(metricTCPMtuProbing)
prometheus.MustRegister(metricConntrackTCPLiberal)
prometheus.MustRegister(metricBridgeNfCallIptables)
prometheus.MustRegister(metricTCPMem)
prometheus.MustRegister(metricIPv6RouteMaxsize)
}

// registerClientMetrics sets up the client latency metrics from client-go
func registerClientMetrics() {
// register the metrics with our registry
Expand Down
Loading