From 5916aba46ba1a9dbb39dea1ac8bc370fbeb75246 Mon Sep 17 00:00:00 2001 From: Jayanth Varavani <1111446+jayanthvn@users.noreply.github.com> Date: Tue, 13 Jul 2021 17:32:26 -0700 Subject: [PATCH] Add metrics for total prefix count and ips used per cidr (#1529) * Add metrics for total prefix count and ips used per cidr * update readme * formatting --- Makefile | 4 +- cmd/cni-metrics-helper/README.md | 2 + cmd/cni-metrics-helper/metrics/cni_metrics.go | 12 ++ cmd/cni-metrics-helper/metrics/cni_test2.data | 194 ++++++++++++++++++ .../metrics/metrics_test.go | 23 +++ pkg/awsutils/awsutils.go | 4 +- pkg/ipamd/datastore/data_store.go | 28 +++ pkg/ipamd/ipamd.go | 2 +- scripts/dockerfiles/Dockerfile.metrics | 1 + 9 files changed, 265 insertions(+), 5 deletions(-) create mode 100644 cmd/cni-metrics-helper/metrics/cni_test2.data diff --git a/Makefile b/Makefile index 59e58bca84..62921353bb 100644 --- a/Makefile +++ b/Makefile @@ -174,7 +174,7 @@ docker-unit-test: build-docker-test ## Run unit tests inside of the testing # Build metrics helper agent. build-metrics: ## Build metrics helper agent. - go build -ldflags="-s -w" -o cni-metrics-helper ./cmd/cni-metrics-helper + go build $(VENDOR_OVERRIDE_FLAG) -ldflags="-s -w" -o cni-metrics-helper ./cmd/cni-metrics-helper # Build metrics helper agent Docker image. docker-metrics: ## Build metrics helper agent Docker image. @@ -188,7 +188,7 @@ docker-metrics: ## Build metrics helper agent Docker image. metrics-unit-test: CGO_ENABLED=1 metrics-unit-test: GOARCH= metrics-unit-test: ## Run metrics helper unit test suite (must be run natively). - go test -v -cover -race -timeout 10s \ + go test -v $(VENDOR_OVERRIDE_FLAG) -cover -race -timeout 10s \ ./cmd/cni-metrics-helper/metrics/... # Run metrics helper unit test suite in a container. diff --git a/cmd/cni-metrics-helper/README.md b/cmd/cni-metrics-helper/README.md index 184dce1384..85cd93ce89 100644 --- a/cmd/cni-metrics-helper/README.md +++ b/cmd/cni-metrics-helper/README.md @@ -34,6 +34,8 @@ Adding the CNI metrics helper will publish the following metrics to CloudWatch: "podENIErr", "reconcileCount", "totalIPAddresses", +"totalIPv4Prefixes", +"totalAssignedIPv4sPerCidr" ``` ### Get cni-metrics-helper logs diff --git a/cmd/cni-metrics-helper/metrics/cni_metrics.go b/cmd/cni-metrics-helper/metrics/cni_metrics.go index 85db5e984f..7520648052 100644 --- a/cmd/cni-metrics-helper/metrics/cni_metrics.go +++ b/cmd/cni-metrics-helper/metrics/cni_metrics.go @@ -40,6 +40,18 @@ var InterestingCNIMetrics = map[string]metricsConvert{ matchFunc: matchAny, actionFunc: metricsAdd, data: &dataPoints{}}}}, + "awscni_total_ipv4_prefixes": { + actions: []metricsAction{ + {cwMetricName: "totalIPv4Prefixes", + matchFunc: matchAny, + actionFunc: metricsAdd, + data: &dataPoints{}}}}, + "awscni_assigned_ip_per_ipv4cidr": { + actions: []metricsAction{ + {cwMetricName: "totalAssignedIPv4sPerCidr", + matchFunc: matchAny, + actionFunc: metricsAdd, + data: &dataPoints{}}}}, "awscni_eni_allocated": { actions: []metricsAction{ {cwMetricName: "eniAllocated", diff --git a/cmd/cni-metrics-helper/metrics/cni_test2.data b/cmd/cni-metrics-helper/metrics/cni_test2.data new file mode 100644 index 0000000000..4cac567982 --- /dev/null +++ b/cmd/cni-metrics-helper/metrics/cni_test2.data @@ -0,0 +1,194 @@ +# HELP awscni_add_ip_req_count The number of add IP address request +# TYPE awscni_add_ip_req_count counter +awscni_add_ip_req_count 100 +# HELP awscni_assigned_ip_addresses The number of IP addresses assigned to pods +# TYPE awscni_assigned_ip_addresses gauge +awscni_assigned_ip_addresses 1 +# HELP awscni_total_ipv4_prefixes The total number of IPv4 prefixes +# TYPE awscni_total_ipv4_prefixes gauge +awscni_total_ipv4_prefixes 1 +# HELP awscni_assigned_ip_per_ipv4cidr The total number of IP addresses assigned per cidr +# TYPE awscni_assigned_ip_per_ipv4cidr gauge +awscni_assigned_ip_per_ipv4cidr 1 +# HELP awscni_aws_api_error_count The number of times AWS API returns an error +# TYPE awscni_aws_api_error_count counter +awscni_aws_api_error_count{api="DeleteNetworkInterface",error="InvalidParameterValue"} 14 +# HELP awscni_aws_api_latency_ms AWS API call latency in ms +# TYPE awscni_aws_api_latency_ms summary +awscni_aws_api_latency_ms{api="AssignPrivateIpAddresses",error="false",quantile="0.5"} NaN +awscni_aws_api_latency_ms{api="AssignPrivateIpAddresses",error="false",quantile="0.9"} NaN +awscni_aws_api_latency_ms{api="AssignPrivateIpAddresses",error="false",quantile="0.99"} NaN +awscni_aws_api_latency_ms_sum{api="AssignPrivateIpAddresses",error="false"} 2938 +awscni_aws_api_latency_ms_count{api="AssignPrivateIpAddresses",error="false"} 10 +awscni_aws_api_latency_ms{api="AttachNetworkInterface",error="false",quantile="0.5"} NaN +awscni_aws_api_latency_ms{api="AttachNetworkInterface",error="false",quantile="0.9"} NaN +awscni_aws_api_latency_ms{api="AttachNetworkInterface",error="false",quantile="0.99"} NaN +awscni_aws_api_latency_ms_sum{api="AttachNetworkInterface",error="false"} 4377 +awscni_aws_api_latency_ms_count{api="AttachNetworkInterface",error="false"} 10 +awscni_aws_api_latency_ms{api="CreateNetworkInterface",error="false",quantile="0.5"} NaN +awscni_aws_api_latency_ms{api="CreateNetworkInterface",error="false",quantile="0.9"} NaN +awscni_aws_api_latency_ms{api="CreateNetworkInterface",error="false",quantile="0.99"} NaN +awscni_aws_api_latency_ms_sum{api="CreateNetworkInterface",error="false"} 1328 +awscni_aws_api_latency_ms_count{api="CreateNetworkInterface",error="false"} 10 +awscni_aws_api_latency_ms{api="CreateTags",error="false",quantile="0.5"} NaN +awscni_aws_api_latency_ms{api="CreateTags",error="false",quantile="0.9"} NaN +awscni_aws_api_latency_ms{api="CreateTags",error="false",quantile="0.99"} NaN +awscni_aws_api_latency_ms_sum{api="CreateTags",error="false"} 1123 +awscni_aws_api_latency_ms_count{api="CreateTags",error="false"} 10 +awscni_aws_api_latency_ms{api="DeleteNetworkInterface",error="false",quantile="0.5"} NaN +awscni_aws_api_latency_ms{api="DeleteNetworkInterface",error="false",quantile="0.9"} NaN +awscni_aws_api_latency_ms{api="DeleteNetworkInterface",error="false",quantile="0.99"} NaN +awscni_aws_api_latency_ms_sum{api="DeleteNetworkInterface",error="false"} 2364 +awscni_aws_api_latency_ms_count{api="DeleteNetworkInterface",error="false"} 9 +awscni_aws_api_latency_ms{api="DeleteNetworkInterface",error="true",quantile="0.5"} NaN +awscni_aws_api_latency_ms{api="DeleteNetworkInterface",error="true",quantile="0.9"} NaN +awscni_aws_api_latency_ms{api="DeleteNetworkInterface",error="true",quantile="0.99"} NaN +awscni_aws_api_latency_ms_sum{api="DeleteNetworkInterface",error="true"} 1806 +awscni_aws_api_latency_ms_count{api="DeleteNetworkInterface",error="true"} 14 +awscni_aws_api_latency_ms{api="DescribeInstances",error="false",quantile="0.5"} NaN +awscni_aws_api_latency_ms{api="DescribeInstances",error="false",quantile="0.9"} NaN +awscni_aws_api_latency_ms{api="DescribeInstances",error="false",quantile="0.99"} NaN +awscni_aws_api_latency_ms_sum{api="DescribeInstances",error="false"} 1330 +awscni_aws_api_latency_ms_count{api="DescribeInstances",error="false"} 10 +awscni_aws_api_latency_ms{api="DescribeNetworkInterfaces",error="false",quantile="0.5"} NaN +awscni_aws_api_latency_ms{api="DescribeNetworkInterfaces",error="false",quantile="0.9"} NaN +awscni_aws_api_latency_ms{api="DescribeNetworkInterfaces",error="false",quantile="0.99"} NaN +awscni_aws_api_latency_ms_sum{api="DescribeNetworkInterfaces",error="false"} 2360 +awscni_aws_api_latency_ms_count{api="DescribeNetworkInterfaces",error="false"} 20 +awscni_aws_api_latency_ms{api="DetachNetworkInterface",error="false",quantile="0.5"} NaN +awscni_aws_api_latency_ms{api="DetachNetworkInterface",error="false",quantile="0.9"} NaN +awscni_aws_api_latency_ms{api="DetachNetworkInterface",error="false",quantile="0.99"} NaN +awscni_aws_api_latency_ms_sum{api="DetachNetworkInterface",error="false"} 1828 +awscni_aws_api_latency_ms_count{api="DetachNetworkInterface",error="false"} 9 +awscni_aws_api_latency_ms{api="GetMetadata",error="false",quantile="0.5"} 0 +awscni_aws_api_latency_ms{api="GetMetadata",error="false",quantile="0.9"} 0 +awscni_aws_api_latency_ms{api="GetMetadata",error="false",quantile="0.99"} 1 +awscni_aws_api_latency_ms_sum{api="GetMetadata",error="false"} 4384 +awscni_aws_api_latency_ms_count{api="GetMetadata",error="false"} 82716 +awscni_aws_api_latency_ms{api="ModifyNetworkInterfaceAttribute",error="false",quantile="0.5"} NaN +awscni_aws_api_latency_ms{api="ModifyNetworkInterfaceAttribute",error="false",quantile="0.9"} NaN +awscni_aws_api_latency_ms{api="ModifyNetworkInterfaceAttribute",error="false",quantile="0.99"} NaN +awscni_aws_api_latency_ms_sum{api="ModifyNetworkInterfaceAttribute",error="false"} 1551 +awscni_aws_api_latency_ms_count{api="ModifyNetworkInterfaceAttribute",error="false"} 10 +# HELP awscni_del_ip_req_count The number of delete IP address request +# TYPE awscni_del_ip_req_count counter +awscni_del_ip_req_count{reason="PodDeleted"} 106 +awscni_del_ip_req_count{reason="SetupNSFailed"} 2 +# HELP awscni_eni_allocated The number of ENIs allocated +# TYPE awscni_eni_allocated gauge +awscni_eni_allocated 2 +# HELP awscni_eni_max The maximum number of ENIs that can be attached to the instance +# TYPE awscni_eni_max gauge +awscni_eni_max 3 +# HELP awscni_ip_max The maximum number of IP addresses that can be allocated to the instance +# TYPE awscni_ip_max gauge +awscni_ip_max 15 +# HELP awscni_ipamd_action_inprogress The number of ipamd actions in progress +# TYPE awscni_ipamd_action_inprogress gauge +awscni_ipamd_action_inprogress{fn="decreaseIPPool"} 0 +awscni_ipamd_action_inprogress{fn="increaseIPPool"} 0 +awscni_ipamd_action_inprogress{fn="nodeIPPoolReconcile"} 0 +awscni_ipamd_action_inprogress{fn="nodeInit"} 0 +awscni_ipamd_action_inprogress{fn="retryAllocENIIP"} 0 +# HELP awscni_total_ip_addresses The total number of IP addresses +# TYPE awscni_total_ip_addresses gauge +awscni_total_ip_addresses 16 +# HELP go_gc_duration_seconds A summary of the GC invocation durations. +# TYPE go_gc_duration_seconds summary +go_gc_duration_seconds{quantile="0"} 1.7901e-05 +go_gc_duration_seconds{quantile="0.25"} 3.2781e-05 +go_gc_duration_seconds{quantile="0.5"} 5.1354e-05 +go_gc_duration_seconds{quantile="0.75"} 0.00013115 +go_gc_duration_seconds{quantile="1"} 0.005550315 +go_gc_duration_seconds_sum 0.797514698 +go_gc_duration_seconds_count 9895 +# HELP go_goroutines Number of goroutines that currently exist. +# TYPE go_goroutines gauge +go_goroutines 25 +# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. +# TYPE go_memstats_alloc_bytes gauge +go_memstats_alloc_bytes 7.710456e+06 +# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed. +# TYPE go_memstats_alloc_bytes_total counter +go_memstats_alloc_bytes_total 1.778966304e+10 +# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table. +# TYPE go_memstats_buck_hash_sys_bytes gauge +go_memstats_buck_hash_sys_bytes 1.775926e+06 +# HELP go_memstats_frees_total Total number of frees. +# TYPE go_memstats_frees_total counter +go_memstats_frees_total 1.80537453e+08 +# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata. +# TYPE go_memstats_gc_sys_bytes gauge +go_memstats_gc_sys_bytes 712704 +# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and still in use. +# TYPE go_memstats_heap_alloc_bytes gauge +go_memstats_heap_alloc_bytes 7.710456e+06 +# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used. +# TYPE go_memstats_heap_idle_bytes gauge +go_memstats_heap_idle_bytes 4.620288e+06 +# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use. +# TYPE go_memstats_heap_inuse_bytes gauge +go_memstats_heap_inuse_bytes 1.0387456e+07 +# HELP go_memstats_heap_objects Number of allocated objects. +# TYPE go_memstats_heap_objects gauge +go_memstats_heap_objects 43550 +# HELP go_memstats_heap_released_bytes_total Total number of heap bytes released to OS. +# TYPE go_memstats_heap_released_bytes_total counter +go_memstats_heap_released_bytes_total 1.073152e+06 +# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system. +# TYPE go_memstats_heap_sys_bytes gauge +go_memstats_heap_sys_bytes 1.5007744e+07 +# HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection. +# TYPE go_memstats_last_gc_time_seconds gauge +go_memstats_last_gc_time_seconds 1.5562085013141422e+09 +# HELP go_memstats_lookups_total Total number of pointer lookups. +# TYPE go_memstats_lookups_total counter +go_memstats_lookups_total 226723 +# HELP go_memstats_mallocs_total Total number of mallocs. +# TYPE go_memstats_mallocs_total counter +go_memstats_mallocs_total 1.80581003e+08 +# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures. +# TYPE go_memstats_mcache_inuse_bytes gauge +go_memstats_mcache_inuse_bytes 3472 +# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system. +# TYPE go_memstats_mcache_sys_bytes gauge +go_memstats_mcache_sys_bytes 16384 +# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures. +# TYPE go_memstats_mspan_inuse_bytes gauge +go_memstats_mspan_inuse_bytes 128896 +# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system. +# TYPE go_memstats_mspan_sys_bytes gauge +go_memstats_mspan_sys_bytes 163840 +# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place. +# TYPE go_memstats_next_gc_bytes gauge +go_memstats_next_gc_bytes 1.1965008e+07 +# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations. +# TYPE go_memstats_other_sys_bytes gauge +go_memstats_other_sys_bytes 589762 +# HELP go_memstats_stack_inuse_bytes Number of bytes in use by the stack allocator. +# TYPE go_memstats_stack_inuse_bytes gauge +go_memstats_stack_inuse_bytes 720896 +# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator. +# TYPE go_memstats_stack_sys_bytes gauge +go_memstats_stack_sys_bytes 720896 +# HELP go_memstats_sys_bytes Number of bytes obtained by system. Sum of all system allocations. +# TYPE go_memstats_sys_bytes gauge +go_memstats_sys_bytes 1.8987256e+07 +# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. +# TYPE process_cpu_seconds_total counter +process_cpu_seconds_total 1391.13 +# HELP process_max_fds Maximum number of open file descriptors. +# TYPE process_max_fds gauge +process_max_fds 65536 +# HELP process_open_fds Number of open file descriptors. +# TYPE process_open_fds gauge +process_open_fds 10 +# HELP process_resident_memory_bytes Resident memory size in bytes. +# TYPE process_resident_memory_bytes gauge +process_resident_memory_bytes 3.9559168e+07 +# HELP process_start_time_seconds Start time of the process since unix epoch in seconds. +# TYPE process_start_time_seconds gauge +process_start_time_seconds 1.5550271644e+09 +# HELP process_virtual_memory_bytes Virtual memory size in bytes. +# TYPE process_virtual_memory_bytes gauge +process_virtual_memory_bytes 5.1757056e+07 diff --git a/cmd/cni-metrics-helper/metrics/metrics_test.go b/cmd/cni-metrics-helper/metrics/metrics_test.go index f84fd66eb7..ad9de224a3 100644 --- a/cmd/cni-metrics-helper/metrics/metrics_test.go +++ b/cmd/cni-metrics-helper/metrics/metrics_test.go @@ -94,3 +94,26 @@ func TestAPIServerMetric(t *testing.T) { assert.Equal(t, 1.0, actions[0].data.curSingleDataPoint) assert.Equal(t, 0.0, actions[0].data.lastSingleDataPoint) } + +func TestAPIServerMetricwithPDenabled(t *testing.T) { + testTarget := newTestMetricsTarget("cni_test2.data", InterestingCNIMetrics) + ctx := context.Background() + _, _, _, err := metricsListGrabAggregateConvert(ctx, testTarget) + assert.NoError(t, err) + + actions := InterestingCNIMetrics["awscni_assigned_ip_addresses"].actions + // verify awscni_assigned_ip_addresses value + assert.Equal(t, 1.0, actions[0].data.curSingleDataPoint) + + actions = InterestingCNIMetrics["awscni_total_ip_addresses"].actions + // verify awscni_total_ip_addresses value + assert.Equal(t, 16.0, actions[0].data.curSingleDataPoint) + + actions = InterestingCNIMetrics["awscni_total_ipv4_prefixes"].actions + // verify awscni_total_ipv4_prefixes value + assert.Equal(t, 1.0, actions[0].data.curSingleDataPoint) + + actions = InterestingCNIMetrics["awscni_assigned_ip_per_ipv4cidr"].actions + // verify awscni_assigned_ip_per_ipv4cidr value + assert.Equal(t, 1.0, actions[0].data.curSingleDataPoint) +} diff --git a/pkg/awsutils/awsutils.go b/pkg/awsutils/awsutils.go index 4b2bb7ff87..05dea61df4 100644 --- a/pkg/awsutils/awsutils.go +++ b/pkg/awsutils/awsutils.go @@ -1346,9 +1346,9 @@ func (cache *EC2InstanceMetadataCache) AllocIPAddresses(eniID string, numIPs int "Returning without an error here since we will verify the actual state by calling EC2 to see what addresses have already assigned to this ENI.") return nil } - log.Errorf("Failed to allocate a private IP addresses on ENI %v: %v", eniID, err) + log.Errorf("Failed to allocate a private IP/Prefix addresses on ENI %v: %v", eniID, err) awsAPIErrInc("AssignPrivateIpAddresses", err) - return errors.Wrap(err, "allocate IP address: failed to allocate a private IP address") + return errors.Wrap(err, "allocate IP/Prefix address: failed to allocate a private IP/Prefix address") } if output != nil { if cache.enableIpv4PrefixDelegation { diff --git a/pkg/ipamd/datastore/data_store.go b/pkg/ipamd/datastore/data_store.go index b81a2a4caa..e34ef0b050 100644 --- a/pkg/ipamd/datastore/data_store.go +++ b/pkg/ipamd/datastore/data_store.go @@ -112,6 +112,19 @@ var ( Help: "The number of IPs force removed while they had assigned pods", }, ) + totalPrefixes = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "awscni_total_ipv4_prefixes", + Help: "The total number of IPv4 prefixes", + }, + ) + ipsPerCidr = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "awscni_assigned_ip_per_ipv4cidr", + Help: "The total number of IP addresses assigned per cidr", + }, + []string{"cidr"}, + ) prometheusRegistered = false ) @@ -280,6 +293,8 @@ func prometheusRegister() { prometheus.MustRegister(assignedIPs) prometheus.MustRegister(forceRemovedENIs) prometheus.MustRegister(forceRemovedIPs) + prometheus.MustRegister(totalPrefixes) + prometheus.MustRegister(ipsPerCidr) prometheusRegistered = true } } @@ -391,6 +406,9 @@ func (ds *DataStore) ReadBackingStore() error { cidr.IPv4Addresses[allocation.IPv4] = addr ds.assignPodIPv4AddressUnsafe(allocation.IPAMKey, eni, addr) ds.log.Debugf("Recovered %s => %s/%s", allocation.IPAMKey, eni.ID, addr.Address) + //Update prometheus for ips per cidr + //Secondary IP mode will have /32:1 and Prefix mode will have /28: + ipsPerCidr.With(prometheus.Labels{"cidr": cidr.Cidr.String()}).Inc() break eniloop } } @@ -493,6 +511,7 @@ func (ds *DataStore) AddIPv4CidrToStore(eniID string, ipv4Cidr net.IPNet, isPref ds.total += newCidrInfo.Size() if isPrefix { ds.allocatedPrefix++ + totalPrefixes.Set(float64(ds.allocatedPrefix)) } totalIPs.Set(float64(ds.total)) @@ -541,6 +560,7 @@ func (ds *DataStore) DelIPv4CidrFromStore(eniID string, cidr net.IPNet, force bo ds.total -= deletableCidr.Size() if deletableCidr.IsPrefix { ds.allocatedPrefix-- + totalPrefixes.Set(float64(ds.allocatedPrefix)) } totalIPs.Set(float64(ds.total)) delete(curENI.AvailableIPv4Cidrs, strIPv4Cidr) @@ -579,6 +599,9 @@ func (ds *DataStore) AssignPodIPv4Address(ipamKey IPAMKey) (ipv4address string, if availableCidr.IPv4Addresses == nil { availableCidr.IPv4Addresses = make(map[string]*AddressInfo) } + //Update prometheus for ips per cidr + //Secondary IP mode will have /32:1 and Prefix mode will have /28: + ipsPerCidr.With(prometheus.Labels{"cidr": availableCidr.Cidr.String()}).Inc() } else { //This can happen during upgrade or PD enable/disable knob toggle //ENI can have prefixes attached and no space for SIPs or vice versa @@ -601,6 +624,8 @@ func (ds *DataStore) AssignPodIPv4Address(ipamKey IPAMKey) (ipv4address string, ds.unassignPodIPv4AddressUnsafe(addr) //Remove the IP from eni DB delete(availableCidr.IPv4Addresses, addr.Address) + //Update prometheus for ips per cidr + ipsPerCidr.With(prometheus.Labels{"cidr": availableCidr.Cidr.String()}).Dec() return "", -1, err } return addr.Address, eni.DeviceNumber, nil @@ -857,6 +882,7 @@ func (ds *DataStore) RemoveUnusedENIFromStore(warmIPTarget, minimumIPTarget, war ds.total -= availableCidr.Size() if availableCidr.IsPrefix { ds.allocatedPrefix-- + totalPrefixes.Set(float64(ds.allocatedPrefix)) } } ds.log.Infof("RemoveUnusedENIFromStore %s: IP/Prefix address pool stats: free %d addresses, total: %d, assigned: %d, total prefixes: %d", @@ -964,6 +990,8 @@ func (ds *DataStore) UnassignPodIPv4Address(ipamKey IPAMKey) (e *ENI, ip string, ds.log.Infof("Prefix delegation is enabled and the IP is from secondary pool hence no need to update prefix pool") ds.total-- } + //Update prometheus for ips per cidr + ipsPerCidr.With(prometheus.Labels{"cidr": availableCidr.Cidr.String()}).Dec() ds.log.Infof("UnassignPodIPv4Address: sandbox %s's ipAddr %s, DeviceNumber %d", ipamKey, addr.Address, eni.DeviceNumber) diff --git a/pkg/ipamd/ipamd.go b/pkg/ipamd/ipamd.go index 8d8da5b8ee..a1ee87a523 100644 --- a/pkg/ipamd/ipamd.go +++ b/pkg/ipamd/ipamd.go @@ -169,7 +169,7 @@ var ( reconcileCnt = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "awscni_reconcile_count", - Help: "The number of times ipamd reconciles on ENIs and IP addresses", + Help: "The number of times ipamd reconciles on ENIs and IP/Prefix addresses", }, []string{"fn"}, ) diff --git a/scripts/dockerfiles/Dockerfile.metrics b/scripts/dockerfiles/Dockerfile.metrics index f334797b44..6b41c78d33 100644 --- a/scripts/dockerfiles/Dockerfile.metrics +++ b/scripts/dockerfiles/Dockerfile.metrics @@ -11,6 +11,7 @@ ENV GOPROXY=direct # Copy modules in before the rest of the source to only expire cache on module # changes: COPY go.mod go.sum ./ +COPY vendor/ vendor/ RUN go mod download COPY . ./