Skip to content

Commit

Permalink
fix: paginate DescribeNetworkInterfaces with deep filters (#375)
Browse files Browse the repository at this point in the history
* fix: paginate DescribeNetworkInterfaces with deep filters

* update metrics and address review comments

* minor updates to address comments
  • Loading branch information
sushrk authored Feb 29, 2024
1 parent c10421f commit b5699de
Show file tree
Hide file tree
Showing 11 changed files with 193 additions and 132 deletions.
8 changes: 8 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ func main() {
var healthCheckTimeout int
var enableWindowsPrefixDelegation bool
var region string
var vpcID string

flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080",
"The address the metric endpoint binds to.")
Expand Down Expand Up @@ -141,6 +142,7 @@ func main() {
flag.BoolVar(&enableWindowsPrefixDelegation, "enable-windows-prefix-delegation", false,
"Enable the feature flag for Windows prefix delegation")
flag.StringVar(&region, "aws-region", "", "The aws region of the k8s cluster")
flag.StringVar(&vpcID, "vpc-id", "", "The vpc-id where EKS cluster is deployed")

flag.Parse()

Expand Down Expand Up @@ -183,6 +185,11 @@ func main() {
os.Exit(1)
}

if vpcID == "" {
setupLog.Error(fmt.Errorf("vpc-id is a required parameter"), "unable to start the controller")
os.Exit(1)
}

// Profiler disabled by default, to enable set the enableProfiling argument
if enableProfiling {
// To use the profiler - https://golang.org/pkg/net/http/pprof/
Expand Down Expand Up @@ -336,6 +343,7 @@ func main() {
EC2Wrapper: ec2Wrapper,
ClusterName: clusterName,
Log: ctrl.Log.WithName("eni cleaner"),
VPCID: vpcID,
}).SetupWithManager(ctx, mgr, healthzHandler); err != nil {
setupLog.Error(err, "unable to start eni cleaner")
os.Exit(1)
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

116 changes: 64 additions & 52 deletions pkg/aws/ec2/api/eni_cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ type ENICleaner struct {
EC2Wrapper EC2Wrapper
ClusterName string
Log logr.Logger
VPCID string

availableENIs map[string]struct{}
shutdown bool
Expand All @@ -42,16 +43,22 @@ type ENICleaner struct {
}

var (
vpcCniLeakedENICleanupCnt = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "vpc_cni_created_leaked_eni_cleanup_count",
Help: "The number of leaked ENIs created by VPC-CNI that is cleaned up by the controller",
vpccniAvailableENICnt = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "vpc_cni_created_available_eni_count",
Help: "The number of available ENIs created by VPC-CNI that controller will try to delete in each cleanup cycle",
},
)
vpcrcLeakedENICleanupCnt = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "vpc_rc_created_leaked_eni_cleanup_count",
Help: "The number of leaked ENIs created by VPC-RC that is cleaned up by the controller",
vpcrcAvailableENICnt = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "vpc_rc_created_available_eni_count",
Help: "The number of available ENIs created by VPC-RC that controller will try to delete in each cleanup cycle",
},
)
leakedENICnt = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "leaked_eni_count",
Help: "The number of available ENIs that failed to be deleted by the controller in each cleanup cycle",
},
)
)
Expand Down Expand Up @@ -101,6 +108,9 @@ func (e *ENICleaner) Start(ctx context.Context) error {
// interval between cycle 1 and 2 and hence can be safely deleted. And we can also conclude that Interface 1 was
// created but not attached at the the time when 1st cycle ran and hence it should not be deleted.
func (e *ENICleaner) cleanUpAvailableENIs() {
vpcrcAvailableCount := 0
vpccniAvailableCount := 0
leakedENICount := 0
describeNetworkInterfaceIp := &ec2.DescribeNetworkInterfacesInput{
Filters: []*ec2.Filter{
{
Expand All @@ -116,63 +126,65 @@ func (e *ENICleaner) cleanUpAvailableENIs() {
Values: aws.StringSlice([]string{config.NetworkInterfaceOwnerTagValue,
config.NetworkInterfaceOwnerVPCCNITagValue}),
},
{
Name: aws.String("vpc-id"),
Values: []*string{aws.String(e.VPCID)},
},
},
}

availableENIs := make(map[string]struct{})

for {
describeNetworkInterfaceOp, err := e.EC2Wrapper.DescribeNetworkInterfaces(describeNetworkInterfaceIp)
if err != nil {
e.Log.Error(err, "failed to describe network interfaces, will retry")
return
}

for _, networkInterface := range describeNetworkInterfaceOp.NetworkInterfaces {
if _, exists := e.availableENIs[*networkInterface.NetworkInterfaceId]; exists {
// Increment promethues metrics for number of leaked ENIs cleaned up
if tagIdx := slices.IndexFunc(networkInterface.TagSet, func(tag *ec2.Tag) bool {
return *tag.Key == config.NetworkInterfaceOwnerTagKey
}); tagIdx != -1 {
switch *networkInterface.TagSet[tagIdx].Value {
case config.NetworkInterfaceOwnerTagValue:
vpcrcLeakedENICleanupCnt.Inc()
case config.NetworkInterfaceOwnerVPCCNITagValue:
vpcCniLeakedENICleanupCnt.Inc()
default:
// We will not hit this case as we only filter for above two tag values, adding it for any future use cases
e.Log.Info("found available ENI not created by VPC-CNI/VPC-RC")
}
}
networkInterfaces, err := e.EC2Wrapper.DescribeNetworkInterfacesPages(describeNetworkInterfaceIp)
if err != nil {
e.Log.Error(err, "failed to describe network interfaces, cleanup will be retried in next cycle")
return
}

// The ENI in available state has been sitting for at least the eni clean up interval and it should
// be removed
_, err := e.EC2Wrapper.DeleteNetworkInterface(&ec2.DeleteNetworkInterfaceInput{
NetworkInterfaceId: networkInterface.NetworkInterfaceId,
})
if err != nil {
// Log and continue, if the ENI is still present it will be cleaned up in next 2 cycles
e.Log.Error(err, "failed to delete the dangling network interface",
"id", *networkInterface.NetworkInterfaceId)
for _, networkInterface := range networkInterfaces {
if _, exists := e.availableENIs[*networkInterface.NetworkInterfaceId]; exists {
// Increment promethues metrics for number of leaked ENIs cleaned up
if tagIdx := slices.IndexFunc(networkInterface.TagSet, func(tag *ec2.Tag) bool {
return *tag.Key == config.NetworkInterfaceOwnerTagKey
}); tagIdx != -1 {
switch *networkInterface.TagSet[tagIdx].Value {
case config.NetworkInterfaceOwnerTagValue:
vpcrcAvailableCount += 1
case config.NetworkInterfaceOwnerVPCCNITagValue:
vpccniAvailableCount += 1
default:
// We should not hit this case as we only filter for relevant tag values, log error and continue if unexpected ENIs found
e.Log.Error(fmt.Errorf("found available ENI not created by VPC-CNI/VPC-RC"), "eniID", *networkInterface.NetworkInterfaceId)
continue
}
e.Log.Info("deleted dangling ENI successfully",
"eni id", networkInterface.NetworkInterfaceId)
} else {
// Seeing the ENI for the first time, add it to the new list of available network interfaces
availableENIs[*networkInterface.NetworkInterfaceId] = struct{}{}
e.Log.V(1).Info("adding eni to to the map of available ENIs, will be removed if present in "+
"next run too", "id", *networkInterface.NetworkInterfaceId)
}
}

if describeNetworkInterfaceOp.NextToken == nil {
break
// The ENI in available state has been sitting for at least the eni clean up interval and it should
// be removed
_, err := e.EC2Wrapper.DeleteNetworkInterface(&ec2.DeleteNetworkInterfaceInput{
NetworkInterfaceId: networkInterface.NetworkInterfaceId,
})
if err != nil {
leakedENICount += 1
// Log and continue, if the ENI is still present it will be cleaned up in next 2 cycles
e.Log.Error(err, "failed to delete the dangling network interface",
"id", *networkInterface.NetworkInterfaceId)
continue
}
e.Log.Info("deleted dangling ENI successfully",
"eni id", networkInterface.NetworkInterfaceId)
} else {
// Seeing the ENI for the first time, add it to the new list of available network interfaces
availableENIs[*networkInterface.NetworkInterfaceId] = struct{}{}
e.Log.V(1).Info("adding eni to to the map of available ENIs, will be removed if present in "+
"next run too", "id", *networkInterface.NetworkInterfaceId)
}

describeNetworkInterfaceIp.NextToken = describeNetworkInterfaceOp.NextToken
}

// Update leaked ENI metrics
vpcrcAvailableENICnt.Set(float64(vpcrcAvailableCount))
vpccniAvailableENICnt.Set(float64(vpccniAvailableCount))
leakedENICnt.Set(float64(leakedENICount))
// Set the available ENIs to the list of ENIs seen in the current cycle
e.availableENIs = availableENIs
}
27 changes: 15 additions & 12 deletions pkg/aws/ec2/api/eni_cleanup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ var (
mockNetworkInterfaceId2 = "eni-000000000000001"
mockNetworkInterfaceId3 = "eni-000000000000002"

mockVPCID = "vpc-0000000000000000"

mockDescribeNetworkInterfaceIp = &ec2.DescribeNetworkInterfacesInput{
Filters: []*ec2.Filter{
{
Expand All @@ -52,19 +54,19 @@ var (
Values: aws.StringSlice([]string{config.NetworkInterfaceOwnerTagValue,
config.NetworkInterfaceOwnerVPCCNITagValue}),
},
{
Name: aws.String("vpc-id"),
Values: []*string{aws.String(mockVPCID)},
},
},
}
mockDescribeInterfaceOpWith1And2 = &ec2.DescribeNetworkInterfacesOutput{
NetworkInterfaces: []*ec2.NetworkInterface{
{NetworkInterfaceId: &mockNetworkInterfaceId1},
{NetworkInterfaceId: &mockNetworkInterfaceId2},
},
mockDescribeInterfaceOpWith1And2 = []*ec2.NetworkInterface{
{NetworkInterfaceId: &mockNetworkInterfaceId1},
{NetworkInterfaceId: &mockNetworkInterfaceId2},
}
mockDescribeInterfaceOpWith1And3 = &ec2.DescribeNetworkInterfacesOutput{
NetworkInterfaces: []*ec2.NetworkInterface{
{NetworkInterfaceId: &mockNetworkInterfaceId1},
{NetworkInterfaceId: &mockNetworkInterfaceId3},
},
mockDescribeInterfaceOpWith1And3 = []*ec2.NetworkInterface{
{NetworkInterfaceId: &mockNetworkInterfaceId1},
{NetworkInterfaceId: &mockNetworkInterfaceId3},
}
)

Expand All @@ -74,6 +76,7 @@ func getMockENICleaner(ctrl *gomock.Controller) (*ENICleaner, *mock_api.MockEC2W
EC2Wrapper: mockEC2Wrapper,
availableENIs: map[string]struct{}{},
Log: zap.New(zap.UseDevMode(true)),
VPCID: mockVPCID,
clusterNameTagKey: mockClusterNameTagKey,
ctx: context.Background(),
}, mockEC2Wrapper
Expand All @@ -85,10 +88,10 @@ func TestENICleaner_cleanUpAvailableENIs(t *testing.T) {

gomock.InOrder(
// Return network interface 1 and 2 in first cycle
mockWrapper.EXPECT().DescribeNetworkInterfaces(mockDescribeNetworkInterfaceIp).
mockWrapper.EXPECT().DescribeNetworkInterfacesPages(mockDescribeNetworkInterfaceIp).
Return(mockDescribeInterfaceOpWith1And2, nil),
// Return network interface 1 and 3 in the second cycle
mockWrapper.EXPECT().DescribeNetworkInterfaces(mockDescribeNetworkInterfaceIp).
mockWrapper.EXPECT().DescribeNetworkInterfacesPages(mockDescribeNetworkInterfaceIp).
Return(mockDescribeInterfaceOpWith1And3, nil),
// Expect to delete the network interface 1
mockWrapper.EXPECT().DeleteNetworkInterface(
Expand Down
49 changes: 13 additions & 36 deletions pkg/aws/ec2/api/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ type EC2APIHelper interface {
ipResourceCount *config.IPResourceCount, interfaceType *string) (*ec2.NetworkInterface, error)
DeleteNetworkInterface(interfaceId *string) error
GetSubnet(subnetId *string) (*ec2.Subnet, error)
GetBranchNetworkInterface(trunkID *string) ([]*ec2.NetworkInterface, error)
GetBranchNetworkInterface(trunkID *string, subnetID *string) ([]*ec2.NetworkInterface, error)
GetInstanceNetworkInterface(instanceId *string) ([]*ec2.InstanceNetworkInterface, error)
DescribeNetworkInterfaces(nwInterfaceIds []*string) ([]*ec2.NetworkInterface, error)
DescribeTrunkInterfaceAssociation(trunkInterfaceId *string) ([]*ec2.TrunkInterfaceAssociation, error)
Expand Down Expand Up @@ -562,43 +562,20 @@ func (h *ec2APIHelper) UnassignIPv4Resources(eniID string, resourceType config.R
return err
}

func (h *ec2APIHelper) GetBranchNetworkInterface(trunkID *string) ([]*ec2.NetworkInterface, error) {
filters := []*ec2.Filter{{
Name: aws.String("tag:" + config.TrunkENIIDTag),
Values: []*string{trunkID},
}}

describeNetworkInterfacesInput := &ec2.DescribeNetworkInterfacesInput{Filters: filters}
var nwInterfaces []*ec2.NetworkInterface
for {
describeNetworkInterfaceOutput, err := h.ec2Wrapper.DescribeNetworkInterfaces(describeNetworkInterfacesInput)
if err != nil {
return nil, err
}

if describeNetworkInterfaceOutput == nil || describeNetworkInterfaceOutput.NetworkInterfaces == nil ||
len(describeNetworkInterfaceOutput.NetworkInterfaces) == 0 {
// No more interface associated with the trunk, return the result
break
}

// One or more interface associated with the trunk, return the result
for _, nwInterface := range describeNetworkInterfaceOutput.NetworkInterfaces {
// Only attach the required details to avoid consuming extra memory
nwInterfaces = append(nwInterfaces, &ec2.NetworkInterface{
NetworkInterfaceId: nwInterface.NetworkInterfaceId,
TagSet: nwInterface.TagSet,
})
}

if describeNetworkInterfaceOutput.NextToken == nil {
break
}

describeNetworkInterfacesInput.NextToken = describeNetworkInterfaceOutput.NextToken
func (h *ec2APIHelper) GetBranchNetworkInterface(trunkID *string, subnetID *string) ([]*ec2.NetworkInterface, error) {
filters := []*ec2.Filter{
{
Name: aws.String("tag:" + config.TrunkENIIDTag),
Values: []*string{trunkID},
},
{
Name: aws.String("subnet-id"),
Values: []*string{subnetID},
},
}

return nwInterfaces, nil
describeNetworkInterfacesInput := &ec2.DescribeNetworkInterfacesInput{Filters: filters}
return h.ec2Wrapper.DescribeNetworkInterfacesPages(describeNetworkInterfacesInput)
}

// DetachAndDeleteNetworkInterface detaches the network interface first and then deletes it
Expand Down
Loading

0 comments on commit b5699de

Please sign in to comment.