From b96ef961b2fc7540f52569981430508d5330369a Mon Sep 17 00:00:00 2001 From: Jeff Nelson Date: Tue, 25 Apr 2023 22:17:36 +0000 Subject: [PATCH] return error from increaseDatastorePool; parameterize k8s version --- pkg/ipamd/ipamd.go | 36 ++++++++++++------- test/framework/options.go | 2 ++ .../resources/aws/utils/nodegroup.go | 12 ++++--- testdata/amazon-eks-nodegroup.yaml | 1 - 4 files changed, 33 insertions(+), 18 deletions(-) diff --git a/pkg/ipamd/ipamd.go b/pkg/ipamd/ipamd.go index d9060ca1fb..1e43141493 100644 --- a/pkg/ipamd/ipamd.go +++ b/pkg/ipamd/ipamd.go @@ -599,7 +599,11 @@ func (c *IPAMContext) nodeInit() error { // On node init, check if datastore pool needs to be increased. If so, attach CIDRs from existing ENIs and attach new ENIs. if !c.disableENIProvisioning && c.isDatastorePoolTooLow() { - c.increaseDatastorePool(ctx) + if err := c.increaseDatastorePool(ctx); err != nil { + // Note that the only error currently returned by increaseDatastorePool is an error attaching CIDRs (other than insufficient IPs) + podENIErrInc("nodeInit") + return errors.New("error while trying to increase datastore pool") + } // If custom networking is enabled and the pool is empty, return an error, as there is a misconfiguration and // the node should not become ready. if c.useCustomNetworking && c.isDatastorePoolEmpty() { @@ -800,7 +804,7 @@ func (c *IPAMContext) tryUnassignCidrsFromAll() { } } -func (c *IPAMContext) increaseDatastorePool(ctx context.Context) { +func (c *IPAMContext) increaseDatastorePool(ctx context.Context) error { log.Debug("Starting to increase pool size") ipamdActionsInprogress.WithLabelValues("increaseDatastorePool").Add(float64(1)) defer ipamdActionsInprogress.WithLabelValues("increaseDatastorePool").Sub(float64(1)) @@ -808,40 +812,41 @@ func (c *IPAMContext) increaseDatastorePool(ctx context.Context) { short, _, warmIPTargetDefined := c.datastoreTargetState() if warmIPTargetDefined && short == 0 { log.Debugf("Skipping increase Datastore pool, warm target reached") - return + return nil } if !warmIPTargetDefined { shortPrefix, warmTargetDefined := c.datastorePrefixTargetState() if warmTargetDefined && shortPrefix == 0 { log.Debugf("Skipping increase Datastore pool, warm prefix target reached") - return + return nil } } if c.isTerminating() { log.Debug("AWS CNI is terminating, will not try to attach any new IPs or ENIs right now") - return + return nil } if !c.manageENIsNonScheduleable && c.isNodeNonSchedulable() { log.Debug("AWS CNI is on a non schedulable node, will not try to attach any new IPs or ENIs right now") - return + return nil } // Try to add more Cidrs to existing ENIs first. if c.inInsufficientCidrCoolingPeriod() { log.Debugf("Recently we had InsufficientCidr error hence will wait for %v before retrying", insufficientCidrErrorCooldown) - return + return nil } increasedPool, err := c.tryAssignCidrs() if err != nil { - log.Errorf(err.Error()) if containsInsufficientCIDRsOrSubnetIPs(err) { log.Errorf("Unable to attach IPs/Prefixes for the ENI, subnet doesn't seem to have enough IPs/Prefixes. Consider using new subnet or carve a reserved range using create-subnet-cidr-reservation") c.lastInsufficientCidrError = time.Now() - return + return nil } + log.Errorf(err.Error()) + return err } if increasedPool { c.updateLastNodeIPPoolAction() @@ -851,16 +856,20 @@ func (c *IPAMContext) increaseDatastorePool(ctx context.Context) { if c.enablePodENI && c.dataStore.GetTrunkENI() == "" { reserveSlotForTrunkENI = 1 } - // If we did not add an IP, try to add an ENI instead. + // If we did not add any IPs, try to allocate an ENI. if c.dataStore.GetENIs() < (c.maxENI - c.unmanagedENI - reserveSlotForTrunkENI) { if err = c.tryAllocateENI(ctx); err == nil { c.updateLastNodeIPPoolAction() + } else { + // Note that no error is returned if ENI allocation fails. This is because ENI allocation failure should not cause node to be "NotReady". + log.Debugf("Error trying to allocate ENI: %v", err) } } else { log.Debugf("Skipping ENI allocation as the max ENI limit of %d is already reached (accounting for %d unmanaged ENIs and %d trunk ENIs)", c.maxENI, c.unmanagedENI, reserveSlotForTrunkENI) } } + return nil } func (c *IPAMContext) updateLastNodeIPPoolAction() { @@ -880,7 +889,6 @@ func (c *IPAMContext) tryAllocateENI(ctx context.Context) error { if c.useCustomNetworking { eniCfg, err := eniconfig.MyENIConfig(ctx, c.cachedK8SClient) - if err != nil { log.Errorf("Failed to get pod ENI config") return err @@ -902,7 +910,6 @@ func (c *IPAMContext) tryAllocateENI(ctx context.Context) error { } resourcesToAllocate := c.GetENIResourcesToAllocate() - _, err = c.awsClient.AllocIPAddresses(eni, resourcesToAllocate) if err != nil { log.Warnf("Failed to allocate %d IP addresses on an ENI: %v", resourcesToAllocate, err) @@ -1349,8 +1356,11 @@ func podENIErrInc(fn string) { // nodeIPPoolReconcile reconcile ENI and IP info from metadata service and IP addresses in datastore func (c *IPAMContext) nodeIPPoolReconcile(ctx context.Context, interval time.Duration) { + // To reduce the number of EC2 API calls, skip reconciliation if IPs were recently added to the datastore. timeSinceLast := time.Since(c.lastNodeIPPoolAction) - if timeSinceLast <= interval { + // Make an exception if node needs a trunk ENI and one is not currently attached. + needsTrunkEni := c.enablePodENI && c.dataStore.GetTrunkENI() == "" + if timeSinceLast <= interval && !needsTrunkEni { return } diff --git a/test/framework/options.go b/test/framework/options.go index 0449a8bec2..d0cbfdc424 100644 --- a/test/framework/options.go +++ b/test/framework/options.go @@ -46,6 +46,7 @@ type Options struct { PrivateSubnets string AvailabilityZones string PublicRouteTableID string + NgK8SVersion string } func (options *Options) BindFlags() { @@ -68,6 +69,7 @@ func (options *Options) BindFlags() { flag.StringVar(&options.PrivateSubnets, "private-subnets", "", "Comma separated list of private subnets (optional, if specified you must specify all of public/private-subnets, public-route-table-id, and availability-zones)") flag.StringVar(&options.AvailabilityZones, "availability-zones", "", "Comma separated list of private subnets (optional, if specified you must specify all of public/private-subnets, public-route-table-id, and availability-zones)") flag.StringVar(&options.PublicRouteTableID, "public-route-table-id", "", "Public route table ID (optional, if specified you must specify all of public/private-subnets, public-route-table-id, and availability-zones)") + flag.StringVar(&options.NgK8SVersion, "ng-kubernetes-version", "1.25", `Kubernetes version for self-managed node groups (optional, default is "1.25")`) } func (options *Options) Validate() error { diff --git a/test/framework/resources/aws/utils/nodegroup.go b/test/framework/resources/aws/utils/nodegroup.go index 26330f65c6..9847621e9a 100644 --- a/test/framework/resources/aws/utils/nodegroup.go +++ b/test/framework/resources/aws/utils/nodegroup.go @@ -29,11 +29,11 @@ import ( "github.com/aws/amazon-vpc-cni-k8s/test/framework/utils" ) -const CreateNodeGroupCFNTemplate = "/testdata/amazon-eks-nodegroup.yaml" - -// Docker will be default, if not specified const ( - CONTAINERD = "containerd" + // Docker will be default, if not specified + CONTAINERD = "containerd" + CreateNodeGroupCFNTemplate = "/testdata/amazon-eks-nodegroup.yaml" + NodeImageIdSSMParam = "/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id" ) type NodeGroupProperties struct { @@ -124,6 +124,10 @@ func CreateAndWaitTillSelfManagedNGReady(f *framework.Framework, properties Node ParameterKey: aws.String("NodeGroupName"), ParameterValue: aws.String(properties.NodeGroupName), }, + { + ParameterKey: aws.String("NodeImageIdSSMParam"), + ParameterValue: aws.String(fmt.Sprintf(NodeImageIdSSMParam, f.Options.NgK8SVersion)), + }, { ParameterKey: aws.String("NodeAutoScalingGroupMinSize"), ParameterValue: aws.String(asgSizeString), diff --git a/testdata/amazon-eks-nodegroup.yaml b/testdata/amazon-eks-nodegroup.yaml index dad122d5fb..1bcee36eb8 100644 --- a/testdata/amazon-eks-nodegroup.yaml +++ b/testdata/amazon-eks-nodegroup.yaml @@ -74,7 +74,6 @@ Parameters: NodeImageIdSSMParam: Type: "AWS::SSM::Parameter::Value" - Default: /aws/service/eks/optimized-ami/1.24/amazon-linux-2/recommended/image_id Description: AWS Systems Manager Parameter Store parameter of the AMI ID for the worker node instances. Change this value to match the version of Kubernetes you are using. DisableIMDSv1: