From 3d3838220b044e57f31cda4fc04108234f337d1d Mon Sep 17 00:00:00 2001 From: Justin Miron Date: Thu, 20 Mar 2025 16:43:55 -0500 Subject: [PATCH] add ImageId to the set of additional cloud provider labels The ImageId is per-node metadata that allows cluster owners to: 1. Identify the set of image ids running a cluster 2. Track rollouts of images through a cluster 3. Quickly identify bad image builds being rolled out to cluster The aws cloud-provider already exposes similar instance-level metadata such as instance type, topology information. This introduces a new label applied to each node in a cluster: `k8s.aws/image-id=`. It does not require any additional API calls as the describe instance call already has the ImageId. --- pkg/providers/v1/aws.go | 4 ++++ pkg/providers/v1/aws_test.go | 9 +++++---- pkg/providers/v1/instances_v2.go | 11 +++++++++-- pkg/providers/v1/instances_v2_test.go | 15 +++++++++------ pkg/providers/v1/well_known_labels.go | 2 ++ 5 files changed, 29 insertions(+), 12 deletions(-) diff --git a/pkg/providers/v1/aws.go b/pkg/providers/v1/aws.go index 5287938ad8..7be6b2a44c 100644 --- a/pkg/providers/v1/aws.go +++ b/pkg/providers/v1/aws.go @@ -1016,6 +1016,10 @@ func (c *Cloud) InstanceType(ctx context.Context, nodeName types.NodeName) (stri return aws.StringValue(inst.InstanceType), nil } +func (c *Cloud) getImageID(instance *ec2.Instance) string { + return aws.StringValue(instance.ImageId) +} + // GetZone implements Zones.GetZone func (c *Cloud) GetZone(ctx context.Context) (cloudprovider.Zone, error) { return cloudprovider.Zone{ diff --git a/pkg/providers/v1/aws_test.go b/pkg/providers/v1/aws_test.go index 428690f0fb..6793f7ff31 100644 --- a/pkg/providers/v1/aws_test.go +++ b/pkg/providers/v1/aws_test.go @@ -554,10 +554,10 @@ func testHasNodeAddress(t *testing.T, addrs []v1.NodeAddress, addressType v1.Nod } func makeMinimalInstance(instanceID string) ec2.Instance { - return makeInstance(instanceID, "", "", "", "", nil, false) + return makeInstance(instanceID, "", "", "", "", nil, false, "") } -func makeInstance(instanceID string, privateIP, publicIP, privateDNSName, publicDNSName string, ipv6s []string, setNetInterface bool) ec2.Instance { +func makeInstance(instanceID string, privateIP, publicIP, privateDNSName, publicDNSName string, ipv6s []string, setNetInterface bool, imageID string) ec2.Instance { var tag ec2.Tag tag.Key = aws.String(TagNameKubernetesClusterLegacy) tag.Value = aws.String(TestClusterID) @@ -570,6 +570,7 @@ func makeInstance(instanceID string, privateIP, publicIP, privateDNSName, public PublicDnsName: aws.String(publicDNSName), PublicIpAddress: aws.String(publicIP), InstanceType: aws.String("c3.large"), + ImageId: aws.String(imageID), Tags: tags, Placement: &ec2.Placement{AvailabilityZone: aws.String("us-west-2a")}, State: &ec2.InstanceState{ @@ -651,7 +652,7 @@ func TestNodeAddressesByProviderID(t *testing.T) { }, } { t.Run(tc.Name, func(t *testing.T) { - instance := makeInstance(tc.InstanceID, tc.PrivateIP, tc.PublicIP, tc.PrivateDNSName, tc.PublicDNSName, tc.Ipv6s, tc.SetNetInterface) + instance := makeInstance(tc.InstanceID, tc.PrivateIP, tc.PublicIP, tc.PrivateDNSName, tc.PublicDNSName, tc.Ipv6s, tc.SetNetInterface, "") aws1, _ := mockInstancesResp(&instance, []*ec2.Instance{&instance}) _, err := aws1.NodeAddressesByProviderID(context.TODO(), "i-xxx") if err == nil { @@ -757,7 +758,7 @@ func TestNodeAddresses(t *testing.T) { }, } { t.Run(tc.Name, func(t *testing.T) { - instance := makeInstance(tc.InstanceID, tc.PrivateIP, tc.PublicIP, tc.PrivateDNSName, tc.PublicDNSName, tc.Ipv6s, tc.SetNetInterface) + instance := makeInstance(tc.InstanceID, tc.PrivateIP, tc.PublicIP, tc.PrivateDNSName, tc.PublicDNSName, tc.Ipv6s, tc.SetNetInterface, "") aws1, _ := mockInstancesResp(&instance, []*ec2.Instance{&instance}) _, err := aws1.NodeAddresses(context.TODO(), "instance-mismatch.ec2.internal") if err == nil { diff --git a/pkg/providers/v1/instances_v2.go b/pkg/providers/v1/instances_v2.go index 53957898e2..a3a8fd13a6 100644 --- a/pkg/providers/v1/instances_v2.go +++ b/pkg/providers/v1/instances_v2.go @@ -68,7 +68,7 @@ func (c *Cloud) InstanceShutdown(ctx context.Context, node *v1.Node) (bool, erro } func (c *Cloud) getAdditionalLabels(ctx context.Context, zoneName string, instanceID string, instanceType string, - region string, existingLabels map[string]string) (map[string]string, error) { + region string, imageID string, existingLabels map[string]string) (map[string]string, error) { additionalLabels := map[string]string{} // If zone ID label is already set, skip. @@ -82,6 +82,11 @@ func (c *Cloud) getAdditionalLabels(ctx context.Context, zoneName string, instan additionalLabels[LabelZoneID] = zoneID } + // If image id label is already set, skip. + if _, ok := existingLabels[LabelImageID]; !ok { + additionalLabels[LabelImageID] = imageID + } + // If topology labels are already set, skip. if _, ok := existingLabels[LabelNetworkNodePrefix+"1"]; !ok { nodeTopology, err := c.instanceTopologyManager.GetNodeTopology(ctx, instanceType, region, instanceID) @@ -127,6 +132,7 @@ func (c *Cloud) InstanceMetadata(ctx context.Context, node *v1.Node) (*cloudprov var ( instanceType string + imageID string zone cloudprovider.Zone nodeAddresses []v1.NodeAddress ) @@ -151,6 +157,7 @@ func (c *Cloud) InstanceMetadata(ctx context.Context, node *v1.Node) (*cloudprov return nil, fmt.Errorf("failed to get instance by ID %s: %w", instanceID, err) } + imageID = c.getImageID(instance) instanceType = c.getInstanceType(instance) zone = c.getInstanceZone(instance) nodeAddresses, err = c.getInstanceNodeAddress(instance) @@ -159,7 +166,7 @@ func (c *Cloud) InstanceMetadata(ctx context.Context, node *v1.Node) (*cloudprov } } - additionalLabels, err := c.getAdditionalLabels(ctx, zone.FailureDomain, string(instanceID), instanceType, zone.Region, node.Labels) + additionalLabels, err := c.getAdditionalLabels(ctx, zone.FailureDomain, string(instanceID), instanceType, zone.Region, imageID, node.Labels) if err != nil { return nil, err } diff --git a/pkg/providers/v1/instances_v2_test.go b/pkg/providers/v1/instances_v2_test.go index bfd3e08bcd..00cc9b5654 100644 --- a/pkg/providers/v1/instances_v2_test.go +++ b/pkg/providers/v1/instances_v2_test.go @@ -167,7 +167,7 @@ func TestInstanceShutdown(t *testing.T) { func TestInstanceMetadata(t *testing.T) { t.Run("Should return populated InstanceMetadata", func(t *testing.T) { - instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true) + instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true, "ami-0000") c, _ := mockInstancesResp(&instance, []*ec2.Instance{&instance}) var mockedTopologyManager resourcemanagers.MockedInstanceTopologyManager c.instanceTopologyManager = &mockedTopologyManager @@ -207,11 +207,12 @@ func TestInstanceMetadata(t *testing.T) { LabelNetworkNodePrefix + "1": "nn-123456789", LabelNetworkNodePrefix + "2": "nn-234567890", LabelNetworkNodePrefix + "3": "nn-345678901", + LabelImageID: "ami-0000", }, result.AdditionalLabels) }) t.Run("Should skip additional labels if already set", func(t *testing.T) { - instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true) + instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true, "ami-0000") c, _ := mockInstancesResp(&instance, []*ec2.Instance{&instance}) var mockedTopologyManager resourcemanagers.MockedInstanceTopologyManager c.instanceTopologyManager = &mockedTopologyManager @@ -226,6 +227,7 @@ func TestInstanceMetadata(t *testing.T) { LabelNetworkNodePrefix + "1": "nn-123456789", LabelNetworkNodePrefix + "2": "nn-234567890", LabelNetworkNodePrefix + "3": "nn-345678901", + LabelImageID: "ami-0000", } result, err := c.InstanceMetadata(context.TODO(), node) @@ -239,7 +241,7 @@ func TestInstanceMetadata(t *testing.T) { }) t.Run("Should swallow errors if getting node topology fails if instance type not expected to be supported", func(t *testing.T) { - instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true) + instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true, "ami-0000") c, _ := mockInstancesResp(&instance, []*ec2.Instance{&instance}) var mockedTopologyManager resourcemanagers.MockedInstanceTopologyManager c.instanceTopologyManager = &mockedTopologyManager @@ -259,12 +261,13 @@ func TestInstanceMetadata(t *testing.T) { mockedTopologyManager.AssertNumberOfCalls(t, "GetNodeTopology", 1) assert.Equal(t, map[string]string{ - LabelZoneID: "az1", + LabelZoneID: "az1", + LabelImageID: "ami-0000", }, result.AdditionalLabels) }) t.Run("Should not swallow errors if getting node topology fails if instance type is expected to be supported", func(t *testing.T) { - instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true) + instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true, "ami-0000") c, _ := mockInstancesResp(&instance, []*ec2.Instance{&instance}) var mockedTopologyManager resourcemanagers.MockedInstanceTopologyManager c.instanceTopologyManager = &mockedTopologyManager @@ -286,7 +289,7 @@ func TestInstanceMetadata(t *testing.T) { }) t.Run("Should limit ec2:DescribeInstances calls to a single request per instance", func(t *testing.T) { - instance := makeInstance("i-00000000000001234", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true) + instance := makeInstance("i-00000000000001234", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true, "ami-0000") c, awsServices := mockInstancesResp(&instance, []*ec2.Instance{&instance}) node := &v1.Node{ Spec: v1.NodeSpec{ diff --git a/pkg/providers/v1/well_known_labels.go b/pkg/providers/v1/well_known_labels.go index be9179c957..a8420f6f21 100644 --- a/pkg/providers/v1/well_known_labels.go +++ b/pkg/providers/v1/well_known_labels.go @@ -24,4 +24,6 @@ const ( // but will be initially applied to nodes. The suffix should be an incremented // integer starting at 1. LabelNetworkNodePrefix = "topology.k8s.aws/network-node-layer-" + // LabelImageID is a machine image label that can be applied to node resources. + LabelImageID = "k8s.aws/image-id" )