From e3942984fbace082e0f10ca7b0959aaf9a145c4a Mon Sep 17 00:00:00 2001 From: Chinmay Gadgil Date: Thu, 9 Dec 2021 14:11:55 -0800 Subject: [PATCH] merge conflicts with cni-metrics-helper chart Fix compilation errors (#1751) add support for running canary script in different regions (#1752) Regenerate pod eni values for new instance types (#1754) * Regenerate pod eni values for new instance types Co-authored-by: Senthil Kumaran Closed issue message (#1761) * closed issue message * update message fix typo in upload script (#1763) Update calico file path Use an unique s3 bucket name (#1760) Update region Workflow to build arm and x86 images (#1764) DataStore.GetStats() refactoring to simplify adding new fields (#1704) * DataStore.GetStats() refactoring to simplify adding new fields * cleanup * cleanup * cleanup * goimports * rename test to TestGetStatsV4 * address comments * fix typo * update * update "IP pool is too low" logging * GetStats() -> GetIpStats() * GetStats() -> GetIpStats() in tests and comments * update test * cleanup test * add logPoolStats comment Fix KOPS_STATE_STORE (#1770) Automation script for running IT (#1759) Update issue template Update issue template with email address Update issue template Update go.mod for integration folder (#1741) * Update go.mod for integration folder - Update go.mod for integration folder * Change integration test to use new K8s test framework * Modify server pod image * Switch to Nginx port 80 for server pod * Switch server port in client test * Remove custom command directive for Nginx pod * Added ping command for host checks README: mention arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy (#1768) Co-authored-by: Shreya027 Add dl1.24xlarge to ENILimits override list (#1777) Chart and Manifest updates (#1771) * Chart and Manifest updates * Update probe timeout values Change workflow to use git install (#1785) - Change workflow to use git install as the go get command was altering go.mod file without updating go.sum file Add HostNetworking Test for PPSG in test agent (#1720) * Add HostNetworking Test for PPSG in test agent * Updated PPSG test to validate vlan.eth link --- .../templates/clusterrole.yaml | 30 +------ charts/cni-metrics-helper/values.yaml | 1 + cmd/cni-metrics-helper/README.md | 88 +++++++++++++++++++ cmd/cni-metrics-helper/main.go | 16 +++- cmd/cni-metrics-helper/metrics/metrics.go | 10 +-- config/master/cni-metrics-helper-cn.yaml | 30 +------ .../cni-metrics-helper-us-gov-east-1.yaml | 30 +------ .../cni-metrics-helper-us-gov-west-1.yaml | 30 +------ config/master/cni-metrics-helper.yaml | 33 +------ config/master/manifests.jsonnet | 32 +------ pkg/publisher/publisher.go | 36 +++++--- pkg/publisher/publisher_test.go | 12 +++ 12 files changed, 155 insertions(+), 193 deletions(-) diff --git a/charts/cni-metrics-helper/templates/clusterrole.yaml b/charts/cni-metrics-helper/templates/clusterrole.yaml index 9078383b4a..9331637d19 100644 --- a/charts/cni-metrics-helper/templates/clusterrole.yaml +++ b/charts/cni-metrics-helper/templates/clusterrole.yaml @@ -5,34 +5,6 @@ metadata: rules: - apiGroups: [""] resources: - - nodes - pods - pods/proxy - - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - - endpoints - verbs: ["list", "watch", "get"] - - apiGroups: ["extensions"] - resources: - - daemonsets - - deployments - - replicasets - verbs: ["list", "watch"] - - apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - - apiGroups: ["batch"] - resources: - - cronjobs - - jobs - verbs: ["list", "watch"] - - apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] \ No newline at end of file + verbs: ["get", "watch", "list"] \ No newline at end of file diff --git a/charts/cni-metrics-helper/values.yaml b/charts/cni-metrics-helper/values.yaml index ab89878633..fe90724ae1 100644 --- a/charts/cni-metrics-helper/values.yaml +++ b/charts/cni-metrics-helper/values.yaml @@ -12,6 +12,7 @@ image: env: USE_CLOUDWATCH: "true" + AWS_CLUSTER_ID: "" fullnameOverride: "cni-metrics-helper" diff --git a/cmd/cni-metrics-helper/README.md b/cmd/cni-metrics-helper/README.md index 85cd93ce89..14eeca84b7 100644 --- a/cmd/cni-metrics-helper/README.md +++ b/cmd/cni-metrics-helper/README.md @@ -13,6 +13,94 @@ The following diagram shows how `cni-metrics-helper` works in a cluster: ![](../../docs/images/cni-metrics-helper.png) +### Using IRSA +As per [AWS EKS Security Best Practice](https://docs.aws.amazon.com/eks/latest/userguide/best-practices-security.html), if you are using IRSA for pods then following requirements must be satisfied to succesfully publish metrics to CloudWatch + +1. The IAM Role for your SA must have following policy attached + +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "cloudwatch:PutMetricData" + ], + "Resource": "*" + } + ] +} +``` + +2. You should have following ClusterRole and ClusterRoleBinding for the IRSA + +``` +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cni-metrics-helper +rules: + - apiGroups: [""] + resources: + - pods + - pods/proxy + verbs: ["get", "watch", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cni-metrics-helper + labels: + app.kubernetes.io/name: cni-metrics-helper + app.kubernetes.io/instance: cni-metrics-helper + app.kubernetes.io/version: "v1.9.3" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cni-metrics-helper +subjects: + - kind: ServiceAccount + name: + namespace: kube-system +``` + +3. Specify this IRSA in the cni-metrics-helper deployment spec alongwith CLUSTER_ID as the metric dimension + +``` +kind: Deployment +apiVersion: apps/v1 +metadata: + name: cni-metrics-helper + namespace: kube-system + labels: + k8s-app: cni-metrics-helper +spec: + selector: + matchLabels: + k8s-app: cni-metrics-helper + template: + metadata: + labels: + k8s-app: cni-metrics-helper + spec: + containers: + - env: + - name: USE_CLOUDWATCH + value: "true" + - name: CLUSTER_ID + value: "demo-cluster" + name: cni-metrics-helper + image: + serviceAccountName: +``` +With IRSA, the above deployment spec will be auto-injected with AWS_REGION parameter and it will be used to fetch Region information. +Possible Scenarios for above configuration +1. If you are not using IRSA, then Region and CLUSTER_ID will be fetched using IMDS (should have access) +2. If you are using IRSA but have not specified CLUSTER_ID, we can still get this information if IMDS access is not blocked +3. If you have blocked IMDS access, then you must specify a value for CLUSTER_ID (metric dimension) in the deployment spec +4. If you have not blocked IMDS access but have specified CLUSTER_ID value, then it will be used. + ### Installing the cni-metrics-helper ``` kubectl apply -f v1.6/cni-metrics-helper.yaml diff --git a/cmd/cni-metrics-helper/main.go b/cmd/cni-metrics-helper/main.go index 02a234c663..4d4cdc1936 100644 --- a/cmd/cni-metrics-helper/main.go +++ b/cmd/cni-metrics-helper/main.go @@ -80,9 +80,23 @@ func main() { } } + // Fetch region, if using IRSA it be will auto injected as env variable in pod spec + // If not found then it will be empty, in which case we will try to fetch it from IMDS (existing approach) + // This can also mean that Cx is not using IRSA and we shouldn't enforce IRSA requirement + region, _ := os.LookupEnv("AWS_REGION") + + // should be name/identifier for the cluster if specified + clusterID, _ := os.LookupEnv("AWS_CLUSTER_ID") + + log.Infof("Using REGION=%s and CLUSTER_ID=%s", region, clusterID) + log.Infof("Starting CNIMetricsHelper. Sending metrics to CloudWatch: %v, LogLevel %s", options.submitCW, logConfig.LogLevel) clientSet, err := k8sapi.GetKubeClientSet() + if err != nil { + log.Fatalf("Error Fetching Kubernetes Client: %s", err) + os.Exit(1) + } rawK8SClient, err := k8sapi.CreateKubeClient() if err != nil { @@ -98,7 +112,7 @@ func main() { var cw publisher.Publisher if options.submitCW { - cw, err = publisher.New(ctx) + cw, err = publisher.New(ctx, region, clusterID) if err != nil { log.Fatalf("Failed to create publisher: %v", err) } diff --git a/cmd/cni-metrics-helper/metrics/metrics.go b/cmd/cni-metrics-helper/metrics/metrics.go index e57d317e0f..4c923b5697 100644 --- a/cmd/cni-metrics-helper/metrics/metrics.go +++ b/cmd/cni-metrics-helper/metrics/metrics.go @@ -238,11 +238,11 @@ func postProcessingHistogram(convert metricsConvert, log logger.Logger) bool { func processMetric(family *dto.MetricFamily, convert metricsConvert, log logger.Logger) (bool, error) { resetDetected := false - mType := family.GetType() + metricType := family.GetType() for _, metric := range family.GetMetric() { for _, act := range convert.actions { if act.matchFunc(metric) { - switch mType { + switch metricType { case dto.MetricType_GAUGE: processGauge(metric, &act) case dto.MetricType_HISTOGRAM: @@ -256,7 +256,7 @@ func processMetric(family *dto.MetricFamily, convert metricsConvert, log logger. } } - switch mType { + switch metricType { case dto.MetricType_COUNTER: curResetDetected := postProcessingCounter(convert, log) if curResetDetected { @@ -316,9 +316,9 @@ func filterMetrics(originalMetrics map[string]*dto.MetricFamily, func produceCloudWatchMetrics(t metricsTarget, families map[string]*dto.MetricFamily, convertDef map[string]metricsConvert, cw publisher.Publisher) { for key, family := range families { convertMetrics := convertDef[key] - mType := family.GetType() + metricType := family.GetType() for _, action := range convertMetrics.actions { - switch mType { + switch metricType { case dto.MetricType_COUNTER: if t.submitCloudWatch() { dataPoint := &cloudwatch.MetricDatum{ diff --git a/config/master/cni-metrics-helper-cn.yaml b/config/master/cni-metrics-helper-cn.yaml index 0c07079ef3..a1e57bb7f4 100644 --- a/config/master/cni-metrics-helper-cn.yaml +++ b/config/master/cni-metrics-helper-cn.yaml @@ -18,37 +18,9 @@ metadata: rules: - apiGroups: [""] resources: - - nodes - pods - pods/proxy - - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - - endpoints - verbs: ["list", "watch", "get"] - - apiGroups: ["extensions"] - resources: - - daemonsets - - deployments - - replicasets - verbs: ["list", "watch"] - - apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - - apiGroups: ["batch"] - resources: - - cronjobs - - jobs - verbs: ["list", "watch"] - - apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] + verbs: ["get", "watch", "list"] --- # Source: cni-metrics-helper/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 diff --git a/config/master/cni-metrics-helper-us-gov-east-1.yaml b/config/master/cni-metrics-helper-us-gov-east-1.yaml index 91a40ed6ca..4276fd3392 100644 --- a/config/master/cni-metrics-helper-us-gov-east-1.yaml +++ b/config/master/cni-metrics-helper-us-gov-east-1.yaml @@ -18,37 +18,9 @@ metadata: rules: - apiGroups: [""] resources: - - nodes - pods - pods/proxy - - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - - endpoints - verbs: ["list", "watch", "get"] - - apiGroups: ["extensions"] - resources: - - daemonsets - - deployments - - replicasets - verbs: ["list", "watch"] - - apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - - apiGroups: ["batch"] - resources: - - cronjobs - - jobs - verbs: ["list", "watch"] - - apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] + verbs: ["get", "watch", "list"] --- # Source: cni-metrics-helper/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 diff --git a/config/master/cni-metrics-helper-us-gov-west-1.yaml b/config/master/cni-metrics-helper-us-gov-west-1.yaml index 6f592e93ff..3adb81ec1d 100644 --- a/config/master/cni-metrics-helper-us-gov-west-1.yaml +++ b/config/master/cni-metrics-helper-us-gov-west-1.yaml @@ -18,37 +18,9 @@ metadata: rules: - apiGroups: [""] resources: - - nodes - pods - pods/proxy - - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - - endpoints - verbs: ["list", "watch", "get"] - - apiGroups: ["extensions"] - resources: - - daemonsets - - deployments - - replicasets - verbs: ["list", "watch"] - - apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - - apiGroups: ["batch"] - resources: - - cronjobs - - jobs - verbs: ["list", "watch"] - - apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] + verbs: ["get", "watch", "list"] --- # Source: cni-metrics-helper/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 diff --git a/config/master/cni-metrics-helper.yaml b/config/master/cni-metrics-helper.yaml index 18404d7f8d..dc1d8f4916 100644 --- a/config/master/cni-metrics-helper.yaml +++ b/config/master/cni-metrics-helper.yaml @@ -18,37 +18,9 @@ metadata: rules: - apiGroups: [""] resources: - - nodes - pods - pods/proxy - - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - - endpoints - verbs: ["list", "watch", "get"] - - apiGroups: ["extensions"] - resources: - - daemonsets - - deployments - - replicasets - verbs: ["list", "watch"] - - apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - - apiGroups: ["batch"] - resources: - - cronjobs - - jobs - verbs: ["list", "watch"] - - apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] + verbs: ["get", "watch", "list"] --- # Source: cni-metrics-helper/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -89,6 +61,9 @@ spec: - env: - name: USE_CLOUDWATCH value: "true" + # Optional: Should be ClusterName/ClusterIdentifier used as the metric dimension + - name: AWS_CLUSTER_ID + value: "" name: cni-metrics-helper image: "602401143452.dkr.ecr.us-west-2.amazonaws.com/cni-metrics-helper:v1.10.1" serviceAccountName: cni-metrics-helper diff --git a/config/master/manifests.jsonnet b/config/master/manifests.jsonnet index 2eb73cdeab..d531f599f3 100644 --- a/config/master/manifests.jsonnet +++ b/config/master/manifests.jsonnet @@ -297,40 +297,11 @@ local metricsHelper = { { apiGroups: [""], resources: [ - "nodes", "pods", - "pods/proxy", - "services", - "resourcequotas", - "replicationcontrollers", - "limitranges", - "persistentvolumeclaims", - "persistentvolumes", - "namespaces", - "endpoints", + "pods/proxy" ], verbs: ["list", "watch", "get"], }, - { - apiGroups: ["extensions"], - resources: ["daemonsets", "deployments", "replicasets"], - verbs: ["list", "watch"], - }, - { - apiGroups: ["apps"], - resources: ["statefulsets"], - verbs: ["list", "watch"], - }, - { - apiGroups: ["batch"], - resources: ["cronjobs", "jobs"], - verbs: ["list", "watch"], - }, - { - apiGroups: ["autoscaling"], - resources: ["horizontalpodautoscalers"], - verbs: ["list", "watch"], - }, ], }, @@ -390,6 +361,7 @@ local metricsHelper = { name: "cni-metrics-helper", env_:: { USE_CLOUDWATCH: "true", + AWS_CLUSTER_ID: "", }, env: [ {name: kv[0]} + if std.isObject(kv[1]) then kv[1] else {value: kv[1]} diff --git a/pkg/publisher/publisher.go b/pkg/publisher/publisher.go index 09524a46ae..7bf59c4a65 100644 --- a/pkg/publisher/publisher.go +++ b/pkg/publisher/publisher.go @@ -84,22 +84,34 @@ type cloudWatchPublisher struct { lock sync.RWMutex } +// Logic to fetch Region and CLUSTER_ID +// Case 1: Cx not using IRSA, we need to get region and clusterID using IMDS +// Case 2: Cx using IRSA but not specified clusterID, we can still get this info if IMDS is not blocked +// Case 3: Cx blocked IMDS access and not using IRSA (which means region == "") AND +// not specified clusterID then its a Cx error // New returns a new instance of `Publisher` -func New(ctx context.Context) (Publisher, error) { +func New(ctx context.Context, region string, clusterID string) (Publisher, error) { sess := awssession.New() - // Get cluster-ID - ec2Client, err := ec2wrapper.NewMetricsClient() - if err != nil { - return nil, errors.Wrap(err, "publisher: unable to obtain EC2 service client") - } - clusterID := getClusterID(ec2Client) - // Get ec2metadata client - ec2MetadataClient := ec2metadatawrapper.New(sess) + // If Customers have explicitly specified clusterID then skip generating it + if clusterID == "" { + ec2Client, err := ec2wrapper.NewMetricsClient() + if err != nil { + return nil, errors.Wrap(err, "publisher: unable to obtain EC2 service client") + } + + clusterID = getClusterID(ec2Client) + } - region, err := ec2MetadataClient.Region() - if err != nil { - return nil, errors.Wrap(err, "publisher: Unable to obtain region") + // Try to fetch region if not available + if region == "" { + // Get ec2metadata client + ec2MetadataClient := ec2metadatawrapper.New(sess) + val, err := ec2MetadataClient.Region() + if err != nil { + return nil, errors.Wrap(err, "publisher: Unable to obtain region") + } + region = val } // Get AWS session diff --git a/pkg/publisher/publisher_test.go b/pkg/publisher/publisher_test.go index 7e2a95cfb7..d1b13845df 100644 --- a/pkg/publisher/publisher_test.go +++ b/pkg/publisher/publisher_test.go @@ -33,6 +33,18 @@ const ( testMonitorDuration = time.Millisecond * 10 ) +func TestCloudWatchPublisherWithNoIMDS(t *testing.T) { + ctx, cancel := context.WithCancel(context.TODO()) + defer cancel() + + region := "us-west-2" + clusterID := testClusterID + + cw, err := New(ctx, region, clusterID) + assert.NoError(t, err) + assert.NotNil(t, cw) +} + func TestCloudWatchPublisherWithSingleDatum(t *testing.T) { cloudwatchPublisher := getCloudWatchPublisher(t)