diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index 3283404eb..017de5af8 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -217,8 +217,7 @@ var testTypeToTestConfig = map[string][]testConfig{ {testDir: "./test/fluent", terraformDir: "terraform/eks/daemon/fluent/windows/2022"}, { testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu", - targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, - instanceType: "g4dn.xlarge", + targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, }, }, "eks_deployment": { diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf index 658b78269..86026fb72 100644 --- a/terraform/eks/daemon/gpu/main.tf +++ b/terraform/eks/daemon/gpu/main.tf @@ -2,13 +2,13 @@ // SPDX-License-Identifier: MIT module "common" { - source = "../../../common" + source = "../common" cwagent_image_repo = var.cwagent_image_repo cwagent_image_tag = var.cwagent_image_tag } module "basic_components" { - source = "../../../basic_components" + source = "../basic_components" region = var.region } diff --git a/terraform/eks/daemon/gpu/variables.tf b/terraform/eks/daemon/gpu/variables.tf index 26a0e6cd0..15602011e 100644 --- a/terraform/eks/daemon/gpu/variables.tf +++ b/terraform/eks/daemon/gpu/variables.tf @@ -8,7 +8,7 @@ variable "region" { variable "test_dir" { type = string - default = "./test/gpu" + default = "../test/gpu" } variable "cwagent_image_repo" { diff --git a/terraform/gpu/main.tf b/terraform/gpu/main.tf new file mode 100644 index 000000000..92db23eb8 --- /dev/null +++ b/terraform/gpu/main.tf @@ -0,0 +1,146 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +module "common" { + source = "../common" + cwagent_image_repo = var.cwagent_image_repo + cwagent_image_tag = var.cwagent_image_tag +} + +module "basic_components" { + source = "../basic_components" + + region = var.region +} + + +data "aws_eks_cluster_auth" "this" { + name = aws_eks_cluster.this.name +} + +locals { + role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "") + aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "") +} + +resource "aws_eks_cluster" "this" { + name = "cwagent-operator-eks-integ-${module.common.testing_id}" + role_arn = local.role_arn + version = var.k8s_version + vpc_config { + subnet_ids = module.basic_components.public_subnet_ids + security_group_ids = [module.basic_components.security_group] + } +} + +# EKS Node Groups +resource "aws_eks_node_group" "this" { + cluster_name = aws_eks_cluster.this.name + node_group_name = "cwagent-operator-eks-integ-node" + node_role_arn = aws_iam_role.node_role.arn + subnet_ids = module.basic_components.public_subnet_ids + + scaling_config { + desired_size = 1 + max_size = 1 + min_size = 1 + } + + ami_type = "AL2_x86_64_GPU" + capacity_type = "ON_DEMAND" + disk_size = 20 + instance_types = ["g4dn.xlarge"] + + depends_on = [ + aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy + ] +} + +# EKS Node IAM Role +resource "aws_iam_role" "node_role" { + name = "cwagent-operator-eks-Worker-Role-${module.common.testing_id}" + + assume_role_policy = < pods.txt + kubectl describe pods --all-namespaces > pods_describe.txt + + # Log the contents of the files + cat pods.txt + cat pods_describe.txt + EOT + } +} + diff --git a/terraform/gpu/providers.tf b/terraform/gpu/providers.tf new file mode 100644 index 000000000..205375027 --- /dev/null +++ b/terraform/gpu/providers.tf @@ -0,0 +1,20 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +provider "aws" { + region = var.region + endpoints { + eks = var.beta ? var.beta_endpoint : null + } +} + +provider "kubernetes" { + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name] + } + host = aws_eks_cluster.this.endpoint + cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.this.token +} \ No newline at end of file diff --git a/terraform/gpu/variables.tf b/terraform/gpu/variables.tf new file mode 100644 index 000000000..2bc2bec38 --- /dev/null +++ b/terraform/gpu/variables.tf @@ -0,0 +1,58 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +variable "region" { + type = string + default = "us-west-2" +} + +variable "test_dir" { + type = string + default = "../../test/gpu" +} + +variable "addon_name" { + type = string + default = "amazon-cloudwatch-observability" +} + +variable "addon_version" { + type = string + default = "v1.6.0-eksbuild.1" +} + + +variable "cwagent_image_repo" { + type = string + default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent" +} + +variable "cwagent_image_tag" { + type = string + default = "latest" +} + +variable "k8s_version" { + type = string + default = "1.28" +} + +variable "ami_type" { + type = string + default = "AL2_x86_64_GPU" +} + +variable "instance_type" { + type = string + default = "g4dn.xlarge" +} + +variable "beta" { + type = bool + default = true +} + +variable "beta_endpoint" { + type = string + default = "https://api.beta.us-west-2.wesley.amazonaws.com" +} \ No newline at end of file diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index 140c839c1..ff0e2862d 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -58,6 +58,15 @@ func ValidateMetrics(env *environment.MetaData, metricFilter string, expectedDim } results = append(results, validateMetricsAvailability(dims, metrics, actual)) for _, m := range metrics { + // this is to prevent panic with rand.Intn when metrics are not yet ready in a cluster + if _, ok := actual[m]; !ok { + results = append(results, status.TestResult{ + Name: dims, + Status: status.FAILED, + }) + log.Printf("ValidateMetrics failed with missing metric: %s", m) + continue + } // pick a random dimension set to test metric data OR test all dimension sets which might be overkill randIdx := rand.Intn(len(actual[m])) results = append(results, validateMetricValue(m, actual[m][randIdx])) @@ -123,7 +132,7 @@ func validateMetricsAvailability(dims string, expected []string, actual map[stri Name: dims, Status: status.FAILED, } - log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual)) + log.Printf("expected metrics: %d, actual metrics: %d", len(expected), 3*len(actual)) if compareMetrics(expected, actual) { testResult.Status = status.SUCCESSFUL } else {