Skip to content

Commit

Permalink
adding e2e
Browse files Browse the repository at this point in the history
  • Loading branch information
Paramadon committed May 17, 2024
1 parent c4a3a4a commit 53ffe75
Show file tree
Hide file tree
Showing 7 changed files with 238 additions and 6 deletions.
3 changes: 1 addition & 2 deletions generator/test_case_generator.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,7 @@ var testTypeToTestConfig = map[string][]testConfig{
{testDir: "./test/fluent", terraformDir: "terraform/eks/daemon/fluent/windows/2022"},
{
testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
instanceType: "g4dn.xlarge",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
},
},
"eks_deployment": {
Expand Down
4 changes: 2 additions & 2 deletions terraform/eks/daemon/gpu/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
// SPDX-License-Identifier: MIT

module "common" {
source = "../../../common"
source = "../common"
cwagent_image_repo = var.cwagent_image_repo
cwagent_image_tag = var.cwagent_image_tag
}

module "basic_components" {
source = "../../../basic_components"
source = "../basic_components"

region = var.region
}
Expand Down
2 changes: 1 addition & 1 deletion terraform/eks/daemon/gpu/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ variable "region" {

variable "test_dir" {
type = string
default = "./test/gpu"
default = "../test/gpu"
}

variable "cwagent_image_repo" {
Expand Down
146 changes: 146 additions & 0 deletions terraform/gpu/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: MIT

module "common" {
source = "../common"
cwagent_image_repo = var.cwagent_image_repo
cwagent_image_tag = var.cwagent_image_tag
}

module "basic_components" {
source = "../basic_components"

region = var.region
}


data "aws_eks_cluster_auth" "this" {
name = aws_eks_cluster.this.name
}

locals {
role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "")
aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "")
}

resource "aws_eks_cluster" "this" {
name = "cwagent-operator-eks-integ-${module.common.testing_id}"
role_arn = local.role_arn
version = var.k8s_version
vpc_config {
subnet_ids = module.basic_components.public_subnet_ids
security_group_ids = [module.basic_components.security_group]
}
}

# EKS Node Groups
resource "aws_eks_node_group" "this" {
cluster_name = aws_eks_cluster.this.name
node_group_name = "cwagent-operator-eks-integ-node"
node_role_arn = aws_iam_role.node_role.arn
subnet_ids = module.basic_components.public_subnet_ids

scaling_config {
desired_size = 1
max_size = 1
min_size = 1
}

ami_type = "AL2_x86_64_GPU"
capacity_type = "ON_DEMAND"
disk_size = 20
instance_types = ["g4dn.xlarge"]

depends_on = [
aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly,
aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy,
aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy,
aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy
]
}

# EKS Node IAM Role
resource "aws_iam_role" "node_role" {
name = "cwagent-operator-eks-Worker-Role-${module.common.testing_id}"

assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}

resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
role = aws_iam_role.node_role.name
}

resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
role = aws_iam_role.node_role.name
}

resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
role = aws_iam_role.node_role.name
}

resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" {
policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
role = aws_iam_role.node_role.name
}


resource "null_resource" "kubectl" {
depends_on = [
aws_eks_cluster.this,
aws_eks_node_group.this
]
provisioner "local-exec" {
command = <<-EOT
${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name}
${local.aws_eks} list-clusters --output text
${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text
EOT
}
}

resource "aws_eks_addon" "this" {
depends_on = [
null_resource.kubectl
]
addon_name = var.addon_name
cluster_name = aws_eks_cluster.this.name
addon_version = var.addon_version
}

resource "null_resource" "validator" {
depends_on = [
aws_eks_node_group.this,
aws_eks_addon.this
]

provisioner "local-exec" {
command = <<EOT
go test ${var.test_dir} -eksClusterName ${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia
# Get all pods and describe them
kubectl get pods --all-namespaces -o wide > pods.txt
kubectl describe pods --all-namespaces > pods_describe.txt
# Log the contents of the files
cat pods.txt
cat pods_describe.txt
EOT
}
}

20 changes: 20 additions & 0 deletions terraform/gpu/providers.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: MIT

provider "aws" {
region = var.region
endpoints {
eks = var.beta ? var.beta_endpoint : null
}
}

provider "kubernetes" {
exec {
api_version = "client.authentication.k8s.io/v1beta1"
command = "aws"
args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name]
}
host = aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data)
token = data.aws_eks_cluster_auth.this.token
}
58 changes: 58 additions & 0 deletions terraform/gpu/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: MIT

variable "region" {
type = string
default = "us-west-2"
}

variable "test_dir" {
type = string
default = "../../test/gpu"
}

variable "addon_name" {
type = string
default = "amazon-cloudwatch-observability"
}

variable "addon_version" {
type = string
default = "v1.6.0-eksbuild.1"
}


variable "cwagent_image_repo" {
type = string
default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent"
}

variable "cwagent_image_tag" {
type = string
default = "latest"
}

variable "k8s_version" {
type = string
default = "1.28"
}

variable "ami_type" {
type = string
default = "AL2_x86_64_GPU"
}

variable "instance_type" {
type = string
default = "g4dn.xlarge"
}

variable "beta" {
type = bool
default = true
}

variable "beta_endpoint" {
type = string
default = "https://api.beta.us-west-2.wesley.amazonaws.com"
}
11 changes: 10 additions & 1 deletion test/metric/container_insights_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,15 @@ func ValidateMetrics(env *environment.MetaData, metricFilter string, expectedDim
}
results = append(results, validateMetricsAvailability(dims, metrics, actual))
for _, m := range metrics {
// this is to prevent panic with rand.Intn when metrics are not yet ready in a cluster
if _, ok := actual[m]; !ok {
results = append(results, status.TestResult{
Name: dims,
Status: status.FAILED,
})
log.Printf("ValidateMetrics failed with missing metric: %s", m)
continue
}
// pick a random dimension set to test metric data OR test all dimension sets which might be overkill
randIdx := rand.Intn(len(actual[m]))
results = append(results, validateMetricValue(m, actual[m][randIdx]))
Expand Down Expand Up @@ -123,7 +132,7 @@ func validateMetricsAvailability(dims string, expected []string, actual map[stri
Name: dims,
Status: status.FAILED,
}
log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual))
log.Printf("expected metrics: %d, actual metrics: %d", len(expected), 3*len(actual))
if compareMetrics(expected, actual) {
testResult.Status = status.SUCCESSFUL
} else {
Expand Down

0 comments on commit 53ffe75

Please sign in to comment.