Skip to content

Commit

Permalink
verify cluster stability
Browse files Browse the repository at this point in the history
  • Loading branch information
matmerr committed Sep 27, 2024
1 parent 0046174 commit 318ee7c
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 62 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
*.so
*.dylib

# Avoid checking in keys
*.pem

# Test binary, built with `go test -c`
*.test

Expand Down
19 changes: 16 additions & 3 deletions test/e2e/framework/kubernetes/get-logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,16 @@ import (
"k8s.io/client-go/tools/clientcmd"
)

func PrintPodLogs(kubeconfigpath, namespace, labelSelector string) {
type GetPodLogs struct {
KubeConfigFilePath string
Namespace string
LabelSelector string
}

func (p *GetPodLogs) Run() error {
fmt.Printf("printing pod logs for namespace: %s, labelselector: %s\n", p.Namespace, p.LabelSelector)
// Load the kubeconfig file to get the configuration to access the cluster
config, err := clientcmd.BuildConfigFromFlags("", kubeconfigpath)
config, err := clientcmd.BuildConfigFromFlags("", p.KubeConfigFilePath)
if err != nil {
log.Printf("error building kubeconfig: %s\n", err)
}
Expand All @@ -25,8 +32,14 @@ func PrintPodLogs(kubeconfigpath, namespace, labelSelector string) {
log.Printf("error creating clientset: %s\n", err)
}

PrintPodLogs(context.Background(), clientset, p.Namespace, p.LabelSelector)

return nil
}

func PrintPodLogs(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) {
// List all the pods in the namespace
pods, err := clientset.CoreV1().Pods(namespace).List(context.Background(), metav1.ListOptions{
pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
LabelSelector: labelSelector,
})
if err != nil {
Expand Down
26 changes: 24 additions & 2 deletions test/e2e/framework/kubernetes/install-retina-helm.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package kubernetes

import (
"context"
"fmt"
"log"
"os"
Expand All @@ -11,10 +12,12 @@ import (
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart/loader"
"helm.sh/helm/v3/pkg/cli"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
)

const (
createTimeout = 240 * time.Second // windpws is slow
createTimeout = 20 * time.Minute // windows is slow
deleteTimeout = 60 * time.Second
)

Expand All @@ -32,6 +35,8 @@ type InstallHelmChart struct {
}

func (i *InstallHelmChart) Run() error {
ctx, cancel := context.WithTimeout(context.Background(), createTimeout)
defer cancel()
settings := cli.New()
settings.KubeConfig = i.KubeConfigFilePath
actionConfig := new(action.Configuration)
Expand Down Expand Up @@ -97,7 +102,7 @@ func (i *InstallHelmChart) Run() error {
client.WaitForJobs = true

// install the chart here
rel, err := client.Run(chart, chart.Values)
rel, err := client.RunWithContext(ctx, chart, chart.Values)
if err != nil {
return fmt.Errorf("failed to install chart: %w", err)
}
Expand All @@ -106,6 +111,23 @@ func (i *InstallHelmChart) Run() error {
// this will confirm the values set during installation
log.Printf("chart values: %v\n", rel.Config)

// ensure all pods are running, since helm doesn't care about windows
config, err := clientcmd.BuildConfigFromFlags("", i.KubeConfigFilePath)
if err != nil {
return fmt.Errorf("error building kubeconfig: %w", err)
}

clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return fmt.Errorf("error creating Kubernetes client: %w", err)
}

labelSelector := "k8s-app=retina"
err = WaitForPodReady(ctx, clientset, "kube-system", labelSelector)
if err != nil {
return fmt.Errorf("error waiting for retina pods to be ready: %w", err)
}

return nil
}

Expand Down
31 changes: 8 additions & 23 deletions test/e2e/framework/kubernetes/no-crashes.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,19 @@ import (
"context"
"fmt"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
)

type NoCrashes struct {
var ErrPodCrashed = fmt.Errorf("pod has crashes")

type EnsureStableCluster struct {
LabelSelector string
PodNamespace string
KubeConfigFilePath string
}

func (n *NoCrashes) Run() error {
func (n *EnsureStableCluster) Run() error {
config, err := clientcmd.BuildConfigFromFlags("", n.KubeConfigFilePath)
if err != nil {
return fmt.Errorf("error building kubeconfig: %w", err)
Expand All @@ -27,32 +27,17 @@ func (n *NoCrashes) Run() error {
return fmt.Errorf("error creating Kubernetes client: %w", err)
}

fieldSelector := fields.Everything()

pods, err := clientset.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
LabelSelector: n.LabelSelector,
FieldSelector: fieldSelector.String(),
})
err = WaitForPodReady(context.TODO(), clientset, n.PodNamespace, n.LabelSelector)
if err != nil {
return fmt.Errorf("error listing pods: %w", err)
return fmt.Errorf("error waiting for retina pods to be ready: %w", err)
}

for _, pod := range pods.Items {
for _, status := range pod.Status.ContainerStatuses {
if status.RestartCount > 0 {
PrintPodLogs(n.KubeConfigFilePath, pod.Namespace, pod.Name)
return fmt.Errorf("Pod %s has %d restarts", pod.Name, status)
}
}
}

return nil
}

func (n *NoCrashes) Prevalidate() error {
func (n *EnsureStableCluster) Prevalidate() error {
return nil
}

func (n *NoCrashes) Stop() error {
func (n *EnsureStableCluster) Stop() error {
return nil
}
17 changes: 16 additions & 1 deletion test/e2e/framework/kubernetes/wait-pod-ready.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ const (
func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) error {
podReadyMap := make(map[string]bool)

printIterator := 0
conditionFunc := wait.ConditionWithContextFunc(func(context.Context) (bool, error) {
defer func() {
printIterator++
}()
var podList *corev1.PodList
podList, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector})
if err != nil {
Expand All @@ -40,11 +44,21 @@ func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, names
return false, fmt.Errorf("error getting Pod: %w", err)
}

for istatus := range pod.Status.ContainerStatuses {
status := &pod.Status.ContainerStatuses[istatus]
if status.RestartCount > 0 {
return false, fmt.Errorf("pod %s has %d restarts: status: %+v: %w", pod.Name, status.RestartCount, status, ErrPodCrashed)
}
}

// Check the Pod phase
if pod.Status.Phase != corev1.PodRunning {
log.Printf("pod \"%s\" is not in Running state yet. Waiting...\n", pod.Name)
if printIterator%5 == 0 {
log.Printf("pod \"%s\" is not in Running state yet. Waiting...\n", pod.Name)
}
return false, nil
}

if !podReadyMap[pod.Name] {
log.Printf("pod \"%s\" is in Running state\n", pod.Name)
podReadyMap[pod.Name] = true
Expand All @@ -56,6 +70,7 @@ func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, names

err := wait.PollUntilContextCancel(ctx, RetryIntervalPodsReady, true, conditionFunc)
if err != nil {
PrintPodLogs(ctx, clientset, namespace, labelSelector)
return fmt.Errorf("error waiting for pods in namespace \"%s\" with label \"%s\" to be in Running state: %w", namespace, labelSelector, err)
}
return nil
Expand Down
69 changes: 40 additions & 29 deletions test/e2e/jobs/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,46 @@ import (
tcp "github.com/microsoft/retina/test/e2e/scenarios/tcp"
)

func CreateTestInfra(subID, clusterName, location, kubeConfigFilePath string) *types.Job {
func CreateTestInfra(subID, clusterName, location, kubeConfigFilePath string, createInfra bool) *types.Job {
job := types.NewJob("Create e2e test infrastructure")

job.AddStep(&azure.CreateResourceGroup{
SubscriptionID: subID,
ResourceGroupName: clusterName,
Location: location,
}, nil)

job.AddStep(&azure.CreateVNet{
VnetName: "testvnet",
VnetAddressSpace: "10.0.0.0/9",
}, nil)

job.AddStep(&azure.CreateSubnet{
SubnetName: "testsubnet",
SubnetAddressSpace: "10.0.0.0/12",
}, nil)

job.AddStep(&azure.CreateNPMCluster{
ClusterName: clusterName,
PodCidr: "10.128.0.0/9",
DNSServiceIP: "192.168.0.10",
ServiceCidr: "192.168.0.0/28",
}, nil)

job.AddStep(&azure.GetAKSKubeConfig{
KubeConfigFilePath: kubeConfigFilePath,
}, nil)
if createInfra {
job.AddStep(&azure.CreateResourceGroup{
SubscriptionID: subID,
ResourceGroupName: clusterName,
Location: location,
}, nil)

job.AddStep(&azure.CreateVNet{
VnetName: "testvnet",
VnetAddressSpace: "10.0.0.0/9",
}, nil)

job.AddStep(&azure.CreateSubnet{
SubnetName: "testsubnet",
SubnetAddressSpace: "10.0.0.0/12",
}, nil)

job.AddStep(&azure.CreateNPMCluster{
ClusterName: clusterName,
PodCidr: "10.128.0.0/9",
DNSServiceIP: "192.168.0.10",
ServiceCidr: "192.168.0.0/28",
}, nil)

job.AddStep(&azure.GetAKSKubeConfig{
KubeConfigFilePath: kubeConfigFilePath,
}, nil)

} else {
job.AddStep(&azure.GetAKSKubeConfig{
KubeConfigFilePath: "./test.pem",
ClusterName: clusterName,
SubscriptionID: subID,
ResourceGroupName: clusterName,
Location: location,
}, nil)
}

job.AddStep(&generic.LoadFlags{
TagEnv: generic.DefaultTagEnv,
Expand Down Expand Up @@ -122,7 +133,7 @@ func InstallAndTestRetinaBasicMetrics(kubeConfigFilePath, chartPath string) *typ
job.AddScenario(dns.ValidateBasicDNSMetrics(scenario.name, scenario.req, scenario.resp))
}

job.AddStep(&kubernetes.NoCrashes{
job.AddStep(&kubernetes.EnsureStableCluster{
PodNamespace: "kube-system",
LabelSelector: "k8s-app=retina",
}, nil)
Expand Down Expand Up @@ -189,7 +200,7 @@ func UpgradeAndTestRetinaAdvancedMetrics(kubeConfigFilePath, chartPath, valuesFi

job.AddScenario(latency.ValidateLatencyMetric())

job.AddStep(&kubernetes.NoCrashes{
job.AddStep(&kubernetes.EnsureStableCluster{
PodNamespace: "kube-system",
LabelSelector: "k8s-app=retina",
}, nil)
Expand Down
20 changes: 16 additions & 4 deletions test/e2e/retina_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package retina

import (
"crypto/rand"
"flag"
"math/big"
"os"
"os/user"
Expand All @@ -16,14 +17,23 @@ import (
"github.com/stretchr/testify/require"
)

var locations = []string{"eastus2", "centralus", "southcentralus", "uksouth", "centralindia", "westus2"}
var (
locations = []string{"eastus2", "centralus", "southcentralus", "uksouth", "centralindia", "westus2"}
createInfra = flag.Bool("create-infra", true, "create a Resource group, vNET and AKS cluster for testing")
deleteInfra = flag.Bool("delete-infra", true, "delete a Resource group, vNET and AKS cluster for testing")
)

// TestE2ERetina tests all e2e scenarios for retina
func TestE2ERetina(t *testing.T) {
curuser, err := user.Current()
require.NoError(t, err)
flag.Parse()

clusterName := curuser.Username + common.NetObsRGtag + strconv.FormatInt(time.Now().Unix(), 10)
clusterName := os.Getenv("CLUSTER_NAME")
if clusterName == "" {
clusterName = curuser.Username + common.NetObsRGtag + strconv.FormatInt(time.Now().Unix(), 10)
t.Logf("CLUSTER_NAME is not set, generating a random cluster name: %s", clusterName)
}

subID := os.Getenv("AZURE_SUBSCRIPTION_ID")
require.NotEmpty(t, subID)
Expand All @@ -49,15 +59,17 @@ func TestE2ERetina(t *testing.T) {
kubeConfigFilePath := filepath.Join(rootDir, "test", "e2e", "test.pem")

// CreateTestInfra
createTestInfra := types.NewRunner(t, jobs.CreateTestInfra(subID, clusterName, location, kubeConfigFilePath))
createTestInfra := types.NewRunner(t, jobs.CreateTestInfra(subID, clusterName, location, kubeConfigFilePath, *createInfra))
createTestInfra.Run()

// Hacky way to ensure that the test infra is deleted even if the test panics
defer func() {
if r := recover(); r != nil {
t.Logf("Recovered in TestE2ERetina, %v", r)
}
_ = jobs.DeleteTestInfra(subID, clusterName, location).Run()
if *deleteInfra {
_ = jobs.DeleteTestInfra(subID, clusterName, location).Run()
}
}()

// Install and test Retina basic metrics
Expand Down

0 comments on commit 318ee7c

Please sign in to comment.