From e0f08bd5acb11aed52d619d7dc0b59181c44130d Mon Sep 17 00:00:00 2001 From: Zhiming Guo Date: Mon, 3 Jun 2024 21:19:22 +1000 Subject: [PATCH] PENT-103-part-1: refactor integration test so it works on clusterd org (#336) --- .gitignore | 3 + DEVELOPMENT.md | 89 ++++++++++++++++++++---- internal/controller/config/config.go | 24 ++++--- internal/integration/integration_test.go | 57 +++++---------- internal/integration/interrupt_test.go | 1 + internal/integration/main_test.go | 8 ++- internal/integration/testcase_test.go | 66 ++++++++++++++++-- justfile | 11 ++- 8 files changed, 185 insertions(+), 74 deletions(-) diff --git a/.gitignore b/.gitignore index 4da0b811..d05c0287 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ Brewfile.lock.json .vscode dist/ + +# For all glorious direnv users. +.envrc diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index c5cf1071..4aed7ea3 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -16,12 +16,72 @@ just --list # Integration Tests +## Architecture + +Agent Stack K8s integration tests depend on a running Buildkite instance. By default, they use the production Buildkite. + +```mermaid +flowchart LR + c((Controller)) -->|create jobs| K + Buildkite <-->|Pull jobs| c + subgraph K8s cluster + K(Kube API) + end +``` + +During test run, the test suites: +1. create ephemeral pipelines and queues for a given [Buildkite Agent Cluster](https://buildkite.com/docs/clusters/overview). +2. Run executor, which will monitor jobs from the target queue in target Buildkite Cluster, + starts new Jobs in a Kubernetes cluster. +3. Test suite will clean up those ephemeral objects in the end. + +To run integration test locally, we recommend you to run individual test. For example, + +```bash +just test -run TestWalkingSkeleton +``` + ## Setup -For running the integration tests you'll need to add some additional scopes to your Buildkite API token: + +Any member of the public should be able to run our integration as long as you are an user of Buildkite, and you have +access to a Kubernetes cluster. + +Concretely, to get the integration test running locally, you will need: +1. A valid Buildkite API token (presuming you are a customer of Buildkite). +2. A valid Buildkite Agent Token in your target Buildkite Cluster. The agent token needs to be installed in your K8s + cluster. +3. Your organization name in Buildkite and your target Buildkite Cluster UUID. +4. Depending on test cases, you may also need a SSH keys, please read below. +5. Your shell environment will need CLI write access to a k8s cluster. + +### Use environment variables + +We found it's convenient to supply API token, organization name, and cluster UUID as environment variables. + +```bash +export BUILDKITE_TOKEN="bkua_**************" +export ORG="your-cool-org-slug" +export CLUSTER_UUID="UUID-UUID-UUID-UUID" +``` + +### Token Scopes + +Required Buildkite API token scopes: - `read_artifacts` - `read_build_logs` - `write_pipelines` +- `write_clusters` + +### Install Agent Token + +Agent token is used by the k8s jobs instead of controller, so: + +```bash +kubectl create secret generic buildkite-agent-token --from-literal=BUILDKITE_AGENT_TOKEN=my-agent-token +``` + +### SSH secret You'll also need to create an SSH secret in your cluster to run [this test pipeline](internal/integration/fixtures/secretref.yaml). This SSH key needs to be associated with your GitHub account to be able to clone this public repo, and must be in a form acceptable to OpenSSH (aka `BEGIN OPENSSH PRIVATE KEY`, not `BEGIN PRIVATE KEY`). @@ -34,13 +94,16 @@ The integration tests on the [`kubernetes-agent-stack`](https://buildkite.com/bu ## Cleanup -These will be deleted automatically for successful tests, but for unsuccessful tests, then will remain after then end of the test job to allow you to debug them. -However, this means they should be cleaned up manually. To do this run + +In general, pipelines and queues will be deleted automatically for successful tests, but for unsuccessful tests, then will remain after then end of the test job to allow you to debug them. + +To do clean them up: + ```bash -CLEANUP_PIPELINES=true just cleanup-orphans --org=buildkite-kubernetes-stack --buildkite-token= +just cleanup-orphans ``` -The token will need to have graphql access as well as: +The token will need to have GraphQL access as well as: - `read_artifacts` - `write_pipelines` @@ -50,19 +113,17 @@ To clean these out you should run the following in a kubernetes context in the n kubectl get -o jsonpath='{.items[*].metadata.name}' jobs | xargs -L1 kubectl delete job ``` -At the time of writing, the CI pipeline is run in an EKS cluster, `agent-stack-k8s-ci` in the `buildkite-agent` AWS account. -The controller is deployed to the `buildkite` namespace in that cluster. -See https://docs.aws.amazon.com/eks/latest/userguide/create-kubeconfig.html for how to obtain a kubeconfig for an EKS cluster. +## CI ❤️ Integration Test -# Run from source +At the time of writing, the CI pipeline run in an EKS cluster, `agent-stack-k8s-ci` in the `buildkite-agent` AWS account. +CI deployes the controller onto `buildkite` namespace in that cluster. -First store the agent token in a Kubernetes secret: +# Run from source -```bash! -kubectl create secret generic buildkite-agent-token --from-literal=BUILDKITE_AGENT_TOKEN=my-agent-token -``` +Running from the source can be useful for debugging purpose, you will generally need to meet the same requirement of +running a integration test. -Next start the controller: +In this case, you can choose to supply some inputs via CLI parameters instead of environment variable. ```bash! just run --org my-org --buildkite-token my-api-token --debug diff --git a/internal/controller/config/config.go b/internal/controller/config/config.go index fba7510e..6b7facfe 100644 --- a/internal/controller/config/config.go +++ b/internal/controller/config/config.go @@ -22,17 +22,19 @@ var DefaultAgentImage = "ghcr.io/buildkite/agent:" + version.Version() // mapstructure (the module) supports switching the struct tag to "json", viper does not. So we have // to have the `mapstructure` tag for viper and the `json` tag is used by the mapstructure! type Config struct { - Debug bool `json:"debug"` - JobTTL time.Duration `json:"job-ttl"` - PollInterval time.Duration `json:"poll-interval"` - AgentTokenSecret string `json:"agent-token-secret" validate:"required"` - BuildkiteToken string `json:"buildkite-token" validate:"required"` - Image string `json:"image" validate:"required"` - MaxInFlight int `json:"max-in-flight" validate:"min=0"` - Namespace string `json:"namespace" validate:"required"` - Org string `json:"org" validate:"required"` - Tags stringSlice `json:"tags" validate:"min=1"` - ProfilerAddress string `json:"profiler-address" validate:"omitempty,hostname_port"` + Debug bool `json:"debug"` + JobTTL time.Duration `json:"job-ttl"` + PollInterval time.Duration `json:"poll-interval"` + AgentTokenSecret string `json:"agent-token-secret" validate:"required"` + BuildkiteToken string `json:"buildkite-token" validate:"required"` + Image string `json:"image" validate:"required"` + MaxInFlight int `json:"max-in-flight" validate:"min=0"` + Namespace string `json:"namespace" validate:"required"` + Org string `json:"org" validate:"required"` + Tags stringSlice `json:"tags" validate:"min=1"` + ProfilerAddress string `json:"profiler-address" validate:"omitempty,hostname_port"` + // This field is mandatory for most new orgs. + // Some old orgs allows unclustered setup. ClusterUUID string `json:"cluster-uuid" validate:"omitempty"` AdditionalRedactedVars stringSlice `json:"additional-redacted-vars" validate:"omitempty"` PodSpecPatch *corev1.PodSpec `json:"pod-spec-patch" validate:"omitempty"` diff --git a/internal/integration/integration_test.go b/internal/integration/integration_test.go index 1d9962e3..22dfb6b0 100644 --- a/internal/integration/integration_test.go +++ b/internal/integration/integration_test.go @@ -22,8 +22,7 @@ func TestWalkingSkeleton(t *testing.T) { GraphQL: api.NewClient(cfg.BuildkiteToken), }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) tc.AssertSuccess(ctx, build) @@ -44,8 +43,7 @@ func TestPodSpecPatchInStep(t *testing.T) { GraphQL: api.NewClient(cfg.BuildkiteToken), }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) @@ -62,8 +60,7 @@ func TestPodSpecPatchInStepFailsWhenPatchingContainerCommands(t *testing.T) { }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) @@ -80,8 +77,7 @@ func TestPodSpecPatchInController(t *testing.T) { GraphQL: api.NewClient(cfg.BuildkiteToken), }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) cfg := cfg cfg.PodSpecPatch = &corev1.PodSpec{ Containers: []corev1.Container{ @@ -113,8 +109,7 @@ func TestControllerPicksUpJobsWithSubsetOfAgentTags(t *testing.T) { }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) cfg := cfg cfg.Tags = append(cfg.Tags, "foo=bar") // job has queue=, agent has queue= and foo=bar @@ -133,8 +128,7 @@ func TestControllerSetsAdditionalRedactedVars(t *testing.T) { }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) cfg := cfg cfg.AdditionalRedactedVars = []string{"ELEVEN_HERBS_AND_SPICES"} @@ -157,8 +151,7 @@ func TestPrePostCheckoutHooksRun(t *testing.T) { }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) @@ -176,8 +169,7 @@ func TestChown(t *testing.T) { GraphQL: api.NewClient(cfg.BuildkiteToken), }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) tc.AssertSuccess(ctx, build) @@ -198,8 +190,7 @@ func TestSSHRepoClone(t *testing.T) { Get(ctx, "agent-stack-k8s", metav1.GetOptions{}) require.NoError(t, err, "agent-stack-k8s secret must exist") - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) tc.AssertSuccess(ctx, build) @@ -215,8 +206,7 @@ func TestPluginCloneFailsTests(t *testing.T) { ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) tc.AssertFail(ctx, build) @@ -232,8 +222,7 @@ func TestMaxInFlightLimited(t *testing.T) { ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) cfg := cfg cfg.MaxInFlight = 1 tc.StartController(ctx, cfg) @@ -271,8 +260,7 @@ func TestMaxInFlightUnlimited(t *testing.T) { ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) cfg := cfg cfg.MaxInFlight = 0 tc.StartController(ctx, cfg) @@ -315,8 +303,7 @@ func TestSidecars(t *testing.T) { GraphQL: api.NewClient(cfg.BuildkiteToken), }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) tc.AssertSuccess(ctx, build) @@ -331,8 +318,7 @@ func TestExtraVolumeMounts(t *testing.T) { GraphQL: api.NewClient(cfg.BuildkiteToken), }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) tc.AssertSuccess(ctx, build) @@ -346,8 +332,7 @@ func TestInvalidPodSpec(t *testing.T) { GraphQL: api.NewClient(cfg.BuildkiteToken), }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) tc.AssertFail(ctx, build) @@ -365,8 +350,7 @@ func TestInvalidPodJSON(t *testing.T) { GraphQL: api.NewClient(cfg.BuildkiteToken), }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) tc.AssertFail(ctx, build) @@ -384,8 +368,7 @@ func TestEnvVariables(t *testing.T) { GraphQL: api.NewClient(cfg.BuildkiteToken), }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) tc.AssertSuccess(ctx, build) @@ -400,8 +383,7 @@ func TestImagePullBackOffCancelled(t *testing.T) { GraphQL: api.NewClient(cfg.BuildkiteToken), }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) tc.AssertFail(ctx, build) @@ -416,8 +398,7 @@ func TestArtifactsUploadFailedJobs(t *testing.T) { GraphQL: api.NewClient(cfg.BuildkiteToken), }.Init() ctx := context.Background() - pipelineID, cleanup := tc.CreatePipeline(ctx) - t.Cleanup(cleanup) + pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx) tc.StartController(ctx, cfg) build := tc.TriggerBuild(ctx, pipelineID) tc.AssertFail(ctx, build) diff --git a/internal/integration/interrupt_test.go b/internal/integration/interrupt_test.go index b154c7e6..0f9831f1 100644 --- a/internal/integration/interrupt_test.go +++ b/internal/integration/interrupt_test.go @@ -40,6 +40,7 @@ func CleanupOnInterrupt(cleanup func()) { // EnsureCleanup will run the provided cleanup function when the test ends, // either via t.Cleanup or on interrupt via CleanupOnInterrupt. +// But this can't cover test timeout case. func EnsureCleanup(t *testing.T, cleanup func()) { t.Cleanup(cleanup) CleanupOnInterrupt(cleanup) diff --git a/internal/integration/main_test.go b/internal/integration/main_test.go index 83be7c76..e83d70c9 100644 --- a/internal/integration/main_test.go +++ b/internal/integration/main_test.go @@ -20,9 +20,11 @@ const ( ) var ( - branch string - cfg config.Config - cleanupPipelines bool + branch string + cfg config.Config + cleanupPipelines bool + // Preserve pipelines even if the test passses. + // By default, failed pipeline will always be kept. preservePipelines bool //go:embed fixtures/* diff --git a/internal/integration/testcase_test.go b/internal/integration/testcase_test.go index e8c11350..c316e632 100644 --- a/internal/integration/testcase_test.go +++ b/internal/integration/testcase_test.go @@ -74,14 +74,61 @@ func (t testcase) Init() testcase { return t } -func (t testcase) CreatePipeline(ctx context.Context) (string, func()) { +// Create ephemeral test queues and pipelines, return pipeline's GraphQL ID. +// Register their cleanup as test cleanup. +// So when test ends, those queues and pipelines get deleted. +func (t testcase) PrepareQueueAndPipelineWithCleanup(ctx context.Context) string { + t.Helper() + + var queueName string + if cfg.ClusterUUID == "" { + // TODO: This condition will be removed by subsequent PRs because we aim to eliminate non-clustered accounts. + t.Log("No cluster-id is specified, assuming non clustered setup, skipping cluster queue creation...") + } else { + queue := t.createClusterQueueWithCleanup() + queueName = *queue.Key + } + + if queueName == "" { + queueName = t.ShortPipelineName() + } + p := t.createPipelineWithCleanup(ctx, queueName) + return *p.GraphQLID +} + +func (t testcase) createClusterQueueWithCleanup() *buildkite.ClusterQueue { + t.Helper() + + queueName := t.ShortPipelineName() + queue, _, err := t.Buildkite.ClusterQueues.Create(cfg.Org, cfg.ClusterUUID, &buildkite.ClusterQueueCreate{ + Key: &queueName, + }) + require.NoError(t, err) + + EnsureCleanup(t.T, func() { + if t.preserveEphemeralObjects() { + return + } + + _, err := t.Buildkite.ClusterQueues.Delete(cfg.Org, cfg.ClusterUUID, *queue.ID) + if err != nil { + t.Errorf("Unable to clean up cluster queue %s: %v", *queue.ID, err) + return + } + t.Logf("deleted cluster queue! %s", *queue.ID) + }) + + return queue +} + +func (t testcase) createPipelineWithCleanup(ctx context.Context, queueName string) *buildkite.Pipeline { t.Helper() tpl, err := template.ParseFS(fixtures, fmt.Sprintf("fixtures/%s", t.Fixture)) require.NoError(t, err) var steps bytes.Buffer - require.NoError(t, tpl.Execute(&steps, map[string]string{"queue": t.ShortPipelineName()})) + require.NoError(t, tpl.Execute(&steps, map[string]string{"queue": queueName})) pipeline, _, err := t.Buildkite.Pipelines.Create(cfg.Org, &buildkite.CreatePipeline{ Name: t.PipelineName, Repository: t.Repo, @@ -89,14 +136,20 @@ func (t testcase) CreatePipeline(ctx context.Context) (string, func()) { TriggerMode: strPtr("none"), }, Configuration: steps.String(), + ClusterID: cfg.ClusterUUID, }) require.NoError(t, err) - - return *pipeline.GraphQLID, func() { - if !preservePipelines && !t.Failed() { + EnsureCleanup(t.T, func() { + if !t.preserveEphemeralObjects() { t.deletePipeline(ctx) } - } + }) + + return pipeline +} + +func (t testcase) preserveEphemeralObjects() bool { + return preservePipelines || t.Failed() } func (t testcase) StartController(ctx context.Context, cfg config.Config) { @@ -105,6 +158,7 @@ func (t testcase) StartController(ctx context.Context, cfg config.Config) { runCtx, cancel := context.WithCancel(ctx) EnsureCleanup(t.T, cancel) + // TODO: Use queue name created above cfg.Tags = []string{fmt.Sprintf("queue=%s", t.ShortPipelineName())} cfg.Debug = true diff --git a/justfile b/justfile index b26ce451..a021cce5 100644 --- a/justfile +++ b/justfile @@ -62,6 +62,13 @@ deploy *FLAGS: {{FLAGS}} # Invoke with CLEANUP_PIPELINES=true -# pass in --org= --buildkite-token= +# pass in --org= --buildkite-token= or use environment variables per development.md cleanup-orphans *FLAGS: - @go test -v -run TestCleanupOrphanedPipelines ./internal/integration {{FLAGS}} + #!/usr/bin/env bash + set -e + export CLEANUP_PIPELINES=true + go test -v \ + -ldflags="-X github.com/buildkite/agent-stack-k8s/v2/internal/integration_test.branch=${GIT_BRANCH}" \ + -run TestCleanupOrphanedPipelines \ + ./internal/integration \ + {{FLAGS}}