From 2f9794b7ac57deafbcf88699a500658494ca7a06 Mon Sep 17 00:00:00 2001 From: Christian Schlotter Date: Thu, 1 Feb 2024 08:41:12 +0100 Subject: [PATCH] hack: add capv-janitor for automated ci cleanup --- Makefile | 15 ++ hack/tools/janitor/README.md | 10 ++ hack/tools/janitor/janitor.go | 251 ++++++++++++++++++++++++++++++++++ hack/tools/janitor/main.go | 125 +++++++++++++++++ hack/tools/janitor/vsphere.go | 100 ++++++++++++++ 5 files changed, 501 insertions(+) create mode 100644 hack/tools/janitor/README.md create mode 100644 hack/tools/janitor/janitor.go create mode 100644 hack/tools/janitor/main.go create mode 100644 hack/tools/janitor/vsphere.go diff --git a/Makefile b/Makefile index a6995c7f02..4c04226a83 100644 --- a/Makefile +++ b/Makefile @@ -249,6 +249,8 @@ VCSIM_RBAC_ROOT ?= test/infrastructure/vcsim/config/rbac VERSION ?= $(shell cat clusterctl-settings.json | jq .config.nextVersion -r) OVERRIDES_DIR := $(HOME)/.cluster-api/overrides/infrastructure-vsphere/$(VERSION) +JANITOR_DIR ?= ./$(TOOLS_DIR)/janitor + help: # Display this help @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[0-9A-Za-z_-]+:.*?##/ { printf " \033[36m%-50s\033[0m %s\n", $$1, $$2 } /^\$$\([0-9A-Za-z_-]+\):.*?##/ { gsub("_","-", $$1); printf " \033[36m%-50s\033[0m %s\n", tolower(substr($$1, 3, length($$1)-7)), $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) @@ -755,6 +757,19 @@ clean-bin: ## Remove all generated binaries rm -rf $(BIN_DIR) rm -rf $(TOOLS_BIN_DIR) +.PHONY: clean-ci +clean-ci: ## Cleanup orphaned objects in CI + @if [ -z "${GOVC_USERNAME}" ]; then echo "GOVC_USERNAME is not set"; exit 1; fi + @if [ -z "${GOVC_PASSWORD}" ]; then echo "GOVC_PASSWORD is not set"; exit 1; fi + @if [ -z "${GOVC_URL}" ]; then echo "GOVC_URL is not set"; exit 1; fi + go run $(JANITOR_DIR) \ + --dry-run=false \ + --max-age=12h \ + --ipam-namespace=default \ + --folder=/SDDC-Datacenter/vm/Workloads/cluster-api-provider-vsphere \ + --folder=/SDDC-Datacenter/vm/Workloads/cloud-provider-vsphere \ + --folder=/SDDC-Datacenter/vm/Workloads/image-builder + .PHONY: clean-temporary clean-temporary: ## Remove all temporary files and folders rm -f minikube.kubeconfig diff --git a/hack/tools/janitor/README.md b/hack/tools/janitor/README.md new file mode 100644 index 0000000000..f5de74aba9 --- /dev/null +++ b/hack/tools/janitor/README.md @@ -0,0 +1,10 @@ +# janitor + +The janitor is a tool for CI to cleanup objects leftover from failed or killed prowjobs. +It can be run regularly as prowjob. + +It tries to delete: + +* vSphere: virtual machines in the configured folders which exist longer than the configured `--max-age` flag. +* vSphere: cluster modules which do not refer any virtual machine +* IPAM: IPAddressClaims which exist longer than the configured `--max-age` flag diff --git a/hack/tools/janitor/janitor.go b/hack/tools/janitor/janitor.go new file mode 100644 index 0000000000..fa76c1a99b --- /dev/null +++ b/hack/tools/janitor/janitor.go @@ -0,0 +1,251 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "fmt" + "time" + + "github.com/pkg/errors" + "github.com/vmware/govmomi/find" + "github.com/vmware/govmomi/object" + govmomicluster "github.com/vmware/govmomi/vapi/cluster" + "github.com/vmware/govmomi/vim25/mo" + "github.com/vmware/govmomi/vim25/types" + kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/klog/v2" + ipamv1 "sigs.k8s.io/cluster-api/exp/ipam/api/v1alpha1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func newJanitor(vSphereClients *vSphereClients, ipamClient client.Client, maxAge time.Duration, ipamNamespace string, dryRun bool) *janitor { + return &janitor{ + dryRun: dryRun, + ipamClient: ipamClient, + ipamNamespace: ipamNamespace, + maxCreationDate: time.Now().Add(-maxAge), + vSphereClients: vSphereClients, + } +} + +type janitor struct { + dryRun bool + ipamClient client.Client + ipamNamespace string + maxCreationDate time.Time + vSphereClients *vSphereClients +} + +type virtualMachine struct { + managedObject mo.VirtualMachine + object *object.VirtualMachine +} + +// deleteVSphereVMs deletes all VSphereVMs in a given folder in vSphere if their creation +// timestamp is before the janitor's configured maxCreationDate. +func (s *janitor) deleteVSphereVMs(ctx context.Context, folder string) error { + log := ctrl.LoggerFrom(ctx).WithName("vSphereVMs").WithValues("folder", folder) + ctx = ctrl.LoggerInto(ctx, log) + + if folder == "" { + return fmt.Errorf("cannot use empty string as folder") + } + + log.Info("Deleting vSphere VMs in folder") + + // List all virtual machines inside the folder. + finder := find.NewFinder(s.vSphereClients.Vim, false) + managedObjects, err := finder.ManagedObjectListChildren(ctx, folder+"/...", "VirtualMachine") + if err != nil { + return err + } + + if len(managedObjects) == 0 { + return nil + } + + // Retrieve information for all found virtual machines. + managedObjectReferences := []types.ManagedObjectReference{} + for _, obj := range managedObjects { + managedObjectReferences = append(managedObjectReferences, obj.Object.Reference()) + } + var managedObjectVMs []mo.VirtualMachine + if err := s.vSphereClients.Govmomi.Retrieve(ctx, managedObjectReferences, []string{"config", "summary.runtime.powerState", "summary.config.template"}, &managedObjectVMs); err != nil { + return err + } + + vmsToDeleteAndPoweroff := []*virtualMachine{} + vmsToDelete := []*virtualMachine{} + + // Filter out vms we don't have to cleanup depending on s.maxCreationDate. + for _, managedObjectVM := range managedObjectVMs { + if managedObjectVM.Summary.Config.Template { + // Skip templates for deletion. + continue + } + if managedObjectVM.Config.CreateDate.After(s.maxCreationDate) { + // Ignore vms created after maxCreationDate + continue + } + + vm := &virtualMachine{ + managedObject: managedObjectVM, + object: object.NewVirtualMachine(s.vSphereClients.Vim, managedObjectVM.Reference()), + } + + if vm.managedObject.Summary.Runtime.PowerState == types.VirtualMachinePowerStatePoweredOn { + vmsToDeleteAndPoweroff = append(vmsToDeleteAndPoweroff, vm) + continue + } + vmsToDelete = append(vmsToDelete, vm) + } + + // PowerOff vms which are still running. Triggering PowerOff for a VM results in a task in vSphere. + poweroffTasks := []*object.Task{} + for _, vm := range vmsToDeleteAndPoweroff { + log.Info("Powering off vm in vSphere", "vm", vm.managedObject.Config.Name) + if s.dryRun { + // Skipping actual PowerOff on dryRun. + continue + } + task, err := vm.object.PowerOff(ctx) + if err != nil { + return err + } + log.Info("Created PowerOff task for VM", "vm", vm.managedObject.Config.Name, "task", task.Name()) + poweroffTasks = append(poweroffTasks, task) + } + // Wait for all PowerOff tasks to be finished. We intentionally ignore errors here + // because the VM may already got into PowerOff state and log the errors only. + // We are logging as best effort. If a machine did not successfully PowerOff, the + // Destroy task below will result in an error. + // xref govc: https://github.com/vmware/govmomi/blob/512c168/govc/vm/destroy.go#L94-L96 + if err := waitForTasksFinished(ctx, poweroffTasks, true); err != nil { + log.Info("Ignoring error for PowerOff task", "err", err) + } + + destroyTasks := []*object.Task{} + for _, vm := range append(vmsToDeleteAndPoweroff, vmsToDelete...) { + log.Info("Destroying vm in vSphere", "vm", vm.managedObject.Config.Name) + if dryRun { + // Skipping actual destroy on dryRun. + continue + } + task, err := vm.object.Destroy(ctx) + if err != nil { + return err + } + log.Info("Created Destroy task for VM", "vm", vm.managedObject.Config.Name, "task", task.Name()) + destroyTasks = append(destroyTasks, task) + } + // Wait for all destroy tasks to succeed. + if err := waitForTasksFinished(ctx, destroyTasks, false); err != nil { + return errors.Wrap(err, "failed to wait for vm destroy task to finish") + } + + return nil +} + +func waitForTasksFinished(ctx context.Context, tasks []*object.Task, ignoreErrors bool) error { + for _, t := range tasks { + if err := t.Wait(ctx); !ignoreErrors && err != nil { + return err + } + } + return nil +} + +func (s *janitor) deleteIPAddressClaims(ctx context.Context) error { + log := ctrl.LoggerFrom(ctx).WithName("IPAddressClaims") + ctrl.LoggerInto(ctx, log) + log.Info("Deleting IPAddressClaims") + + // List all existing IPAddressClaims + ipAddressClaims := &ipamv1.IPAddressClaimList{} + if err := s.ipamClient.List(ctx, ipAddressClaims, + client.InNamespace(s.ipamNamespace), + ); err != nil { + return err + } + + errList := []error{} + + for _, ipAddressClaim := range ipAddressClaims.Items { + ipAddressClaim := ipAddressClaim + // Skip IPAddressClaims which got created after maxCreationDate. + if ipAddressClaim.CreationTimestamp.After(s.maxCreationDate) { + continue + } + + log.Info("Deleting IPAddressClaim", "IPAddressClaim", klog.KObj(&ipAddressClaim)) + + if s.dryRun { + // Skipping actual deletion on dryRun. + continue + } + + if err := s.ipamClient.Delete(ctx, &ipAddressClaim); err != nil { + errList = append(errList, err) + } + } + + return kerrors.NewAggregate(errList) +} + +func (s *janitor) deleteVSphereClusterModules(ctx context.Context) error { + log := ctrl.LoggerFrom(ctx).WithName("vSphere cluster modules") + ctrl.LoggerInto(ctx, log) + log.Info("Deleting vSphere cluster modules") + + manager := govmomicluster.NewManager(s.vSphereClients.Rest) + + // List all existing modules + clusterModules, err := manager.ListModules(ctx) + if err != nil { + return err + } + + errList := []error{} + // Check for all modules if they refer members and delete them if they are empty. + for _, clusterModule := range clusterModules { + members, err := manager.ListModuleMembers(ctx, clusterModule.Module) + if err != nil { + errList = append(errList, err) + continue + } + + // Do not attempt to delete if the cluster module still refers virtual machines. + if len(members) > 0 { + continue + } + + log.Info("Deleting empty vSphere cluster module", "clusterModule", clusterModule.Module) + + if s.dryRun { + // Skipping actual deletion on dryRun. + continue + } + + if err := manager.DeleteModule(ctx, clusterModule.Module); err != nil { + errList = append(errList, err) + } + } + + return kerrors.NewAggregate(errList) +} diff --git a/hack/tools/janitor/main.go b/hack/tools/janitor/main.go new file mode 100644 index 0000000000..b982bfc833 --- /dev/null +++ b/hack/tools/janitor/main.go @@ -0,0 +1,125 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package main is the main package for capv-janitor. +package main + +import ( + "context" + "flag" + "os" + "time" + + "github.com/pkg/errors" + "github.com/spf13/pflag" + "k8s.io/apimachinery/pkg/runtime" + kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/klog/v2" + ipamv1 "sigs.k8s.io/cluster-api/exp/ipam/api/v1alpha1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var ipamScheme *runtime.Scheme + +func init() { + ipamScheme = runtime.NewScheme() + _ = ipamv1.AddToScheme(ipamScheme) +} + +var ( + dryRun bool + ipamNamespace string + maxAge time.Duration + vsphereFolders []string +) + +func initFlags(fs *pflag.FlagSet) { + fs.StringArrayVar(&vsphereFolders, "folder", []string{}, "Path to folders in vCenter to cleanup virtual machines.") + fs.StringVar(&ipamNamespace, "ipam-namespace", "", "Namespace for IPAddressClaim cleanup.") + fs.DurationVar(&maxAge, "max-age", time.Hour*12, "Maximum age of an object before it is getting deleted.") + fs.BoolVar(&dryRun, "dry-run", false, "dry-run results in not deleting anything but printing the actions.") +} + +func main() { + initFlags(pflag.CommandLine) + pflag.CommandLine.AddGoFlagSet(flag.CommandLine) + pflag.Parse() + + log := klog.Background() + ctx := ctrl.LoggerInto(context.Background(), log) + + if err := run(ctx); err != nil { + log.Error(err, "Failed running vsphere-janitor") + os.Exit(1) + } + + log.Info("Finished cleanup.") +} + +func run(ctx context.Context) error { + log := ctrl.LoggerFrom(ctx) + + log.Info("Configured settings", "dry-run", dryRun) + log.Info("Configured settings", "folders", vsphereFolders) + log.Info("Configured settings", "ipam-namespace", ipamNamespace) + log.Info("Configured settings", "max-age", maxAge) + + // Create clients for vSphere. + vSphereClients, err := newVSphereClients(ctx, getVSphereClientInput{ + Username: os.Getenv("GOVC_USERNAME"), + Password: os.Getenv("GOVC_PASSWORD"), + Server: os.Getenv("GOVC_URL"), + Thumbprint: os.Getenv("VSPHERE_TLS_THUMBPRINT"), + UserAgent: "capv-janitor", + }) + if err != nil { + return errors.Wrap(err, "creating vSphere clients") + } + defer vSphereClients.logout(ctx) + + // Create controller-runtime client for IPAM. + restConfig := ctrl.GetConfigOrDie() + ipamClient, err := client.New(restConfig, client.Options{Scheme: ipamScheme}) + if err != nil { + return errors.Wrap(err, "creating IPAM client") + } + + janitor := newJanitor(vSphereClients, ipamClient, maxAge, ipamNamespace, dryRun) + + // First cleanup old vms to free up IPAddressClaims or cluster modules which are still in-use. + errList := []error{} + for _, folder := range vsphereFolders { + if err := janitor.deleteVSphereVMs(ctx, folder); err != nil { + errList = append(errList, errors.Wrapf(err, "cleaning up vSphereVMs for folder %q", folder)) + } + } + if err := kerrors.NewAggregate(errList); err != nil { + return errors.Wrap(err, "cleaning up vSphereVMs") + } + + // Second cleanup IPAddressClaims. + if err := janitor.deleteIPAddressClaims(ctx); err != nil { + return errors.Wrap(err, "cleaning up IPAddressClaims") + } + + // Third cleanup cluster modules. + if err := janitor.deleteVSphereClusterModules(ctx); err != nil { + return errors.Wrap(err, "cleaning up vSphere cluster modules") + } + + return nil +} diff --git a/hack/tools/janitor/vsphere.go b/hack/tools/janitor/vsphere.go new file mode 100644 index 0000000000..5185e6425e --- /dev/null +++ b/hack/tools/janitor/vsphere.go @@ -0,0 +1,100 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "net/url" + + "github.com/vmware/govmomi" + "github.com/vmware/govmomi/session" + "github.com/vmware/govmomi/vapi/rest" + "github.com/vmware/govmomi/vim25" + "github.com/vmware/govmomi/vim25/soap" + ctrl "sigs.k8s.io/controller-runtime" +) + +type getVSphereClientInput struct { + Password string + Server string + Thumbprint string + UserAgent string + Username string +} + +// vSphereClients is a collection of different clients for vSphere. +type vSphereClients struct { + Vim *vim25.Client + Govmomi *govmomi.Client + Rest *rest.Client +} + +// logout logs out all clients. It logs errors if the context contains a logger. +func (v *vSphereClients) logout(ctx context.Context) { + log := ctrl.LoggerFrom(ctx) + if err := v.Govmomi.Logout(ctx); err != nil { + log.Error(err, "logging out govmomi client") + } + + if err := v.Rest.Logout(ctx); err != nil { + log.Error(err, "logging out rest client") + } +} + +// newVSphereClients creates a vSphereClients object from the given input. +func newVSphereClients(ctx context.Context, input getVSphereClientInput) (*vSphereClients, error) { + urlCredentials := url.UserPassword(input.Username, input.Password) + + serverURL, err := soap.ParseURL(input.Server) + if err != nil { + return nil, err + } + serverURL.User = urlCredentials + var soapClient *soap.Client + if input.Thumbprint == "" { + soapClient = soap.NewClient(serverURL, true) + } else { + soapClient = soap.NewClient(serverURL, false) + soapClient.SetThumbprint(serverURL.Host, input.Thumbprint) + } + soapClient.UserAgent = input.UserAgent + + vimClient, err := vim25.NewClient(ctx, soapClient) + if err != nil { + return nil, err + } + + govmomiClient := &govmomi.Client{ + Client: vimClient, + SessionManager: session.NewManager(vimClient), + } + + if err := govmomiClient.Login(ctx, urlCredentials); err != nil { + return nil, err + } + + restClient := rest.NewClient(vimClient) + if err := restClient.Login(ctx, urlCredentials); err != nil { + return nil, err + } + + return &vSphereClients{ + Vim: vimClient, + Govmomi: govmomiClient, + Rest: restClient, + }, nil +}