diff --git a/cmd/main.go b/cmd/main.go index 07ba8f7f..09831d0a 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -35,7 +35,6 @@ import ( "github.com/uselagoon/remote-controller/internal/harbor" "github.com/uselagoon/remote-controller/internal/helpers" - "github.com/uselagoon/remote-controller/internal/metrics" "github.com/uselagoon/remote-controller/internal/utilities/deletions" "github.com/uselagoon/remote-controller/internal/utilities/pruner" @@ -185,10 +184,10 @@ func main() { var unauthenticatedRegistry string - flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", - "The address the metric endpoint binds to.") - flag.BoolVar(&secureMetrics, "metrics-secure", false, - "If set the metrics endpoint is served securely") + flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ + "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") + flag.BoolVar(&secureMetrics, "metrics-secure", true, + "If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.") flag.BoolVar(&enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers") @@ -489,10 +488,16 @@ func main() { tlsOpts = append(tlsOpts, disableHTTP2) } metricsServerOptions := metricsserver.Options{ - BindAddress: metricsAddr, - SecureServing: secureMetrics, - TLSOpts: tlsOpts, - FilterProvider: filters.WithAuthenticationAndAuthorization, + BindAddress: metricsAddr, + SecureServing: secureMetrics, + TLSOpts: tlsOpts, + } + if secureMetrics { + // FilterProvider is used to protect the metrics endpoint with authn/authz. + // These configurations ensure that only authorized users and service accounts + // can access the metrics endpoint. The RBAC are configured in 'config/rbac/kustomization.yaml'. More info: + // https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/metrics/filters#WithAuthenticationAndAuthorization + metricsServerOptions.FilterProvider = filters.WithAuthenticationAndAuthorization } mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: scheme, @@ -1012,10 +1017,6 @@ func main() { } // +kubebuilder:scaffold:builder - setupLog.Info("starting lagoon metrics server") - m := metrics.NewServer(setupLog, ":9912") - defer m.Shutdown(context.Background()) - setupLog.Info("starting manager") if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { setupLog.Error(err, "problem running manager") diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index e639c738..60d17e40 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -49,6 +49,8 @@ resources: - ../crd - ../rbac - ../manager +# [METRICS] Expose the controller manager metrics service. +- metrics_service.yaml patches: - path: envs.yaml - path: manager_auth_proxy_patch.yaml diff --git a/config/rbac/auth_proxy_service.yaml b/config/default/metrics_service.yaml similarity index 73% rename from config/rbac/auth_proxy_service.yaml rename to config/default/metrics_service.yaml index 6cf656be..b6e338a6 100644 --- a/config/rbac/auth_proxy_service.yaml +++ b/config/default/metrics_service.yaml @@ -9,6 +9,7 @@ spec: ports: - name: https port: 8443 - targetPort: https + protocol: TCP + targetPort: 8443 selector: - control-plane: controller-manager + control-plane: controller-manager \ No newline at end of file diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 3577dde2..119be019 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -17,4 +17,5 @@ resources: # More info: https://book.kubebuilder.io/reference/metrics.html - metrics_auth_role.yaml - metrics_auth_role_binding.yaml -- metrics_reader_role.yaml \ No newline at end of file +- metrics_reader_role.yaml +- metrics_reader_role_binding.yaml \ No newline at end of file diff --git a/config/rbac/metrics_reader_role_binding.yaml b/config/rbac/metrics_reader_role_binding.yaml new file mode 100644 index 00000000..2d14de4d --- /dev/null +++ b/config/rbac/metrics_reader_role_binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: metrics-reader-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: metrics-reader +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system \ No newline at end of file diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index f87e87e0..f4c8c418 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -1,89 +1,85 @@ package metrics import ( - "fmt" - "net/http" - "time" - - "github.com/go-logr/logr" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/prometheus/client_golang/prometheus/promhttp" + "sigs.k8s.io/controller-runtime/pkg/metrics" ) -// NewServer returns a *http.Server serving prometheus metrics in a new -// goroutine. -// Caller should defer Shutdown() for cleanup. -func NewServer(log logr.Logger, addr string) *http.Server { - mux := http.NewServeMux() - mux.Handle("/metrics", promhttp.Handler()) - s := http.Server{ - Addr: addr, - Handler: mux, - ReadTimeout: 16 * time.Second, - WriteTimeout: 16 * time.Second, - } - go func() { - if err := s.ListenAndServe(); err != http.ErrServerClosed { - log.Error(fmt.Errorf("metrics server did not shut down cleanly"), err.Error()) - } - }() - return &s -} - var ( // general counters for builds - BuildsRunningGauge = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "lagoon_builds_running_current", - Help: "The total number of Lagoon builds running", - }) - BuildsPendingGauge = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "lagoon_builds_pending_current", - Help: "The total number of Lagoon builds pending or queued", - }) - BuildsStartedCounter = promauto.NewCounter(prometheus.CounterOpts{ - Name: "lagoon_builds_started_total", - Help: "The total number of Lagoon builds started", - }) - BuildsCompletedCounter = promauto.NewCounter(prometheus.CounterOpts{ - Name: "lagoon_builds_completed_total", - Help: "The total number of Lagoon builds completed", - }) - BuildsFailedCounter = promauto.NewCounter(prometheus.CounterOpts{ - Name: "lagoon_builds_failed_total", - Help: "The total number of Lagoon builds failed", - }) - BuildsCancelledCounter = promauto.NewCounter(prometheus.CounterOpts{ - Name: "lagoon_builds_cancelled_total", - Help: "The total number of Lagoon builds cancelled", - }) + BuildsRunningGauge = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "lagoon_builds_running_current", + Help: "The total number of Lagoon builds running", + }, + ) + BuildsPendingGauge = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "lagoon_builds_pending_current", + Help: "The total number of Lagoon builds pending or queued", + }, + ) + BuildsStartedCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "lagoon_builds_started_total", + Help: "The total number of Lagoon builds started", + }, + ) + BuildsCompletedCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "lagoon_builds_completed_total", + Help: "The total number of Lagoon builds completed", + }, + ) + BuildsFailedCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "lagoon_builds_failed_total", + Help: "The total number of Lagoon builds failed", + }, + ) + BuildsCancelledCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "lagoon_builds_cancelled_total", + Help: "The total number of Lagoon builds cancelled", + }, + ) // general counters for tasks - TasksRunningGauge = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "lagoon_tasks_running_current", - Help: "The total number of Lagoon tasks running", - }) - TasksStartedCounter = promauto.NewCounter(prometheus.CounterOpts{ - Name: "lagoon_tasks_started_total", - Help: "The total number of Lagoon tasks started", - }) - TasksCompletedCounter = promauto.NewCounter(prometheus.CounterOpts{ - Name: "lagoon_tasks_completed_total", - Help: "The total number of Lagoon tasks completed", - }) - TasksFailedCounter = promauto.NewCounter(prometheus.CounterOpts{ - Name: "lagoon_tasks_failed_total", - Help: "The total number of Lagoon tasks failed", - }) - TasksCancelledCounter = promauto.NewCounter(prometheus.CounterOpts{ - Name: "lagoon_tasks_cancelled_total", - Help: "The total number of Lagoon tasks cancelled", - }) + TasksRunningGauge = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "lagoon_tasks_running_current", + Help: "The total number of Lagoon tasks running", + }, + ) + TasksStartedCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "lagoon_tasks_started_total", + Help: "The total number of Lagoon tasks started", + }, + ) + TasksCompletedCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "lagoon_tasks_completed_total", + Help: "The total number of Lagoon tasks completed", + }, + ) + TasksFailedCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "lagoon_tasks_failed_total", + Help: "The total number of Lagoon tasks failed", + }, + ) + TasksCancelledCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "lagoon_tasks_cancelled_total", + Help: "The total number of Lagoon tasks cancelled", + }, + ) // buildStatus will count the build transisiton steps // when the build step changes, the count is removed and the new step metric is created // this is useful to gauge how long particular steps take in a build - BuildStatus = promauto.NewGaugeVec(prometheus.GaugeOpts{ + BuildStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "lagoon_build_status", Help: "The status of running Lagoon builds", }, @@ -97,7 +93,7 @@ var ( // RunningStatus will count when a build or task is running // when the build or task is complete, the count is removed // this is useful to gauge how long a build or task runs for - BuildRunningStatus = promauto.NewGaugeVec(prometheus.GaugeOpts{ + BuildRunningStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "lagoon_build_running_status", Help: "The duration of running Lagoon builds", }, @@ -106,7 +102,7 @@ var ( "build_namespace", }, ) - TaskRunningStatus = promauto.NewGaugeVec(prometheus.GaugeOpts{ + TaskRunningStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "lagoon_task_running_status", Help: "The duration of running Lagoon tasks", }, @@ -116,3 +112,23 @@ var ( }, ) ) + +func init() { + // Register custom metrics with the global prometheus registry + metrics.Registry.MustRegister( + BuildsRunningGauge, + BuildsPendingGauge, + BuildsStartedCounter, + BuildsCompletedCounter, + BuildsFailedCounter, + BuildsCancelledCounter, + TasksRunningGauge, + TasksStartedCounter, + TasksCompletedCounter, + TasksFailedCounter, + TasksCancelledCounter, + BuildStatus, + BuildRunningStatus, + TaskRunningStatus, + ) +} diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index b228eddc..065029fb 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -40,6 +40,20 @@ var ( duration = 600 * time.Second interval = 1 * time.Second + + metricLabels = []string{ + "lagoon_builds_cancelled_total", + "lagoon_builds_completed_total", + "lagoon_builds_failed_total", + "lagoon_builds_pending_current", + "lagoon_builds_running_current", + "lagoon_builds_started_total", + "lagoon_tasks_cancelled_total", + "lagoon_tasks_completed_total", + "lagoon_tasks_failed_total", + "lagoon_tasks_running_current", + "lagoon_tasks_started_total", + } ) func init() { @@ -59,16 +73,20 @@ var _ = Describe("controller", Ordered, func() { // comment to prevent cleaning up controller namespace and local services AfterAll(func() { - By("removing manager namespace") - cmd := exec.Command("kubectl", "delete", "ns", namespace) - _, _ = utils.Run(cmd) + By("stop metrics consumer") + utils.StartMetricsConsumer() + + // By("removing manager namespace") + // cmd := exec.Command("kubectl", "delete", "ns", namespace) + // _, _ = utils.Run(cmd) - By("stop local services") - utils.StopLocalServices() + // By("stop local services") + // utils.StopLocalServices() }) Context("Operator", func() { It("should run successfully", func() { + // start tests var controllerPodName string var err error @@ -116,7 +134,6 @@ var _ = Describe("controller", Ordered, func() { controllerPodName = podNames[0] ExpectWithOffset(2, controllerPodName).Should(ContainSubstring("controller-manager")) - // Validate pod status cmd = exec.Command("kubectl", "get", "pods", controllerPodName, "-o", "jsonpath={.status.phase}", "-n", namespace, @@ -130,6 +147,11 @@ var _ = Describe("controller", Ordered, func() { } EventuallyWithOffset(1, verifyControllerUp, time.Minute, time.Second).Should(Succeed()) + By("start metrics consumer") + Expect(utils.StartMetricsConsumer()).To(Succeed()) + + time.Sleep(30 * time.Second) + By("validating that lagoonbuilds are working") for _, name := range []string{"7m5zypx", "8m5zypx", "9m5zypx", "1m5zypx"} { if name == "9m5zypx" { @@ -363,7 +385,6 @@ var _ = Describe("controller", Ordered, func() { controllerPodName = podNames[0] ExpectWithOffset(2, controllerPodName).Should(ContainSubstring("controller-manager")) verifyRobotCredentialsRotate := func() error { - // Validate pod status cmd = exec.Command("kubectl", "logs", controllerPodName, "-c", "manager", "-n", namespace, @@ -411,9 +432,15 @@ var _ = Describe("controller", Ordered, func() { return nil } EventuallyWithOffset(1, verifyNamespaceRemoved, duration, interval).Should(Succeed()) + + By("validating that there are metrics") + runCmd := `curl -s -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://remote-controller-controller-manager-metrics-service.remote-controller-system.svc.cluster.local:8443/metrics | grep -v "#" | grep "lagoon_"` + output, err := utils.RunCommonsCommand(namespace, runCmd) + ExpectWithOffset(2, err).NotTo(HaveOccurred()) + fmt.Printf("metrics: %s", string(output)) + err = utils.CheckStringContainsStrings(string(output), metricLabels) + ExpectWithOffset(2, err).NotTo(HaveOccurred()) + // End tests }) - // uncomment to debug ... - // time.Sleep(5 * time.Minute) }) - }) diff --git a/test/e2e/testdata/metrics-consumer.yaml b/test/e2e/testdata/metrics-consumer.yaml new file mode 100644 index 00000000..97969413 --- /dev/null +++ b/test/e2e/testdata/metrics-consumer.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Pod +metadata: + name: metrics-consumer + namespace: remote-controller-system + labels: + app: metrics-consumer +spec: + serviceAccountName: remote-controller-controller-manager + containers: + - name: metrics-consumer + image: uselagoon/commons:latest + command: ["/bin/sh"] + args: ["-c", "sleep 3000"] diff --git a/test/utils/utils.go b/test/utils/utils.go index 9a7a7a33..832a4eb9 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -51,6 +51,24 @@ func InstallBulkStorage() error { return err } +func StartMetricsConsumer() error { + cmd := exec.Command("kubectl", "apply", "-f", "test/e2e/testdata/metrics-consumer.yaml") + _, err := Run(cmd) + return err +} + +func StopMetricsConsumer() { + cmd := exec.Command("kubectl", "delete", "-f", "test/e2e/testdata/metrics-consumer.yaml") + if _, err := Run(cmd); err != nil { + warnError(err) + } +} + +func RunCommonsCommand(ns, runCmd string) ([]byte, error) { + cmd := exec.Command("kubectl", "-n", ns, "exec", "metrics-consumer", "--", "sh", "-c", runCmd) + return Run(cmd) +} + // Run executes the provided command within this context func Run(cmd *exec.Cmd) ([]byte, error) { dir, _ := GetProjectDir() @@ -106,3 +124,12 @@ func GetProjectDir() (string, error) { wd = strings.Replace(wd, "/test/e2e", "", -1) return wd, nil } + +func CheckStringContainsStrings(str string, strs []string) error { + for _, s := range strs { + if !strings.Contains(str, s) { + return fmt.Errorf("string %s not found in strings", s) + } + } + return nil +}