From 4b9963e1afd61bf0e3853b927c13f3d8ea6501f5 Mon Sep 17 00:00:00 2001 From: Luke Kysow <1034429+lkysow@users.noreply.github.com> Date: Mon, 5 Jul 2021 12:22:07 -0700 Subject: [PATCH] Change error handling when scraping metrics * If Envoy returns an error then also respond with a 500 in our merged metrics response so that Prometheus will know that we had an error, not that there are no metrics. * If the service metrics return with a non-2xx status code then don't include the response body in the merged metrics. This will stop issues where users accidentally turn on metrics merging but they don't have an exporter and so their metrics endpoint returns 404. I could have responded with a 500 in this case in order to indicate that there is an error, however I think it's more likely that users are accidentally turning on metrics merging and the error indication is accomplished via a new metric (see below). * Append a new metric that indicates the success of the service scraping. This can be used for alerting by users since the response code of the service metrics response is discarded: * success: consul_metrics_merging_service_metrics_success 1 * fail: consul_metrics_merging_service_metrics_success 0 * modify logging to use key/value pairs * Fixes #546 --- CHANGELOG.md | 4 + .../subcommand/consul-sidecar/command.go | 80 +++++++++++--- .../subcommand/consul-sidecar/command_test.go | 100 +++++++++++------- 3 files changed, 131 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 873d49b793..d977798e5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,8 @@ BREAKING CHANGES: * Control Plane * Update minimum go version for project to 1.17 [[GH-878](https://github.com/hashicorp/consul-k8s/pull/878)] + * Add boolean metric to merged metrics response `consul_merged_service_metrics_success` to indicate if service metrics were + scraped successfully. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)] IMPROVEMENTS: * CLI @@ -16,6 +18,8 @@ IMPROVEMENTS: BUG FIXES: * Control Plane: * Add a workaround to check that the ACL token is replicated to other Consul servers. [[GH-862](https://github.com/hashicorp/consul-k8s/issues/862)] + * Return 500 on prometheus response if unable to get metrics from Envoy. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)] + * Don't include body of failed service metrics calls in merged metrics response. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)] * Helm Chart * Admin Partitions **(Consul Enterprise only)**: Do not mount Consul CA certs to partition-init job if `externalServers.useSystemRoots` is `true`. [[GH-885](https://github.com/hashicorp/consul-k8s/pull/885)] diff --git a/control-plane/subcommand/consul-sidecar/command.go b/control-plane/subcommand/consul-sidecar/command.go index dcb05199ba..b6ff0d9d39 100644 --- a/control-plane/subcommand/consul-sidecar/command.go +++ b/control-plane/subcommand/consul-sidecar/command.go @@ -21,8 +21,13 @@ import ( "github.com/mitchellh/cli" ) -const metricsServerShutdownTimeout = 5 * time.Second -const envoyMetricsAddr = "http://127.0.0.1:19000/stats/prometheus" +const ( + metricsServerShutdownTimeout = 5 * time.Second + envoyMetricsAddr = "http://127.0.0.1:19000/stats/prometheus" + // prometheusServiceMetricsSuccessKey is the key of the prometheus metric used to + // indicate if service metrics were scraped successfully. + prometheusServiceMetricsSuccessKey = "consul_merged_service_metrics_success" +) type Command struct { UI cli.Ui @@ -240,27 +245,36 @@ func (c *Command) createMergedMetricsServer() *http.Server { mergedMetricsServerAddr := fmt.Sprintf("127.0.0.1:%s", c.flagMergedMetricsPort) server := &http.Server{Addr: mergedMetricsServerAddr, Handler: mux} + // http.Client satisfies the metricsGetter interface. // The default http.Client timeout is indefinite, so adding a timeout makes // sure that requests don't hang. client := &http.Client{ Timeout: time.Second * 10, } - // http.Client satisfies the metricsGetter interface. - c.envoyMetricsGetter = client - c.serviceMetricsGetter = client + + // During tests these may already be set to mocks. + if c.envoyMetricsGetter == nil { + c.envoyMetricsGetter = client + } + if c.serviceMetricsGetter == nil { + c.serviceMetricsGetter = client + } return server } // mergedMetricsHandler has the logic to append both Envoy and service metrics // together, logging if it's unsuccessful at either. +// If the Envoy scrape fails, we respond with a 500 code which follows the Prometheus +// exporter guidelines. If the service scrape fails, we respond with a 200 so +// that the Envoy metrics are still scraped. +// We also include a metric line in each response indicating the success or +// failure of the service metric scraping. func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request) { - envoyMetrics, err := c.envoyMetricsGetter.Get(envoyMetricsAddr) if err != nil { - // If there is an error scraping Envoy, we want the handler to return - // without writing anything to the response, and log the error. - c.logger.Error(fmt.Sprintf("Error scraping Envoy proxy metrics: %s", err.Error())) + c.logger.Error("Error scraping Envoy proxy metrics", "err", err) + http.Error(rw, fmt.Sprintf("Error scraping Envoy proxy metrics: %s", err), http.StatusInternalServerError) return } @@ -273,18 +287,22 @@ func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request) }() envoyMetricsBody, err := ioutil.ReadAll(envoyMetrics.Body) if err != nil { - c.logger.Error(fmt.Sprintf("Couldn't read Envoy proxy metrics: %s", err.Error())) + c.logger.Error("Could not read Envoy proxy metrics", "err", err) + http.Error(rw, fmt.Sprintf("Could not read Envoy proxy metrics: %s", err), http.StatusInternalServerError) return } - _, err = rw.Write(envoyMetricsBody) - if err != nil { - c.logger.Error(fmt.Sprintf("Error writing envoy metrics body: %s", err.Error())) + if non2xxCode(envoyMetrics.StatusCode) { + c.logger.Error("Received non-2xx status code scraping Envoy proxy metrics", "code", envoyMetrics.StatusCode, "response", string(envoyMetricsBody)) + http.Error(rw, fmt.Sprintf("Received non-2xx status code scraping Envoy proxy metrics: %d: %s", envoyMetrics.StatusCode, string(envoyMetricsBody)), http.StatusInternalServerError) + return } + writeResponse(rw, envoyMetricsBody, "envoy metrics", c.logger) serviceMetricsAddr := fmt.Sprintf("http://127.0.0.1:%s%s", c.flagServiceMetricsPort, c.flagServiceMetricsPath) serviceMetrics, err := c.serviceMetricsGetter.Get(serviceMetricsAddr) if err != nil { - c.logger.Warn(fmt.Sprintf("Error scraping service metrics: %s", err.Error())) + c.logger.Warn("Error scraping service metrics", "err", err) + writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger) // Since we've already written the Envoy metrics to the response, we can // return at this point if we were unable to get service metrics. return @@ -300,12 +318,25 @@ func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request) }() serviceMetricsBody, err := ioutil.ReadAll(serviceMetrics.Body) if err != nil { - c.logger.Error(fmt.Sprintf("Couldn't read service metrics: %s", err.Error())) + c.logger.Error("Could not read service metrics", "err", err) + writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger) return } - _, err = rw.Write(serviceMetricsBody) + if non2xxCode(serviceMetrics.StatusCode) { + c.logger.Error("Received non-2xx status code scraping service metrics", "code", serviceMetrics.StatusCode, "response", string(serviceMetricsBody)) + writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger) + return + } + writeResponse(rw, serviceMetricsBody, "service metrics", c.logger) + writeResponse(rw, serviceMetricSuccess(true), "service metrics success", c.logger) +} + +// writeResponse is a helper method to write resp to rw and log if there is an error writing. +// respName is the name of this response that will be used in the error log. +func writeResponse(rw http.ResponseWriter, resp []byte, respName string, logger hclog.Logger) { + _, err := rw.Write(resp) if err != nil { - c.logger.Error(fmt.Sprintf("Error writing service metrics body: %s", err.Error())) + logger.Error(fmt.Sprintf("Error writing %s: %s", respName, err.Error())) } } @@ -339,6 +370,21 @@ func (c *Command) validateFlags() error { return nil } +// non2xxCode returns true if code is not in the range of 200-299 inclusive. +func non2xxCode(code int) bool { + return code < 200 || code >= 300 +} + +// serviceMetricSuccess returns a prometheus metric line indicating +// the success of the metrics merging. +func serviceMetricSuccess(success bool) []byte { + boolAsInt := 0 + if success { + boolAsInt = 1 + } + return []byte(fmt.Sprintf("%s %d\n", prometheusServiceMetricsSuccessKey, boolAsInt)) +} + // parseConsulFlags creates Consul client command flags // from command's HTTP flags and returns them as an array of strings. func (c *Command) parseConsulFlags() []string { diff --git a/control-plane/subcommand/consul-sidecar/command_test.go b/control-plane/subcommand/consul-sidecar/command_test.go index 6ee55f8ea0..370b4afd8f 100644 --- a/control-plane/subcommand/consul-sidecar/command_test.go +++ b/control-plane/subcommand/consul-sidecar/command_test.go @@ -214,50 +214,88 @@ func TestRunSignalHandlingAllProcessesEnabled(t *testing.T) { } } -type envoyMetrics struct { +type mockEnvoyMetricsGetter struct { + respStatusCode int } -func (em *envoyMetrics) Get(url string) (resp *http.Response, err error) { +func (em *mockEnvoyMetricsGetter) Get(_ string) (resp *http.Response, err error) { response := &http.Response{} + response.StatusCode = em.respStatusCode response.Body = ioutil.NopCloser(bytes.NewReader([]byte("envoy metrics\n"))) return response, nil } -type serviceMetrics struct { - url string +// mockServiceMetricsGetter +type mockServiceMetricsGetter struct { + // reqURL is the last URL that was passed to Get(url) + reqURL string + + // respStatusCode is the status code to use for the response. + respStatusCode int } -func (sm *serviceMetrics) Get(url string) (resp *http.Response, err error) { +func (sm *mockServiceMetricsGetter) Get(url string) (resp *http.Response, err error) { + // Record the URL that we were called with. + sm.reqURL = url + response := &http.Response{} response.Body = ioutil.NopCloser(bytes.NewReader([]byte("service metrics\n"))) - sm.url = url + response.StatusCode = sm.respStatusCode + return response, nil } func TestMergedMetricsServer(t *testing.T) { cases := []struct { - name string - runEnvoyMetricsServer bool - runServiceMetricsServer bool - expectedOutput string + name string + envoyMetricsGetter *mockEnvoyMetricsGetter + serviceMetricsGetter *mockServiceMetricsGetter + expectedStatusCode int + expectedOutput string }{ { - name: "happy path: envoy and service metrics are merged", - runEnvoyMetricsServer: true, - runServiceMetricsServer: true, - expectedOutput: "envoy metrics\nservice metrics\n", + name: "happy path: envoy and service metrics are merged", + envoyMetricsGetter: &mockEnvoyMetricsGetter{ + respStatusCode: 200, + }, + serviceMetricsGetter: &mockServiceMetricsGetter{ + respStatusCode: 200, + }, + expectedStatusCode: 200, + expectedOutput: "envoy metrics\nservice metrics\nconsul_merged_service_metrics_success 1\n", }, { - name: "no service metrics", - runEnvoyMetricsServer: true, - runServiceMetricsServer: false, - expectedOutput: "envoy metrics\n", + name: "service metrics non-200", + envoyMetricsGetter: &mockEnvoyMetricsGetter{ + respStatusCode: 200, + }, + serviceMetricsGetter: &mockServiceMetricsGetter{ + respStatusCode: 404, + }, + expectedStatusCode: 200, + expectedOutput: "envoy metrics\nconsul_merged_service_metrics_success 0\n", }, { - name: "no envoy metrics", - runEnvoyMetricsServer: false, - runServiceMetricsServer: true, - expectedOutput: "", + name: "envoy metrics non-200", + envoyMetricsGetter: &mockEnvoyMetricsGetter{ + respStatusCode: 404, + }, + serviceMetricsGetter: &mockServiceMetricsGetter{ + respStatusCode: 200, + }, + expectedStatusCode: 500, + expectedOutput: "Received non-2xx status code scraping Envoy proxy metrics: 404: envoy metrics\n\n", + }, + { + name: "envoy and service metrics non-200", + envoyMetricsGetter: &mockEnvoyMetricsGetter{ + respStatusCode: 500, + }, + serviceMetricsGetter: &mockServiceMetricsGetter{ + respStatusCode: 500, + }, + expectedStatusCode: 500, + expectedOutput: "Received non-2xx status code scraping Envoy proxy metrics: 500: envoy metrics\n\n", }, } @@ -272,21 +310,11 @@ func TestMergedMetricsServer(t *testing.T) { flagServiceMetricsPort: fmt.Sprint(randomPorts[1]), flagServiceMetricsPath: "/metrics", logger: hclog.Default(), + envoyMetricsGetter: c.envoyMetricsGetter, + serviceMetricsGetter: c.serviceMetricsGetter, } server := cmd.createMergedMetricsServer() - - // Override the cmd's envoyMetricsGetter and serviceMetricsGetter - // with stubs. - em := &envoyMetrics{} - sm := &serviceMetrics{} - if c.runEnvoyMetricsServer { - cmd.envoyMetricsGetter = em - } - if c.runServiceMetricsServer { - cmd.serviceMetricsGetter = sm - } - go func() { _ = server.ListenAndServe() }() @@ -304,8 +332,8 @@ func TestMergedMetricsServer(t *testing.T) { // Verify the correct service metrics url was used. The service // metrics endpoint is only called if the Envoy metrics endpoint // call succeeds. - if c.runServiceMetricsServer && c.runEnvoyMetricsServer { - require.Equal(r, fmt.Sprintf("http://127.0.0.1:%d%s", randomPorts[1], "/metrics"), sm.url) + if c.envoyMetricsGetter.respStatusCode == 200 { + require.Equal(r, fmt.Sprintf("http://127.0.0.1:%d%s", randomPorts[1], "/metrics"), c.serviceMetricsGetter.reqURL) } }) })