diff --git a/server/embed/etcd.go b/server/embed/etcd.go index 188d73f4496c..b889ba094869 100644 --- a/server/embed/etcd.go +++ b/server/embed/etcd.go @@ -744,6 +744,7 @@ func (e *Etcd) serveClients() (err error) { etcdhttp.HandleVersion(mux, e.Server) etcdhttp.HandleMetrics(mux) etcdhttp.HandleHealth(e.cfg.logger, mux, e.Server) + etcdhttp.InstallHealthEndpoints(e.cfg.logger, mux, e.Server) var gopts []grpc.ServerOption if e.cfg.GRPCKeepAliveMinTime > time.Duration(0) { diff --git a/server/etcdserver/api/etcdhttp/readiness.go b/server/etcdserver/api/etcdhttp/readiness.go new file mode 100644 index 000000000000..3861a95b5711 --- /dev/null +++ b/server/etcdserver/api/etcdhttp/readiness.go @@ -0,0 +1,211 @@ +// Copyright 2023 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package etcdhttp + +import ( + "bytes" + "context" + "fmt" + "net/http" + "strings" + "time" + + "go.uber.org/zap" + + "go.etcd.io/etcd/api/v3/etcdserverpb" + "go.etcd.io/etcd/server/v3/auth" +) + +func InstallHealthEndpoints(lg *zap.Logger, mux *http.ServeMux, server ServerHealth) { + readyz := ReadyzCheckRegistry(server) + readyz.InstallHttpEndpoints(mux, lg) + livez := LivezCheckRegistry(server) + livez.InstallHttpEndpoints(mux, lg) +} + +type HealthCheck func(ctx context.Context) error + +type CheckRegistry struct { + path string + + checks map[string]HealthCheck +} + +func LivezCheckRegistry(server ServerHealth) *CheckRegistry { + reg := CheckRegistry{path: "/livez", checks: make(map[string]HealthCheck)} + reg.Register("ping", ping) + reg.Register("serializable_read", checkSerializableRead(server)) + return ® +} + +func ReadyzCheckRegistry(server ServerHealth) *CheckRegistry { + reg := CheckRegistry{path: "/readyz", checks: make(map[string]HealthCheck)} + reg.Register("ping", ping) + reg.Register("data_corruption", checkAlarm(server, etcdserverpb.AlarmType_CORRUPT)) + reg.Register("serializable_read", checkSerializableRead(server)) + return ® +} + +func (reg *CheckRegistry) Register(name string, check HealthCheck) { + reg.checks[name] = check +} + +func (reg *CheckRegistry) InstallHttpEndpoints(mux *http.ServeMux, lg *zap.Logger) { + lg.Info(fmt.Sprintf("Installing %s http endpoint.", reg.path)) + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + cancel() + mux.Handle(reg.path, reg.handleRootHealth(ctx, lg)) + for checkName, check := range reg.checks { + mux.Handle(fmt.Sprintf("%s/%v", reg.path, checkName), adaptCheckToHandler(ctx, check)) + } +} + +// handleRootHealth returns an http.HandlerFunc that serves the provided checks. +func (reg *CheckRegistry) handleRootHealth(ctx context.Context, lg *zap.Logger) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + // extracts the health check names to be excludeList from the query param + excludeList, _ := r.URL.Query()["exclude"] + // extracts the health check names to be allowList from the query param + allowList, _ := r.URL.Query()["allowlist"] + + if len(excludeList) > 0 && len(allowList) > 0 { + http.Error(w, fmt.Sprintf("do not expect both allowlist: %v and exclude list: %v to be both specified in %s.", allowList, excludeList, reg.path), http.StatusBadRequest) + return + } + excluded := listToStringSet(excludeList) + included := listToStringSet(allowList) + failedChecks := []string{} + var individualCheckOutput bytes.Buffer + for checkName, check := range reg.checks { + if len(allowList) > 0 { + if _, found := included[checkName]; !found { + fmt.Fprintf(&individualCheckOutput, "[+]%s not included: ok\n", checkName) + continue + } + delete(included, checkName) + } else { + // no-op the check if we've specified we want to exclude the check + if _, found := excluded[checkName]; found { + delete(excluded, checkName) + fmt.Fprintf(&individualCheckOutput, "[+]%s excluded: ok\n", checkName) + continue + } + } + if err := check(ctx); err != nil { + fmt.Fprintf(&individualCheckOutput, "[-]%s failed: %v\n", checkName, err) + failedChecks = append(failedChecks, checkName) + } else { + fmt.Fprintf(&individualCheckOutput, "[+]%s ok\n", checkName) + } + } + if len(excluded) > 0 { + fmt.Fprintf(&individualCheckOutput, "warn: some health checks cannot be excluded: no matches for %s\n", formatQuoted(excluded.List()...)) + } + if len(included) > 0 { + fmt.Fprintf(&individualCheckOutput, "warn: some health checks cannot be included: no matches for %s\n", formatQuoted(included.List()...)) + } + // always be verbose on failure + if len(failedChecks) > 0 { + http.Error(w, fmt.Sprintf("%s%s check failed", individualCheckOutput.String(), reg.path), http.StatusInternalServerError) + lg.Error(fmt.Sprintf("%s%s check failed", individualCheckOutput.String(), reg.path)) + return + } + lg.Info(fmt.Sprintf("%s check passed:\n%s", reg.path, individualCheckOutput.String())) + + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + w.Header().Set("X-Content-Type-Options", "nosniff") + if _, found := r.URL.Query()["verbose"]; !found { + fmt.Fprint(w, "ok") + return + } + individualCheckOutput.WriteTo(w) + fmt.Fprintf(w, "%s check passed\n", reg.path) + } +} + +// adaptCheckToHandler returns an http.HandlerFunc that serves the provided checks. +func adaptCheckToHandler(ctx context.Context, c HealthCheck) http.HandlerFunc { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + err := c(ctx) + if err != nil { + http.Error(w, fmt.Sprintf("internal server error: %v", err), http.StatusInternalServerError) + } else { + fmt.Fprint(w, "ok") + } + }) +} + +// formatQuoted returns a formatted string of the health check names, +// preserving the order passed in. +func formatQuoted(names ...string) string { + quoted := make([]string, 0, len(names)) + for _, name := range names { + quoted = append(quoted, fmt.Sprintf("%q", name)) + } + return strings.Join(quoted, ",") +} + +type stringSet map[string]struct{} + +func (s stringSet) List() []string { + keys := make([]string, len(s)) + + i := 0 + for k := range s { + keys[i] = k + i++ + } + return keys +} + +func listToStringSet(list []string) stringSet { + set := make(map[string]struct{}) + for _, s := range list { + if len(s) == 0 { + continue + } + set[s] = struct{}{} + } + return set +} + +func ping(context.Context) error { return nil } + +// checkAlarm checks if a specific alarm type is active in the server. +func checkAlarm(srv ServerHealth, at etcdserverpb.AlarmType) func(context.Context) error { + return func(ctx context.Context) error { + as := srv.Alarms() + if len(as) > 0 { + for _, v := range as { + if v.Alarm == at { + return fmt.Errorf("Alarm active:%s", at.String()) + } + } + } + return nil + } +} + +func checkSerializableRead(srv ServerHealth) func(ctx context.Context) error { + return func(ctx context.Context) error { + checkCtx, cancel := context.WithTimeout(ctx, time.Second) + _, err := srv.Range(checkCtx, &etcdserverpb.RangeRequest{KeysOnly: true, Limit: 1, Serializable: true}) + cancel() + if err != nil && err != auth.ErrUserEmpty && err != auth.ErrPermissionDenied { + return fmt.Errorf("RANGE ERROR:%s", err) + } + return nil + } +} diff --git a/server/etcdserver/api/etcdhttp/readiness_test.go b/server/etcdserver/api/etcdhttp/readiness_test.go new file mode 100644 index 000000000000..527e47d435d5 --- /dev/null +++ b/server/etcdserver/api/etcdhttp/readiness_test.go @@ -0,0 +1,193 @@ +// Copyright 2023 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package etcdhttp + +import ( + "fmt" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "go.uber.org/zap/zaptest" + + pb "go.etcd.io/etcd/api/v3/etcdserverpb" + "go.etcd.io/etcd/client/pkg/v3/testutil" + "go.etcd.io/etcd/server/v3/auth" +) + +type healthTestCase struct { + name string + healthCheckURL string + expectStatusCode int + inResult []string + notInResult []string + + alarms []*pb.AlarmMember + apiError error +} + +func TestDataCorruptionCheck(t *testing.T) { + tests := []healthTestCase{ + { + name: "Live if CORRUPT alarm is on", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_CORRUPT}}, + healthCheckURL: "/livez", + expectStatusCode: http.StatusOK, + notInResult: []string{"data_corruption"}, + }, + { + name: "Not ready if CORRUPT alarm is on", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_CORRUPT}}, + healthCheckURL: "/readyz", + expectStatusCode: http.StatusInternalServerError, + inResult: []string{"[-]data_corruption failed: Alarm active:CORRUPT", "[+]ping ok", "readyz check failed"}, + }, + { + name: "ready if CORRUPT alarm is not on", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, + healthCheckURL: "/readyz", + expectStatusCode: http.StatusOK, + }, + { + name: "ready if CORRUPT alarm is excluded", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_CORRUPT}, {MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, + healthCheckURL: "/readyz?exclude=data_corruption&exclude=ping&verbose", + expectStatusCode: http.StatusOK, + inResult: []string{"[+]data_corruption excluded: ok", "[+]ping excluded: ok", "[+]serializable_read ok", "readyz check passed"}, + }, + { + name: "ready if CORRUPT alarm is not allowlisted", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_CORRUPT}, {MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, + healthCheckURL: "/readyz?allowlist=ping&verbose", + expectStatusCode: http.StatusOK, + inResult: []string{"[+]data_corruption not included: ok", "[+]ping ok", "[+]serializable_read not included: ok", "readyz check passed"}, + }, + { + name: "/ready/data_corruption error if CORRUPT alarm is on", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_CORRUPT}, {MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, + healthCheckURL: "/readyz/data_corruption", + expectStatusCode: http.StatusInternalServerError, + }, + { + name: "/ready/ping ok if CORRUPT alarm is on", + alarms: []*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_CORRUPT}, {MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}}, + healthCheckURL: "/readyz/ping", + expectStatusCode: http.StatusOK, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mux := http.NewServeMux() + s := &fakeHealthServer{} + logger := zaptest.NewLogger(t) + InstallHealthEndpoints(logger, mux, s) + ts := httptest.NewServer(mux) + defer ts.Close() + // OK before alarms are activated. + checkHttpResponse(t, ts, tt.healthCheckURL, http.StatusOK, nil, nil) + // Activate the alarms. + s.alarms = tt.alarms + checkHttpResponse(t, ts, tt.healthCheckURL, tt.expectStatusCode, tt.inResult, tt.notInResult) + }) + } +} + +func TestSerializableReadCheck(t *testing.T) { + tests := []healthTestCase{ + { + name: "Alive even if authentication failed", + healthCheckURL: "/livez", + apiError: auth.ErrUserEmpty, + expectStatusCode: http.StatusOK, + }, + { + name: "Alive even if authorization failed", + healthCheckURL: "/livez", + apiError: auth.ErrPermissionDenied, + expectStatusCode: http.StatusOK, + }, + { + name: "Not alive if range api is not available", + healthCheckURL: "/livez", + apiError: fmt.Errorf("Unexpected error"), + expectStatusCode: http.StatusInternalServerError, + }, + { + name: "Not ready if range api is not available", + healthCheckURL: "/readyz", + apiError: fmt.Errorf("Unexpected error"), + expectStatusCode: http.StatusInternalServerError, + }, + { + name: "ok if allowlist a non registered check", + healthCheckURL: "/readyz?allowlist=non_exist&verbose", + apiError: fmt.Errorf("Unexpected error"), + expectStatusCode: http.StatusOK, + inResult: []string{"[+]ping not included: ok", "[+]serializable_read not included: ok", "warn: some health checks cannot be included: no matches for \"non_exist\"", "readyz check passed"}, + }, + { + name: "Cannot specify both allowlist and exclude at the same time", + healthCheckURL: "/readyz?allowlist=ping&exclude=serializable_read", + apiError: fmt.Errorf("Unexpected error"), + expectStatusCode: http.StatusBadRequest, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mux := http.NewServeMux() + s := &fakeHealthServer{apiError: tt.apiError} + logger := zaptest.NewLogger(t) + InstallHealthEndpoints(logger, mux, s) + ts := httptest.NewServer(mux) + defer ts.Close() + checkHttpResponse(t, ts, tt.healthCheckURL, tt.expectStatusCode, tt.inResult, tt.notInResult) + }) + } +} + +func checkHttpResponse(t *testing.T, ts *httptest.Server, url string, expectStatusCode int, inResult []string, notInResult []string) { + res, err := ts.Client().Do(&http.Request{Method: http.MethodGet, URL: testutil.MustNewURL(t, ts.URL+url)}) + if err != nil { + t.Errorf("fail serve http request %s %v", url, err) + } + if res == nil { + t.Errorf("got nil http response with http request %s", url) + return + } + if res.StatusCode != expectStatusCode { + t.Errorf("want statusCode %d but got %d", expectStatusCode, res.StatusCode) + } + defer res.Body.Close() + b, err := io.ReadAll(res.Body) + if err != nil { + t.Errorf("Failed to read response for %s", url) + return + } + result := string(b) + for _, substr := range inResult { + if !strings.Contains(result, substr) { + t.Errorf("Could not find substring : %s, in response: %s", substr, result) + return + } + } + for _, substr := range notInResult { + if strings.Contains(result, substr) { + t.Errorf("Do not expect substring : %s, in response: %s", substr, result) + return + } + } +}