From ca3780eb922da480b736aeb755cb25d8f4ee6918 Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Fri, 12 Jan 2024 16:18:47 -0600 Subject: [PATCH] backport of commit e1e80f383ec5a784a9bfad11937ba3fe143b4ec3 (#19730) Co-authored-by: Luiz Aoqui --- .changelog/19720.txt | 3 + api/operator.go | 60 ++++++ command/agent/http.go | 1 + command/agent/operator_endpoint.go | 29 +++ command/agent/operator_endpoint_test.go | 39 ++++ command/setup_vault.go | 180 ++++++++++++++++- command/setup_vault_test.go | 153 ++++++++++++++ nomad/operator_endpoint.go | 95 +++++++++ nomad/operator_endpoint_test.go | 181 +++++++++++++++++ nomad/structs/operator.go | 12 ++ nomad/structs/workload_id.go | 7 + .../api-docs/operator/upgrade-check.mdx | 187 ++++++++++++++++++ website/content/docs/commands/setup/vault.mdx | 62 ++++++ .../content/docs/integrations/vault/acl.mdx | 61 ++++-- website/data/api-docs-nav-data.json | 4 + website/redirects.js | 11 ++ 16 files changed, 1070 insertions(+), 15 deletions(-) create mode 100644 .changelog/19720.txt create mode 100644 command/setup_vault_test.go create mode 100644 website/content/api-docs/operator/upgrade-check.mdx diff --git a/.changelog/19720.txt b/.changelog/19720.txt new file mode 100644 index 00000000000..9823f3a0352 --- /dev/null +++ b/.changelog/19720.txt @@ -0,0 +1,3 @@ +```release-note:improvement +cli: Add new option `nomad setup vault -check` to help cluster operators migrate to workload identities for Vault +``` diff --git a/api/operator.go b/api/operator.go index e9823b2f541..a6f11f45e4d 100644 --- a/api/operator.go +++ b/api/operator.go @@ -411,3 +411,63 @@ type LeadershipTransferResponse struct { WriteMeta } + +// VaultWorkloadIdentityUpgradeCheck is the result of verifying if the cluster +// is ready to switch to workload identities for Vault. +type VaultWorkloadIdentityUpgradeCheck struct { + // JobsWithoutVaultIdentity is the list of jobs that have a `vault` block + // but do not have an `identity` for Vault. + JobsWithoutVaultIdentity []*JobListStub + + // OutdatedNodes is the list of nodes running a version of Nomad that does + // not support workload identities for Vault. + OutdatedNodes []*NodeListStub + + // VaultTokens is the list of Vault ACL token accessors that Nomad created + // and will no longer manage after the cluster is migrated to workload + // identities. + VaultTokens []*VaultAccessor +} + +// Ready returns true if the cluster is ready to migrate to workload identities +// with Vault. +func (v *VaultWorkloadIdentityUpgradeCheck) Ready() bool { + return v != nil && + len(v.VaultTokens) == 0 && + len(v.OutdatedNodes) == 0 && + len(v.JobsWithoutVaultIdentity) == 0 +} + +// VaultAccessor is a Vault ACL token created by Nomad for a task to access +// Vault using the legacy authentication flow. +type VaultAccessor struct { + // AllocID is the ID of the allocation that requested this token. + AllocID string + + // Task is the name of the task that requested this token. + Task string + + // NodeID is the ID of the node running the allocation that requested this + // token. + NodeID string + + // Accessor is the Vault ACL token accessor ID. + Accessor string + + // CreationTTL is the TTL set when the token was created. + CreationTTL int + + // CreateIndex is the Raft index when the token was created. + CreateIndex uint64 +} + +// UpgradeCheckVaultWorkloadIdentity retrieves the cluster status for migrating +// to workload identities with Vault. +func (op *Operator) UpgradeCheckVaultWorkloadIdentity(q *QueryOptions) (*VaultWorkloadIdentityUpgradeCheck, *QueryMeta, error) { + var resp VaultWorkloadIdentityUpgradeCheck + qm, err := op.c.query("/v1/operator/upgrade-check/vault-workload-identity", &resp, q) + if err != nil { + return nil, nil, err + } + return &resp, qm, nil +} diff --git a/command/agent/http.go b/command/agent/http.go index 4f7e20f07b3..35e3ccd557e 100644 --- a/command/agent/http.go +++ b/command/agent/http.go @@ -487,6 +487,7 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) { s.mux.HandleFunc("/v1/operator/autopilot/configuration", s.wrap(s.OperatorAutopilotConfiguration)) s.mux.HandleFunc("/v1/operator/autopilot/health", s.wrap(s.OperatorServerHealth)) s.mux.HandleFunc("/v1/operator/snapshot", s.wrap(s.SnapshotRequest)) + s.mux.HandleFunc("/v1/operator/upgrade-check/", s.wrap(s.UpgradeCheckRequest)) s.mux.HandleFunc("/v1/system/gc", s.wrap(s.GarbageCollectRequest)) s.mux.HandleFunc("/v1/system/reconcile/summaries", s.wrap(s.ReconcileJobSummaries)) diff --git a/command/agent/operator_endpoint.go b/command/agent/operator_endpoint.go index c79641e02eb..09d4343ddf5 100644 --- a/command/agent/operator_endpoint.go +++ b/command/agent/operator_endpoint.go @@ -521,3 +521,32 @@ func (s *HTTPServer) snapshotRestoreRequest(resp http.ResponseWriter, req *http. return nil, codedErr } + +func (s *HTTPServer) UpgradeCheckRequest(resp http.ResponseWriter, req *http.Request) (any, error) { + path := strings.TrimPrefix(req.URL.Path, "/v1/operator/upgrade-check") + switch { + case strings.HasSuffix(path, "/vault-workload-identity"): + return s.upgradeCheckVaultWorkloadIdentity(resp, req) + default: + return nil, CodedError(http.StatusNotFound, fmt.Sprintf("Path %s not found", req.URL.Path)) + } +} + +func (s *HTTPServer) upgradeCheckVaultWorkloadIdentity(resp http.ResponseWriter, req *http.Request) (any, error) { + if req.Method != http.MethodGet { + return nil, CodedError(405, ErrInvalidMethod) + } + + args := structs.UpgradeCheckVaultWorkloadIdentityRequest{} + if s.parse(resp, req, &args.Region, &args.QueryOptions) { + return nil, nil + } + + var out structs.UpgradeCheckVaultWorkloadIdentityResponse + if err := s.agent.RPC("Operator.UpgradeCheckVaultWorkloadIdentity", &args, &out); err != nil { + return nil, err + } + + setMeta(resp, &out.QueryMeta) + return out, nil +} diff --git a/command/agent/operator_endpoint_test.go b/command/agent/operator_endpoint_test.go index 2d5590ffd06..ba0666620b1 100644 --- a/command/agent/operator_endpoint_test.go +++ b/command/agent/operator_endpoint_test.go @@ -661,3 +661,42 @@ func TestOperator_SnapshotRequests(t *testing.T) { require.True(t, jobExists()) }) } + +func TestOperator_UpgradeCheckRequest_VaultWorkloadIdentity(t *testing.T) { + ci.Parallel(t) + httpTest(t, func(c *Config) { + c.Vaults[0].Enabled = pointer.Of(true) + c.Vaults[0].Name = "default" + }, func(s *TestAgent) { + // Create a test job with a Vault block but without an identity. + job := mock.Job() + job.TaskGroups[0].Tasks[0].Vault = &structs.Vault{ + Cluster: "default", + Policies: []string{"test"}, + } + + args := structs.JobRegisterRequest{ + Job: job, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + var resp structs.JobRegisterResponse + err := s.Agent.RPC("Job.Register", &args, &resp) + must.NoError(t, err) + + // Make HTTP request to retrieve + req, err := http.NewRequest(http.MethodGet, "/v1/operator/upgrade-check/vault-workload-identity", nil) + must.NoError(t, err) + respW := httptest.NewRecorder() + + obj, err := s.Server.UpgradeCheckRequest(respW, req) + must.NoError(t, err) + must.NotEq(t, "", respW.Header().Get("X-Nomad-Index")) + must.NotEq(t, "", respW.Header().Get("X-Nomad-LastContact")) + must.Eq(t, "true", respW.Header().Get("X-Nomad-KnownLeader")) + + upgradeCheck := obj.(structs.UpgradeCheckVaultWorkloadIdentityResponse) + must.Len(t, 1, upgradeCheck.JobsWithoutVaultIdentity) + must.Len(t, 0, upgradeCheck.VaultTokens) + must.Eq(t, job.ID, upgradeCheck.JobsWithoutVaultIdentity[0].ID) + }) +} diff --git a/command/setup_vault.go b/command/setup_vault.go index e052f0c7c90..a0557063b5c 100644 --- a/command/setup_vault.go +++ b/command/setup_vault.go @@ -12,6 +12,7 @@ import ( "slices" "strings" + "github.com/dustin/go-humanize/english" "github.com/hashicorp/vault/api" "github.com/mitchellh/cli" "github.com/posener/complete" @@ -48,6 +49,12 @@ type SetupVaultCommand struct { destroy bool autoYes bool + + // Options for -check. + check bool + json bool + tmpl string + verbose bool } // Help satisfies the cli.Command Help function. @@ -62,6 +69,10 @@ Usage: nomad setup vault [options] VAULT_TOKEN, VAULT_ADDR, and other Vault-related environment variables as documented in https://developer.hashicorp.com/vault/docs/commands#environment-variables. + The -check option can be used to verify if the Nomad cluster is ready to + migrate to use Workload Identities with Vault. This option requires + operator:read permission for Nomad. + WARNING: This command is an experimental feature and may change its behavior in future versions of Nomad. @@ -79,7 +90,22 @@ Setup Vault options: Automatically answers "yes" to all the questions, making the setup non-interactive. Defaults to "false". -` + -check + Verify if the Nomad cluster is ready to migrate to Workload Identities. + +Setup Vault options when using -check: + + -json + Output migration status information in its JSON format. + + -t + Format and display migration status information using a Go template. + + -verbose + Display full information. + + ` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + return strings.TrimSpace(helpText) } @@ -89,6 +115,12 @@ func (s *SetupVaultCommand) AutocompleteFlags() complete.Flags { "-jwks-url": complete.PredictAnything, "-destroy": complete.PredictSet("true", "false"), "-y": complete.PredictSet("true", "false"), + + // Options for -check. + "-check": complete.PredictSet("true", "false"), + "-json": complete.PredictSet("true", "false"), + "-verbose": complete.PredictSet("true", "false"), + "-t": complete.PredictAnything, }) } @@ -110,6 +142,13 @@ func (s *SetupVaultCommand) Run(args []string) int { flags.BoolVar(&s.destroy, "destroy", false, "") flags.BoolVar(&s.autoYes, "y", false, "") flags.StringVar(&s.jwksURL, "jwks-url", "http://localhost:4646/.well-known/jwks.json", "") + + // Options for -check. + flags.BoolVar(&s.check, "check", false, "") + flags.BoolVar(&s.json, "json", false, "") + flags.BoolVar(&s.verbose, "verbose", false, "") + flags.StringVar(&s.tmpl, "t", "", "") + if err := flags.Parse(args); err != nil { return 1 } @@ -121,6 +160,32 @@ func (s *SetupVaultCommand) Run(args []string) int { return 1 } + if s.check { + return s.checkUpgrade() + } else { + // Verify that -check flags are not set. + var invalid []string + if s.json { + invalid = append(invalid, "-json") + } + if s.verbose { + invalid = append(invalid, "-verbose") + } + if s.tmpl != "" { + invalid = append(invalid, "-t") + } + + if len(invalid) > 0 { + s.Ui.Error(fmt.Sprintf( + "The %s %s can only be used with -check", + english.OxfordWordSeries(invalid, "and"), + english.PluralWord(len(invalid), "option", "options"), + )) + s.Ui.Error(commandErrorText(s)) + return 1 + } + } + if !isTty() && !s.autoYes { s.Ui.Error("This command requires -y option when running in non-interactive mode") return 1 @@ -216,7 +281,7 @@ a namespace %q and create all configuration within that namespace. */ s.Ui.Output(` We will now enable the JWT credential backend and create a JWT auth method that -Nomad workloads will use. +Nomad workloads will use. `) if s.authMethodExists() { @@ -606,6 +671,117 @@ func (s *SetupVaultCommand) removeConfiguredComponents() int { return exitCode } +func (s *SetupVaultCommand) checkUpgrade() int { + length := shortId + if s.verbose { + length = fullId + } + + client, err := s.Meta.Client() + if err != nil { + s.Ui.Error(fmt.Sprintf("Error initializing client: %s", err)) + return 1 + } + + resp, _, err := client.Operator().UpgradeCheckVaultWorkloadIdentity(nil) + if err != nil { + s.Ui.Error(fmt.Sprintf("Error querying scheduler configuration: %s", err)) + return 1 + } + + // Output formatted option if requested. + if s.json || len(s.tmpl) > 0 { + out, err := Format(s.json, s.tmpl, resp) + if err != nil { + s.Ui.Error(err.Error()) + return 1 + } + + s.Ui.Output(out) + return 0 + } + + if resp.Ready() { + s.Ui.Output("Nomad cluster is ready to use workload identities with Vault.") + return 0 + } + + if len(resp.JobsWithoutVaultIdentity) != 0 { + s.Ui.Output(s.Colorize().Color(` +[bold]Jobs Without Workload Identity for Vault[reset] +The following jobs access Vault but are not configured for workload identity. + +You should redeploy them before fully migrating to workload identities with +Vault to prevent unexpected errors if their tokens need to be recreated. + +Refer to https://developer.hashicorp.com/nomad/s/vault-workload-identity-migration +for more information. +`)) + out := make([]string, len(resp.JobsWithoutVaultIdentity)+1) + out[0] = "ID|Namespace|Type|Status" + for i, job := range resp.JobsWithoutVaultIdentity { + out[i+1] = fmt.Sprintf("%s|%s|%s|%s", + limit(job.ID, length), + job.Namespace, + job.Type, + job.Status, + ) + } + s.Ui.Output(formatList(out)) + } + + if len(resp.OutdatedNodes) != 0 { + s.Ui.Output(s.Colorize().Color(` +[bold]Outdated Nodes[reset] +The following nodes are running a version of Nomad that does not support using +workload identities with Vault. + +You should upgrade them to Nomad 1.7 before fully migrating to workload +identities with Vault to prevent unexpected errors if they receive allocations +for jobs that use Vault. + +Refer to https://developer.hashicorp.com/nomad/s/vault-workload-identity-migration +for more information. +`)) + out := make([]string, len(resp.OutdatedNodes)+1) + out[0] = "ID|Name|Address|Version|Drain|Eligibility|Status" + for i, node := range resp.OutdatedNodes { + out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s|%s", + limit(node.ID, length), + node.Name, + node.Address, + node.Version, + node.Drain, + node.SchedulingEligibility, + node.Status, + ) + } + s.Ui.Output(formatList(out)) + } + + if len(resp.VaultTokens) != 0 { + s.Ui.Output(s.Colorize().Color(` +[bold]Vault Tokens[reset] +The following Vault ACL tokens were created by Nomad but will not be +automatically revoked after migrating to workload identities. They will expire +once their TTL reaches zero. +`)) + out := make([]string, len(resp.VaultTokens)+1) + out[0] = "Accessor ID|Allocation ID|Node ID|Configured TTL" + for i, token := range resp.VaultTokens { + out[i+1] = fmt.Sprintf("%s|%s|%s|%d", + token.Accessor, + limit(token.AllocID, length), + limit(token.NodeID, length), + token.CreationTTL, + ) + } + s.Ui.Output(formatList(out)) + } + + return 0 +} + func printMapOfStrings(m map[string]string) string { var output string diff --git a/command/setup_vault_test.go b/command/setup_vault_test.go new file mode 100644 index 00000000000..9bce8d8b2a5 --- /dev/null +++ b/command/setup_vault_test.go @@ -0,0 +1,153 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package command + +import ( + "fmt" + "testing" + + "github.com/mitchellh/cli" + "github.com/shoenig/test/must" + + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/command/agent" + "github.com/hashicorp/nomad/helper/pointer" +) + +func TestSetupVaultCommand_Run(t *testing.T) { + ci.Parallel(t) + + // Start in dev mode so we get a node registration + srv, client, url := testServer(t, true, func(c *agent.Config) { + c.DevMode = true + c.Vaults[0].Name = "default" + c.Vaults[0].Enabled = pointer.Of(true) + }) + defer srv.Shutdown() + + // Register a job with a vault block but without an identity for Vault. + job := testJob("test") + job.TaskGroups[0].Tasks[0].Vault = &api.Vault{ + Cluster: "default", + Policies: []string{"test"}, + } + _, _, err := client.Jobs().Register(job, nil) + must.NoError(t, err) + + job, _, err = client.Jobs().Info(*job.ID, nil) + must.NoError(t, err) + + testCases := []struct { + name string + args []string + expectedErr string + expectedRC int + expectedOut string + }{ + { + name: "-check flags", + args: []string{ + "-json", + "-t", "{{.}}", + "-verbose", + }, + expectedRC: 1, + expectedErr: "The -json, -verbose, and -t options can only be used with -check", + }, + { + name: "-check", + args: []string{ + "-check", + "-address", url, + }, + expectedRC: 0, + expectedOut: ` +Jobs Without Workload Identity for Vault +The following jobs access Vault but are not configured for workload identity. + +You should redeploy them before fully migrating to workload identities with +Vault to prevent unexpected errors if their tokens need to be recreated. + +Refer to https://developer.hashicorp.com/nomad/s/vault-workload-identity-migration +for more information. + +ID Namespace Type Status +test default batch pending +`, + }, + { + name: "-check with -json", + args: []string{ + "-check", + "-json", + "-address", url, + }, + expectedRC: 0, + expectedOut: fmt.Sprintf(`{ + "JobsWithoutVaultIdentity": [ + { + "CreateIndex": 10, + "Datacenters": [ + "dc1" + ], + "ID": "test", + "JobModifyIndex": %d, + "JobSummary": null, + "ModifyIndex": %d, + "Name": "test", + "Namespace": "default", + "ParameterizedJob": false, + "ParentID": "", + "Periodic": false, + "Priority": 1, + "Status": "pending", + "StatusDescription": "", + "Stop": false, + "SubmitTime": %d, + "Type": "batch" + } + ], + "OutdatedNodes": [], + "VaultTokens": [] +} +`, *job.CreateIndex, *job.ModifyIndex, *job.SubmitTime), + }, + { + name: "-check with -t", + args: []string{ + "-check", + "-t", "{{with index .JobsWithoutVaultIdentity 0}}{{.ID}}{{end}}", + "-address", url, + }, + expectedRC: 0, + expectedOut: "test\n", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ui := cli.NewMockUi() + meta := Meta{Ui: ui} + + defer func() { + if t.Failed() { + fmt.Println(ui.ErrorWriter.String()) + fmt.Println(ui.OutputWriter.String()) + } + }() + + cmd := &SetupVaultCommand{Meta: meta} + got := cmd.Run(tc.args) + must.Eq(t, tc.expectedRC, got) + + if tc.expectedErr != "" { + must.StrContains(t, ui.ErrorWriter.String(), tc.expectedErr) + } else { + must.Eq(t, ui.ErrorWriter.String(), "") + must.Eq(t, ui.OutputWriter.String(), tc.expectedOut) + } + }) + } +} diff --git a/nomad/operator_endpoint.go b/nomad/operator_endpoint.go index 982730b9732..0bdc8a60766 100644 --- a/nomad/operator_endpoint.go +++ b/nomad/operator_endpoint.go @@ -12,7 +12,9 @@ import ( "time" "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-msgpack/codec" + version "github.com/hashicorp/go-version" "github.com/hashicorp/raft" "github.com/hashicorp/serf/serf" @@ -787,6 +789,99 @@ func (op *Operator) snapshotRestore(conn io.ReadWriteCloser) { encoder.Encode(reply) } +func (op *Operator) UpgradeCheckVaultWorkloadIdentity( + args *structs.UpgradeCheckVaultWorkloadIdentityRequest, + reply *structs.UpgradeCheckVaultWorkloadIdentityResponse, +) error { + authErr := op.srv.Authenticate(op.ctx, args) + if done, err := op.srv.forward("Operator.UpgradeCheckVaultWorkloadIdentity", args, args, reply); done { + return err + } + op.srv.MeasureRPCRate("operator", structs.RateMetricRead, args) + if authErr != nil { + return structs.ErrPermissionDenied + } + + // This action requires operator read access. + rule, err := op.srv.ResolveACL(args) + if err != nil { + return err + } else if rule != nil && !rule.AllowOperatorRead() { + return structs.ErrPermissionDenied + } + + state := op.srv.fsm.State() + ws := memdb.NewWatchSet() + + // Check for jobs that use Vault but don't have an identity for Vault. + jobsIter, err := state.Jobs(ws) + if err != nil { + return fmt.Errorf("failed to retrieve jobs: %w", err) + } + + jobs := []*structs.JobListStub{} + for raw := jobsIter.Next(); raw != nil; raw = jobsIter.Next() { + job := raw.(*structs.Job) + + TG_LOOP: + for _, tg := range job.TaskGroups { + for _, t := range tg.Tasks { + if t.Vault == nil { + continue + } + + foundWID := false + for _, wid := range t.Identities { + if wid.IsVault() { + foundWID = true + break + } + } + if !foundWID { + jobs = append(jobs, job.Stub(nil, nil)) + break TG_LOOP + } + } + } + } + reply.JobsWithoutVaultIdentity = jobs + + // Find nodes that don't support workload identities for Vault. + nodesIter, err := state.Nodes(ws) + if err != nil { + return fmt.Errorf("failed to retrieve nodes: %w", err) + } + + nodes := []*structs.NodeListStub{} + for raw := nodesIter.Next(); raw != nil; raw = nodesIter.Next() { + node := raw.(*structs.Node) + + v, err := version.NewVersion(node.Attributes["nomad.version"]) + if err != nil || v.LessThan(structs.MinNomadVersionVaultWID) { + nodes = append(nodes, node.Stub(nil)) + continue + } + } + reply.OutdatedNodes = nodes + + // Retrieve Vault tokens that were created by Nomad servers. + vaultTokensIter, err := state.VaultAccessors(ws) + if err != nil { + return fmt.Errorf("failed to retrieve Vault token accessors: %w", err) + } + + vaultTokens := []*structs.VaultAccessor{} + for raw := vaultTokensIter.Next(); raw != nil; raw = vaultTokensIter.Next() { + vaultTokens = append(vaultTokens, raw.(*structs.VaultAccessor)) + } + reply.VaultTokens = vaultTokens + + reply.QueryMeta.Index, _ = op.srv.State().LatestIndex() + op.srv.setQueryMeta(&reply.QueryMeta) + + return nil +} + func decodeStreamOutput(decoder *codec.Decoder) (io.Reader, <-chan error) { pr, pw := io.Pipe() errCh := make(chan error, 1) diff --git a/nomad/operator_endpoint_test.go b/nomad/operator_endpoint_test.go index fdb829b2313..822f7757e4a 100644 --- a/nomad/operator_endpoint_test.go +++ b/nomad/operator_endpoint_test.go @@ -17,6 +17,7 @@ import ( "testing" "time" + "github.com/google/go-cmp/cmp/cmpopts" "github.com/hashicorp/go-msgpack/codec" msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/acl" @@ -1188,3 +1189,183 @@ func TestOperator_SnapshotRestore_ACL(t *testing.T) { }) } } + +func TestOperator_UpgradeCheckRequest_VaultWorkloadIdentity(t *testing.T) { + ci.Parallel(t) + + s1, cleanupS1 := TestServer(t, nil) + defer cleanupS1() + testutil.WaitForLeader(t, s1.RPC) + + codec := rpcClient(t, s1) + state := s1.fsm.State() + + // Register mock nodes, one pre-1.7. + node := mock.Node() + node.Attributes["nomad.version"] = "1.7.2" + err := state.UpsertNode(structs.MsgTypeTestSetup, 1000, node) + must.NoError(t, err) + + outdatedNode := mock.Node() + outdatedNode.Attributes["nomad.version"] = "1.6.4" + err = state.UpsertNode(structs.MsgTypeTestSetup, 1001, outdatedNode) + must.NoError(t, err) + + // Create non-default namespace. + ns := mock.Namespace() + state.UpsertNamespaces(1002, []*structs.Namespace{ns}) + + // Register Vault jobs, one with and another without workload identity. + jobNoWID := mock.Job() + jobNoWID.TaskGroups[0].Tasks[0].Vault = &structs.Vault{ + Cluster: "default", + Policies: []string{"test"}, + } + // Add multiple tasks and groups to make sure we don't have duplicate jobs + // in the result. + jobNoWID.TaskGroups[0].Tasks = append(jobNoWID.TaskGroups[0].Tasks, jobNoWID.TaskGroups[0].Tasks[0].Copy()) + jobNoWID.TaskGroups[0].Tasks[1].Name = "task-1" + jobNoWID.TaskGroups = append(jobNoWID.TaskGroups, jobNoWID.TaskGroups[0].Copy()) + jobNoWID.TaskGroups[1].Name = "tg-1" + + err = state.UpsertJob(structs.MsgTypeTestSetup, 1003, nil, jobNoWID) + must.NoError(t, err) + + jobNoWIDNonDefaultNS := mock.Job() + jobNoWIDNonDefaultNS.Namespace = ns.Name + jobNoWIDNonDefaultNS.TaskGroups[0].Tasks[0].Vault = &structs.Vault{ + Cluster: "default", + Policies: []string{"test"}, + } + err = state.UpsertJob(structs.MsgTypeTestSetup, 1004, nil, jobNoWIDNonDefaultNS) + must.NoError(t, err) + + jobWithWID := mock.Job() + jobWithWID.TaskGroups[0].Tasks[0].Vault = &structs.Vault{ + Cluster: "default", + } + jobWithWID.TaskGroups[0].Tasks[0].Identities = []*structs.WorkloadIdentity{{ + Name: "vault_default", + }} + err = state.UpsertJob(structs.MsgTypeTestSetup, 1005, nil, jobWithWID) + must.NoError(t, err) + + // Create allocs for the jobs. + allocJobNoWID := mock.Alloc() + allocJobNoWID.Job = jobNoWID + allocJobNoWID.JobID = jobNoWID.ID + allocJobNoWID.NodeID = node.ID + + allocJobWithWID := mock.Alloc() + allocJobWithWID.Job = jobWithWID + allocJobWithWID.JobID = jobWithWID.ID + allocJobWithWID.NodeID = node.ID + + err = state.UpsertAllocs(structs.MsgTypeTestSetup, 1006, []*structs.Allocation{allocJobNoWID, allocJobWithWID}) + must.NoError(t, err) + + // Create Vault token accessor for job without Vault identity and one that + // is no longer used. + tokenJobNoWID := mock.VaultAccessor() + tokenJobNoWID.AllocID = allocJobNoWID.ID + tokenJobNoWID.NodeID = node.ID + + tokenUnused := mock.VaultAccessor() + err = state.UpsertVaultAccessor(1007, []*structs.VaultAccessor{tokenJobNoWID, tokenUnused}) + must.NoError(t, err) + + // Make request. + args := &structs.UpgradeCheckVaultWorkloadIdentityRequest{ + QueryOptions: structs.QueryOptions{ + Region: "global", + AuthToken: node.SecretID, + }, + } + var resp structs.UpgradeCheckVaultWorkloadIdentityResponse + err = msgpackrpc.CallWithCodec(codec, "Operator.UpgradeCheckVaultWorkloadIdentity", args, &resp) + must.NoError(t, err) + must.Eq(t, 1007, resp.Index) + + // Verify only jobs without Vault identity are returned. + must.Len(t, 2, resp.JobsWithoutVaultIdentity) + must.SliceContains(t, resp.JobsWithoutVaultIdentity, jobNoWID.Stub(nil, nil), must.Cmp(cmpopts.IgnoreFields( + structs.JobListStub{}, + "Status", + "ModifyIndex", + ))) + must.SliceContains(t, resp.JobsWithoutVaultIdentity, jobNoWIDNonDefaultNS.Stub(nil, nil), must.Cmp(cmpopts.IgnoreFields( + structs.JobListStub{}, + "Status", + "ModifyIndex", + ))) + + // Verify only outdated nodes are returned. + must.Len(t, 1, resp.OutdatedNodes) + must.SliceContains(t, resp.OutdatedNodes, outdatedNode.Stub(nil)) + + // Verify Vault ACL tokens are returned. + must.Len(t, 2, resp.VaultTokens) + must.SliceContains(t, resp.VaultTokens, tokenJobNoWID) + must.SliceContains(t, resp.VaultTokens, tokenUnused) +} + +func TestOperator_UpgradeCheckRequest_VaultWorkloadIdentity_ACL(t *testing.T) { + ci.Parallel(t) + + s1, root, cleanupS1 := TestACLServer(t, nil) + defer cleanupS1() + testutil.WaitForLeader(t, s1.RPC) + + codec := rpcClient(t, s1) + state := s1.fsm.State() + + // Create test tokens and policies. + allowed := mock.CreatePolicyAndToken(t, state, 1000, "allowed", `operator {policy = "read"}`) + notAllowed := mock.CreatePolicyAndToken(t, state, 1002, "not-allowed", mock.NamespacePolicy("default", "write", nil)) + + testCases := []struct { + name string + token string + expectedErr string + }{ + { + name: "root token is allowed", + token: root.SecretID, + expectedErr: "", + }, + { + name: "operator read token is allowed", + token: allowed.SecretID, + expectedErr: "", + }, + { + name: "token not allowed", + token: notAllowed.SecretID, + expectedErr: structs.ErrPermissionDenied.Error(), + }, + { + name: "missing token not allowed", + token: "", + expectedErr: structs.ErrPermissionDenied.Error(), + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Make request. + args := &structs.UpgradeCheckVaultWorkloadIdentityRequest{ + QueryOptions: structs.QueryOptions{ + Region: "global", + AuthToken: tc.token, + }, + } + var resp structs.UpgradeCheckVaultWorkloadIdentityResponse + err := msgpackrpc.CallWithCodec(codec, "Operator.UpgradeCheckVaultWorkloadIdentity", args, &resp) + if tc.expectedErr == "" { + must.NoError(t, err) + } else { + must.ErrorContains(t, err, tc.expectedErr) + } + }) + } +} diff --git a/nomad/structs/operator.go b/nomad/structs/operator.go index c68bd668d90..09246c83da0 100644 --- a/nomad/structs/operator.go +++ b/nomad/structs/operator.go @@ -373,3 +373,15 @@ type SnapshotRestoreResponse struct { QueryMeta } + +type UpgradeCheckVaultWorkloadIdentityRequest struct { + QueryOptions +} + +type UpgradeCheckVaultWorkloadIdentityResponse struct { + JobsWithoutVaultIdentity []*JobListStub + OutdatedNodes []*NodeListStub + VaultTokens []*VaultAccessor + + QueryMeta +} diff --git a/nomad/structs/workload_id.go b/nomad/structs/workload_id.go index 144103ab5b9..368acb371d4 100644 --- a/nomad/structs/workload_id.go +++ b/nomad/structs/workload_id.go @@ -11,6 +11,7 @@ import ( "time" "github.com/hashicorp/go-multierror" + "github.com/hashicorp/go-version" ) const ( @@ -52,6 +53,12 @@ var ( // validIdentityName is used to validate workload identity Name fields. Must // be safe to use in filenames. validIdentityName = regexp.MustCompile("^[a-zA-Z0-9-_]{1,128}$") + + // MinNomadVersionVaultWID is the minimum version of Nomad that supports + // workload identities for Vault. + // "-a" is used here so that it is "less than" all pre-release versions of + // Nomad 1.7.0 as well + MinNomadVersionVaultWID = version.Must(version.NewVersion("1.7.0-a")) ) // WorkloadIdentity is the jobspec block which determines if and how a workload diff --git a/website/content/api-docs/operator/upgrade-check.mdx b/website/content/api-docs/operator/upgrade-check.mdx new file mode 100644 index 00000000000..8659c6b1359 --- /dev/null +++ b/website/content/api-docs/operator/upgrade-check.mdx @@ -0,0 +1,187 @@ +--- +layout: api +page_title: Upgrade Check - Operator - HTTP API +description: |- + The /operator/upgrade-check endpoints provide tools for verifying the state + of the cluster prior to upgrades. +--- + +# Upgrade Check Operator HTTP API + +The `/operator/upgrade-check` endpoints provide some predefined verifications +that can be useful prior to upgrades and changes to Nomad configuration. + + + +These endpoints are meant to target specific releases of Nomad and may be +removed or modified without notice. + + + +## Vault Workload Identity + +This endpoint retrieves jobs, nodes, and Vault ACL tokens that may be affected +when migrating a Nomad cluster to use [workload identities for +Vault][nomad_acl_vault_wid]. + +| Method | Path | Produces | +| ------ | ---------------------------------------------------- | ------------------ | +| `GET` | `/v1/operator/upgrade-check/vault-workload-identity` | `application/json` | + +The table below shows this endpoint's support for +[blocking queries](/nomad/api-docs#blocking-queries) and +[required ACLs](/nomad/api-docs#acls). + +| Blocking Queries | ACL Required | +| ---------------- | --------------- | +| `NO` | `operator:read` | + +### Sample Request + +```shell-session +$ nomad operator api \ + /v1/operator/upgrade-check/vault-workload-identity +``` + +### Sample Response + +```json +{ + "Index": 20, + "JobsWithoutVaultIdentity": [ + { + "CreateIndex": 11, + "Datacenters": [ + "*" + ], + "ID": "example", + "JobModifyIndex": 11, + "JobSummary": null, + "ModifyIndex": 19, + "Multiregion": null, + "Name": "example", + "Namespace": "default", + "NodePool": "default", + "ParameterizedJob": false, + "ParentID": "", + "Periodic": false, + "Priority": 50, + "Status": "running", + "StatusDescription": "", + "Stop": false, + "SubmitTime": 1704995322434188000, + "Type": "service" + } + ], + "KnownLeader": true, + "LastContact": 0, + "NextToken": "", + "OutdatedNodes": [ + { + "Address": "192.168.0.186", + "CreateIndex": 8, + "Datacenter": "dc1", + "Drain": false, + "Drivers": { + "qemu": { + "Attributes": { + "driver.qemu": "true", + "driver.qemu.version": "8.1.1" + }, + "Detected": true, + "HealthDescription": "Healthy", + "Healthy": true, + "UpdateTime": "2024-01-11T12:48:35.993541-05:00" + }, + "exec": { + "Attributes": {}, + "Detected": false, + "HealthDescription": "exec driver unsupported on client OS", + "Healthy": false, + "UpdateTime": "2024-01-11T12:48:35.958495-05:00" + }, + "raw_exec": { + "Attributes": { + "driver.raw_exec": "true" + }, + "Detected": true, + "HealthDescription": "Healthy", + "Healthy": true, + "UpdateTime": "2024-01-11T12:48:35.958539-05:00" + }, + "java": { + "Attributes": {}, + "Detected": false, + "HealthDescription": "", + "Healthy": false, + "UpdateTime": "2024-01-11T12:48:35.97141-05:00" + }, + "docker": { + "Attributes": { + "driver.docker.bridge_ip": "172.17.0.1", + "driver.docker.runtimes": "io.containerd.runc.v2,runc", + "driver.docker.os_type": "linux", + "driver.docker": "true", + "driver.docker.version": "24.0.7" + }, + "Detected": true, + "HealthDescription": "Healthy", + "Healthy": true, + "UpdateTime": "2024-01-11T12:48:35.989993-05:00" + } + }, + "HostVolumes": null, + "ID": "049f7683-0cde-727f-428a-913a89f92bd8", + "LastDrain": null, + "ModifyIndex": 10, + "Name": "client-1", + "NodeClass": "", + "NodePool": "default", + "SchedulingEligibility": "eligible", + "Status": "ready", + "StatusDescription": "", + "Version": "1.6.4" + } + ], + "VaultTokens": [ + { + "Accessor": "czh9MPcRXzAhxBL9XKyb3Kh1", + "AllocID": "f00893d4-d9ef-4937-6a7a-ab495b68a971", + "CreateIndex": 14, + "CreationTTL": 60, + "NodeID": "049f7683-0cde-727f-428a-913a89f92bd8", + "Task": "redis" + } + ] +} +``` + +#### Field Reference + +- `JobsWithoutVaultIdentity` `(array)` - The list of jobs that have a + [`vault`][] block but do not have an [`identity`][] for Vault + authentication. These jobs can fail if they are not redeployed with an + identity for Vault before the configuration for Nomad servers are updated and + their access to Vault is removed. + +- `OutdatedNodes` `(array)` - The list of nodes running a version of + Nomad that does not support workload identity authentication for Vault. + Allocations placed in these nodes will use the deprecated legacy flow to + retrieve Vault tokens. If the Nomad servers configuration is update to remove + their access to Vault before these nodes are upgraded, these allocations will + fail. Allocations that use workload identity for Vault will not be able to be + placed in these nodes until they are upgraded. + +- `VaultTokens` `(array)` - The list of Vault ACL tokens created + by Nomad servers using the deprecated legacy flow. They will continue to work + even after the migration to the workload identities, but they may not be + automatically revoked by Nomad and will only expire once their TTL reaches + zero. + +Refer to [Migrating to Using Workload Identity with +Vault][nomad_acl_vault_wid_migrate] for more information. + +[`identity`]: /nomad/docs/job-specification/identity +[`vault`]: /nomad/docs/job-specification/vault +[nomad_acl_vault_wid]: /nomad/docs/integrations/vault/acl#nomad-workload-identities +[nomad_acl_vault_wid_migrate]: /nomad/docs/integrations/vault/acl#migrating-to-using-workload-identity-with-vault diff --git a/website/content/docs/commands/setup/vault.mdx b/website/content/docs/commands/setup/vault.mdx index ac5da6fa485..a016ea0366a 100644 --- a/website/content/docs/commands/setup/vault.mdx +++ b/website/content/docs/commands/setup/vault.mdx @@ -14,6 +14,13 @@ This command requires `acl:write` permissions for Vault and respects `VAULT_TOKEN`, `VAULT_ADDR`, and other [Vault-related environment variables][vaultenv]. +The `-check` option can be used to verify if the Nomad cluster is ready to +migrate to use Workload Identities with Vault. This option requires +`operator:read` permission for Nomad. + +Refer to [Migrating to Using Workload Identity with +Vault][nomad_acl_vault_wid_migrate] for more information. + This command is an experimental feature and may change its behavior in future @@ -38,6 +45,19 @@ nomad setup vault [options] - `-y`: Automatically answers `yes` to all the questions, making the setup non-interactive. Defaults to `false`. +- `-check`: Verify if the Nomad cluster is ready to migrate to Workload + Identities. + +### Setup Vault Options When Using `-check`: + +- `-json`: Output migration status information in its JSON format. + +- `-t`: Format and display migration status information using a Go template. + +- `-verbose`: Display full information. + +@include 'general_options_no_namespace.mdx' + ## Examples Below is an example of an interactive session with default options, interrupted @@ -145,4 +165,46 @@ services using workload identities. Run the command again to finish the configuration process. ``` +The `-check` option can use to verify if a cluster is ready to migrate to using +workload identities with Vault. + +``` +$ nomad setup vault -check + +Jobs Without Workload Identity for Vault +The following jobs access Vault but are not configured for workload identity. + +You should redeploy them before fully migrating to workload identities with +Vault to prevent unexpected errors if their tokens need to be recreated. + +Refer to https://developer.hashicorp.com/nomad/s/vault-workload-identity-migration +for more information. + +ID Namespace Type Status +example default service running + +Outdated Nodes +The following nodes are running a version of Nomad that does not support using +workload identities with Vault. + +You should upgrade them to Nomad 1.7 before fully migrating to workload +identities with Vault to prevent unexpected errors if they receive allocations +for jobs that use Vault. + +Refer to https://developer.hashicorp.com/nomad/s/vault-workload-identity-migration +for more information. + +ID Name Address Version Drain Eligibility Status +049f7683 client-1 192.168.0.186 1.6.4 false eligible ready + +Vault Tokens +The following Vault ACL tokens were created by Nomad but will not be +automatically revoked after migrating to workload identities. They will expire +once their TTL reaches zero. + +Accessor ID Allocation ID Node ID Configured TTL +czh9MPcRXzAhxBL9XKyb3Kh1 f00893d4 049f7683 60 +``` + [vaultenv]: /vault/docs/commands#environment-variables +[nomad_acl_vault_wid_migrate]: /nomad/docs/integrations/vault/acl#migrating-to-using-workload-identity-with-vault diff --git a/website/content/docs/integrations/vault/acl.mdx b/website/content/docs/integrations/vault/acl.mdx index 6a6ce4dc703..1f1c7fdb28e 100644 --- a/website/content/docs/integrations/vault/acl.mdx +++ b/website/content/docs/integrations/vault/acl.mdx @@ -763,25 +763,60 @@ $ VAULT_TOKEN=s.H39hfS7eHSbb1GpkdzOQLTmz.fvuLy nomad job run vault.nomad Migrating from the legacy (pre-1.7) workflow where workloads use the agent's Vault token requires configuration on your Vault cluster and your Nomad server -agents. It does not require updating your running Nomad jobs unless you wish to -specify a non-default role. To migrate: +agents. + +Once the migration is fully complete, Nomad server will no longer have access +to Vault, as it was required in the deprecated legacy workflow. This also means +that they will no longer be able to fulfill some of their responsibilities from +the legacy workflow, such as generating and revoking Vault ACL tokens. + +Before removing Vault connectivity configuration from Nomad servers, you must +make sure the rest of the cluster is ready to support workload identities for +Vault. You can run the [`nomad setup vault -check`][nomad_cli_setup_vault] +command to verify what changes are still necessary. + +Before removing Nomad servers access to Vault you must: + + * Redeploy the jobs listed in the section `Jobs Without Workload Identity for + Vault` with an identity for Vault. You can specify this identity [directly + in the job][jobspec_identity_vault] or redeploy the job without changes to + use the default value from the server [`vault.default_identity`][] + configuration if set. + * Upgrade nodes listed in the section `Outdated Nodes` to a version of Nomad + above 1.7.0. + +There is not action required for the Vault ACL tokens listed under `Vault +Tokens`. Nomad will revoke them as you redeploy jobs to use workload identities +but there may be some leftovers. You can still proceed with the migration +process, but Nomad will not revoke them once access to Vault is removed from +Nomad servers. They will expire once their TTL reaches zero, or you may +manually revoke them if they are no longer needed by an allocation. + +The migration process can happen over time. As long as all servers are upgraded +to Nomad 1.7+ and still retain access to Vault, jobs can still use either the +new workload identity flow or the deprecated legacy flow. + +To summarize the migration process: -* Create the Vault auth method, default role, and policies on your Vault - cluster. * Enable [`vault.default_identity`][] blocks in your Nomad server agent configurations, but **do not modify any of the existing Vault configuration**. * Upgrade your cluster following the documented [Upgrade Process][docs_upgrade]. -* Resubmit Nomad jobs that need access to Vault to redeploy them with a new - workload identity for Vault. - * (Optionally) Add [`vault.role`][] fields to any Nomad jobs that will not - use the default role. - * (Optionally) add [`identity`][] blocks to your jobs if you want to use a - different identity because of how your auth method and roles are - configured. -* Once all jobs have been resubmitted, you may remove parameters no longer used - by the Nomad server agents from the [`vault`][config] configuration block. +* Create the Vault auth method, default role, and policies on your Vault + cluster. +* Run the `nomad setup vault -check` command to verify if the cluster is ready + to migrate to workload identity access to Vault. + * Resubmit Nomad jobs that need access to Vault to redeploy them with a new + workload identity for Vault. + * (Optionally) Add [`vault.role`][] fields to any Nomad jobs that will not + use the default role. + * (Optionally) add [`identity`][] blocks to your jobs if you want to use a + different identity because of how your auth method and roles are + configured. + * Upgrade any remaining clients to Nomad 1.7+. +* Remove parameters no longer used by the Nomad server agents from the + [`vault`][config] configuration block. [Variables]: /nomad/docs/concepts/variables [Vault Namespaces]: /vault/docs/enterprise/namespaces diff --git a/website/data/api-docs-nav-data.json b/website/data/api-docs-nav-data.json index 0ba76dac667..e89bec40ac8 100644 --- a/website/data/api-docs-nav-data.json +++ b/website/data/api-docs-nav-data.json @@ -132,6 +132,10 @@ { "title": "Snapshot", "path": "operator/snapshot" + }, + { + "title": "Upgrade Check", + "path": "operator/upgrade-check" } ] }, diff --git a/website/redirects.js b/website/redirects.js index e03c6cdc854..69ec026686f 100644 --- a/website/redirects.js +++ b/website/redirects.js @@ -29,6 +29,17 @@ module.exports = [ permanent: true, }, */ + + /** + * /s/* redirects for useful links that need a stable URL but we may need to + * change its destination in the future. + */ + { + source: '/nomad/s/vault-workload-identity-migration', + destination: + 'https://developer.hashicorp.com/nomad/docs/integrations/vault/acl#migrating-to-using-workload-identity-with-vault', + permanent: false, + }, // Rename and re-arrange Autoscaling Internals section { source: '/nomad/tools/autoscaling/internals/:path*',