diff --git a/.changelog/19720.txt b/.changelog/19720.txt
new file mode 100644
index 00000000000..9823f3a0352
--- /dev/null
+++ b/.changelog/19720.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+cli: Add new option `nomad setup vault -check` to help cluster operators migrate to workload identities for Vault
+```
diff --git a/api/operator.go b/api/operator.go
index e9823b2f541..a6f11f45e4d 100644
--- a/api/operator.go
+++ b/api/operator.go
@@ -411,3 +411,63 @@ type LeadershipTransferResponse struct {
WriteMeta
}
+
+// VaultWorkloadIdentityUpgradeCheck is the result of verifying if the cluster
+// is ready to switch to workload identities for Vault.
+type VaultWorkloadIdentityUpgradeCheck struct {
+ // JobsWithoutVaultIdentity is the list of jobs that have a `vault` block
+ // but do not have an `identity` for Vault.
+ JobsWithoutVaultIdentity []*JobListStub
+
+ // OutdatedNodes is the list of nodes running a version of Nomad that does
+ // not support workload identities for Vault.
+ OutdatedNodes []*NodeListStub
+
+ // VaultTokens is the list of Vault ACL token accessors that Nomad created
+ // and will no longer manage after the cluster is migrated to workload
+ // identities.
+ VaultTokens []*VaultAccessor
+}
+
+// Ready returns true if the cluster is ready to migrate to workload identities
+// with Vault.
+func (v *VaultWorkloadIdentityUpgradeCheck) Ready() bool {
+ return v != nil &&
+ len(v.VaultTokens) == 0 &&
+ len(v.OutdatedNodes) == 0 &&
+ len(v.JobsWithoutVaultIdentity) == 0
+}
+
+// VaultAccessor is a Vault ACL token created by Nomad for a task to access
+// Vault using the legacy authentication flow.
+type VaultAccessor struct {
+ // AllocID is the ID of the allocation that requested this token.
+ AllocID string
+
+ // Task is the name of the task that requested this token.
+ Task string
+
+ // NodeID is the ID of the node running the allocation that requested this
+ // token.
+ NodeID string
+
+ // Accessor is the Vault ACL token accessor ID.
+ Accessor string
+
+ // CreationTTL is the TTL set when the token was created.
+ CreationTTL int
+
+ // CreateIndex is the Raft index when the token was created.
+ CreateIndex uint64
+}
+
+// UpgradeCheckVaultWorkloadIdentity retrieves the cluster status for migrating
+// to workload identities with Vault.
+func (op *Operator) UpgradeCheckVaultWorkloadIdentity(q *QueryOptions) (*VaultWorkloadIdentityUpgradeCheck, *QueryMeta, error) {
+ var resp VaultWorkloadIdentityUpgradeCheck
+ qm, err := op.c.query("/v1/operator/upgrade-check/vault-workload-identity", &resp, q)
+ if err != nil {
+ return nil, nil, err
+ }
+ return &resp, qm, nil
+}
diff --git a/command/agent/http.go b/command/agent/http.go
index 4f7e20f07b3..35e3ccd557e 100644
--- a/command/agent/http.go
+++ b/command/agent/http.go
@@ -487,6 +487,7 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
s.mux.HandleFunc("/v1/operator/autopilot/configuration", s.wrap(s.OperatorAutopilotConfiguration))
s.mux.HandleFunc("/v1/operator/autopilot/health", s.wrap(s.OperatorServerHealth))
s.mux.HandleFunc("/v1/operator/snapshot", s.wrap(s.SnapshotRequest))
+ s.mux.HandleFunc("/v1/operator/upgrade-check/", s.wrap(s.UpgradeCheckRequest))
s.mux.HandleFunc("/v1/system/gc", s.wrap(s.GarbageCollectRequest))
s.mux.HandleFunc("/v1/system/reconcile/summaries", s.wrap(s.ReconcileJobSummaries))
diff --git a/command/agent/operator_endpoint.go b/command/agent/operator_endpoint.go
index c79641e02eb..09d4343ddf5 100644
--- a/command/agent/operator_endpoint.go
+++ b/command/agent/operator_endpoint.go
@@ -521,3 +521,32 @@ func (s *HTTPServer) snapshotRestoreRequest(resp http.ResponseWriter, req *http.
return nil, codedErr
}
+
+func (s *HTTPServer) UpgradeCheckRequest(resp http.ResponseWriter, req *http.Request) (any, error) {
+ path := strings.TrimPrefix(req.URL.Path, "/v1/operator/upgrade-check")
+ switch {
+ case strings.HasSuffix(path, "/vault-workload-identity"):
+ return s.upgradeCheckVaultWorkloadIdentity(resp, req)
+ default:
+ return nil, CodedError(http.StatusNotFound, fmt.Sprintf("Path %s not found", req.URL.Path))
+ }
+}
+
+func (s *HTTPServer) upgradeCheckVaultWorkloadIdentity(resp http.ResponseWriter, req *http.Request) (any, error) {
+ if req.Method != http.MethodGet {
+ return nil, CodedError(405, ErrInvalidMethod)
+ }
+
+ args := structs.UpgradeCheckVaultWorkloadIdentityRequest{}
+ if s.parse(resp, req, &args.Region, &args.QueryOptions) {
+ return nil, nil
+ }
+
+ var out structs.UpgradeCheckVaultWorkloadIdentityResponse
+ if err := s.agent.RPC("Operator.UpgradeCheckVaultWorkloadIdentity", &args, &out); err != nil {
+ return nil, err
+ }
+
+ setMeta(resp, &out.QueryMeta)
+ return out, nil
+}
diff --git a/command/agent/operator_endpoint_test.go b/command/agent/operator_endpoint_test.go
index 2d5590ffd06..ba0666620b1 100644
--- a/command/agent/operator_endpoint_test.go
+++ b/command/agent/operator_endpoint_test.go
@@ -661,3 +661,42 @@ func TestOperator_SnapshotRequests(t *testing.T) {
require.True(t, jobExists())
})
}
+
+func TestOperator_UpgradeCheckRequest_VaultWorkloadIdentity(t *testing.T) {
+ ci.Parallel(t)
+ httpTest(t, func(c *Config) {
+ c.Vaults[0].Enabled = pointer.Of(true)
+ c.Vaults[0].Name = "default"
+ }, func(s *TestAgent) {
+ // Create a test job with a Vault block but without an identity.
+ job := mock.Job()
+ job.TaskGroups[0].Tasks[0].Vault = &structs.Vault{
+ Cluster: "default",
+ Policies: []string{"test"},
+ }
+
+ args := structs.JobRegisterRequest{
+ Job: job,
+ WriteRequest: structs.WriteRequest{Region: "global"},
+ }
+ var resp structs.JobRegisterResponse
+ err := s.Agent.RPC("Job.Register", &args, &resp)
+ must.NoError(t, err)
+
+ // Make HTTP request to retrieve
+ req, err := http.NewRequest(http.MethodGet, "/v1/operator/upgrade-check/vault-workload-identity", nil)
+ must.NoError(t, err)
+ respW := httptest.NewRecorder()
+
+ obj, err := s.Server.UpgradeCheckRequest(respW, req)
+ must.NoError(t, err)
+ must.NotEq(t, "", respW.Header().Get("X-Nomad-Index"))
+ must.NotEq(t, "", respW.Header().Get("X-Nomad-LastContact"))
+ must.Eq(t, "true", respW.Header().Get("X-Nomad-KnownLeader"))
+
+ upgradeCheck := obj.(structs.UpgradeCheckVaultWorkloadIdentityResponse)
+ must.Len(t, 1, upgradeCheck.JobsWithoutVaultIdentity)
+ must.Len(t, 0, upgradeCheck.VaultTokens)
+ must.Eq(t, job.ID, upgradeCheck.JobsWithoutVaultIdentity[0].ID)
+ })
+}
diff --git a/command/setup_vault.go b/command/setup_vault.go
index e052f0c7c90..a0557063b5c 100644
--- a/command/setup_vault.go
+++ b/command/setup_vault.go
@@ -12,6 +12,7 @@ import (
"slices"
"strings"
+ "github.com/dustin/go-humanize/english"
"github.com/hashicorp/vault/api"
"github.com/mitchellh/cli"
"github.com/posener/complete"
@@ -48,6 +49,12 @@ type SetupVaultCommand struct {
destroy bool
autoYes bool
+
+ // Options for -check.
+ check bool
+ json bool
+ tmpl string
+ verbose bool
}
// Help satisfies the cli.Command Help function.
@@ -62,6 +69,10 @@ Usage: nomad setup vault [options]
VAULT_TOKEN, VAULT_ADDR, and other Vault-related environment variables
as documented in https://developer.hashicorp.com/vault/docs/commands#environment-variables.
+ The -check option can be used to verify if the Nomad cluster is ready to
+ migrate to use Workload Identities with Vault. This option requires
+ operator:read permission for Nomad.
+
WARNING: This command is an experimental feature and may change its behavior
in future versions of Nomad.
@@ -79,7 +90,22 @@ Setup Vault options:
Automatically answers "yes" to all the questions, making the setup
non-interactive. Defaults to "false".
-`
+ -check
+ Verify if the Nomad cluster is ready to migrate to Workload Identities.
+
+Setup Vault options when using -check:
+
+ -json
+ Output migration status information in its JSON format.
+
+ -t
+ Format and display migration status information using a Go template.
+
+ -verbose
+ Display full information.
+
+ ` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace)
+
return strings.TrimSpace(helpText)
}
@@ -89,6 +115,12 @@ func (s *SetupVaultCommand) AutocompleteFlags() complete.Flags {
"-jwks-url": complete.PredictAnything,
"-destroy": complete.PredictSet("true", "false"),
"-y": complete.PredictSet("true", "false"),
+
+ // Options for -check.
+ "-check": complete.PredictSet("true", "false"),
+ "-json": complete.PredictSet("true", "false"),
+ "-verbose": complete.PredictSet("true", "false"),
+ "-t": complete.PredictAnything,
})
}
@@ -110,6 +142,13 @@ func (s *SetupVaultCommand) Run(args []string) int {
flags.BoolVar(&s.destroy, "destroy", false, "")
flags.BoolVar(&s.autoYes, "y", false, "")
flags.StringVar(&s.jwksURL, "jwks-url", "http://localhost:4646/.well-known/jwks.json", "")
+
+ // Options for -check.
+ flags.BoolVar(&s.check, "check", false, "")
+ flags.BoolVar(&s.json, "json", false, "")
+ flags.BoolVar(&s.verbose, "verbose", false, "")
+ flags.StringVar(&s.tmpl, "t", "", "")
+
if err := flags.Parse(args); err != nil {
return 1
}
@@ -121,6 +160,32 @@ func (s *SetupVaultCommand) Run(args []string) int {
return 1
}
+ if s.check {
+ return s.checkUpgrade()
+ } else {
+ // Verify that -check flags are not set.
+ var invalid []string
+ if s.json {
+ invalid = append(invalid, "-json")
+ }
+ if s.verbose {
+ invalid = append(invalid, "-verbose")
+ }
+ if s.tmpl != "" {
+ invalid = append(invalid, "-t")
+ }
+
+ if len(invalid) > 0 {
+ s.Ui.Error(fmt.Sprintf(
+ "The %s %s can only be used with -check",
+ english.OxfordWordSeries(invalid, "and"),
+ english.PluralWord(len(invalid), "option", "options"),
+ ))
+ s.Ui.Error(commandErrorText(s))
+ return 1
+ }
+ }
+
if !isTty() && !s.autoYes {
s.Ui.Error("This command requires -y option when running in non-interactive mode")
return 1
@@ -216,7 +281,7 @@ a namespace %q and create all configuration within that namespace.
*/
s.Ui.Output(`
We will now enable the JWT credential backend and create a JWT auth method that
-Nomad workloads will use.
+Nomad workloads will use.
`)
if s.authMethodExists() {
@@ -606,6 +671,117 @@ func (s *SetupVaultCommand) removeConfiguredComponents() int {
return exitCode
}
+func (s *SetupVaultCommand) checkUpgrade() int {
+ length := shortId
+ if s.verbose {
+ length = fullId
+ }
+
+ client, err := s.Meta.Client()
+ if err != nil {
+ s.Ui.Error(fmt.Sprintf("Error initializing client: %s", err))
+ return 1
+ }
+
+ resp, _, err := client.Operator().UpgradeCheckVaultWorkloadIdentity(nil)
+ if err != nil {
+ s.Ui.Error(fmt.Sprintf("Error querying scheduler configuration: %s", err))
+ return 1
+ }
+
+ // Output formatted option if requested.
+ if s.json || len(s.tmpl) > 0 {
+ out, err := Format(s.json, s.tmpl, resp)
+ if err != nil {
+ s.Ui.Error(err.Error())
+ return 1
+ }
+
+ s.Ui.Output(out)
+ return 0
+ }
+
+ if resp.Ready() {
+ s.Ui.Output("Nomad cluster is ready to use workload identities with Vault.")
+ return 0
+ }
+
+ if len(resp.JobsWithoutVaultIdentity) != 0 {
+ s.Ui.Output(s.Colorize().Color(`
+[bold]Jobs Without Workload Identity for Vault[reset]
+The following jobs access Vault but are not configured for workload identity.
+
+You should redeploy them before fully migrating to workload identities with
+Vault to prevent unexpected errors if their tokens need to be recreated.
+
+Refer to https://developer.hashicorp.com/nomad/s/vault-workload-identity-migration
+for more information.
+`))
+ out := make([]string, len(resp.JobsWithoutVaultIdentity)+1)
+ out[0] = "ID|Namespace|Type|Status"
+ for i, job := range resp.JobsWithoutVaultIdentity {
+ out[i+1] = fmt.Sprintf("%s|%s|%s|%s",
+ limit(job.ID, length),
+ job.Namespace,
+ job.Type,
+ job.Status,
+ )
+ }
+ s.Ui.Output(formatList(out))
+ }
+
+ if len(resp.OutdatedNodes) != 0 {
+ s.Ui.Output(s.Colorize().Color(`
+[bold]Outdated Nodes[reset]
+The following nodes are running a version of Nomad that does not support using
+workload identities with Vault.
+
+You should upgrade them to Nomad 1.7 before fully migrating to workload
+identities with Vault to prevent unexpected errors if they receive allocations
+for jobs that use Vault.
+
+Refer to https://developer.hashicorp.com/nomad/s/vault-workload-identity-migration
+for more information.
+`))
+ out := make([]string, len(resp.OutdatedNodes)+1)
+ out[0] = "ID|Name|Address|Version|Drain|Eligibility|Status"
+ for i, node := range resp.OutdatedNodes {
+ out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s|%s",
+ limit(node.ID, length),
+ node.Name,
+ node.Address,
+ node.Version,
+ node.Drain,
+ node.SchedulingEligibility,
+ node.Status,
+ )
+ }
+ s.Ui.Output(formatList(out))
+ }
+
+ if len(resp.VaultTokens) != 0 {
+ s.Ui.Output(s.Colorize().Color(`
+[bold]Vault Tokens[reset]
+The following Vault ACL tokens were created by Nomad but will not be
+automatically revoked after migrating to workload identities. They will expire
+once their TTL reaches zero.
+`))
+ out := make([]string, len(resp.VaultTokens)+1)
+ out[0] = "Accessor ID|Allocation ID|Node ID|Configured TTL"
+ for i, token := range resp.VaultTokens {
+ out[i+1] = fmt.Sprintf("%s|%s|%s|%d",
+ token.Accessor,
+ limit(token.AllocID, length),
+ limit(token.NodeID, length),
+ token.CreationTTL,
+ )
+ }
+ s.Ui.Output(formatList(out))
+ }
+
+ return 0
+}
+
func printMapOfStrings(m map[string]string) string {
var output string
diff --git a/command/setup_vault_test.go b/command/setup_vault_test.go
new file mode 100644
index 00000000000..9bce8d8b2a5
--- /dev/null
+++ b/command/setup_vault_test.go
@@ -0,0 +1,153 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: BUSL-1.1
+
+package command
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/mitchellh/cli"
+ "github.com/shoenig/test/must"
+
+ "github.com/hashicorp/nomad/api"
+ "github.com/hashicorp/nomad/ci"
+ "github.com/hashicorp/nomad/command/agent"
+ "github.com/hashicorp/nomad/helper/pointer"
+)
+
+func TestSetupVaultCommand_Run(t *testing.T) {
+ ci.Parallel(t)
+
+ // Start in dev mode so we get a node registration
+ srv, client, url := testServer(t, true, func(c *agent.Config) {
+ c.DevMode = true
+ c.Vaults[0].Name = "default"
+ c.Vaults[0].Enabled = pointer.Of(true)
+ })
+ defer srv.Shutdown()
+
+ // Register a job with a vault block but without an identity for Vault.
+ job := testJob("test")
+ job.TaskGroups[0].Tasks[0].Vault = &api.Vault{
+ Cluster: "default",
+ Policies: []string{"test"},
+ }
+ _, _, err := client.Jobs().Register(job, nil)
+ must.NoError(t, err)
+
+ job, _, err = client.Jobs().Info(*job.ID, nil)
+ must.NoError(t, err)
+
+ testCases := []struct {
+ name string
+ args []string
+ expectedErr string
+ expectedRC int
+ expectedOut string
+ }{
+ {
+ name: "-check flags",
+ args: []string{
+ "-json",
+ "-t", "{{.}}",
+ "-verbose",
+ },
+ expectedRC: 1,
+ expectedErr: "The -json, -verbose, and -t options can only be used with -check",
+ },
+ {
+ name: "-check",
+ args: []string{
+ "-check",
+ "-address", url,
+ },
+ expectedRC: 0,
+ expectedOut: `
+Jobs Without Workload Identity for Vault
+The following jobs access Vault but are not configured for workload identity.
+
+You should redeploy them before fully migrating to workload identities with
+Vault to prevent unexpected errors if their tokens need to be recreated.
+
+Refer to https://developer.hashicorp.com/nomad/s/vault-workload-identity-migration
+for more information.
+
+ID Namespace Type Status
+test default batch pending
+`,
+ },
+ {
+ name: "-check with -json",
+ args: []string{
+ "-check",
+ "-json",
+ "-address", url,
+ },
+ expectedRC: 0,
+ expectedOut: fmt.Sprintf(`{
+ "JobsWithoutVaultIdentity": [
+ {
+ "CreateIndex": 10,
+ "Datacenters": [
+ "dc1"
+ ],
+ "ID": "test",
+ "JobModifyIndex": %d,
+ "JobSummary": null,
+ "ModifyIndex": %d,
+ "Name": "test",
+ "Namespace": "default",
+ "ParameterizedJob": false,
+ "ParentID": "",
+ "Periodic": false,
+ "Priority": 1,
+ "Status": "pending",
+ "StatusDescription": "",
+ "Stop": false,
+ "SubmitTime": %d,
+ "Type": "batch"
+ }
+ ],
+ "OutdatedNodes": [],
+ "VaultTokens": []
+}
+`, *job.CreateIndex, *job.ModifyIndex, *job.SubmitTime),
+ },
+ {
+ name: "-check with -t",
+ args: []string{
+ "-check",
+ "-t", "{{with index .JobsWithoutVaultIdentity 0}}{{.ID}}{{end}}",
+ "-address", url,
+ },
+ expectedRC: 0,
+ expectedOut: "test\n",
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ ui := cli.NewMockUi()
+ meta := Meta{Ui: ui}
+
+ defer func() {
+ if t.Failed() {
+ fmt.Println(ui.ErrorWriter.String())
+ fmt.Println(ui.OutputWriter.String())
+ }
+ }()
+
+ cmd := &SetupVaultCommand{Meta: meta}
+ got := cmd.Run(tc.args)
+ must.Eq(t, tc.expectedRC, got)
+
+ if tc.expectedErr != "" {
+ must.StrContains(t, ui.ErrorWriter.String(), tc.expectedErr)
+ } else {
+ must.Eq(t, ui.ErrorWriter.String(), "")
+ must.Eq(t, ui.OutputWriter.String(), tc.expectedOut)
+ }
+ })
+ }
+}
diff --git a/nomad/operator_endpoint.go b/nomad/operator_endpoint.go
index 982730b9732..0bdc8a60766 100644
--- a/nomad/operator_endpoint.go
+++ b/nomad/operator_endpoint.go
@@ -12,7 +12,9 @@ import (
"time"
"github.com/hashicorp/go-hclog"
+ "github.com/hashicorp/go-memdb"
"github.com/hashicorp/go-msgpack/codec"
+ version "github.com/hashicorp/go-version"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
@@ -787,6 +789,99 @@ func (op *Operator) snapshotRestore(conn io.ReadWriteCloser) {
encoder.Encode(reply)
}
+func (op *Operator) UpgradeCheckVaultWorkloadIdentity(
+ args *structs.UpgradeCheckVaultWorkloadIdentityRequest,
+ reply *structs.UpgradeCheckVaultWorkloadIdentityResponse,
+) error {
+ authErr := op.srv.Authenticate(op.ctx, args)
+ if done, err := op.srv.forward("Operator.UpgradeCheckVaultWorkloadIdentity", args, args, reply); done {
+ return err
+ }
+ op.srv.MeasureRPCRate("operator", structs.RateMetricRead, args)
+ if authErr != nil {
+ return structs.ErrPermissionDenied
+ }
+
+ // This action requires operator read access.
+ rule, err := op.srv.ResolveACL(args)
+ if err != nil {
+ return err
+ } else if rule != nil && !rule.AllowOperatorRead() {
+ return structs.ErrPermissionDenied
+ }
+
+ state := op.srv.fsm.State()
+ ws := memdb.NewWatchSet()
+
+ // Check for jobs that use Vault but don't have an identity for Vault.
+ jobsIter, err := state.Jobs(ws)
+ if err != nil {
+ return fmt.Errorf("failed to retrieve jobs: %w", err)
+ }
+
+ jobs := []*structs.JobListStub{}
+ for raw := jobsIter.Next(); raw != nil; raw = jobsIter.Next() {
+ job := raw.(*structs.Job)
+
+ TG_LOOP:
+ for _, tg := range job.TaskGroups {
+ for _, t := range tg.Tasks {
+ if t.Vault == nil {
+ continue
+ }
+
+ foundWID := false
+ for _, wid := range t.Identities {
+ if wid.IsVault() {
+ foundWID = true
+ break
+ }
+ }
+ if !foundWID {
+ jobs = append(jobs, job.Stub(nil, nil))
+ break TG_LOOP
+ }
+ }
+ }
+ }
+ reply.JobsWithoutVaultIdentity = jobs
+
+ // Find nodes that don't support workload identities for Vault.
+ nodesIter, err := state.Nodes(ws)
+ if err != nil {
+ return fmt.Errorf("failed to retrieve nodes: %w", err)
+ }
+
+ nodes := []*structs.NodeListStub{}
+ for raw := nodesIter.Next(); raw != nil; raw = nodesIter.Next() {
+ node := raw.(*structs.Node)
+
+ v, err := version.NewVersion(node.Attributes["nomad.version"])
+ if err != nil || v.LessThan(structs.MinNomadVersionVaultWID) {
+ nodes = append(nodes, node.Stub(nil))
+ continue
+ }
+ }
+ reply.OutdatedNodes = nodes
+
+ // Retrieve Vault tokens that were created by Nomad servers.
+ vaultTokensIter, err := state.VaultAccessors(ws)
+ if err != nil {
+ return fmt.Errorf("failed to retrieve Vault token accessors: %w", err)
+ }
+
+ vaultTokens := []*structs.VaultAccessor{}
+ for raw := vaultTokensIter.Next(); raw != nil; raw = vaultTokensIter.Next() {
+ vaultTokens = append(vaultTokens, raw.(*structs.VaultAccessor))
+ }
+ reply.VaultTokens = vaultTokens
+
+ reply.QueryMeta.Index, _ = op.srv.State().LatestIndex()
+ op.srv.setQueryMeta(&reply.QueryMeta)
+
+ return nil
+}
+
func decodeStreamOutput(decoder *codec.Decoder) (io.Reader, <-chan error) {
pr, pw := io.Pipe()
errCh := make(chan error, 1)
diff --git a/nomad/operator_endpoint_test.go b/nomad/operator_endpoint_test.go
index fdb829b2313..822f7757e4a 100644
--- a/nomad/operator_endpoint_test.go
+++ b/nomad/operator_endpoint_test.go
@@ -17,6 +17,7 @@ import (
"testing"
"time"
+ "github.com/google/go-cmp/cmp/cmpopts"
"github.com/hashicorp/go-msgpack/codec"
msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
"github.com/hashicorp/nomad/acl"
@@ -1188,3 +1189,183 @@ func TestOperator_SnapshotRestore_ACL(t *testing.T) {
})
}
}
+
+func TestOperator_UpgradeCheckRequest_VaultWorkloadIdentity(t *testing.T) {
+ ci.Parallel(t)
+
+ s1, cleanupS1 := TestServer(t, nil)
+ defer cleanupS1()
+ testutil.WaitForLeader(t, s1.RPC)
+
+ codec := rpcClient(t, s1)
+ state := s1.fsm.State()
+
+ // Register mock nodes, one pre-1.7.
+ node := mock.Node()
+ node.Attributes["nomad.version"] = "1.7.2"
+ err := state.UpsertNode(structs.MsgTypeTestSetup, 1000, node)
+ must.NoError(t, err)
+
+ outdatedNode := mock.Node()
+ outdatedNode.Attributes["nomad.version"] = "1.6.4"
+ err = state.UpsertNode(structs.MsgTypeTestSetup, 1001, outdatedNode)
+ must.NoError(t, err)
+
+ // Create non-default namespace.
+ ns := mock.Namespace()
+ state.UpsertNamespaces(1002, []*structs.Namespace{ns})
+
+ // Register Vault jobs, one with and another without workload identity.
+ jobNoWID := mock.Job()
+ jobNoWID.TaskGroups[0].Tasks[0].Vault = &structs.Vault{
+ Cluster: "default",
+ Policies: []string{"test"},
+ }
+ // Add multiple tasks and groups to make sure we don't have duplicate jobs
+ // in the result.
+ jobNoWID.TaskGroups[0].Tasks = append(jobNoWID.TaskGroups[0].Tasks, jobNoWID.TaskGroups[0].Tasks[0].Copy())
+ jobNoWID.TaskGroups[0].Tasks[1].Name = "task-1"
+ jobNoWID.TaskGroups = append(jobNoWID.TaskGroups, jobNoWID.TaskGroups[0].Copy())
+ jobNoWID.TaskGroups[1].Name = "tg-1"
+
+ err = state.UpsertJob(structs.MsgTypeTestSetup, 1003, nil, jobNoWID)
+ must.NoError(t, err)
+
+ jobNoWIDNonDefaultNS := mock.Job()
+ jobNoWIDNonDefaultNS.Namespace = ns.Name
+ jobNoWIDNonDefaultNS.TaskGroups[0].Tasks[0].Vault = &structs.Vault{
+ Cluster: "default",
+ Policies: []string{"test"},
+ }
+ err = state.UpsertJob(structs.MsgTypeTestSetup, 1004, nil, jobNoWIDNonDefaultNS)
+ must.NoError(t, err)
+
+ jobWithWID := mock.Job()
+ jobWithWID.TaskGroups[0].Tasks[0].Vault = &structs.Vault{
+ Cluster: "default",
+ }
+ jobWithWID.TaskGroups[0].Tasks[0].Identities = []*structs.WorkloadIdentity{{
+ Name: "vault_default",
+ }}
+ err = state.UpsertJob(structs.MsgTypeTestSetup, 1005, nil, jobWithWID)
+ must.NoError(t, err)
+
+ // Create allocs for the jobs.
+ allocJobNoWID := mock.Alloc()
+ allocJobNoWID.Job = jobNoWID
+ allocJobNoWID.JobID = jobNoWID.ID
+ allocJobNoWID.NodeID = node.ID
+
+ allocJobWithWID := mock.Alloc()
+ allocJobWithWID.Job = jobWithWID
+ allocJobWithWID.JobID = jobWithWID.ID
+ allocJobWithWID.NodeID = node.ID
+
+ err = state.UpsertAllocs(structs.MsgTypeTestSetup, 1006, []*structs.Allocation{allocJobNoWID, allocJobWithWID})
+ must.NoError(t, err)
+
+ // Create Vault token accessor for job without Vault identity and one that
+ // is no longer used.
+ tokenJobNoWID := mock.VaultAccessor()
+ tokenJobNoWID.AllocID = allocJobNoWID.ID
+ tokenJobNoWID.NodeID = node.ID
+
+ tokenUnused := mock.VaultAccessor()
+ err = state.UpsertVaultAccessor(1007, []*structs.VaultAccessor{tokenJobNoWID, tokenUnused})
+ must.NoError(t, err)
+
+ // Make request.
+ args := &structs.UpgradeCheckVaultWorkloadIdentityRequest{
+ QueryOptions: structs.QueryOptions{
+ Region: "global",
+ AuthToken: node.SecretID,
+ },
+ }
+ var resp structs.UpgradeCheckVaultWorkloadIdentityResponse
+ err = msgpackrpc.CallWithCodec(codec, "Operator.UpgradeCheckVaultWorkloadIdentity", args, &resp)
+ must.NoError(t, err)
+ must.Eq(t, 1007, resp.Index)
+
+ // Verify only jobs without Vault identity are returned.
+ must.Len(t, 2, resp.JobsWithoutVaultIdentity)
+ must.SliceContains(t, resp.JobsWithoutVaultIdentity, jobNoWID.Stub(nil, nil), must.Cmp(cmpopts.IgnoreFields(
+ structs.JobListStub{},
+ "Status",
+ "ModifyIndex",
+ )))
+ must.SliceContains(t, resp.JobsWithoutVaultIdentity, jobNoWIDNonDefaultNS.Stub(nil, nil), must.Cmp(cmpopts.IgnoreFields(
+ structs.JobListStub{},
+ "Status",
+ "ModifyIndex",
+ )))
+
+ // Verify only outdated nodes are returned.
+ must.Len(t, 1, resp.OutdatedNodes)
+ must.SliceContains(t, resp.OutdatedNodes, outdatedNode.Stub(nil))
+
+ // Verify Vault ACL tokens are returned.
+ must.Len(t, 2, resp.VaultTokens)
+ must.SliceContains(t, resp.VaultTokens, tokenJobNoWID)
+ must.SliceContains(t, resp.VaultTokens, tokenUnused)
+}
+
+func TestOperator_UpgradeCheckRequest_VaultWorkloadIdentity_ACL(t *testing.T) {
+ ci.Parallel(t)
+
+ s1, root, cleanupS1 := TestACLServer(t, nil)
+ defer cleanupS1()
+ testutil.WaitForLeader(t, s1.RPC)
+
+ codec := rpcClient(t, s1)
+ state := s1.fsm.State()
+
+ // Create test tokens and policies.
+ allowed := mock.CreatePolicyAndToken(t, state, 1000, "allowed", `operator {policy = "read"}`)
+ notAllowed := mock.CreatePolicyAndToken(t, state, 1002, "not-allowed", mock.NamespacePolicy("default", "write", nil))
+
+ testCases := []struct {
+ name string
+ token string
+ expectedErr string
+ }{
+ {
+ name: "root token is allowed",
+ token: root.SecretID,
+ expectedErr: "",
+ },
+ {
+ name: "operator read token is allowed",
+ token: allowed.SecretID,
+ expectedErr: "",
+ },
+ {
+ name: "token not allowed",
+ token: notAllowed.SecretID,
+ expectedErr: structs.ErrPermissionDenied.Error(),
+ },
+ {
+ name: "missing token not allowed",
+ token: "",
+ expectedErr: structs.ErrPermissionDenied.Error(),
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ // Make request.
+ args := &structs.UpgradeCheckVaultWorkloadIdentityRequest{
+ QueryOptions: structs.QueryOptions{
+ Region: "global",
+ AuthToken: tc.token,
+ },
+ }
+ var resp structs.UpgradeCheckVaultWorkloadIdentityResponse
+ err := msgpackrpc.CallWithCodec(codec, "Operator.UpgradeCheckVaultWorkloadIdentity", args, &resp)
+ if tc.expectedErr == "" {
+ must.NoError(t, err)
+ } else {
+ must.ErrorContains(t, err, tc.expectedErr)
+ }
+ })
+ }
+}
diff --git a/nomad/structs/operator.go b/nomad/structs/operator.go
index c68bd668d90..09246c83da0 100644
--- a/nomad/structs/operator.go
+++ b/nomad/structs/operator.go
@@ -373,3 +373,15 @@ type SnapshotRestoreResponse struct {
QueryMeta
}
+
+type UpgradeCheckVaultWorkloadIdentityRequest struct {
+ QueryOptions
+}
+
+type UpgradeCheckVaultWorkloadIdentityResponse struct {
+ JobsWithoutVaultIdentity []*JobListStub
+ OutdatedNodes []*NodeListStub
+ VaultTokens []*VaultAccessor
+
+ QueryMeta
+}
diff --git a/nomad/structs/workload_id.go b/nomad/structs/workload_id.go
index 144103ab5b9..368acb371d4 100644
--- a/nomad/structs/workload_id.go
+++ b/nomad/structs/workload_id.go
@@ -11,6 +11,7 @@ import (
"time"
"github.com/hashicorp/go-multierror"
+ "github.com/hashicorp/go-version"
)
const (
@@ -52,6 +53,12 @@ var (
// validIdentityName is used to validate workload identity Name fields. Must
// be safe to use in filenames.
validIdentityName = regexp.MustCompile("^[a-zA-Z0-9-_]{1,128}$")
+
+ // MinNomadVersionVaultWID is the minimum version of Nomad that supports
+ // workload identities for Vault.
+ // "-a" is used here so that it is "less than" all pre-release versions of
+ // Nomad 1.7.0 as well
+ MinNomadVersionVaultWID = version.Must(version.NewVersion("1.7.0-a"))
)
// WorkloadIdentity is the jobspec block which determines if and how a workload
diff --git a/website/content/api-docs/operator/upgrade-check.mdx b/website/content/api-docs/operator/upgrade-check.mdx
new file mode 100644
index 00000000000..8659c6b1359
--- /dev/null
+++ b/website/content/api-docs/operator/upgrade-check.mdx
@@ -0,0 +1,187 @@
+---
+layout: api
+page_title: Upgrade Check - Operator - HTTP API
+description: |-
+ The /operator/upgrade-check endpoints provide tools for verifying the state
+ of the cluster prior to upgrades.
+---
+
+# Upgrade Check Operator HTTP API
+
+The `/operator/upgrade-check` endpoints provide some predefined verifications
+that can be useful prior to upgrades and changes to Nomad configuration.
+
+
+
+These endpoints are meant to target specific releases of Nomad and may be
+removed or modified without notice.
+
+
+
+## Vault Workload Identity
+
+This endpoint retrieves jobs, nodes, and Vault ACL tokens that may be affected
+when migrating a Nomad cluster to use [workload identities for
+Vault][nomad_acl_vault_wid].
+
+| Method | Path | Produces |
+| ------ | ---------------------------------------------------- | ------------------ |
+| `GET` | `/v1/operator/upgrade-check/vault-workload-identity` | `application/json` |
+
+The table below shows this endpoint's support for
+[blocking queries](/nomad/api-docs#blocking-queries) and
+[required ACLs](/nomad/api-docs#acls).
+
+| Blocking Queries | ACL Required |
+| ---------------- | --------------- |
+| `NO` | `operator:read` |
+
+### Sample Request
+
+```shell-session
+$ nomad operator api \
+ /v1/operator/upgrade-check/vault-workload-identity
+```
+
+### Sample Response
+
+```json
+{
+ "Index": 20,
+ "JobsWithoutVaultIdentity": [
+ {
+ "CreateIndex": 11,
+ "Datacenters": [
+ "*"
+ ],
+ "ID": "example",
+ "JobModifyIndex": 11,
+ "JobSummary": null,
+ "ModifyIndex": 19,
+ "Multiregion": null,
+ "Name": "example",
+ "Namespace": "default",
+ "NodePool": "default",
+ "ParameterizedJob": false,
+ "ParentID": "",
+ "Periodic": false,
+ "Priority": 50,
+ "Status": "running",
+ "StatusDescription": "",
+ "Stop": false,
+ "SubmitTime": 1704995322434188000,
+ "Type": "service"
+ }
+ ],
+ "KnownLeader": true,
+ "LastContact": 0,
+ "NextToken": "",
+ "OutdatedNodes": [
+ {
+ "Address": "192.168.0.186",
+ "CreateIndex": 8,
+ "Datacenter": "dc1",
+ "Drain": false,
+ "Drivers": {
+ "qemu": {
+ "Attributes": {
+ "driver.qemu": "true",
+ "driver.qemu.version": "8.1.1"
+ },
+ "Detected": true,
+ "HealthDescription": "Healthy",
+ "Healthy": true,
+ "UpdateTime": "2024-01-11T12:48:35.993541-05:00"
+ },
+ "exec": {
+ "Attributes": {},
+ "Detected": false,
+ "HealthDescription": "exec driver unsupported on client OS",
+ "Healthy": false,
+ "UpdateTime": "2024-01-11T12:48:35.958495-05:00"
+ },
+ "raw_exec": {
+ "Attributes": {
+ "driver.raw_exec": "true"
+ },
+ "Detected": true,
+ "HealthDescription": "Healthy",
+ "Healthy": true,
+ "UpdateTime": "2024-01-11T12:48:35.958539-05:00"
+ },
+ "java": {
+ "Attributes": {},
+ "Detected": false,
+ "HealthDescription": "",
+ "Healthy": false,
+ "UpdateTime": "2024-01-11T12:48:35.97141-05:00"
+ },
+ "docker": {
+ "Attributes": {
+ "driver.docker.bridge_ip": "172.17.0.1",
+ "driver.docker.runtimes": "io.containerd.runc.v2,runc",
+ "driver.docker.os_type": "linux",
+ "driver.docker": "true",
+ "driver.docker.version": "24.0.7"
+ },
+ "Detected": true,
+ "HealthDescription": "Healthy",
+ "Healthy": true,
+ "UpdateTime": "2024-01-11T12:48:35.989993-05:00"
+ }
+ },
+ "HostVolumes": null,
+ "ID": "049f7683-0cde-727f-428a-913a89f92bd8",
+ "LastDrain": null,
+ "ModifyIndex": 10,
+ "Name": "client-1",
+ "NodeClass": "",
+ "NodePool": "default",
+ "SchedulingEligibility": "eligible",
+ "Status": "ready",
+ "StatusDescription": "",
+ "Version": "1.6.4"
+ }
+ ],
+ "VaultTokens": [
+ {
+ "Accessor": "czh9MPcRXzAhxBL9XKyb3Kh1",
+ "AllocID": "f00893d4-d9ef-4937-6a7a-ab495b68a971",
+ "CreateIndex": 14,
+ "CreationTTL": 60,
+ "NodeID": "049f7683-0cde-727f-428a-913a89f92bd8",
+ "Task": "redis"
+ }
+ ]
+}
+```
+
+#### Field Reference
+
+- `JobsWithoutVaultIdentity` `(array)` - The list of jobs that have a
+ [`vault`][] block but do not have an [`identity`][] for Vault
+ authentication. These jobs can fail if they are not redeployed with an
+ identity for Vault before the configuration for Nomad servers are updated and
+ their access to Vault is removed.
+
+- `OutdatedNodes` `(array)` - The list of nodes running a version of
+ Nomad that does not support workload identity authentication for Vault.
+ Allocations placed in these nodes will use the deprecated legacy flow to
+ retrieve Vault tokens. If the Nomad servers configuration is update to remove
+ their access to Vault before these nodes are upgraded, these allocations will
+ fail. Allocations that use workload identity for Vault will not be able to be
+ placed in these nodes until they are upgraded.
+
+- `VaultTokens` `(array)` - The list of Vault ACL tokens created
+ by Nomad servers using the deprecated legacy flow. They will continue to work
+ even after the migration to the workload identities, but they may not be
+ automatically revoked by Nomad and will only expire once their TTL reaches
+ zero.
+
+Refer to [Migrating to Using Workload Identity with
+Vault][nomad_acl_vault_wid_migrate] for more information.
+
+[`identity`]: /nomad/docs/job-specification/identity
+[`vault`]: /nomad/docs/job-specification/vault
+[nomad_acl_vault_wid]: /nomad/docs/integrations/vault/acl#nomad-workload-identities
+[nomad_acl_vault_wid_migrate]: /nomad/docs/integrations/vault/acl#migrating-to-using-workload-identity-with-vault
diff --git a/website/content/docs/commands/setup/vault.mdx b/website/content/docs/commands/setup/vault.mdx
index ac5da6fa485..a016ea0366a 100644
--- a/website/content/docs/commands/setup/vault.mdx
+++ b/website/content/docs/commands/setup/vault.mdx
@@ -14,6 +14,13 @@ This command requires `acl:write` permissions for Vault and respects
`VAULT_TOKEN`, `VAULT_ADDR`, and other [Vault-related environment
variables][vaultenv].
+The `-check` option can be used to verify if the Nomad cluster is ready to
+migrate to use Workload Identities with Vault. This option requires
+`operator:read` permission for Nomad.
+
+Refer to [Migrating to Using Workload Identity with
+Vault][nomad_acl_vault_wid_migrate] for more information.
+
This command is an experimental feature and may change its behavior in future
@@ -38,6 +45,19 @@ nomad setup vault [options]
- `-y`: Automatically answers `yes` to all the questions, making the setup
non-interactive. Defaults to `false`.
+- `-check`: Verify if the Nomad cluster is ready to migrate to Workload
+ Identities.
+
+### Setup Vault Options When Using `-check`:
+
+- `-json`: Output migration status information in its JSON format.
+
+- `-t`: Format and display migration status information using a Go template.
+
+- `-verbose`: Display full information.
+
+@include 'general_options_no_namespace.mdx'
+
## Examples
Below is an example of an interactive session with default options, interrupted
@@ -145,4 +165,46 @@ services using workload identities.
Run the command again to finish the configuration process.
```
+The `-check` option can use to verify if a cluster is ready to migrate to using
+workload identities with Vault.
+
+```
+$ nomad setup vault -check
+
+Jobs Without Workload Identity for Vault
+The following jobs access Vault but are not configured for workload identity.
+
+You should redeploy them before fully migrating to workload identities with
+Vault to prevent unexpected errors if their tokens need to be recreated.
+
+Refer to https://developer.hashicorp.com/nomad/s/vault-workload-identity-migration
+for more information.
+
+ID Namespace Type Status
+example default service running
+
+Outdated Nodes
+The following nodes are running a version of Nomad that does not support using
+workload identities with Vault.
+
+You should upgrade them to Nomad 1.7 before fully migrating to workload
+identities with Vault to prevent unexpected errors if they receive allocations
+for jobs that use Vault.
+
+Refer to https://developer.hashicorp.com/nomad/s/vault-workload-identity-migration
+for more information.
+
+ID Name Address Version Drain Eligibility Status
+049f7683 client-1 192.168.0.186 1.6.4 false eligible ready
+
+Vault Tokens
+The following Vault ACL tokens were created by Nomad but will not be
+automatically revoked after migrating to workload identities. They will expire
+once their TTL reaches zero.
+
+Accessor ID Allocation ID Node ID Configured TTL
+czh9MPcRXzAhxBL9XKyb3Kh1 f00893d4 049f7683 60
+```
+
[vaultenv]: /vault/docs/commands#environment-variables
+[nomad_acl_vault_wid_migrate]: /nomad/docs/integrations/vault/acl#migrating-to-using-workload-identity-with-vault
diff --git a/website/content/docs/integrations/vault/acl.mdx b/website/content/docs/integrations/vault/acl.mdx
index 6a6ce4dc703..1f1c7fdb28e 100644
--- a/website/content/docs/integrations/vault/acl.mdx
+++ b/website/content/docs/integrations/vault/acl.mdx
@@ -763,25 +763,60 @@ $ VAULT_TOKEN=s.H39hfS7eHSbb1GpkdzOQLTmz.fvuLy nomad job run vault.nomad
Migrating from the legacy (pre-1.7) workflow where workloads use the agent's
Vault token requires configuration on your Vault cluster and your Nomad server
-agents. It does not require updating your running Nomad jobs unless you wish to
-specify a non-default role. To migrate:
+agents.
+
+Once the migration is fully complete, Nomad server will no longer have access
+to Vault, as it was required in the deprecated legacy workflow. This also means
+that they will no longer be able to fulfill some of their responsibilities from
+the legacy workflow, such as generating and revoking Vault ACL tokens.
+
+Before removing Vault connectivity configuration from Nomad servers, you must
+make sure the rest of the cluster is ready to support workload identities for
+Vault. You can run the [`nomad setup vault -check`][nomad_cli_setup_vault]
+command to verify what changes are still necessary.
+
+Before removing Nomad servers access to Vault you must:
+
+ * Redeploy the jobs listed in the section `Jobs Without Workload Identity for
+ Vault` with an identity for Vault. You can specify this identity [directly
+ in the job][jobspec_identity_vault] or redeploy the job without changes to
+ use the default value from the server [`vault.default_identity`][]
+ configuration if set.
+ * Upgrade nodes listed in the section `Outdated Nodes` to a version of Nomad
+ above 1.7.0.
+
+There is not action required for the Vault ACL tokens listed under `Vault
+Tokens`. Nomad will revoke them as you redeploy jobs to use workload identities
+but there may be some leftovers. You can still proceed with the migration
+process, but Nomad will not revoke them once access to Vault is removed from
+Nomad servers. They will expire once their TTL reaches zero, or you may
+manually revoke them if they are no longer needed by an allocation.
+
+The migration process can happen over time. As long as all servers are upgraded
+to Nomad 1.7+ and still retain access to Vault, jobs can still use either the
+new workload identity flow or the deprecated legacy flow.
+
+To summarize the migration process:
-* Create the Vault auth method, default role, and policies on your Vault
- cluster.
* Enable [`vault.default_identity`][] blocks in your Nomad server agent
configurations, but **do not modify any of the existing Vault
configuration**.
* Upgrade your cluster following the documented [Upgrade
Process][docs_upgrade].
-* Resubmit Nomad jobs that need access to Vault to redeploy them with a new
- workload identity for Vault.
- * (Optionally) Add [`vault.role`][] fields to any Nomad jobs that will not
- use the default role.
- * (Optionally) add [`identity`][] blocks to your jobs if you want to use a
- different identity because of how your auth method and roles are
- configured.
-* Once all jobs have been resubmitted, you may remove parameters no longer used
- by the Nomad server agents from the [`vault`][config] configuration block.
+* Create the Vault auth method, default role, and policies on your Vault
+ cluster.
+* Run the `nomad setup vault -check` command to verify if the cluster is ready
+ to migrate to workload identity access to Vault.
+ * Resubmit Nomad jobs that need access to Vault to redeploy them with a new
+ workload identity for Vault.
+ * (Optionally) Add [`vault.role`][] fields to any Nomad jobs that will not
+ use the default role.
+ * (Optionally) add [`identity`][] blocks to your jobs if you want to use a
+ different identity because of how your auth method and roles are
+ configured.
+ * Upgrade any remaining clients to Nomad 1.7+.
+* Remove parameters no longer used by the Nomad server agents from the
+ [`vault`][config] configuration block.
[Variables]: /nomad/docs/concepts/variables
[Vault Namespaces]: /vault/docs/enterprise/namespaces
diff --git a/website/data/api-docs-nav-data.json b/website/data/api-docs-nav-data.json
index 0ba76dac667..e89bec40ac8 100644
--- a/website/data/api-docs-nav-data.json
+++ b/website/data/api-docs-nav-data.json
@@ -132,6 +132,10 @@
{
"title": "Snapshot",
"path": "operator/snapshot"
+ },
+ {
+ "title": "Upgrade Check",
+ "path": "operator/upgrade-check"
}
]
},
diff --git a/website/redirects.js b/website/redirects.js
index e03c6cdc854..69ec026686f 100644
--- a/website/redirects.js
+++ b/website/redirects.js
@@ -29,6 +29,17 @@ module.exports = [
permanent: true,
},
*/
+
+ /**
+ * /s/* redirects for useful links that need a stable URL but we may need to
+ * change its destination in the future.
+ */
+ {
+ source: '/nomad/s/vault-workload-identity-migration',
+ destination:
+ 'https://developer.hashicorp.com/nomad/docs/integrations/vault/acl#migrating-to-using-workload-identity-with-vault',
+ permanent: false,
+ },
// Rename and re-arrange Autoscaling Internals section
{
source: '/nomad/tools/autoscaling/internals/:path*',