diff --git a/.changelog/19670.txt b/.changelog/19670.txt new file mode 100644 index 00000000000..da25850c442 --- /dev/null +++ b/.changelog/19670.txt @@ -0,0 +1,3 @@ +```release-note:improvement +drivers: Enable configuring a raw_exec task to not have an upper memory limit +``` diff --git a/drivers/shared/executor/executor_universal_linux.go b/drivers/shared/executor/executor_universal_linux.go index 32c4bd90295..77703848da4 100644 --- a/drivers/shared/executor/executor_universal_linux.go +++ b/drivers/shared/executor/executor_universal_linux.go @@ -1,6 +1,8 @@ // Copyright (c) HashiCorp, Inc. // SPDX-License-Identifier: MPL-2.0 +//go:build linux + package executor import ( @@ -20,6 +22,12 @@ import ( "golang.org/x/sys/unix" ) +const ( + // memoryNoLimit is a sentinel value for memory_max that indicates the + // raw_exec driver should not enforce a maximum memory limit + memoryNoLimit = -1 +) + // setCmdUser takes a user id as a string and looks up the user, and sets the command // to execute as that user. func setCmdUser(cmd *exec.Cmd, userid string) error { @@ -226,7 +234,11 @@ func (e *UniversalExecutor) configureCG2(cgroup string, command *ExecCommand) { // write memory cgroup files memHard, memSoft := e.computeMemory(command) ed := cgroupslib.OpenPath(cgroup) - _ = ed.Write("memory.max", strconv.FormatInt(memHard, 10)) + if memHard == memoryNoLimit { + _ = ed.Write("memory.max", "max") + } else { + _ = ed.Write("memory.max", strconv.FormatInt(memHard, 10)) + } if memSoft > 0 { ed = cgroupslib.OpenPath(cgroup) _ = ed.Write("memory.low", strconv.FormatInt(memSoft, 10)) @@ -264,17 +276,29 @@ func (*UniversalExecutor) computeCPU(command *ExecCommand) uint64 { return cpuWeight } +func mbToBytes(n int64) int64 { + return n * 1024 * 1024 +} + // computeMemory returns the hard and soft memory limits for the task func (*UniversalExecutor) computeMemory(command *ExecCommand) (int64, int64) { mem := command.Resources.NomadResources.Memory memHard, memSoft := mem.MemoryMaxMB, mem.MemoryMB - if memHard <= 0 { + + switch memHard { + case 0: + // typical case where 'memory' is the hard limit memHard = mem.MemoryMB - memSoft = 0 + return mbToBytes(memHard), 0 + case memoryNoLimit: + // special oversub case where 'memory' is soft limit and there is no + // hard limit - helping re-create old raw_exec behavior + return memoryNoLimit, mbToBytes(memSoft) + default: + // typical oversub case where 'memory' is soft limit and 'memory_max' + // is hard limit + return mbToBytes(memHard), mbToBytes(memSoft) } - memHardBytes := memHard * 1024 * 1024 - memSoftBytes := memSoft * 1024 * 1024 - return memHardBytes, memSoftBytes } // withNetworkIsolation calls the passed function the network namespace `spec` diff --git a/drivers/shared/executor/executor_universal_linux_test.go b/drivers/shared/executor/executor_universal_linux_test.go new file mode 100644 index 00000000000..739057c2ccb --- /dev/null +++ b/drivers/shared/executor/executor_universal_linux_test.go @@ -0,0 +1,68 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +//go:build linux + +package executor + +import ( + "fmt" + "testing" + + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/plugins/drivers" + "github.com/shoenig/test/must" +) + +func Test_computeMemory(t *testing.T) { + cases := []struct { + memory int64 + memoryMax int64 + expSoft int64 + expHard int64 + }{ + { + // typical case; only 'memory' is set and that is used as the hard + // memory limit + memory: 100, + memoryMax: 0, + expSoft: 0, + expHard: mbToBytes(100), + }, + { + // oversub case; both 'memory' and 'memory_max' are set and used as + // the soft and hard memory limits + memory: 100, + memoryMax: 200, + expSoft: mbToBytes(100), + expHard: mbToBytes(200), + }, + { + // special oversub case; 'memory' is set and 'memory_max' is set to + // -1; which indicates there should be no hard limit (i.e. -1 / max) + memory: 100, + memoryMax: memoryNoLimit, + expSoft: mbToBytes(100), + expHard: memoryNoLimit, + }, + } + + for _, tc := range cases { + name := fmt.Sprintf("(%d,%d)", tc.memory, tc.memoryMax) + t.Run(name, func(t *testing.T) { + command := &ExecCommand{ + Resources: &drivers.Resources{ + NomadResources: &structs.AllocatedTaskResources{ + Memory: structs.AllocatedMemoryResources{ + MemoryMB: tc.memory, + MemoryMaxMB: tc.memoryMax, + }, + }, + }, + } + hard, soft := (*UniversalExecutor)(nil).computeMemory(command) + must.Eq(t, tc.expSoft, soft) + must.Eq(t, tc.expHard, hard) + }) + } +} diff --git a/e2e/rawexec/input/oversubmax.hcl b/e2e/rawexec/input/oversubmax.hcl new file mode 100644 index 00000000000..011ecd79710 --- /dev/null +++ b/e2e/rawexec/input/oversubmax.hcl @@ -0,0 +1,38 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +job "oversubmax" { + type = "batch" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "group" { + reschedule { + attempts = 0 + unlimited = false + } + + restart { + attempts = 0 + mode = "fail" + } + + task "cat" { + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "cat /sys/fs/cgroup/$(cat /proc/self/cgroup | cut -d':' -f3)/memory.{low,max}"] + } + + resources { + cpu = 100 + memory = 64 + memory_max = -1 # unlimited + } + } + } +} diff --git a/e2e/rawexec/rawexec_test.go b/e2e/rawexec/rawexec_test.go index 8907e797e16..6c52a93d8a3 100644 --- a/e2e/rawexec/rawexec_test.go +++ b/e2e/rawexec/rawexec_test.go @@ -4,6 +4,7 @@ package rawexec import ( + "regexp" "testing" "github.com/hashicorp/nomad/e2e/v3/cluster3" @@ -18,6 +19,8 @@ func TestRawExec(t *testing.T) { ) t.Run("testOomAdj", testOomAdj) + t.Run("testOversubMemory", testOversubMemory) + t.Run("testOversubMemoryUnlimited", testOversubMemoryUnlimited) } func testOomAdj(t *testing.T) { @@ -27,3 +30,21 @@ func testOomAdj(t *testing.T) { logs := job.TaskLogs("group", "cat") must.StrContains(t, logs.Stdout, "0") } + +func testOversubMemory(t *testing.T) { + job, cleanup := jobs3.Submit(t, "./input/oversub.hcl") + t.Cleanup(cleanup) + + logs := job.TaskLogs("group", "cat") + must.StrContains(t, logs.Stdout, "134217728") // 128 mb memory_max +} + +func testOversubMemoryUnlimited(t *testing.T) { + job, cleanup := jobs3.Submit(t, "./input/oversubmax.hcl") + t.Cleanup(cleanup) + + // will print memory.low then memory.max + logs := job.TaskLogs("group", "cat") + logsRe := regexp.MustCompile(`67108864\s+max`) + must.RegexMatch(t, logsRe, logs.Stdout) +} diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 2f83d036a27..44d18789516 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2466,6 +2466,12 @@ func (r *Resources) DiskInBytes() int64 { return int64(r.DiskMB * BytesInMegabyte) } +const ( + // memoryNoLimit is a sentinel value indicating there is no upper hard + // memory limit + memoryNoLimit = -1 +) + func (r *Resources) Validate() error { var mErr multierror.Error @@ -2488,7 +2494,9 @@ func (r *Resources) Validate() error { } } - if r.MemoryMaxMB != 0 && r.MemoryMaxMB < r.MemoryMB { + // ensure memory_max is greater than memory, unless it is set to 0 or -1 which + // are both sentinel values + if (r.MemoryMaxMB != 0 && r.MemoryMaxMB != memoryNoLimit) && r.MemoryMaxMB < r.MemoryMB { mErr.Errors = append(mErr.Errors, fmt.Errorf("MemoryMaxMB value (%d) should be larger than MemoryMB value (%d)", r.MemoryMaxMB, r.MemoryMB)) } diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 0961f845b76..05ff3081de4 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -2139,6 +2139,14 @@ func TestTask_Validate_Resources(t *testing.T) { }, err: "MemoryMaxMB value (10) should be larger than MemoryMB value (200", }, + { + name: "memory max no limit", + res: &Resources{ + CPU: 100, + MemoryMB: 200, + MemoryMaxMB: -1, + }, + }, } for i := range cases { diff --git a/website/content/docs/drivers/raw_exec.mdx b/website/content/docs/drivers/raw_exec.mdx index aacbe10a3b5..951bdca7eb3 100644 --- a/website/content/docs/drivers/raw_exec.mdx +++ b/website/content/docs/drivers/raw_exec.mdx @@ -141,6 +141,18 @@ properly. Nomad will not leak any processes if cgroups are being used to manage the process tree. Cgroups are used on Linux when Nomad is being run with appropriate privileges, and the cgroup system is mounted. +If the cluster is configured with memory oversubscription enabled, a task using +the `raw_exec` driver can be configured to have no maximum memory limit by +setting `memory_max = -1`. + +```hcl +resources { + cpu = 500 + memory = 128 + memory_max = -1 # no limit +} +``` + [hardening]: /nomad/docs/install/production/requirements#user-permissions [plugin-options]: #plugin-options