Skip to content

Commit

Permalink
Add gpus_per_node support for SLURM, Torque and Kubernetes (#266)
Browse files Browse the repository at this point in the history
Fixes #263
  • Loading branch information
treydock authored May 21, 2021
1 parent a0bcd6c commit 7aaad32
Show file tree
Hide file tree
Showing 10 changed files with 55 additions and 1 deletion.
1 change: 1 addition & 0 deletions lib/ood_core/job/adapters/kubernetes/batch.rb
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def generate_id_yml(script)
spec = OodCore::Job::Adapters::Kubernetes::Resources::PodSpec.new(container, init_containers: init_containers)
all_mounts = native_data[:mounts].nil? ? mounts : mounts + native_data[:mounts]
node_selector = native_data[:node_selector].nil? ? {} : native_data[:node_selector]
gpu_type = native_data[:gpu_type].nil? ? "nvidia.com/gpu" : native_data[:gpu_type]

template = ERB.new(File.read(resource_file), nil, '-')

Expand Down
6 changes: 6 additions & 0 deletions lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,15 @@ spec:
limits:
memory: "<%= spec.container.memory %>"
cpu: "<%= spec.container.cpu %>"
<%- unless script.gpus_per_node.nil? -%>
<%= gpu_type %>: <%= script.gpus_per_node %>
<%- end -%>
requests:
memory: "<%= spec.container.memory %>"
cpu: "<%= spec.container.cpu %>"
<%- unless script.gpus_per_node.nil? -%>
<%= gpu_type %>: <%= script.gpus_per_node %>
<%- end -%>
securityContext:
allowPrivilegeEscalation: false
capabilities:
Expand Down
1 change: 1 addition & 0 deletions lib/ood_core/job/adapters/slurm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,7 @@ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
args.concat ["-t", seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
args.concat ['-a', script.job_array_request] unless script.job_array_request.nil?
args.concat ['--qos', script.qos] unless script.qos.nil?
args.concat ['--gpus-per-node', script.gpus_per_node] unless script.gpus_per_node.nil?
# ignore nodes, don't know how to do this for slurm

# Set dependencies
Expand Down
2 changes: 2 additions & 0 deletions lib/ood_core/job/adapters/torque.rb
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
args.concat ["-l", "walltime=#{seconds_to_duration(script.wall_time)}"] unless script.wall_time.nil?
args.concat ['-t', script.job_array_request] unless script.job_array_request.nil?
args.concat ['-l', "qos=#{script.qos}"] unless script.qos.nil?
args.concat ['-l', "gpus=#{script.gpus_per_node}"] unless script.gpus_per_node.nil?

# Set environment variables
env = script.job_environment.to_h
args.concat ["-v", env.keys.join(",")] unless env.empty?
Expand Down
9 changes: 8 additions & 1 deletion lib/ood_core/job/script.rb
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ class Script
# @return [String, nil] qos
attr_reader :qos

# The GPUs per node for the job
# @return [Integer, nil] gpus per node
attr_reader :gpus_per_node

# Object detailing any native specifications that are implementation specific
# @note Should not be used at all costs.
# @return [Object, nil] native specifications
Expand Down Expand Up @@ -136,6 +140,7 @@ class Script
# @param accounting_id [#to_s, nil] accounting id
# @param job_array_request [#to_s, nil] job array request
# @param qos [#to_s, nil] qos
# @param gpus_per_node [#to_i, nil] gpus per node
# @param native [Object, nil] native specifications
# @param copy_environment [Boolean, nil] copy the environment
def initialize(content:, args: nil, submit_as_hold: nil, rerunnable: nil,
Expand All @@ -145,7 +150,7 @@ def initialize(content:, args: nil, submit_as_hold: nil, rerunnable: nil,
output_path: nil, error_path: nil, reservation_id: nil,
queue_name: nil, priority: nil, start_time: nil,
wall_time: nil, accounting_id: nil, job_array_request: nil,
qos: nil, native: nil, copy_environment: nil, **_)
qos: nil, gpus_per_node: nil, native: nil, copy_environment: nil, **_)
@content = content.to_s

@submit_as_hold = submit_as_hold
Expand All @@ -170,6 +175,7 @@ def initialize(content:, args: nil, submit_as_hold: nil, rerunnable: nil,
@accounting_id = accounting_id && accounting_id.to_s
@job_array_request = job_array_request && job_array_request.to_s
@qos = qos && qos.to_s
@gpus_per_node = gpus_per_node && gpus_per_node.to_i
@native = native
@copy_environment = (copy_environment.nil?) ? nil : !! copy_environment
end
Expand Down Expand Up @@ -200,6 +206,7 @@ def to_h
accounting_id: accounting_id,
job_array_request: job_array_request,
qos: qos,
gpus_per_node: gpus_per_node,
native: native,
copy_environment: copy_environment
}
Expand Down
2 changes: 2 additions & 0 deletions spec/fixtures/output/k8s/pod_yml_from_all_configs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,11 @@ spec:
limits:
memory: "6Gi"
cpu: "4"
nvidia.com/gpu: 1
requests:
memory: "6Gi"
cpu: "4"
nvidia.com/gpu: 1
securityContext:
allowPrivilegeEscalation: false
capabilities:
Expand Down
1 change: 1 addition & 0 deletions spec/job/adapters/kubernetes/batch_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ def build_script(opts = {})
it "submits with correct yml file given all config options" do
script = build_script(
accounting_id: 'test',
gpus_per_node: 1,
native: {
container: {
name: 'rspec-test',
Expand Down
12 changes: 12 additions & 0 deletions spec/job/adapters/slurm_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,18 @@ def build_script(opts = {})
it { expect(slurm).to have_received(:submit_string).with(content, args: ["-t", "26:15:34", "--export", "NONE"], env: {}) }
end

context "with :qos" do
before { adapter.submit(build_script(qos: "test")) }

it { expect(slurm).to have_received(:submit_string).with(content, args: ["--qos", "test", "--export", "NONE"], env: {}) }
end

context "with :gpus_per_node" do
before { adapter.submit(build_script(gpus_per_node: 1)) }

it { expect(slurm).to have_received(:submit_string).with(content, args: ["--gpus-per-node", 1, "--export", "NONE"], env: {}) }
end

context "with :native" do
before { adapter.submit(build_script(native: ["A", "B", "C"])) }

Expand Down
6 changes: 6 additions & 0 deletions spec/job/adapters/torque_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,12 @@ def build_script(opts = {})
it { expect(pbs).to have_received(:submit).with(content, args: ["-l", "qos=high", "-j", "oe"], env: {}, chdir: nil)}
end

context "with :gpus_per_node" do
before { adapter.submit(build_script(gpus_per_node: 1)) }

it { expect(pbs).to have_received(:submit).with(content, args: ["-l", "gpus=1", "-j", "oe"], env: {}, chdir: nil)}
end

context "with :native" do
before { adapter.submit(build_script(native: ["A", "B", "C"])) }

Expand Down
16 changes: 16 additions & 0 deletions spec/job/script_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def build_script(opts = {})
it { is_expected.to respond_to(:wall_time) }
it { is_expected.to respond_to(:accounting_id) }
it { is_expected.to respond_to(:job_array_request) }
it { is_expected.to respond_to(:qos) }
it { is_expected.to respond_to(:gpus_per_node) }
it { is_expected.to respond_to(:native) }
it { is_expected.to respond_to(:to_h) }
it { is_expected.to respond_to(:copy_environment) }
Expand Down Expand Up @@ -178,6 +180,18 @@ def build_script(opts = {})
it { is_expected.to eq("my_account") }
end

describe "#qos" do
subject { build_script(qos: double(to_s: "test")).qos }

it { is_expected.to eq("test") }
end

describe "#gpus_per_node" do
subject { build_script(gpus_per_node: double(to_i: 1)).gpus_per_node }

it { is_expected.to eq(1) }
end

describe "#native" do
subject { build_script(native: "native").native }

Expand Down Expand Up @@ -209,6 +223,8 @@ def build_script(opts = {})
it { is_expected.to have_key(:wall_time) }
it { is_expected.to have_key(:accounting_id) }
it { is_expected.to have_key(:job_array_request) }
it { is_expected.to have_key(:qos) }
it { is_expected.to have_key(:gpus_per_node) }
it { is_expected.to have_key(:native) }
it { is_expected.to have_key(:copy_environment) }
end
Expand Down

0 comments on commit 7aaad32

Please sign in to comment.