From b31144793cb243e5af7785ed1e07023a8a131037 Mon Sep 17 00:00:00 2001 From: Luke Weiler Date: Tue, 5 Apr 2022 15:21:35 -0400 Subject: [PATCH] Added uses_gpu flag to Job Info class (#753) OodCore::Job::Info now responds to `gpus` which is the number of GPUs in use in that particular job. This also adds Slurm support to start reporting GPU info. --- lib/ood_core/job/adapters/slurm.rb | 7 ++++++- lib/ood_core/job/info.rb | 13 ++++++++++++- spec/fixtures/scripts/squeue.rb | 4 ++-- spec/job/adapters/slurm_spec.rb | 30 ++++++++++++++++++++++++++++++ spec/job/info_spec.rb | 8 ++++++++ 5 files changed, 58 insertions(+), 4 deletions(-) diff --git a/lib/ood_core/job/adapters/slurm.rb b/lib/ood_core/job/adapters/slurm.rb index 917a1b681..b91027ada 100644 --- a/lib/ood_core/job/adapters/slurm.rb +++ b/lib/ood_core/job/adapters/slurm.rb @@ -617,6 +617,10 @@ def get_state(st) STATE_MAP.fetch(st, :undetermined) end + def gpus_from_gres(gres) + gres.to_s.scan(/gpu:[^,]*(\d+)/).flatten.map(&:to_i).sum + end + # Parse hash describing Slurm job status def parse_job_info(v) allocated_nodes = parse_nodes(v[:node_list]) @@ -643,7 +647,8 @@ def parse_job_info(v) cpu_time: nil, submission_time: v[:submit_time] ? Time.parse(v[:submit_time]) : nil, dispatch_time: (v[:start_time].nil? || v[:start_time] == "N/A") ? nil : Time.parse(v[:start_time]), - native: v + native: v, + gpus: gpus_from_gres(v[:gres]) ) end diff --git a/lib/ood_core/job/info.rb b/lib/ood_core/job/info.rb index d4b5e558c..c364f26c2 100644 --- a/lib/ood_core/job/info.rb +++ b/lib/ood_core/job/info.rb @@ -65,6 +65,10 @@ class Info # @return [Object] native info attr_reader :native + # Number of gpus allocated for job + # @return [Integer, nil] allocated total number of gpus + attr_reader :gpus + # List of job array child task statuses # @note only relevant for job arrays # @return [Array] tasks @@ -86,11 +90,12 @@ class Info # @param dispatch_time [#to_i, nil] dispatch time # @param tasks [Array] tasks e.g. { id: '12345.owens-batch', status: :running } # @param native [Object] native info + # @param gpus [#to_i, 0] allocated total number of gpus def initialize(id:, status:, allocated_nodes: [], submit_host: nil, job_name: nil, job_owner: nil, accounting_id: nil, procs: nil, queue_name: nil, wallclock_time: nil, wallclock_limit: nil, cpu_time: nil, submission_time: nil, - dispatch_time: nil, native: nil, tasks: [], + dispatch_time: nil, native: nil, gpus: 0, tasks: [], **_) @id = id.to_s @status = Status.new(state: status.to_sym) @@ -111,6 +116,7 @@ def initialize(id:, status:, allocated_nodes: [], submit_host: nil, @status = job_array_aggregate_status unless @tasks.empty? @native = native + @gpus = gpus && gpus.to_i end # Create a new Info for a child task @@ -147,10 +153,15 @@ def to_h submission_time: submission_time, dispatch_time: dispatch_time, native: native, + gpus: gpus, tasks: tasks } end + def gpu? + gpus.positive? + end + # The comparison operator # @param other [#to_h] object to compare against # @return [Boolean] whether objects are equivalent diff --git a/spec/fixtures/scripts/squeue.rb b/spec/fixtures/scripts/squeue.rb index b6a9034c3..31e9f7c74 100755 --- a/spec/fixtures/scripts/squeue.rb +++ b/spec/fixtures/scripts/squeue.rb @@ -1,5 +1,5 @@ #!/usr/bin/env ruby puts "\u001EACCOUNT\u001FJOBID\u001FEXEC_HOST\u001FMIN_CPUS\u001FCPUS\u001FMIN_TMP_DISK\u001FNODES\u001FEND_TIME\u001FDEPENDENCY\u001FFEATURES\u001FARRAY_JOB_ID\u001FGROUP\u001FGROUP\u001FOVER_SUBSCRIBE\u001FSOCKETS_PER_NODE\u001FJOBID\u001FCORES_PER_SOCKET\u001FNAME\u001FTHREADS_PER_CORE\u001FCOMMENT\u001FARRAY_TASK_ID\u001FTIME_LIMIT\u001FTIME_LEFT\u001FMIN_MEMORY\u001FTIME\u001FREQ_NODES\u001FNODELIST\u001FCOMMAND\u001FCONTIGUOUS\u001FQOS\u001FPARTITION\u001FPRIORITY\u001FREASON\u001FSTART_TIME\u001FST\u001FSTATE\u001FUSER\u001FUSER\u001FRESERVATION\u001FSUBMIT_TIME\u001FWCKEY\u001FLICENSES\u001FEXC_NODES\u001FCORE_SPEC\u001FNICE\u001FSCHEDNODES\u001FS:C:T\u001FWORK_DIR\u001FGRES" -puts "\u001Eoscstaff\u001F5096321\u001Fbr006\u001F1\u001F1\u001F0\u001F1\u001F2019-03-18T10:50:25\u001F\u001F(null)\u001F5096321\u001Foscstaff\u001F15312\u001FOK\u001F*\u001F5096321\u001F*\u001FInteract\u001F*\u001F(null)\u001FN/A\u001F1:00:00\u001F59:55\u001F4400M\u001F0:05\u001F\u001Fr001\u001Fbash\u001F0\u001Frm-interact\u001FRM-small\u001F3985\u001FNone\u001F2019-03-18T10:50:20\u001FCD\u001FCOMPLETED\u001Fefranz\u001F1448\u001F(null)\u001F2019-03-18T10:50:20\u001F(null)\u001F(null)\u001F\u001FN/A\u001F0\u001F(null)\u001F*:*:*\u001F/home/efranz\u001F(null)" -puts "\u001Ect4s8dp\u001F4320602\u001Fn/a\u001F28\u001F1792\u001F0\u001F64\u001FN/A\u001F\u001F(null)\u001F4320602\u001Fct4s8dp\u001F15900\u001FNO\u001F*\u001F4320602\u001F*\u001FLES-data-init\u001F*\u001F(null)\u001FN/A\u001F2-00:00:00\u001F2-00:00:00\u001F123200M\u001F0:00\u001F\u001F\u001F/scratch/ct4s8dp/kyu2/LES-data/run.q\u001F0\u001Frmlrg\u001FRM\u001F11043\u001FResources\u001FN/A\u001FPD\u001FPENDING\u001Fkyu2\u001F66288\u001F(null)\u001F2018-10-30T20:42:56\u001F(null)\u001F(null)\u001F\u001FN/A\u001F0\u001F(null)\u001F*:*:*\u001F/scratch/ct4s8dp/kyu2/LES-data\u001F(null)" +puts "\u001Eoscstaff\u001F5096321\u001Fbr006\u001F1\u001F1\u001F0\u001F1\u001F2019-03-18T10:50:25\u001F\u001F(null)\u001F5096321\u001Foscstaff\u001F15312\u001FOK\u001F*\u001F5096321\u001F*\u001FInteract\u001F*\u001F(null)\u001FN/A\u001F1:00:00\u001F59:55\u001F4400M\u001F0:05\u001F\u001Fr001\u001Fbash\u001F0\u001Frm-interact\u001FRM-small\u001F3985\u001FNone\u001F2019-03-18T10:50:20\u001FCD\u001FCOMPLETED\u001Fefranz\u001F1448\u001F(null)\u001F2019-03-18T10:50:20\u001F(null)\u001F(null)\u001F\u001FN/A\u001F0\u001F(null)\u001F*:*:*\u001F/home/efranz\u001Fgres:gpu:1,gres:gpfs" +puts "\u001Ect4s8dp\u001F4320602\u001Fn/a\u001F28\u001F1792\u001F0\u001F64\u001FN/A\u001F\u001F(null)\u001F4320602\u001Fct4s8dp\u001F15900\u001FNO\u001F*\u001F4320602\u001F*\u001FLES-data-init\u001F*\u001F(null)\u001FN/A\u001F2-00:00:00\u001F2-00:00:00\u001F123200M\u001F0:00\u001F\u001F\u001F/scratch/ct4s8dp/kyu2/LES-data/run.q\u001F0\u001Frmlrg\u001FRM\u001F11043\u001FResources\u001FN/A\u001FPD\u001FPENDING\u001Fkyu2\u001F66288\u001F(null)\u001F2018-10-30T20:42:56\u001F(null)\u001F(null)\u001F\u001FN/A\u001F0\u001F(null)\u001F*:*:*\u001F/scratch/ct4s8dp/kyu2/LES-data\u001Fgres:pfsdir:ess" diff --git a/spec/job/adapters/slurm_spec.rb b/spec/job/adapters/slurm_spec.rb index 86e7351a1..c52a1c704 100644 --- a/spec/job/adapters/slurm_spec.rb +++ b/spec/job/adapters/slurm_spec.rb @@ -306,6 +306,8 @@ def build_script(opts = {}) expect(j1.status).to eq("completed") expect(j1.status).to eq(OodCore::Job::Status.new(state: :completed)) expect(j1.status.to_s).to eq("completed") + expect(j1.gpus).to eq(1) + expect(j1.gpu?).to eq(true) j2 = jobs.last expect(j2.id).to eq("4320602") @@ -316,6 +318,8 @@ def build_script(opts = {}) expect(j2.status).to eq("queued") expect(j2.status).to eq(OodCore::Job::Status.new(state: :queued)) expect(j2.status.to_s).to eq("queued") + expect(j2.gpus).to eq(0) + expect(j2.gpu?).to eq(false) end end @@ -1195,4 +1199,30 @@ def job_info(opts = {}) end end end + + describe "#gpus_from_gres" do + batch = OodCore::Job::Adapters::Slurm::Batch.new(cluster: "owens.osc.edu", conf: "/etc/slurm/conf/", bin: nil, bin_overrides: {}, submit_host: "owens.osc.edu", strict_host_checking: false) + adapter = OodCore::Job::Adapters::Slurm.new(slurm: batch) + + context "when called" do + gres_cases = [ + [nil, 0], + ["", 0], + ["N/A", 0], + ["gres:gpu:v100-32g:2", 2], + ["gres:gpu:v100-32g:2,gres:pfsdir:1", 2], + ["gres:third-thing:sub-thing:17,gres:gpu:v100-32g:2,gres:pfsdir:1", 2], + ["gres:third-thing:sub-thing:17,gres:pfsdir:1,gres:gpu:v100-32g:2", 2], + ["gres:gpu:v30-12g:2,gres:gpu:v31-32g:1", 3], + ["gres:gpu:1", 1], + ["gres:pfsdir:ess", 0] + ] + gres_cases.each do |gc| + it "does not return the correct number of gpus when gres=\"#{gc[0]}\"" do + gpus = adapter.send(:gpus_from_gres, gc[0]) + expect(gpus).to be(gc[1]); + end + end + end + end end diff --git a/spec/job/info_spec.rb b/spec/job/info_spec.rb index 47fe2b649..eca00153d 100644 --- a/spec/job/info_spec.rb +++ b/spec/job/info_spec.rb @@ -32,6 +32,7 @@ def build_info(opts = {}) it { is_expected.to respond_to(:submission_time) } it { is_expected.to respond_to(:dispatch_time) } it { is_expected.to respond_to(:native) } + it { is_expected.to respond_to(:gpus) } it { is_expected.to respond_to(:to_h) } it { is_expected.to respond_to(:tasks) } @@ -151,6 +152,12 @@ def build_info(opts = {}) it { is_expected.to eq("native") } end + describe "#gpus" do + subject { build_info(native: "gpus").native } + + it { is_expected.to eq("gpus") } + end + describe "#to_h" do subject { build_info.to_h } @@ -170,6 +177,7 @@ def build_info(opts = {}) it { is_expected.to have_key(:submission_time) } it { is_expected.to have_key(:dispatch_time) } it { is_expected.to have_key(:native) } + it { is_expected.to have_key(:gpus) } end describe "#==" do