Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support 'UNLIMITED' and 'NOT_SET' slurm times #3586

Merged
merged 4 commits into from
Nov 28, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@
aiida/schedulers/plugins/test_lsf.py|
aiida/schedulers/plugins/test_pbspro.py|
aiida/schedulers/plugins/test_sge.py|
aiida/schedulers/plugins/test_slurm.py|
aiida/schedulers/plugins/test_torque.py|
aiida/sphinxext/tests/workchain_source/conf.py|
aiida/tools/data/array/kpoints/legacy.py|
Expand Down
7 changes: 7 additions & 0 deletions aiida/schedulers/plugins/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,13 @@ def _convert_time(self, string):
"""
Convert a string in the format DD-HH:MM:SS to a number of seconds.
"""

if string == 'UNLIMITED':
return 2147483647 # == 2**31 - 1, largest 32-bit signed integer (68 years)

if string == 'NOT_SET':
return None

groups = _TIME_REGEXP.match(string)
if groups is None:
self.logger.warning("Unrecognized format for time string '{}'".format(string))
Expand Down
120 changes: 69 additions & 51 deletions aiida/schedulers/plugins/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
# For further information on the license, see the LICENSE.txt file #
# For further information please visit http://www.aiida.net #
###########################################################################

"""Tests for the SLURM scheduler plugin.
"""
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
Expand All @@ -16,16 +17,25 @@
import uuid
import datetime

from aiida.schedulers.plugins.slurm import *
from aiida.schedulers.plugins.slurm import SlurmScheduler, JobState

# pylint: disable=line-too-long
# job_id, state_raw, annotation, executing_host, username, number_nodes, number_cpus, allocated_machines, partition, time_limit, time_used, dispatch_time, job_name, submission_time
# See SlurmScheduler.fields
TEXT_SQUEUE_TO_TEST = """862540^^^PD^^^Dependency^^^n/a^^^user1^^^20^^^640^^^(Dependency)^^^normal^^^1-00:00:00^^^0:00^^^N/A^^^longsqw_L24_q_10_0^^^2013-05-22T01:41:11
863100^^^PD^^^Resources^^^n/a^^^user2^^^32^^^1024^^^(Resources)^^^normal^^^10:00^^^0:00^^^2013-05-23T14:44:44^^^eq_solve_e4.slm^^^2013-05-22T04:23:59
863546^^^PD^^^Priority^^^n/a^^^user3^^^2^^^64^^^(Priority)^^^normal^^^8:00:00^^^0:00^^^2013-05-23T14:44:44^^^S2-H2O^^^2013-05-22T08:08:41
863313^^^PD^^^JobHeldUser^^^n/a^^^user4^^^1^^^1^^^(JobHeldUser)^^^normal^^^1:00:00^^^0:00^^^N/A^^^test^^^2013-05-23T00:28:12
862538^^^R^^^None^^^rosa10^^^user5^^^20^^^640^^^nid0[0099,0156-0157,0162-0163,0772-0773,0826,0964-0965,1018-1019,1152-1153,1214-1217,1344-1345]^^^normal^^^1-00:00:00^^^32:10^^^2013-05-23T11:41:30^^^longsqw_L24_q_11_0^^^2013-05-23T03:04:21
861352^^^R^^^None^^^rosa11^^^user6^^^4^^^128^^^nid00[192,246,264-265]^^^normal^^^1-00:00:00^^^23:30:20^^^2013-05-22T12:43:20^^^Pressure_PBEsol_0^^^2013-05-23T09:35:23
863553^^^R^^^None^^^rosa1^^^user5^^^1^^^32^^^nid00471^^^normal^^^30:00^^^29:29^^^2013-05-23T11:44:11^^^bash^^^2013-05-23T10:42:11
863554^^^R^^^None^^^rosa1^^^user5^^^1^^^32^^^nid00471^^^normal^^^NOT_SET^^^29:29^^^2013-05-23T11:44:11^^^bash^^^2013-05-23T10:42:11
"""
JOBS_ON_CLUSTER = 8
JOBS_HELD = 2
JOBS_QUEUED = 2
USERS_RUNNING = ['user5', 'user6']
JOBS_RUNNING = ['862538', '861352', '863553', '863554']


class TestParserSqueue(unittest.TestCase):
Expand All @@ -44,45 +54,42 @@ def test_parse_common_joblist_output(self):
stdout = TEXT_SQUEUE_TO_TEST
stderr = ''

job_list = scheduler._parse_joblist_output(retval, stdout, stderr)
job_list = scheduler._parse_joblist_output(retval, stdout, stderr) # pylint: disable=protected-access
job_dict = {j.job_id: j for j in job_list}

# The parameters are hard coded in the text to parse
job_on_cluster = 7
job_parsed = len(job_list)
self.assertEquals(job_parsed, job_on_cluster)
self.assertEqual(job_parsed, JOBS_ON_CLUSTER)

job_running = 3
job_running_parsed = len([j for j in job_list if j.job_state \
and j.job_state == JobState.RUNNING])
self.assertEquals(job_running, job_running_parsed)
self.assertEqual(len(JOBS_RUNNING), job_running_parsed)

job_held = 2
job_held_parsed = len([j for j in job_list if j.job_state and j.job_state == JobState.QUEUED_HELD])
self.assertEquals(job_held, job_held_parsed)
self.assertEqual(JOBS_HELD, job_held_parsed)

job_queued = 2
job_queued_parsed = len([j for j in job_list if j.job_state and j.job_state == JobState.QUEUED])
self.assertEquals(job_queued, job_queued_parsed)
self.assertEqual(JOBS_QUEUED, job_queued_parsed)

running_users = ['user5', 'user6']
parsed_running_users = [j.job_owner for j in job_list if j.job_state and j.job_state == JobState.RUNNING]
self.assertEquals(set(running_users), set(parsed_running_users))
self.assertEqual(set(USERS_RUNNING), set(parsed_running_users))

running_jobs = ['862538', '861352', '863553']
parsed_running_jobs = [j.job_id for j in job_list if j.job_state and j.job_state == JobState.RUNNING]
self.assertEquals(set(running_jobs), set(parsed_running_jobs))

self.assertEquals([j.requested_wallclock_time_seconds for j in job_list if j.job_id == '863553'][0], 30 * 60)
self.assertEquals([j.wallclock_time_seconds for j in job_list if j.job_id == '863553'][0], 29 * 60 + 29)
self.assertEquals([j.dispatch_time for j in job_list if j.job_id == '863553'][0],
datetime.datetime(2013, 5, 23, 11, 44, 11))
self.assertEquals([j.submission_time for j in job_list if j.job_id == '863553'][0],
datetime.datetime(2013, 5, 23, 10, 42, 11))
self.assertEquals([j.annotation for j in job_list if j.job_id == '863100'][0], 'Resources')
self.assertEquals([j.num_machines for j in job_list if j.job_id == '863100'][0], 32)
self.assertEquals([j.num_mpiprocs for j in job_list if j.job_id == '863100'][0], 1024)
self.assertEquals([j.queue_name for j in job_list if j.job_id == '863100'][0], 'normal')
self.assertEquals([j.title for j in job_list if j.job_id == '861352'][0], 'Pressure_PBEsol_0')
self.assertEqual(set(JOBS_RUNNING), set(parsed_running_jobs))

self.assertEqual(job_dict['863553'].requested_wallclock_time_seconds, 30 * 60)
self.assertEqual(job_dict['863553'].wallclock_time_seconds, 29 * 60 + 29)
self.assertEqual(job_dict['863553'].dispatch_time, datetime.datetime(2013, 5, 23, 11, 44, 11))
self.assertEqual(job_dict['863553'].submission_time, datetime.datetime(2013, 5, 23, 10, 42, 11))

self.assertEqual(job_dict['863100'].annotation, 'Resources')
self.assertEqual(job_dict['863100'].num_machines, 32)
self.assertEqual(job_dict['863100'].num_mpiprocs, 1024)
self.assertEqual(job_dict['863100'].queue_name, 'normal')

self.assertEqual(job_dict['861352'].title, 'Pressure_PBEsol_0')

self.assertEqual(job_dict['863554'].requested_wallclock_time_seconds, None) # not set

# allocated_machines is not implemented in this version of the plugin
# for j in job_list:
Expand All @@ -98,6 +105,7 @@ def test_parse_common_joblist_output(self):


class TestTimes(unittest.TestCase):
"""Test time parsing of SLURM scheduler plugin."""

def test_time_conversion(self):
"""
Expand All @@ -107,27 +115,31 @@ def test_time_conversion(self):
"minutes", "minutes:seconds", "hours:minutes:seconds",
"days-hours", "days-hours:minutes" and "days-hours:minutes:seconds".
"""
# pylint: disable=protected-access
scheduler = SlurmScheduler()
self.assertEquals(scheduler._convert_time('2'), 2 * 60)
self.assertEquals(scheduler._convert_time('02'), 2 * 60)
self.assertEqual(scheduler._convert_time('2'), 2 * 60)
self.assertEqual(scheduler._convert_time('02'), 2 * 60)

self.assertEqual(scheduler._convert_time('02:3'), 2 * 60 + 3)
self.assertEqual(scheduler._convert_time('02:03'), 2 * 60 + 3)

self.assertEquals(scheduler._convert_time('02:3'), 2 * 60 + 3)
self.assertEquals(scheduler._convert_time('02:03'), 2 * 60 + 3)
self.assertEqual(scheduler._convert_time('1:02:03'), 3600 + 2 * 60 + 3)
self.assertEqual(scheduler._convert_time('01:02:03'), 3600 + 2 * 60 + 3)

self.assertEquals(scheduler._convert_time('1:02:03'), 3600 + 2 * 60 + 3)
self.assertEquals(scheduler._convert_time('01:02:03'), 3600 + 2 * 60 + 3)
self.assertEqual(scheduler._convert_time('1-3'), 86400 + 3 * 3600)
self.assertEqual(scheduler._convert_time('01-3'), 86400 + 3 * 3600)
self.assertEqual(scheduler._convert_time('01-03'), 86400 + 3 * 3600)

self.assertEquals(scheduler._convert_time('1-3'), 86400 + 3 * 3600)
self.assertEquals(scheduler._convert_time('01-3'), 86400 + 3 * 3600)
self.assertEquals(scheduler._convert_time('01-03'), 86400 + 3 * 3600)
self.assertEqual(scheduler._convert_time('1-3:5'), 86400 + 3 * 3600 + 5 * 60)
self.assertEqual(scheduler._convert_time('01-3:05'), 86400 + 3 * 3600 + 5 * 60)
self.assertEqual(scheduler._convert_time('01-03:05'), 86400 + 3 * 3600 + 5 * 60)

self.assertEquals(scheduler._convert_time('1-3:5'), 86400 + 3 * 3600 + 5 * 60)
self.assertEquals(scheduler._convert_time('01-3:05'), 86400 + 3 * 3600 + 5 * 60)
self.assertEquals(scheduler._convert_time('01-03:05'), 86400 + 3 * 3600 + 5 * 60)
self.assertEqual(scheduler._convert_time('1-3:5:7'), 86400 + 3 * 3600 + 5 * 60 + 7)
self.assertEqual(scheduler._convert_time('01-3:05:7'), 86400 + 3 * 3600 + 5 * 60 + 7)
self.assertEqual(scheduler._convert_time('01-03:05:07'), 86400 + 3 * 3600 + 5 * 60 + 7)

self.assertEquals(scheduler._convert_time('1-3:5:7'), 86400 + 3 * 3600 + 5 * 60 + 7)
self.assertEquals(scheduler._convert_time('01-3:05:7'), 86400 + 3 * 3600 + 5 * 60 + 7)
self.assertEquals(scheduler._convert_time('01-03:05:07'), 86400 + 3 * 3600 + 5 * 60 + 7)
self.assertEqual(scheduler._convert_time('UNLIMITED'), 2**31 - 1)
self.assertEqual(scheduler._convert_time('NOT_SET'), None)

# Disable logging to avoid excessive output during test
logging.disable(logging.ERROR)
Expand All @@ -146,6 +158,7 @@ def test_time_conversion(self):


class TestSubmitScript(unittest.TestCase):
"""Test submit script generation by SLURM scheduler plugin."""

def test_submit_script(self):
"""
Expand Down Expand Up @@ -179,6 +192,7 @@ def test_submit_script(self):
" < 'aiida.in'" in submit_script_text)

def test_submit_script_bad_shebang(self):
"""Test that first line of submit script is as expected."""
from aiida.schedulers.datastructures import JobTemplate
from aiida.common.datastructures import CodeInfo, CodeRunMode

Expand All @@ -200,9 +214,9 @@ def test_submit_script_bad_shebang(self):
submit_script_text = scheduler.get_submit_script(job_tmpl)

# This tests if the implementation correctly chooses the default:
self.assertEquals(submit_script_text.split('\n')[0], expected_first_line)
self.assertEqual(submit_script_text.split('\n')[0], expected_first_line)

def test_submit_script_with_num_cores_per_machine(self):
def test_submit_script_with_num_cores_per_machine(self): # pylint: disable=invalid-name
"""
Test to verify if script works fine if we specify only
num_cores_per_machine value.
Expand All @@ -215,7 +229,8 @@ def test_submit_script_with_num_cores_per_machine(self):
job_tmpl = JobTemplate()
job_tmpl.shebang = '#!/bin/bash'
job_tmpl.job_resource = scheduler.create_job_resource(
num_machines=1, num_mpiprocs_per_machine=2, num_cores_per_machine=24)
num_machines=1, num_mpiprocs_per_machine=2, num_cores_per_machine=24
)
job_tmpl.uuid = str(uuid.uuid4())
job_tmpl.max_wallclock_seconds = 24 * 3600
code_info = CodeInfo()
Expand All @@ -235,7 +250,7 @@ def test_submit_script_with_num_cores_per_machine(self):
self.assertTrue("'mpirun' '-np' '23' 'pw.x' '-npool' '1'" + \
" < 'aiida.in'" in submit_script_text)

def test_submit_script_with_num_cores_per_mpiproc(self):
def test_submit_script_with_num_cores_per_mpiproc(self): # pylint: disable=invalid-name
"""
Test to verify if scripts works fine if we pass only num_cores_per_mpiproc value
"""
Expand All @@ -247,7 +262,8 @@ def test_submit_script_with_num_cores_per_mpiproc(self):
job_tmpl = JobTemplate()
job_tmpl.shebang = '#!/bin/bash'
job_tmpl.job_resource = scheduler.create_job_resource(
num_machines=1, num_mpiprocs_per_machine=1, num_cores_per_mpiproc=24)
num_machines=1, num_mpiprocs_per_machine=1, num_cores_per_mpiproc=24
)
job_tmpl.uuid = str(uuid.uuid4())
job_tmpl.max_wallclock_seconds = 24 * 3600
code_info = CodeInfo()
Expand All @@ -267,7 +283,7 @@ def test_submit_script_with_num_cores_per_mpiproc(self):
self.assertTrue("'mpirun' '-np' '23' 'pw.x' '-npool' '1'" + \
" < 'aiida.in'" in submit_script_text)

def test_submit_script_with_num_cores_per_machine_and_mpiproc1(self):
def test_submit_script_with_num_cores_per_machine_and_mpiproc1(self): # pylint: disable=invalid-name
"""
Test to verify if scripts works fine if we pass both
num_cores_per_machine and num_cores_per_mpiproc correct values.
Expand All @@ -282,7 +298,8 @@ def test_submit_script_with_num_cores_per_machine_and_mpiproc1(self):
job_tmpl = JobTemplate()
job_tmpl.shebang = '#!/bin/bash'
job_tmpl.job_resource = scheduler.create_job_resource(
num_machines=1, num_mpiprocs_per_machine=1, num_cores_per_machine=24, num_cores_per_mpiproc=24)
num_machines=1, num_mpiprocs_per_machine=1, num_cores_per_machine=24, num_cores_per_mpiproc=24
)
job_tmpl.uuid = str(uuid.uuid4())
job_tmpl.max_wallclock_seconds = 24 * 3600
code_info = CodeInfo()
Expand All @@ -302,7 +319,7 @@ def test_submit_script_with_num_cores_per_machine_and_mpiproc1(self):
self.assertTrue("'mpirun' '-np' '23' 'pw.x' '-npool' '1'" + \
" < 'aiida.in'" in submit_script_text)

def test_submit_script_with_num_cores_per_machine_and_mpiproc2(self):
def test_submit_script_with_num_cores_per_machine_and_mpiproc2(self): # pylint: disable=invalid-name
"""
Test to verify if scripts works fine if we pass
num_cores_per_machine and num_cores_per_mpiproc wrong values.
Expand All @@ -317,7 +334,8 @@ def test_submit_script_with_num_cores_per_machine_and_mpiproc2(self):
job_tmpl = JobTemplate()
with self.assertRaises(ValueError):
job_tmpl.job_resource = scheduler.create_job_resource(
num_machines=1, num_mpiprocs_per_machine=1, num_cores_per_machine=24, num_cores_per_mpiproc=23)
num_machines=1, num_mpiprocs_per_machine=1, num_cores_per_machine=24, num_cores_per_mpiproc=23
)


if __name__ == '__main__':
Expand Down