From c42a86b9417cf2f1072df5c87a55d57023bc4fc4 Mon Sep 17 00:00:00 2001
From: Chris Sewell <chrisj_sewell@hotmail.com>
Date: Wed, 11 Nov 2020 16:35:16 +0100
Subject: [PATCH] `SlurmScheduler`: fix bug in validation of job resources
 (#4555)

The `SlurmJobResource` resource class used by the `SlurmScheduler`
plugin contained a bug in the `validate_resources` methods that would
cause a float value to be set for the `num_cores_per_mpiproc` field in
certain cases. This would cause the submit script to fail because SLURM
only accepts integers for the corresponding `--ncpus-per-task` flag.

The reason is that the code was incorrectly using `isinstance(_, int)`
to check that the divison of `num_cores_per_machine` over
`num_mpiprocs_per_machine` is an integer. In addition to the negation
missing in the conditional, this is not the correct way of checking
whether a division is an integer. Instead it should check that the value
is identical after it is cast to `int`.
---
 aiida/schedulers/plugins/slurm.py |   5 +-
 tests/schedulers/test_slurm.py    | 209 ++++++++++++++++--------------
 2 files changed, 117 insertions(+), 97 deletions(-)

diff --git a/aiida/schedulers/plugins/slurm.py b/aiida/schedulers/plugins/slurm.py
index 6263d6512f..51f04305bb 100644
--- a/aiida/schedulers/plugins/slurm.py
+++ b/aiida/schedulers/plugins/slurm.py
@@ -119,6 +119,7 @@ def validate_resources(cls, **kwargs):
         """
         resources = super().validate_resources(**kwargs)
 
+        # In this plugin we never used num_cores_per_machine so if it is not defined it is OK.
         if resources.num_cores_per_machine is not None and resources.num_cores_per_mpiproc is not None:
             if resources.num_cores_per_machine != resources.num_cores_per_mpiproc * resources.num_mpiprocs_per_machine:
                 raise ValueError(
@@ -130,13 +131,13 @@ def validate_resources(cls, **kwargs):
             if resources.num_cores_per_machine < 1:
                 raise ValueError('num_cores_per_machine must be greater than or equal to one.')
 
-            # In this plugin we never used num_cores_per_machine so if it is not defined it is OK.
             resources.num_cores_per_mpiproc = (resources.num_cores_per_machine / resources.num_mpiprocs_per_machine)
-            if isinstance(resources.num_cores_per_mpiproc, int):
+            if int(resources.num_cores_per_mpiproc) != resources.num_cores_per_mpiproc:
                 raise ValueError(
                     '`num_cores_per_machine` must be equal to `num_cores_per_mpiproc * num_mpiprocs_per_machine` and in'
                     ' particular it should be a multiple of `num_cores_per_mpiproc` and/or `num_mpiprocs_per_machine`'
                 )
+            resources.num_cores_per_mpiproc = int(resources.num_cores_per_mpiproc)
 
         return resources
 
diff --git a/tests/schedulers/test_slurm.py b/tests/schedulers/test_slurm.py
index 9c3b0eeebe..b839273667 100644
--- a/tests/schedulers/test_slurm.py
+++ b/tests/schedulers/test_slurm.py
@@ -8,6 +8,7 @@
 # For further information please visit http://www.aiida.net               #
 ###########################################################################
 """Tests for the SLURM scheduler plugin."""
+#pylint: disable=no-self-use,line-too-long
 import unittest
 import logging
 import uuid
@@ -15,10 +16,9 @@
 
 import pytest
 
-from aiida.schedulers.plugins.slurm import SlurmScheduler, JobState
+from aiida.schedulers.plugins.slurm import SlurmJobResource, SlurmScheduler, JobState
 from aiida.schedulers import SchedulerError
 
-# pylint: disable=line-too-long
 # job_id, state_raw, annotation, executing_host, username, number_nodes, number_cpus, allocated_machines, partition, time_limit, time_used, dispatch_time, job_name, submission_time
 # See SlurmScheduler.fields
 TEXT_SQUEUE_TO_TEST = """862540^^^PD^^^Dependency^^^n/a^^^user1^^^20^^^640^^^(Dependency)^^^normal^^^1-00:00:00^^^0:00^^^N/A^^^longsqw_L24_q_10_0^^^2013-05-22T01:41:11
@@ -37,9 +37,41 @@
 JOBS_RUNNING = ['862538', '861352', '863553', '863554']
 
 
+def test_resource_validation():
+    """Tests to verify that resources are correctly validated."""
+    with pytest.raises(
+        ValueError,
+        match='At least two among `num_machines`, `num_mpiprocs_per_machine` or `tot_num_mpiprocs` must be specified.'
+    ):
+        SlurmJobResource()
+
+    output = SlurmJobResource(num_machines=1, num_mpiprocs_per_machine=2)
+    assert output == {
+        'num_machines': 1,
+        'num_mpiprocs_per_machine': 2,
+        'num_cores_per_machine': None,
+        'num_cores_per_mpiproc': None,
+        'tot_num_mpiprocs': 2
+    }
+
+    with pytest.raises(
+        ValueError, match='`tot_num_mpiprocs` is not equal to `num_mpiprocs_per_machine \\* num_machines`.'
+    ):
+        SlurmJobResource(num_cores_per_machine=1, tot_num_mpiprocs=1, num_mpiprocs_per_machine=2)
+
+    with pytest.raises(ValueError, match='num_cores_per_machine must be greater than or equal to one.'):
+        SlurmJobResource(num_machines=1, tot_num_mpiprocs=1, num_cores_per_machine=0)
+
+    with pytest.raises(
+        ValueError,
+        match='num_cores_per_machine` must be equal to `num_cores_per_mpiproc \\* num_mpiprocs_per_machine`'
+    ):
+        SlurmJobResource(num_machines=1, tot_num_mpiprocs=4, num_cores_per_machine=3)
+
+
 class TestParserSqueue(unittest.TestCase):
     """
-    Tests to verify if teh function _parse_joblist_output behave correctly
+    Tests to verify if the function _parse_joblist_output behave correctly
     The tests is done parsing a string defined above, to be used offline
     """
 
@@ -58,37 +90,37 @@ def test_parse_common_joblist_output(self):
 
         # The parameters are hard coded in the text to parse
         job_parsed = len(job_list)
-        self.assertEqual(job_parsed, JOBS_ON_CLUSTER)
+        assert job_parsed == JOBS_ON_CLUSTER
 
         job_running_parsed = len([j for j in job_list if j.job_state \
                                   and j.job_state == JobState.RUNNING])
-        self.assertEqual(len(JOBS_RUNNING), job_running_parsed)
+        assert len(JOBS_RUNNING) == job_running_parsed
 
         job_held_parsed = len([j for j in job_list if j.job_state and j.job_state == JobState.QUEUED_HELD])
-        self.assertEqual(JOBS_HELD, job_held_parsed)
+        assert JOBS_HELD == job_held_parsed
 
         job_queued_parsed = len([j for j in job_list if j.job_state and j.job_state == JobState.QUEUED])
-        self.assertEqual(JOBS_QUEUED, job_queued_parsed)
+        assert JOBS_QUEUED == job_queued_parsed
 
         parsed_running_users = [j.job_owner for j in job_list if j.job_state and j.job_state == JobState.RUNNING]
-        self.assertEqual(set(USERS_RUNNING), set(parsed_running_users))
+        assert set(USERS_RUNNING) == set(parsed_running_users)
 
         parsed_running_jobs = [j.job_id for j in job_list if j.job_state and j.job_state == JobState.RUNNING]
-        self.assertEqual(set(JOBS_RUNNING), set(parsed_running_jobs))
+        assert set(JOBS_RUNNING) == set(parsed_running_jobs)
 
-        self.assertEqual(job_dict['863553'].requested_wallclock_time_seconds, 30 * 60)  # pylint: disable=invalid-name
-        self.assertEqual(job_dict['863553'].wallclock_time_seconds, 29 * 60 + 29)
-        self.assertEqual(job_dict['863553'].dispatch_time, datetime.datetime(2013, 5, 23, 11, 44, 11))
-        self.assertEqual(job_dict['863553'].submission_time, datetime.datetime(2013, 5, 23, 10, 42, 11))
+        assert job_dict['863553'].requested_wallclock_time_seconds, 30 * 60  # pylint: disable=invalid-name
+        assert job_dict['863553'].wallclock_time_seconds, 29 * 60 + 29
+        assert job_dict['863553'].dispatch_time, datetime.datetime(2013, 5, 23, 11, 44, 11)
+        assert job_dict['863553'].submission_time, datetime.datetime(2013, 5, 23, 10, 42, 11)
 
-        self.assertEqual(job_dict['863100'].annotation, 'Resources')
-        self.assertEqual(job_dict['863100'].num_machines, 32)
-        self.assertEqual(job_dict['863100'].num_mpiprocs, 1024)
-        self.assertEqual(job_dict['863100'].queue_name, 'normal')
+        assert job_dict['863100'].annotation == 'Resources'
+        assert job_dict['863100'].num_machines == 32
+        assert job_dict['863100'].num_mpiprocs == 1024
+        assert job_dict['863100'].queue_name == 'normal'
 
-        self.assertEqual(job_dict['861352'].title, 'Pressure_PBEsol_0')
+        assert job_dict['861352'].title == 'Pressure_PBEsol_0'
 
-        self.assertEqual(job_dict['863554'].requested_wallclock_time_seconds, None)  # pylint: disable=invalid-name
+        assert job_dict['863554'].requested_wallclock_time_seconds is None  # pylint: disable=invalid-name
 
         # allocated_machines is not implemented in this version of the plugin
         #        for j in job_list:
@@ -108,68 +140,55 @@ def test_parse_failed_squeue_output(self):
         scheduler = SlurmScheduler()
 
         # non-zero return value should raise
-        with self.assertRaises(SchedulerError):
-            _ = scheduler._parse_joblist_output(1, TEXT_SQUEUE_TO_TEST, '')  # pylint: disable=protected-access
+        with pytest.raises(SchedulerError, match='squeue returned exit code 1'):
+            scheduler._parse_joblist_output(1, TEXT_SQUEUE_TO_TEST, '')  # pylint: disable=protected-access
 
         # non-empty stderr should be logged
-        with self.assertLogs(scheduler.logger, 'WARNING'):
-            _ = scheduler._parse_joblist_output(0, TEXT_SQUEUE_TO_TEST, 'error message')  # pylint: disable=protected-access
-
-
-class TestTimes(unittest.TestCase):
-    """Test time parsing of SLURM scheduler plugin."""
-
-    def test_time_conversion(self):
-        """
-        Test conversion of (relative) times.
-
-        From docs, acceptable time formats include
-        "minutes", "minutes:seconds", "hours:minutes:seconds",
-        "days-hours", "days-hours:minutes" and "days-hours:minutes:seconds".
-        """
-        # pylint: disable=protected-access
-        scheduler = SlurmScheduler()
-        self.assertEqual(scheduler._convert_time('2'), 2 * 60)
-        self.assertEqual(scheduler._convert_time('02'), 2 * 60)
-
-        self.assertEqual(scheduler._convert_time('02:3'), 2 * 60 + 3)
-        self.assertEqual(scheduler._convert_time('02:03'), 2 * 60 + 3)
-
-        self.assertEqual(scheduler._convert_time('1:02:03'), 3600 + 2 * 60 + 3)
-        self.assertEqual(scheduler._convert_time('01:02:03'), 3600 + 2 * 60 + 3)
-
-        self.assertEqual(scheduler._convert_time('1-3'), 86400 + 3 * 3600)
-        self.assertEqual(scheduler._convert_time('01-3'), 86400 + 3 * 3600)
-        self.assertEqual(scheduler._convert_time('01-03'), 86400 + 3 * 3600)
+        with self.assertLogs(scheduler.logger, logging.WARNING):
+            scheduler._parse_joblist_output(0, TEXT_SQUEUE_TO_TEST, 'error message')  # pylint: disable=protected-access
+
+
+@pytest.mark.parametrize(
+    'value,expected', [('2', 2 * 60), ('02', 2 * 60), ('02:3', 2 * 60 + 3), ('02:03', 2 * 60 + 3),
+                       ('1:02:03', 3600 + 2 * 60 + 3), ('01:02:03', 3600 + 2 * 60 + 3), ('1-3', 86400 + 3 * 3600),
+                       ('01-3', 86400 + 3 * 3600), ('01-03', 86400 + 3 * 3600), ('1-3:5', 86400 + 3 * 3600 + 5 * 60),
+                       ('01-3:05', 86400 + 3 * 3600 + 5 * 60), ('01-03:05', 86400 + 3 * 3600 + 5 * 60),
+                       ('1-3:5:7', 86400 + 3 * 3600 + 5 * 60 + 7), ('01-3:05:7', 86400 + 3 * 3600 + 5 * 60 + 7),
+                       ('01-03:05:07', 86400 + 3 * 3600 + 5 * 60 + 7), ('UNLIMITED', 2**31 - 1), ('NOT_SET', None)]
+)
+def test_time_conversion(value, expected):
+    """
+    Test conversion of (relative) times.
 
-        self.assertEqual(scheduler._convert_time('1-3:5'), 86400 + 3 * 3600 + 5 * 60)
-        self.assertEqual(scheduler._convert_time('01-3:05'), 86400 + 3 * 3600 + 5 * 60)
-        self.assertEqual(scheduler._convert_time('01-03:05'), 86400 + 3 * 3600 + 5 * 60)
+    From docs, acceptable time formats include
+    "minutes", "minutes:seconds", "hours:minutes:seconds",
+    "days-hours", "days-hours:minutes" and "days-hours:minutes:seconds".
+    """
+    # pylint: disable=protected-access
+    scheduler = SlurmScheduler()
+    assert scheduler._convert_time(value) == expected
 
-        self.assertEqual(scheduler._convert_time('1-3:5:7'), 86400 + 3 * 3600 + 5 * 60 + 7)
-        self.assertEqual(scheduler._convert_time('01-3:05:7'), 86400 + 3 * 3600 + 5 * 60 + 7)
-        self.assertEqual(scheduler._convert_time('01-03:05:07'), 86400 + 3 * 3600 + 5 * 60 + 7)
 
-        self.assertEqual(scheduler._convert_time('UNLIMITED'), 2**31 - 1)
-        self.assertEqual(scheduler._convert_time('NOT_SET'), None)
+def test_time_conversion_errors(caplog):
+    """Test conversion of (relative) times for bad inputs."""
+    # pylint: disable=protected-access
+    scheduler = SlurmScheduler()
 
-        # Disable logging to avoid excessive output during test
-        logging.disable(logging.ERROR)
-        with self.assertRaises(ValueError):
+    # Disable logging to avoid excessive output during test
+    with caplog.at_level(logging.CRITICAL):
+        with pytest.raises(ValueError, match='Unrecognized format for time string.'):
             # Empty string not valid
             scheduler._convert_time('')
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError, match='Unrecognized format for time string.'):
             # there should be something after the dash
             scheduler._convert_time('1-')
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError, match='Unrecognized format for time string.'):
             # there should be something after the dash
             # there cannot be a dash after the colons
             scheduler._convert_time('1:2-3')
-        # Reset logging level
-        logging.disable(logging.NOTSET)
 
 
-class TestSubmitScript(unittest.TestCase):
+class TestSubmitScript:
     """Test submit script generation by SLURM scheduler plugin."""
 
     def test_submit_script(self):
@@ -194,13 +213,13 @@ def test_submit_script(self):
 
         submit_script_text = scheduler.get_submit_script(job_tmpl)
 
-        self.assertTrue(submit_script_text.startswith('#!/bin/bash'))
+        assert submit_script_text.startswith('#!/bin/bash')
 
-        self.assertTrue('#SBATCH --no-requeue' in submit_script_text)
-        self.assertTrue('#SBATCH --time=1-00:00:00' in submit_script_text)
-        self.assertTrue('#SBATCH --nodes=1' in submit_script_text)
+        assert '#SBATCH --no-requeue' in submit_script_text
+        assert '#SBATCH --time=1-00:00:00' in submit_script_text
+        assert '#SBATCH --nodes=1' in submit_script_text
 
-        self.assertTrue("'mpirun' '-np' '23' 'pw.x' '-npool' '1' < 'aiida.in'" in submit_script_text)
+        assert "'mpirun' '-np' '23' 'pw.x' '-npool' '1' < 'aiida.in'" in submit_script_text
 
     def test_submit_script_bad_shebang(self):
         """Test that first line of submit script is as expected."""
@@ -225,7 +244,7 @@ def test_submit_script_bad_shebang(self):
             submit_script_text = scheduler.get_submit_script(job_tmpl)
 
             # This tests if the implementation correctly chooses the default:
-            self.assertEqual(submit_script_text.split('\n')[0], expected_first_line)
+            assert submit_script_text.split('\n')[0] == expected_first_line
 
     def test_submit_script_with_num_cores_per_machine(self):  # pylint: disable=invalid-name
         """
@@ -252,13 +271,13 @@ def test_submit_script_with_num_cores_per_machine(self):  # pylint: disable=inva
 
         submit_script_text = scheduler.get_submit_script(job_tmpl)
 
-        self.assertTrue('#SBATCH --no-requeue' in submit_script_text)
-        self.assertTrue('#SBATCH --time=1-00:00:00' in submit_script_text)
-        self.assertTrue('#SBATCH --nodes=1' in submit_script_text)
-        self.assertTrue('#SBATCH --ntasks-per-node=2' in submit_script_text)
-        self.assertTrue('#SBATCH --cpus-per-task=12' in submit_script_text)
+        assert '#SBATCH --no-requeue' in submit_script_text
+        assert '#SBATCH --time=1-00:00:00' in submit_script_text
+        assert '#SBATCH --nodes=1' in submit_script_text
+        assert '#SBATCH --ntasks-per-node=2' in submit_script_text
+        assert '#SBATCH --cpus-per-task=12' in submit_script_text
 
-        self.assertTrue("'mpirun' '-np' '23' 'pw.x' '-npool' '1' < 'aiida.in'" in submit_script_text)
+        assert "'mpirun' '-np' '23' 'pw.x' '-npool' '1' < 'aiida.in'" in submit_script_text
 
     def test_submit_script_with_num_cores_per_mpiproc(self):  # pylint: disable=invalid-name
         """
@@ -284,13 +303,13 @@ def test_submit_script_with_num_cores_per_mpiproc(self):  # pylint: disable=inva
 
         submit_script_text = scheduler.get_submit_script(job_tmpl)
 
-        self.assertTrue('#SBATCH --no-requeue' in submit_script_text)
-        self.assertTrue('#SBATCH --time=1-00:00:00' in submit_script_text)
-        self.assertTrue('#SBATCH --nodes=1' in submit_script_text)
-        self.assertTrue('#SBATCH --ntasks-per-node=1' in submit_script_text)
-        self.assertTrue('#SBATCH --cpus-per-task=24' in submit_script_text)
+        assert '#SBATCH --no-requeue' in submit_script_text
+        assert '#SBATCH --time=1-00:00:00' in submit_script_text
+        assert '#SBATCH --nodes=1' in submit_script_text
+        assert '#SBATCH --ntasks-per-node=1' in submit_script_text
+        assert '#SBATCH --cpus-per-task=24' in submit_script_text
 
-        self.assertTrue("'mpirun' '-np' '23' 'pw.x' '-npool' '1' < 'aiida.in'" in submit_script_text)
+        assert "'mpirun' '-np' '23' 'pw.x' '-npool' '1' < 'aiida.in'" in submit_script_text
 
     def test_submit_script_with_num_cores_per_machine_and_mpiproc1(self):  # pylint: disable=invalid-name
         """
@@ -319,13 +338,13 @@ def test_submit_script_with_num_cores_per_machine_and_mpiproc1(self):  # pylint:
 
         submit_script_text = scheduler.get_submit_script(job_tmpl)
 
-        self.assertTrue('#SBATCH --no-requeue' in submit_script_text)
-        self.assertTrue('#SBATCH --time=1-00:00:00' in submit_script_text)
-        self.assertTrue('#SBATCH --nodes=1' in submit_script_text)
-        self.assertTrue('#SBATCH --ntasks-per-node=1' in submit_script_text)
-        self.assertTrue('#SBATCH --cpus-per-task=24' in submit_script_text)
+        assert '#SBATCH --no-requeue' in submit_script_text
+        assert '#SBATCH --time=1-00:00:00' in submit_script_text
+        assert '#SBATCH --nodes=1' in submit_script_text
+        assert '#SBATCH --ntasks-per-node=1' in submit_script_text
+        assert '#SBATCH --cpus-per-task=24' in submit_script_text
 
-        self.assertTrue("'mpirun' '-np' '23' 'pw.x' '-npool' '1' < 'aiida.in'" in submit_script_text)
+        assert "'mpirun' '-np' '23' 'pw.x' '-npool' '1' < 'aiida.in'" in submit_script_text
 
     def test_submit_script_with_num_cores_per_machine_and_mpiproc2(self):  # pylint: disable=invalid-name
         """
@@ -340,13 +359,13 @@ def test_submit_script_with_num_cores_per_machine_and_mpiproc2(self):  # pylint:
         scheduler = SlurmScheduler()
 
         job_tmpl = JobTemplate()
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError, match='`num_cores_per_machine` must be equal to'):
             job_tmpl.job_resource = scheduler.create_job_resource(
                 num_machines=1, num_mpiprocs_per_machine=1, num_cores_per_machine=24, num_cores_per_mpiproc=23
             )
 
 
-class TestJoblistCommand(unittest.TestCase):
+class TestJoblistCommand:
     """
     Tests of the issued squeue command.
     """
@@ -356,15 +375,15 @@ def test_joblist_single(self):
         scheduler = SlurmScheduler()
 
         command = scheduler._get_joblist_command(jobs=['123'])  # pylint: disable=protected-access
-        self.assertIn('123,123', command)
+        assert '123,123' in command
 
     def test_joblist_multi(self):
         """Test that asking for multiple jobs does not result in duplications."""
         scheduler = SlurmScheduler()
 
         command = scheduler._get_joblist_command(jobs=['123', '456'])  # pylint: disable=protected-access
-        self.assertIn('123,456', command)
-        self.assertNotIn('456,456', command)
+        assert '123,456' in command
+        assert '456,456' not in command
 
 
 def test_parse_out_of_memory():