Log warnings for likely errors in provenance record

bouweandela · web-flow · commit bb9dcc7b1d34 · 2020-05-18T20:55:05.000+02:00
diff --git a/.circleci/install_triggers b/.circleci/install_triggers
@@ -1,5 +1,6 @@
 ^\.circleci/
 ^environment\.yml$
 ^[A-Za-z_]*meta\.yaml$
+^package/
 ^setup\.py$
 ^setup\.cfg$
diff --git a/esmvalcore/_task.py b/esmvalcore/_task.py
@@ -203,6 +203,7 @@ def _ncl_type(value):
 
 class BaseTask:
     """Base class for defining task classes."""
+
     def __init__(self, ancestors=None, name='', products=None):
         """Initialize task."""
         self.ancestors = [] if ancestors is None else ancestors
@@ -265,6 +266,7 @@ class DiagnosticError(Exception):
 
 class DiagnosticTask(BaseTask):
     """Task for running a diagnostic."""
+
     def __init__(self, script, settings, output_dir, ancestors=None, name=''):
         """Create a diagnostic task."""
         super().__init__(ancestors=ancestors, name=name)
@@ -523,8 +525,10 @@ def _collect_provenance(self):
         provenance_file = Path(
             self.settings['run_dir']) / 'diagnostic_provenance.yml'
         if not provenance_file.is_file():
-            logger.warning("No provenance information was written to %s",
-                           provenance_file)
+            logger.warning(
+                "No provenance information was written to %s. Unable to "
+                "record provenance for files created by diagnostic script %s "
+                "in task %s", provenance_file, self.script, self.name)
             return
 
         logger.debug("Collecting provenance from %s", provenance_file)
@@ -554,16 +558,33 @@ def _collect_provenance(self):
             if key not in ignore:
                 attrs[key] = self.settings[key]
 
-        ancestor_products = {p for a in self.ancestors for p in a.products}
+        ancestor_products = {
+            p.filename: p
+            for a in self.ancestors for p in a.products
+        }
 
+        valid = True
         for filename, attributes in table.items():
             # copy to avoid updating other entries if file contains anchors
             attributes = deepcopy(attributes)
             ancestor_files = attributes.pop('ancestors', [])
-            ancestors = {
-                p
-                for p in ancestor_products if p.filename in ancestor_files
-            }
+            if not ancestor_files:
+                logger.warning(
+                    "No ancestor files specified for recording provenance of "
+                    "%s, created by diagnostic script %s in task %s", filename,
+                    self.script, self.name)
+                valid = False
+            ancestors = set()
+            for ancestor_file in ancestor_files:
+                if ancestor_file in ancestor_products:
+                    ancestors.add(ancestor_products[ancestor_file])
+                else:
+                    valid = False
+                    logger.warning(
+                        "Invalid ancestor file %s specified for recording "
+                        "provenance of %s, created by diagnostic script %s "
+                        "in task %s", ancestor_file, filename, self.script,
+                        self.name)
 
             attributes.update(deepcopy(attrs))
             for key in attributes:
@@ -575,9 +596,14 @@ def _collect_provenance(self):
             product.save_provenance()
             _write_citation_files(product.filename, product.provenance)
             self.products.add(product)
+
+        if not valid:
+            logger.warning(
+                "Valid ancestor files for diagnostic script %s in task %s "
+                "are:\n%s", self.script, self.name,
+                '\n'.join(ancestor_products))
         logger.debug("Collecting provenance of task %s took %.1f seconds",
-                     self.name,
-                     time.time() - start)
+                     self.name, time.time() - start)
 
     def __str__(self):
         """Get human readable description."""
diff --git a/setup.py b/setup.py
@@ -54,6 +54,7 @@
         'pytest-flake8',
         'pytest-html!=2.1.0',
         'pytest-metadata>=1.5.1',
+        'pytest-mock',
     ],
     # Development dependencies
     # Use pip install -e .[develop] to install in development mode
diff --git a/tests/unit/task/__init__.py b/tests/unit/task/__init__.py
diff --git a/tests/unit/task/test_diagnostic_task.py b/tests/unit/task/test_diagnostic_task.py
@@ -0,0 +1,254 @@
+import stat
+from pathlib import Path
+
+import pytest
+import yaml
+
+import esmvalcore._task
+
+
+@pytest.mark.parametrize("ext", ['.jl', '.py', '.ncl', '.R'])
+def test_initialize_env(ext, tmp_path, monkeypatch):
+    """Test that the environmental variables are set correctly."""
+    monkeypatch.setattr(esmvalcore._task.DiagnosticTask, '_initialize_cmd',
+                        lambda self: None)
+
+    esmvaltool_path = tmp_path / 'esmvaltool'
+    monkeypatch.setattr(esmvalcore._task, 'DIAGNOSTICS_PATH', esmvaltool_path)
+
+    diagnostics_path = esmvaltool_path / 'diag_scripts'
+    diagnostics_path.mkdir(parents=True)
+    script = diagnostics_path / ('test' + ext)
+    script.touch()
+
+    settings = {
+        'run_dir': str(tmp_path / 'run_dir'),
+        'profile_diagnostic': False,
+    }
+    task = esmvalcore._task.DiagnosticTask(
+        script,
+        settings,
+        output_dir=str(tmp_path),
+    )
+
+    # Create correct environment
+    env = {}
+    if ext in ('.jl', '.py'):
+        env['MPLBACKEND'] = 'Agg'
+    if ext == '.jl':
+        env['JULIA_LOAD_PATH'] = f"{esmvaltool_path / 'install' / 'Julia'}:"
+    if ext in ('.ncl', '.R'):
+        env['diag_scripts'] = str(diagnostics_path)
+
+    assert task.env == env
+
+
+CMD = {
+    # ext, profile: expected command
+    ('.py', False): ['python'],
+    ('.py', True): ['python', '-m', 'vmprof', '--lines', '-o'],
+    ('.ncl', False): ['ncl', '-n', '-p'],
+    ('.ncl', True): ['ncl', '-n', '-p'],
+    ('.R', False): ['Rscript'],
+    ('.R', True): ['Rscript'],
+    ('.jl', False): ['julia'],
+    ('.jl', True): ['julia'],
+    ('', False): [],
+    ('', True): [],
+}
+
+
+@pytest.mark.parametrize("ext_profile,cmd", CMD.items())
+def test_initialize_cmd(ext_profile, cmd, tmp_path, monkeypatch):
+    """Test creating the command to run the diagnostic script."""
+    monkeypatch.setattr(esmvalcore._task.DiagnosticTask, '_initialize_env',
+                        lambda self: None)
+
+    ext, profile = ext_profile
+    script = tmp_path / ('test' + ext)
+    script.touch()
+    if ext == '':
+        # test case where file is executable
+        script.chmod(stat.S_IEXEC)
+
+    run_dir = tmp_path / 'run_dir'
+    settings = {
+        'run_dir': str(run_dir),
+        'profile_diagnostic': profile,
+    }
+
+    monkeypatch.setattr(esmvalcore._task, 'which', lambda x: x)
+
+    task = esmvalcore._task.DiagnosticTask(script,
+                                           settings,
+                                           output_dir=str(tmp_path))
+
+    # Append filenames to expected command
+    if ext == '.py' and profile:
+        cmd.append(str(run_dir / 'profile.bin'))
+    cmd.append(str(script))
+
+    assert task.cmd == cmd
+
+
+@pytest.fixture
+def diagnostic_task(mocker, tmp_path):
+    class TrackedFile(esmvalcore._task.TrackedFile):
+        provenance = None
+
+    mocker.patch.object(esmvalcore._task, 'TrackedFile', autospec=TrackedFile)
+    mocker.patch.dict(esmvalcore._task.TAGS,
+                      {'plot_type': {
+                          'tag': 'tag_value'
+                      }})
+    mocker.patch.object(esmvalcore._task,
+                        '_write_citation_files',
+                        autospec=True)
+
+    mocker.patch.object(esmvalcore._task.DiagnosticTask, '_initialize_cmd')
+    mocker.patch.object(esmvalcore._task.DiagnosticTask, '_initialize_env')
+
+    settings = {
+        'run_dir': str(tmp_path / 'run_dir'),
+        'profile_diagnostic': False,
+        'some_diagnostic_setting': True,
+    }
+
+    task = esmvalcore._task.DiagnosticTask('test.py',
+                                           settings,
+                                           output_dir=str(tmp_path),
+                                           name='some-diagnostic-task')
+    return task
+
+
+def write_mock_provenance(diagnostic_task, record):
+    run_dir = Path(diagnostic_task.settings['run_dir'])
+    run_dir.mkdir(parents=True)
+    provenance_file = run_dir / 'diagnostic_provenance.yml'
+    provenance_file.write_text(yaml.safe_dump(record))
+
+
+def test_collect_provenance(mocker, diagnostic_task):
+    tracked_file_instance = mocker.Mock()
+    tracked_file_class = mocker.patch.object(
+        esmvalcore._task, 'TrackedFile', return_value=tracked_file_instance)
+    write_citation = mocker.patch.object(esmvalcore._task,
+                                         '_write_citation_files')
+
+    record = {
+        "test.png": {
+            "caption": "Some figure",
+            "ancestors": ["xyz.nc"],
+            "plot_type": ["tag"],
+        },
+    }
+
+    write_mock_provenance(diagnostic_task, record)
+
+    ancestor_product = mocker.Mock()
+    ancestor_product.filename = "xyz.nc"
+
+    ancestor_task = mocker.Mock()
+    ancestor_task.products = {ancestor_product}
+
+    diagnostic_task.ancestors = [ancestor_task]
+
+    diagnostic_task.products = mocker.Mock(autospec=set)
+    diagnostic_task._collect_provenance()
+
+    tracked_file_class.assert_called_once_with(
+        "test.png",
+        {
+            "caption": "Some figure",
+            "plot_type": ("tag_value", ),
+            "script_file": "test.py",
+            "some_diagnostic_setting": True,
+        },
+        {ancestor_product},
+    )
+    tracked_file_instance.initialize_provenance.assert_called_once_with(
+        diagnostic_task.activity)
+    tracked_file_instance.save_provenance.assert_called_once()
+    write_citation.assert_called_once_with(tracked_file_instance.filename,
+                                           tracked_file_instance.provenance)
+    diagnostic_task.products.add.assert_called_once_with(tracked_file_instance)
+
+
+def assert_warned(log, msgs):
+    """Check that messages have been logged."""
+    assert len(log.records) == len(msgs)
+    for msg, record in zip(msgs, log.records):
+        for snippet in msg:
+            assert snippet in record.message
+
+
+def test_collect_no_provenance(caplog, diagnostic_task):
+
+    diagnostic_task._collect_provenance()
+    assert_warned(caplog, [["No provenance information was written"]])
+
+
+def test_collect_provenance_no_ancestors(caplog, diagnostic_task):
+
+    record = {
+        "test.png": {
+            "caption": "Some figure",
+        },
+    }
+
+    write_mock_provenance(diagnostic_task, record)
+
+    diagnostic_task._collect_provenance()
+
+    assert_warned(caplog, [
+        ["No ancestor files specified", "test.png"],
+        ["Valid ancestor files"],
+    ])
+
+
+def test_collect_provenance_invalid_ancestors(caplog, diagnostic_task):
+
+    record = {
+        "test.png": {
+            "caption": "Some figure",
+            "ancestors": ["xyz.nc"],
+        },
+    }
+
+    write_mock_provenance(diagnostic_task, record)
+
+    diagnostic_task._collect_provenance()
+
+    assert_warned(caplog, [
+        ["Invalid ancestor file", "test.png"],
+        ["Valid ancestor files"],
+    ])
+
+
+def test_collect_provenance_ancestor_hint(mocker, caplog, diagnostic_task):
+
+    record = {
+        "test.png": {
+            "caption": "Some figure",
+            "ancestors": ["xyz.nc"],
+        },
+        "test.nc": {
+            "ancestors": ["abc.nc"],
+        },
+    }
+
+    write_mock_provenance(diagnostic_task, record)
+
+    ancestor_product = mocker.Mock()
+    ancestor_product.filename = "xyz.nc"
+
+    ancestor_task = mocker.Mock()
+    ancestor_task.products = {ancestor_product}
+
+    diagnostic_task.ancestors = [ancestor_task]
+    diagnostic_task._collect_provenance()
+
+    assert_warned(caplog, [
+        ["Invalid ancestor file", "abc.nc", "test.nc"],
+        ["Valid ancestor files", "xyz.nc"],
+    ])
diff --git a/tests/unit/test_task.py b/tests/unit/test_task.py