Additional unittests for InstrumentUtils()

biocore · Feb 14, 2024 · 1bb127c · 1bb127c
1 parent c62b917
commit 1bb127c
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 47 deletions.
diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
@@ -83,46 +83,14 @@ def get_date(run_directory):
         for format in formats:
             try:
                 date = datetime.strptime(date_string, format)
-                return date.date()
+                return str(date.date())
             except ValueError:
                 # assume ValueErrors are due to incorrect format, rather than
                 # incorrect value from the XML file.
                 pass
 
         raise ValueError(f"'{date_string}' could not be parsed")
 
-    @staticmethod
-    def get_flow_cell_mode(run_dir):
-        # TODO: What is currently returned is the most descriptive information
-        #  found in runParameters.xml or elsewhere regarding flowcell mode.
-        # In some cases, this information may need to be mapped according to
-        # a dictionary of known flowcells used by IGM and their types to get
-        # the actual mode of flowcell.
-        type = InstrumentUtils.get_instrument_type(run_dir)
-
-        run_params_path = join(run_dir, 'RunParameters.xml')
-
-        if not exists(run_params_path):
-            raise ValueError(f"'{run_params_path}' doesn't exist")
-
-        # adjust the search path through RunParameters.xml based on the
-        # schemas observed w/each instrument type.
-        search_strings = {'MiSeq': 'FlowcellRFIDTag',
-                          'HiSeq 4000': 'Setup/Flowcell',
-                          'iSeq': 'FlowcellEEPROMTag',
-                          'NovaSeq X Plus': 'FlowCellType',
-                          'HiSeq 2500': 'Setup/Flowcell',
-                          'NovaSeq 6000': 'RfidsInfo/FlowCellMode',
-                          'RapidRun': 'Setup/Flowcell'}
-
-        with open(run_params_path) as f:
-            value = ET.fromstring(f.read()).find(search_strings[type])
-            if value is not None:
-                return value.text
-
-        raise ValueError("Flowcell information could not be found in "
-                         f"'{run_params_path}'")
-
 
 class Pipeline:
     sif_header = ['sample_name', 'collection_timestamp', 'elevation', 'empo_1',
@@ -273,7 +241,7 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path,
     def _configure_profile(self):
         # extract the instrument type from self.run_dir and the assay type
         # from self.sample_sheet (or self.mapping_file).
-        instr_type = InstrumentUtils.get_instrument_type(self.run_id)
+        instr_type = InstrumentUtils.get_instrument_type(self.run_dir)
 
         if isinstance(self.sample_sheet, str):
             # if self.sample_sheet is a file instead of a KLSampleSheet()
@@ -375,13 +343,13 @@ def _configure_profile(self):
         if base_profile is None:
             raise ValueError("a 'default' profile was not found")
 
-        # overwrite the configuration values in the base-profile with those
-        # in the matching profile as appropriate.
-        for attribute in selected_profile['profile']['configuration']:
-            value = selected_profile['profile']['configuration'][attribute]
-            base_profile['profile']['configuration'][attribute] = value
-
         if selected_profile:
+            # overwrite the configuration values in the base-profile with those
+            # in the matching profile as appropriate.
+            for attribute in selected_profile['profile']['configuration']:
+                value = selected_profile['profile']['configuration'][attribute]
+                base_profile['profile']['configuration'][attribute] = value
+
             # overwrite default info w/selected profile (if one was found)
             # so that complete profile can be written to working directory
             # as a log.

diff --git a/sequence_processing_pipeline/tests/data/configuration_profiles/miseq_metagenomic.json b/sequence_processing_pipeline/tests/data/configuration_profiles/miseq_metagenomic.json
@@ -0,0 +1,60 @@
+{
+  "profile": {
+    "instrument_type": "MiSeq",
+    "assay_type": "Metagenomic",
+    "configuration": {
+      "bcl2fastq": {
+        "nodes": 2,
+        "nprocs": 62,
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 1022,
+        "modules_to_load": [
+          "bcl2fastq_2.20.0.222"
+        ],
+        "executable_path": "bcl2fastq",
+        "per_process_memory_limit": "100gb"
+      },
+      "nu-qc": {
+        "nodes": 2,
+        "cpus_per_task": 32,
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 2028,
+        "minimap2_databases": "/scratch/databases/minimap2",
+        "modules_to_load": [
+          "fastp_0.20.1",
+          "samtools_1.12",
+          "minimap2_2.18"
+        ],
+        "fastp_executable_path": "fastp",
+        "minimap2_executable_path": "minimap2",
+        "samtools_executable_path": "samtools",
+        "job_total_memory_limit": "20",
+        "job_max_array_length": 1000,
+        "known_adapters_path": "fastp_known_adapters_formatted.fna",
+        "bucket_size": 8,
+        "length_limit": 100,
+        "cores_per_task": 2
+      },
+      "seqpro": {
+        "seqpro_path": "seqpro",
+        "modules_to_load": []
+      },
+      "fastqc": {
+        "nodes": 2,
+        "nprocs": 62,
+        "queue": "qiita",
+        "nthreads": 62,
+        "wallclock_time_in_minutes": 220,
+        "modules_to_load": [
+          "fastqc_0.11.5"
+        ],
+        "fastqc_executable_path": "fastqc",
+        "multiqc_executable_path": "multiqc",
+        "multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml",
+        "job_total_memory_limit": "20gb",
+        "job_pool_size": 120,
+        "job_max_array_length": 2000
+      }
+    }
+  }
+}
diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py
@@ -1,9 +1,9 @@
 import json
 import os
 from sequence_processing_pipeline.PipelineError import PipelineError
-from sequence_processing_pipeline.Pipeline import Pipeline
+from sequence_processing_pipeline.Pipeline import Pipeline, InstrumentUtils
 import unittest
-from os import makedirs
+from os import makedirs, walk
 from os.path import abspath, basename, join
 from functools import partial
 import re
@@ -1436,7 +1436,7 @@ def test_configuration_profiles(self):
         obs = pipeline.config_profile['profile']
 
         # assert a profile matching self.good_sample_sheet_path was found.
-        self.assertEqual(obs['instrument_type'], "NovaSeq 6000")
+        self.assertEqual(obs['instrument_type'], "MiSeq")
         self.assertEqual(obs['assay_type'], "Metagenomic")
 
         obs = obs['configuration']
@@ -1450,10 +1450,10 @@ def test_configuration_profiles(self):
 
         # assert increased values over default found in novaseq 6000/
         # metagenomic profile are found in the final configuration as well.
-        self.assertEqual(obs['bcl2fastq']['nodes'], 4)
-        self.assertEqual(obs['bcl2fastq']['nprocs'], 64)
-        self.assertEqual(obs['nu-qc']['nodes'], 4)
-        self.assertEqual(obs['nu-qc']['wallclock_time_in_minutes'], 2048)
+        self.assertEqual(obs['bcl2fastq']['nodes'], 2)
+        self.assertEqual(obs['bcl2fastq']['nprocs'], 62)
+        self.assertEqual(obs['nu-qc']['nodes'], 2)
+        self.assertEqual(obs['nu-qc']['wallclock_time_in_minutes'], 2028)
         self.assertEqual(obs['nu-qc']['cpus_per_task'], 32)
 
     def test_parse_project_name(self):
@@ -2198,6 +2198,57 @@ def test_process_run_info_file(self):
         # called by Pipeline's constructor.
 
 
+class TestInstrumentUtils(unittest.TestCase):
+    def setUp(self):
+        package_root = abspath('./sequence_processing_pipeline')
+        self.path = partial(join, package_root, 'tests', 'data')
+
+    def test_instrument_utils(self):
+        iutils = InstrumentUtils()
+
+        exp = {'231108_M04586_0992_000000000-L7342': {'id': 'M04586',
+                                                      'type': 'MiSeq',
+                                                      'date': '2023-11-08'},
+               '200320_K00180_0957_AHCYKKBBXY_PE150_Knight': {'id': 'K00180',
+                                                              'type': ('HiSeq '
+                                                                       '4000'),
+                                                              'date': ('2020-0'
+                                                                       '3-20')
+                                                              },
+               '20220912_FS10001773_27_BSE39218-1017': {'id': 'FS10001773',
+                                                        'type': 'iSeq',
+                                                        'date': '2022-09-12'},
+               '231215_LH00444_0031_B222WHFLT4': {'id': 'LH00444',
+                                                  'type': 'NovaSeq X Plus',
+                                                  'date': '2023-12-16'},
+               '190809_D00611_0709_AH3CKJBCX3_RKL0040_Feist_36-39_2': {
+                   'id': 'D00611',
+                   'type': 'HiSeq 2500',
+                   'date': '2019-08-09'},
+               '231215_A01535_0435_BH23F5DSXC': {'id': 'A01535',
+                                                 'type': 'NovaSeq 6000',
+                                                 'date': '2023-12-15'},
+               '150629_SN1001_0511_AH5L7GBCXX': {'id': 'SN1001',
+                                                 'type': 'RapidRun',
+                                                 'date': '2015-06-29'}}
+
+        run_directories = []
+        for root, dirs, files in walk(self.path('sample_run_directories')):
+            for run_id in dirs:
+                run_directories.append((run_id, join(root, run_id)))
+
+            # don't walk recursively. stop after first level.
+            break
+
+        for run_id, run_dir in run_directories:
+            self.assertEqual(iutils.get_instrument_id(run_dir),
+                             exp[run_id]['id'])
+            self.assertEqual(iutils.get_instrument_type(run_dir),
+                             exp[run_id]['type'])
+            self.assertEqual(iutils.get_date(run_dir),
+                             exp[run_id]['date'])
+
+
 good_dummy_sheet1 = [
     "[Header],,,,,,", "IEMFileVersion,4,,,,,", "Date,10/27/22,,,,,",
     "Workflow,GenerateFASTQ,,,,,", "Application,FASTQ Only,,,,,",