diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index 039e55f1..5f474bb8 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -83,7 +83,7 @@ def get_date(run_directory): for format in formats: try: date = datetime.strptime(date_string, format) - return date.date() + return str(date.date()) except ValueError: # assume ValueErrors are due to incorrect format, rather than # incorrect value from the XML file. @@ -91,38 +91,6 @@ def get_date(run_directory): raise ValueError(f"'{date_string}' could not be parsed") - @staticmethod - def get_flow_cell_mode(run_dir): - # TODO: What is currently returned is the most descriptive information - # found in runParameters.xml or elsewhere regarding flowcell mode. - # In some cases, this information may need to be mapped according to - # a dictionary of known flowcells used by IGM and their types to get - # the actual mode of flowcell. - type = InstrumentUtils.get_instrument_type(run_dir) - - run_params_path = join(run_dir, 'RunParameters.xml') - - if not exists(run_params_path): - raise ValueError(f"'{run_params_path}' doesn't exist") - - # adjust the search path through RunParameters.xml based on the - # schemas observed w/each instrument type. - search_strings = {'MiSeq': 'FlowcellRFIDTag', - 'HiSeq 4000': 'Setup/Flowcell', - 'iSeq': 'FlowcellEEPROMTag', - 'NovaSeq X Plus': 'FlowCellType', - 'HiSeq 2500': 'Setup/Flowcell', - 'NovaSeq 6000': 'RfidsInfo/FlowCellMode', - 'RapidRun': 'Setup/Flowcell'} - - with open(run_params_path) as f: - value = ET.fromstring(f.read()).find(search_strings[type]) - if value is not None: - return value.text - - raise ValueError("Flowcell information could not be found in " - f"'{run_params_path}'") - class Pipeline: sif_header = ['sample_name', 'collection_timestamp', 'elevation', 'empo_1', @@ -273,7 +241,7 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path, def _configure_profile(self): # extract the instrument type from self.run_dir and the assay type # from self.sample_sheet (or self.mapping_file). - instr_type = InstrumentUtils.get_instrument_type(self.run_id) + instr_type = InstrumentUtils.get_instrument_type(self.run_dir) if isinstance(self.sample_sheet, str): # if self.sample_sheet is a file instead of a KLSampleSheet() @@ -375,13 +343,13 @@ def _configure_profile(self): if base_profile is None: raise ValueError("a 'default' profile was not found") - # overwrite the configuration values in the base-profile with those - # in the matching profile as appropriate. - for attribute in selected_profile['profile']['configuration']: - value = selected_profile['profile']['configuration'][attribute] - base_profile['profile']['configuration'][attribute] = value - if selected_profile: + # overwrite the configuration values in the base-profile with those + # in the matching profile as appropriate. + for attribute in selected_profile['profile']['configuration']: + value = selected_profile['profile']['configuration'][attribute] + base_profile['profile']['configuration'][attribute] = value + # overwrite default info w/selected profile (if one was found) # so that complete profile can be written to working directory # as a log. diff --git a/sequence_processing_pipeline/tests/data/configuration_profiles/miseq_metagenomic.json b/sequence_processing_pipeline/tests/data/configuration_profiles/miseq_metagenomic.json new file mode 100644 index 00000000..76073508 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/configuration_profiles/miseq_metagenomic.json @@ -0,0 +1,60 @@ +{ + "profile": { + "instrument_type": "MiSeq", + "assay_type": "Metagenomic", + "configuration": { + "bcl2fastq": { + "nodes": 2, + "nprocs": 62, + "queue": "qiita", + "wallclock_time_in_minutes": 1022, + "modules_to_load": [ + "bcl2fastq_2.20.0.222" + ], + "executable_path": "bcl2fastq", + "per_process_memory_limit": "100gb" + }, + "nu-qc": { + "nodes": 2, + "cpus_per_task": 32, + "queue": "qiita", + "wallclock_time_in_minutes": 2028, + "minimap2_databases": "/scratch/databases/minimap2", + "modules_to_load": [ + "fastp_0.20.1", + "samtools_1.12", + "minimap2_2.18" + ], + "fastp_executable_path": "fastp", + "minimap2_executable_path": "minimap2", + "samtools_executable_path": "samtools", + "job_total_memory_limit": "20", + "job_max_array_length": 1000, + "known_adapters_path": "fastp_known_adapters_formatted.fna", + "bucket_size": 8, + "length_limit": 100, + "cores_per_task": 2 + }, + "seqpro": { + "seqpro_path": "seqpro", + "modules_to_load": [] + }, + "fastqc": { + "nodes": 2, + "nprocs": 62, + "queue": "qiita", + "nthreads": 62, + "wallclock_time_in_minutes": 220, + "modules_to_load": [ + "fastqc_0.11.5" + ], + "fastqc_executable_path": "fastqc", + "multiqc_executable_path": "multiqc", + "multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml", + "job_total_memory_limit": "20gb", + "job_pool_size": 120, + "job_max_array_length": 2000 + } + } + } +} \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py index 90be761f..44b622bd 100644 --- a/sequence_processing_pipeline/tests/test_Pipeline.py +++ b/sequence_processing_pipeline/tests/test_Pipeline.py @@ -1,9 +1,9 @@ import json import os from sequence_processing_pipeline.PipelineError import PipelineError -from sequence_processing_pipeline.Pipeline import Pipeline +from sequence_processing_pipeline.Pipeline import Pipeline, InstrumentUtils import unittest -from os import makedirs +from os import makedirs, walk from os.path import abspath, basename, join from functools import partial import re @@ -1436,7 +1436,7 @@ def test_configuration_profiles(self): obs = pipeline.config_profile['profile'] # assert a profile matching self.good_sample_sheet_path was found. - self.assertEqual(obs['instrument_type'], "NovaSeq 6000") + self.assertEqual(obs['instrument_type'], "MiSeq") self.assertEqual(obs['assay_type'], "Metagenomic") obs = obs['configuration'] @@ -1450,10 +1450,10 @@ def test_configuration_profiles(self): # assert increased values over default found in novaseq 6000/ # metagenomic profile are found in the final configuration as well. - self.assertEqual(obs['bcl2fastq']['nodes'], 4) - self.assertEqual(obs['bcl2fastq']['nprocs'], 64) - self.assertEqual(obs['nu-qc']['nodes'], 4) - self.assertEqual(obs['nu-qc']['wallclock_time_in_minutes'], 2048) + self.assertEqual(obs['bcl2fastq']['nodes'], 2) + self.assertEqual(obs['bcl2fastq']['nprocs'], 62) + self.assertEqual(obs['nu-qc']['nodes'], 2) + self.assertEqual(obs['nu-qc']['wallclock_time_in_minutes'], 2028) self.assertEqual(obs['nu-qc']['cpus_per_task'], 32) def test_parse_project_name(self): @@ -2198,6 +2198,57 @@ def test_process_run_info_file(self): # called by Pipeline's constructor. +class TestInstrumentUtils(unittest.TestCase): + def setUp(self): + package_root = abspath('./sequence_processing_pipeline') + self.path = partial(join, package_root, 'tests', 'data') + + def test_instrument_utils(self): + iutils = InstrumentUtils() + + exp = {'231108_M04586_0992_000000000-L7342': {'id': 'M04586', + 'type': 'MiSeq', + 'date': '2023-11-08'}, + '200320_K00180_0957_AHCYKKBBXY_PE150_Knight': {'id': 'K00180', + 'type': ('HiSeq ' + '4000'), + 'date': ('2020-0' + '3-20') + }, + '20220912_FS10001773_27_BSE39218-1017': {'id': 'FS10001773', + 'type': 'iSeq', + 'date': '2022-09-12'}, + '231215_LH00444_0031_B222WHFLT4': {'id': 'LH00444', + 'type': 'NovaSeq X Plus', + 'date': '2023-12-16'}, + '190809_D00611_0709_AH3CKJBCX3_RKL0040_Feist_36-39_2': { + 'id': 'D00611', + 'type': 'HiSeq 2500', + 'date': '2019-08-09'}, + '231215_A01535_0435_BH23F5DSXC': {'id': 'A01535', + 'type': 'NovaSeq 6000', + 'date': '2023-12-15'}, + '150629_SN1001_0511_AH5L7GBCXX': {'id': 'SN1001', + 'type': 'RapidRun', + 'date': '2015-06-29'}} + + run_directories = [] + for root, dirs, files in walk(self.path('sample_run_directories')): + for run_id in dirs: + run_directories.append((run_id, join(root, run_id))) + + # don't walk recursively. stop after first level. + break + + for run_id, run_dir in run_directories: + self.assertEqual(iutils.get_instrument_id(run_dir), + exp[run_id]['id']) + self.assertEqual(iutils.get_instrument_type(run_dir), + exp[run_id]['type']) + self.assertEqual(iutils.get_date(run_dir), + exp[run_id]['date']) + + good_dummy_sheet1 = [ "[Header],,,,,,", "IEMFileVersion,4,,,,,", "Date,10/27/22,,,,,", "Workflow,GenerateFASTQ,,,,,", "Application,FASTQ Only,,,,,",