Skip to content

Commit

Permalink
Additional unittests for InstrumentUtils()
Browse files Browse the repository at this point in the history
  • Loading branch information
charles-cowart committed Feb 14, 2024
1 parent c62b917 commit 1bb127c
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 47 deletions.
48 changes: 8 additions & 40 deletions sequence_processing_pipeline/Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,46 +83,14 @@ def get_date(run_directory):
for format in formats:
try:
date = datetime.strptime(date_string, format)
return date.date()
return str(date.date())
except ValueError:
# assume ValueErrors are due to incorrect format, rather than
# incorrect value from the XML file.
pass

raise ValueError(f"'{date_string}' could not be parsed")

@staticmethod
def get_flow_cell_mode(run_dir):
# TODO: What is currently returned is the most descriptive information
# found in runParameters.xml or elsewhere regarding flowcell mode.
# In some cases, this information may need to be mapped according to
# a dictionary of known flowcells used by IGM and their types to get
# the actual mode of flowcell.
type = InstrumentUtils.get_instrument_type(run_dir)

run_params_path = join(run_dir, 'RunParameters.xml')

if not exists(run_params_path):
raise ValueError(f"'{run_params_path}' doesn't exist")

# adjust the search path through RunParameters.xml based on the
# schemas observed w/each instrument type.
search_strings = {'MiSeq': 'FlowcellRFIDTag',
'HiSeq 4000': 'Setup/Flowcell',
'iSeq': 'FlowcellEEPROMTag',
'NovaSeq X Plus': 'FlowCellType',
'HiSeq 2500': 'Setup/Flowcell',
'NovaSeq 6000': 'RfidsInfo/FlowCellMode',
'RapidRun': 'Setup/Flowcell'}

with open(run_params_path) as f:
value = ET.fromstring(f.read()).find(search_strings[type])
if value is not None:
return value.text

raise ValueError("Flowcell information could not be found in "
f"'{run_params_path}'")


class Pipeline:
sif_header = ['sample_name', 'collection_timestamp', 'elevation', 'empo_1',
Expand Down Expand Up @@ -273,7 +241,7 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path,
def _configure_profile(self):
# extract the instrument type from self.run_dir and the assay type
# from self.sample_sheet (or self.mapping_file).
instr_type = InstrumentUtils.get_instrument_type(self.run_id)
instr_type = InstrumentUtils.get_instrument_type(self.run_dir)

if isinstance(self.sample_sheet, str):
# if self.sample_sheet is a file instead of a KLSampleSheet()
Expand Down Expand Up @@ -375,13 +343,13 @@ def _configure_profile(self):
if base_profile is None:
raise ValueError("a 'default' profile was not found")

# overwrite the configuration values in the base-profile with those
# in the matching profile as appropriate.
for attribute in selected_profile['profile']['configuration']:
value = selected_profile['profile']['configuration'][attribute]
base_profile['profile']['configuration'][attribute] = value

if selected_profile:
# overwrite the configuration values in the base-profile with those
# in the matching profile as appropriate.
for attribute in selected_profile['profile']['configuration']:
value = selected_profile['profile']['configuration'][attribute]
base_profile['profile']['configuration'][attribute] = value

# overwrite default info w/selected profile (if one was found)
# so that complete profile can be written to working directory
# as a log.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
{
"profile": {
"instrument_type": "MiSeq",
"assay_type": "Metagenomic",
"configuration": {
"bcl2fastq": {
"nodes": 2,
"nprocs": 62,
"queue": "qiita",
"wallclock_time_in_minutes": 1022,
"modules_to_load": [
"bcl2fastq_2.20.0.222"
],
"executable_path": "bcl2fastq",
"per_process_memory_limit": "100gb"
},
"nu-qc": {
"nodes": 2,
"cpus_per_task": 32,
"queue": "qiita",
"wallclock_time_in_minutes": 2028,
"minimap2_databases": "/scratch/databases/minimap2",
"modules_to_load": [
"fastp_0.20.1",
"samtools_1.12",
"minimap2_2.18"
],
"fastp_executable_path": "fastp",
"minimap2_executable_path": "minimap2",
"samtools_executable_path": "samtools",
"job_total_memory_limit": "20",
"job_max_array_length": 1000,
"known_adapters_path": "fastp_known_adapters_formatted.fna",
"bucket_size": 8,
"length_limit": 100,
"cores_per_task": 2
},
"seqpro": {
"seqpro_path": "seqpro",
"modules_to_load": []
},
"fastqc": {
"nodes": 2,
"nprocs": 62,
"queue": "qiita",
"nthreads": 62,
"wallclock_time_in_minutes": 220,
"modules_to_load": [
"fastqc_0.11.5"
],
"fastqc_executable_path": "fastqc",
"multiqc_executable_path": "multiqc",
"multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml",
"job_total_memory_limit": "20gb",
"job_pool_size": 120,
"job_max_array_length": 2000
}
}
}
}
65 changes: 58 additions & 7 deletions sequence_processing_pipeline/tests/test_Pipeline.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import json
import os
from sequence_processing_pipeline.PipelineError import PipelineError
from sequence_processing_pipeline.Pipeline import Pipeline
from sequence_processing_pipeline.Pipeline import Pipeline, InstrumentUtils
import unittest
from os import makedirs
from os import makedirs, walk
from os.path import abspath, basename, join
from functools import partial
import re
Expand Down Expand Up @@ -1436,7 +1436,7 @@ def test_configuration_profiles(self):
obs = pipeline.config_profile['profile']

# assert a profile matching self.good_sample_sheet_path was found.
self.assertEqual(obs['instrument_type'], "NovaSeq 6000")
self.assertEqual(obs['instrument_type'], "MiSeq")
self.assertEqual(obs['assay_type'], "Metagenomic")

obs = obs['configuration']
Expand All @@ -1450,10 +1450,10 @@ def test_configuration_profiles(self):

# assert increased values over default found in novaseq 6000/
# metagenomic profile are found in the final configuration as well.
self.assertEqual(obs['bcl2fastq']['nodes'], 4)
self.assertEqual(obs['bcl2fastq']['nprocs'], 64)
self.assertEqual(obs['nu-qc']['nodes'], 4)
self.assertEqual(obs['nu-qc']['wallclock_time_in_minutes'], 2048)
self.assertEqual(obs['bcl2fastq']['nodes'], 2)
self.assertEqual(obs['bcl2fastq']['nprocs'], 62)
self.assertEqual(obs['nu-qc']['nodes'], 2)
self.assertEqual(obs['nu-qc']['wallclock_time_in_minutes'], 2028)
self.assertEqual(obs['nu-qc']['cpus_per_task'], 32)

def test_parse_project_name(self):
Expand Down Expand Up @@ -2198,6 +2198,57 @@ def test_process_run_info_file(self):
# called by Pipeline's constructor.


class TestInstrumentUtils(unittest.TestCase):
def setUp(self):
package_root = abspath('./sequence_processing_pipeline')
self.path = partial(join, package_root, 'tests', 'data')

def test_instrument_utils(self):
iutils = InstrumentUtils()

exp = {'231108_M04586_0992_000000000-L7342': {'id': 'M04586',
'type': 'MiSeq',
'date': '2023-11-08'},
'200320_K00180_0957_AHCYKKBBXY_PE150_Knight': {'id': 'K00180',
'type': ('HiSeq '
'4000'),
'date': ('2020-0'
'3-20')
},
'20220912_FS10001773_27_BSE39218-1017': {'id': 'FS10001773',
'type': 'iSeq',
'date': '2022-09-12'},
'231215_LH00444_0031_B222WHFLT4': {'id': 'LH00444',
'type': 'NovaSeq X Plus',
'date': '2023-12-16'},
'190809_D00611_0709_AH3CKJBCX3_RKL0040_Feist_36-39_2': {
'id': 'D00611',
'type': 'HiSeq 2500',
'date': '2019-08-09'},
'231215_A01535_0435_BH23F5DSXC': {'id': 'A01535',
'type': 'NovaSeq 6000',
'date': '2023-12-15'},
'150629_SN1001_0511_AH5L7GBCXX': {'id': 'SN1001',
'type': 'RapidRun',
'date': '2015-06-29'}}

run_directories = []
for root, dirs, files in walk(self.path('sample_run_directories')):
for run_id in dirs:
run_directories.append((run_id, join(root, run_id)))

# don't walk recursively. stop after first level.
break

for run_id, run_dir in run_directories:
self.assertEqual(iutils.get_instrument_id(run_dir),
exp[run_id]['id'])
self.assertEqual(iutils.get_instrument_type(run_dir),
exp[run_id]['type'])
self.assertEqual(iutils.get_date(run_dir),
exp[run_id]['date'])


good_dummy_sheet1 = [
"[Header],,,,,,", "IEMFileVersion,4,,,,,", "Date,10/27/22,,,,,",
"Workflow,GenerateFASTQ,,,,,", "Application,FASTQ Only,,,,,",
Expand Down

0 comments on commit 1bb127c

Please sign in to comment.