From 2b15b07ca494aefdc51959752105fea74c11f237 Mon Sep 17 00:00:00 2001 From: Charles Cowart <42684307+charles-cowart@users.noreply.github.com> Date: Fri, 16 Feb 2024 15:04:15 -0800 Subject: [PATCH] For runs w/multiple projects, manage adapter-trimmed files (#126) * For runs w/multiple projects, manage adapter-trimmed files * comment removed --- sequence_processing_pipeline/FastQCJob.py | 4 ++ sequence_processing_pipeline/NuQCJob.py | 29 +++++++++ .../tests/test_NuQCJob.py | 63 ++++++++++++++++++- 3 files changed, 95 insertions(+), 1 deletion(-) diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py index 65dd9bfd..819f2e99 100644 --- a/sequence_processing_pipeline/FastQCJob.py +++ b/sequence_processing_pipeline/FastQCJob.py @@ -92,6 +92,10 @@ def _find_projects(self, path_to_run_id_data_fastq_dir, is_raw_input): files = [x for x in files if x.endswith('.fastq.gz') and 'zero_files' not in x] + # remove fastq files in the only-adapter-filtered + # folder from consideration if they are present. + files = [x for x in files if 'only-adapter-filtered' not in x] + # break files up into R1, R2, I1, I2 # assume _R1_ does not occur in the path as well. r1_only = [x for x in files if '_R1_' in x] diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py index d824b8a7..7e567c93 100644 --- a/sequence_processing_pipeline/NuQCJob.py +++ b/sequence_processing_pipeline/NuQCJob.py @@ -202,6 +202,26 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst): for fp in files_to_move: move(fp, dst) + @staticmethod + def _move_trimmed_files(project_name, output_path): + ''' + Given output_path, move all fastqs to a new subdir named project_name. + :param project_name: The name of the new folder to be created. + :param output_path: The path to scan for fastq files. + :return: None + ''' + + if exists(output_path): + pattern = f"{output_path}/*.fastq.gz" + + # this directory shouldn't already exist. + makedirs(join(output_path, project_name), exist_ok=False) + + for trimmed_file in list(glob.glob(pattern)): + move(trimmed_file, join(output_path, project_name)) + else: + raise ValueError(f"'{output_path}' does not exist") + def run(self, callback=None): # now a single job-script will be created to process all projects at # the same time, and intelligently handle adapter-trimming as needed @@ -244,6 +264,15 @@ def run(self, callback=None): pattern = f"{source_dir}/*.fastq.gz" completed_files = list(glob.glob(pattern)) + # if the 'only-adapter-filtered' directory exists, move the files + # into a unique location so that files from multiple projects + # don't overwrite each other. + trimmed_only_path = join(self.output_path, + 'only-adapter-filtered') + + if exists(trimmed_only_path): + NuQCJob._move_trimmed_files(project_name, trimmed_only_path) + if needs_human_filtering is True: filtered_directory = join(source_dir, 'filtered_sequences') else: diff --git a/sequence_processing_pipeline/tests/test_NuQCJob.py b/sequence_processing_pipeline/tests/test_NuQCJob.py index 7c801c87..0f4bcd9f 100644 --- a/sequence_processing_pipeline/tests/test_NuQCJob.py +++ b/sequence_processing_pipeline/tests/test_NuQCJob.py @@ -1,11 +1,12 @@ import shutil import unittest -from os.path import join, abspath, exists +from os.path import join, abspath, exists, dirname from functools import partial from sequence_processing_pipeline.NuQCJob import NuQCJob from sequence_processing_pipeline.PipelineError import PipelineError from os import makedirs, remove from metapool import load_sample_sheet +import glob class TestNuQCJob(unittest.TestCase): @@ -546,6 +547,10 @@ def tearDown(self): if exists(self.tmp_file_path): remove(self.tmp_file_path) + # for test_move_trimmed_files() + if exists(self.path('NuQCJob')): + shutil.rmtree(self.path('NuQCJob')) + def test_nuqcjob_creation(self): # use good-sample-sheet as the basis for a sample Metatranscriptomic with self.assertRaises(PipelineError) as e: @@ -1204,6 +1209,41 @@ def test_regular_expressions(self): self._helper(job.json_regex, good_names, bad_names) + def test_move_trimmed(self): + # Note: this test does not make use of the output_dir that other + # tests use. + + for dummy_fp in SAMPLE_DIR: + dummy_fp = self.path(dummy_fp) + dummy_path = dirname(dummy_fp) + makedirs(dummy_path, exist_ok=True) + with open(dummy_fp, 'w') as f: + f.write("This is a dummy file.\n") + + trimmed_only_path = self.path('NuQCJob', 'only-adapter-filtered') + + NuQCJob._move_trimmed_files('NPH_15288', trimmed_only_path) + + new_path = join(trimmed_only_path, 'NPH_15288') + pattern = f"{new_path}/*.fastq.gz" + + exp = [ + ('only-adapter-filtered/NPH_15288/359180345_S58_L001_R1_001.' + 'fastq.gz'), + ('only-adapter-filtered/NPH_15288/359180337_S27_L001_R1_001.' + 'fastq.gz'), + ('only-adapter-filtered/NPH_15288/359180338_S51_L001_R2_001.' + 'fastq.gz'), + ('only-adapter-filtered/NPH_15288/359180338_S51_L001_R1_001.' + 'fastq.gz'), + ('only-adapter-filtered/NPH_15288/359180337_S27_L001_R2_001.' + 'fastq.gz')] + + for trimmed_file in list(glob.glob(pattern)): + trimmed_file = trimmed_file.split('NuQCJob/')[-1] + if trimmed_file not in exp: + self.assertIn(trimmed_file, exp) + def _helper(self, regex, good_names, bad_names): for good_name in good_names: substr = regex.search(good_name) @@ -1214,5 +1254,26 @@ def _helper(self, regex, good_names, bad_names): self.assertIsNone(substr, msg=f'Regex failed on {bad_name}') +SAMPLE_DIR = [ + 'NuQCJob/only-adapter-filtered/359180345_S58_L001_R1_001.fastq.gz', + 'NuQCJob/only-adapter-filtered/359180337_S27_L001_R1_001.fastq.gz', + 'NuQCJob/only-adapter-filtered/359180338_S51_L001_R2_001.fastq.gz', + 'NuQCJob/only-adapter-filtered/359180338_S51_L001_R1_001.fastq.gz', + 'NuQCJob/only-adapter-filtered/359180337_S27_L001_R2_001.fastq.gz', + 'NuQCJob/NPH_15288/fastp_reports_dir/html/359180354_S22_L001_R1_001.html', + 'NuQCJob/NPH_15288/fastp_reports_dir/html/359180338_S51_L001_R1_001.html', + 'NuQCJob/NPH_15288/fastp_reports_dir/html/359180345_S58_L001_R1_001.html', + 'NuQCJob/NPH_15288/fastp_reports_dir/html/359180337_S27_L001_R1_001.html', + 'NuQCJob/NPH_15288/fastp_reports_dir/html/359180353_S17_L001_R1_001.html', + 'NuQCJob/NPH_15288/fastp_reports_dir/json/359180353_S17_L001_R1_001.json', + 'NuQCJob/NPH_15288/fastp_reports_dir/json/359180337_S27_L001_R1_001.json', + 'NuQCJob/NPH_15288/fastp_reports_dir/json/359180345_S58_L001_R1_001.json', + 'NuQCJob/NPH_15288/fastp_reports_dir/json/359180338_S51_L001_R1_001.json', + 'NuQCJob/NPH_15288/fastp_reports_dir/json/359180354_S22_L001_R1_001.json', + 'NuQCJob/process_all_fastq_files.sh', + 'NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981.completed', + 'NuQCJob/logs/slurm-1897981_1.out', + 'NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1'] + if __name__ == '__main__': unittest.main()