Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added option to NuQCJob to annotate filtered fastq. #155

Merged
merged 4 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions sequence_processing_pipeline/NuQCJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
wall_time_limit, jmem, fastp_path, minimap2_path,
samtools_path, modules_to_load, qiita_job_id,
max_array_length, known_adapters_path, movi_path, gres_value,
pmls_path, bucket_size=8, length_limit=100, cores_per_task=4):
pmls_path, additional_fastq_tags, bucket_size=8,
length_limit=100, cores_per_task=4):
"""
Submit a slurm job where the contents of fastq_root_dir are processed
using fastp, minimap2, and samtools. Human-genome sequences will be
Expand All @@ -69,6 +70,8 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
:param bucket_size: the size in GB of each bucket to process
:param length_limit: reads shorter than this will be discarded.
:param cores_per_task: Number of CPU cores per node to request.
:param additional_fastq_tags: A list of fastq tags to preserve during
filtering.
"""
super().__init__(fastq_root_dir,
output_path,
Expand Down Expand Up @@ -96,6 +99,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
self.movi_path = movi_path
self.gres_value = gres_value
self.pmls_path = pmls_path
self.additional_fastq_tags = additional_fastq_tags

# for projects that use sequence_processing_pipeline as a dependency,
# jinja_env must be set to sequence_processing_pipeline's root path,
Expand Down Expand Up @@ -401,6 +405,14 @@ def _generate_mmi_filter_cmds(self, working_dir):

cores_to_allocate = int(self.cores_per_task / 2)

if len(self.additional_fastq_tags) > 0:
# add tags for known metadata types that fastq files may have
# been annotated with. Samtools will safely ignore tags that
# are not present.
tags = " -T %s" % ','.join(self.additional_fastq_tags)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
tags = " -T %s" % ','.join(self.additional_fastq_tags)
tags = "-T %s" % ','.join(self.additional_fastq_tags)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The odd spacing here is intentional. It prevents an extra space from being present when there are no tags. This makes the expected results for tests uniform.

else:
tags = ""

for count, mmi_db_path in enumerate(self.mmi_file_paths):
if count == 0:
# prime initial state with unfiltered file and create first of
Expand All @@ -416,9 +428,10 @@ def _generate_mmi_filter_cmds(self, working_dir):
input = tmp_file1
output = tmp_file2

cmds.append(f"minimap2 -2 -ax sr -t {cores_to_allocate} "
cmds.append(f"minimap2 -2 -ax sr -y -t {cores_to_allocate} "
f"{mmi_db_path} {input} -a | samtools fastq -@ "
f"{cores_to_allocate} -f 12 -F 256 > {output}")
f"{cores_to_allocate} -f 12 -F 256{tags} > "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
f"{cores_to_allocate} -f 12 -F 256{tags} > "
f"{cores_to_allocate} -f 12 -F 256 {tags} > "

f"{output}")

# rename the latest tmp file to the final output filename.
cmds.append(f"mv {output} {final_output}")
Expand Down
38 changes: 38 additions & 0 deletions sequence_processing_pipeline/tests/data/seqs.interleaved.fastq
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:3900:1600/1 BX:Z:TAGACACGAAGGTTGGAC
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:3900:1600/2 BX:Z:TAGACACGAAGGTTGGAC
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:F:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FF,FFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:7740:1600/1 BX:Z:AAAGATGAGGGCAGTTAA
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:7740:1600/2 BX:Z:AAAGATGAGGGCAGTTAA
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:12790:1600/1 BX:Z:TGGGGGTCGTAACACGAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:12790:1600/2 BX:Z:TGGGGGTCGTAACACGAA
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFF::FFFFF,FFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13250:1600/1 BX:Z:CGAGGCAGACTTGAATGC
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFF:FFFFF:FFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13250:1600/2 BX:Z:CGAGGCAGACTTGAATGC
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFF:FFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13520:1600/1 BX:Z:CAGACACGTAGGTGGGAC
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13520:1600/2 BX:Z:CAGACACGTAGGTGGGAC
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:3900:1600/1
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:3900:1600/2
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:F:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FF,FFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:7740:1600/1
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:7740:1600/2
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:12790:1600/1
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:12790:1600/2
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFF::FFFFF,FFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13250:1600/1
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFF:FFFFF:FFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13250:1600/2
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFF:FFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13520:1600/1
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
@1::MUX::FS10001773:68:BTR67708-1611:1:1101:13520:1600/2
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
65 changes: 59 additions & 6 deletions sequence_processing_pipeline/tests/test_NuQCJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -894,6 +894,7 @@ def test_nuqcjob_creation(self):
self.movi_path,
self.gres_value,
self.pmls_path,
[],
bucket_size=8,
length_limit=100,
cores_per_task=4,
Expand Down Expand Up @@ -925,6 +926,7 @@ def test_nuqcjob_creation(self):
self.qiita_job_id,
1000,
"",
[],
self.movi_path,
self.gres_value,
self.pmls_path,
Expand Down Expand Up @@ -961,6 +963,7 @@ def test_nuqcjob_creation(self):
self.qiita_job_id,
1000,
"",
[],
self.movi_path,
self.gres_value,
self.pmls_path,
Expand All @@ -983,6 +986,7 @@ def test_error_msg_from_logs(self):
self.qiita_job_id,
1000,
"",
[],
self.movi_path,
self.gres_value,
self.pmls_path,
Expand Down Expand Up @@ -1032,9 +1036,11 @@ def test_assay_value(self):
self.qiita_job_id,
1000,
"",
[],
self.movi_path,
self.gres_value,
self.pmls_path,
[]
)

def test_audit(self):
Expand All @@ -1054,9 +1060,11 @@ def test_audit(self):
self.qiita_job_id,
1000,
"",
[],
self.movi_path,
self.gres_value,
self.pmls_path,
[]
)

obs = job.audit(self.sample_ids)
Expand Down Expand Up @@ -1835,9 +1843,11 @@ def test_completed_file_generation(self):
self.qiita_job_id,
1000,
"",
[],
self.movi_path,
self.gres_value,
self.pmls_path,
[]
)

my_path = (
Expand Down Expand Up @@ -1874,9 +1884,11 @@ def test_completed_file_generation_some_failures(self):
self.qiita_job_id,
1000,
"",
[],
self.movi_path,
self.gres_value,
self.pmls_path,
[]
)

# test _confirm_job_completed() fails when a .completed file isn't
Expand Down Expand Up @@ -1904,6 +1916,7 @@ def test_generate_job_script(self):
self.movi_path,
self.gres_value,
self.pmls_path,
[]
)

# 2k as a parameter will promote the default value.
Expand Down Expand Up @@ -1932,6 +1945,7 @@ def test_regular_expressions(self):
self.movi_path,
self.gres_value,
self.pmls_path,
[]
)

# a sample of known valid fastq file-names plus a few edge-cases.
Expand Down Expand Up @@ -2088,16 +2102,17 @@ def test_generate_mmi_filter_cmds(self):
self.movi_path,
self.gres_value,
self.pmls_path,
[]
)

obs = job._generate_mmi_filter_cmds("/my_work_dir")

exp = [
"minimap2 -2 -ax sr -t 2 db_path/mmi_1.db /my_work_dir/seqs."
"minimap2 -2 -ax sr -y -t 2 db_path/mmi_1.db /my_work_dir/seqs."
"interleaved.fastq -a | samtools fastq -@ 2 -f 12 -F 256 > "
"/my_work_dir/foo",
"minimap2 -2 -ax sr -t 2 db_path/mmi_2.db /my_work_dir/foo -a | "
"samtools fastq -@ 2 -f 12 -F 256 > /my_work_dir/bar",
"minimap2 -2 -ax sr -y -t 2 db_path/mmi_2.db /my_work_dir/foo -a"
" | samtools fastq -@ 2 -f 12 -F 256 > /my_work_dir/bar",
"mv /my_work_dir/bar /my_work_dir/seqs.interleaved.filter_"
"alignment.fastq",
"[ -e /my_work_dir/foo ] && rm /my_work_dir/foo",
Expand All @@ -2106,9 +2121,47 @@ def test_generate_mmi_filter_cmds(self):

exp = "\n".join(exp)

print(obs)
print("###")
print(exp)
self.assertEqual(obs, exp)

def test_generate_mmi_filter_cmds_w_annotate_fastq(self):
double_db_paths = ["db_path/mmi_1.db", "db_path/mmi_2.db"]
job = NuQCJob(
self.fastq_root_path,
self.output_path,
self.good_sample_sheet_path,
double_db_paths,
"queue_name",
1,
1440,
"8",
"fastp",
"minimap2",
"samtools",
[],
self.qiita_job_id,
1000,
"",
self.movi_path,
self.gres_value,
self.pmls_path,
['BX']
)

obs = job._generate_mmi_filter_cmds("/my_work_dir")

exp = [
"minimap2 -2 -ax sr -y -t 2 db_path/mmi_1.db /my_work_dir/seqs."
"interleaved.fastq -a | samtools fastq -@ 2 -f 12 -F 256 -T BX > "
"/my_work_dir/foo",
"minimap2 -2 -ax sr -y -t 2 db_path/mmi_2.db /my_work_dir/foo -a"
" | samtools fastq -@ 2 -f 12 -F 256 -T BX > /my_work_dir/bar",
"mv /my_work_dir/bar /my_work_dir/seqs.interleaved.filter_"
"alignment.fastq",
"[ -e /my_work_dir/foo ] && rm /my_work_dir/foo",
"[ -e /my_work_dir/bar ] && rm /my_work_dir/bar"
]

exp = "\n".join(exp)

self.assertEqual(obs, exp)

Expand Down
2 changes: 1 addition & 1 deletion sequence_processing_pipeline/tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from tempfile import TemporaryDirectory
import gzip
import os
from os.path import join
from sequence_processing_pipeline.Commands import (split_similar_size_bins,
demux)
import io
from os.path import join


class CommandTests(unittest.TestCase):
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,5 @@
],
entry_points={
'console_scripts': ['demux=sequence_processing_pipeline.scripts.cli'
':demux', ], })
':demux'],
})
Loading