-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Single-end Support #50
Changes from 17 commits
b650a28
03a9da1
de5a56f
361de80
6eb00c4
44d0ab7
760ea9c
2df4e6e
9e92be3
0a7456c
c5bae5c
ad3a067
d83abc2
52e6012
75caa50
3a86a92
7571dff
bdea828
3f44322
1a8e914
d79acf0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,8 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | |
### Added | ||
- Input fastq read support: | ||
- Pacbio raw and corrected reads (#47) | ||
- Oxford Nanopore raw, corrected, and HAC (#47) | ||
- Support for Nanopore-reads assembly with Flye and Canu (#47) | ||
- Oxford Nanopore raw, corrected, and HAC reads (#47) | ||
- Single-end Illumina reads (#50) | ||
- Support for Nanopore-reads assembly with Flye and Canu (#47, #50) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this PR, I fixed a few Nanopore bugs that were introduced in #47. Bugs:
|
||
- New assemblers: | ||
- Hifiasm and Hifiasm-meta (#49) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,3 +26,4 @@ dependencies: | |
- nanoplot>=1.20 | ||
- hifiasm>=0.19 | ||
- hifiasm_meta>=hamtv0.3 | ||
- gfatools>=0.5 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,3 +4,5 @@ markers = | |
long: long running tests | ||
hifi: pacbio hifi-reads tests | ||
nano: oxford nanopore-reads tests | ||
filterwarnings = | ||
ignore::DeprecationWarning:ratelimiter.* | ||
Comment on lines
+7
to
+8
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,11 +13,12 @@ | |
|
||
|
||
PAIRED = ("spades", "megahit", "unicycler") | ||
SINGLE = ("spades", "megahit", "unicycler") | ||
PACBIO = ("canu", "flye", "hifiasm", "hifiasm-meta") | ||
OXFORD = ("canu", "flye") | ||
ALGORITHMS = set(PAIRED + PACBIO + OXFORD) | ||
ALGORITHMS = set(PAIRED + SINGLE + PACBIO + OXFORD) | ||
|
||
ILLUMINA_READS = ("paired",) | ||
ILLUMINA_READS = ("paired", "single") | ||
PACBIO_READS = ("pacbio-raw", "pacbio-corr", "pacbio-hifi") | ||
OXFORD_READS = ("nano-raw", "nano-corr", "nano-hq") | ||
LONG_READS = PACBIO_READS + OXFORD_READS | ||
|
@@ -84,44 +85,53 @@ def create_sample_and_assembler_objects(self): | |
] | ||
|
||
def batch(self): | ||
self.paired_assemblers = [] | ||
self.pacbio_assemblers = [] | ||
self.oxford_assemblers = [] | ||
self.paired_sample_labels = set() | ||
self.paired_assemblers = set() | ||
self.single_sample_labels = set() | ||
self.single_assemblers = set() | ||
self.pacbio_sample_labels = set() | ||
self.pacbio_assemblers = set() | ||
self.oxford_sample_labels = set() | ||
self.oxford_assemblers = set() | ||
for assembler in self.assemblers: | ||
self.determine_assembler_workflow(assembler) | ||
self.batch = { | ||
"paired": { | ||
"samples": self.get_batch_samples(self.paired_assemblers), | ||
"samples": self.get_samples(self.paired_sample_labels), | ||
"assemblers": self.paired_assemblers, | ||
}, | ||
"single": { | ||
"samples": self.get_samples(self.single_sample_labels), | ||
"assemblers": self.single_assemblers, | ||
}, | ||
"pacbio": { | ||
"samples": self.get_batch_samples(self.pacbio_assemblers), | ||
"samples": self.get_samples(self.pacbio_sample_labels), | ||
"assemblers": self.pacbio_assemblers, | ||
}, | ||
"oxford": { | ||
"samples": self.get_batch_samples(self.oxford_assemblers), | ||
"samples": self.get_samples(self.oxford_sample_labels), | ||
"assemblers": self.oxford_assemblers, | ||
}, | ||
} | ||
Comment on lines
87
to
115
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the batch function. In this function, samples and assemblers are sorted by "read type". Originally, a list of assemblers were kept and then saved in the |
||
|
||
def determine_assembler_workflow(self, assembler): | ||
readtypes = set() | ||
for sample in assembler.samples: | ||
readtypes.update({self.samples[sample].readtype}) | ||
if assembler.algorithm in PAIRED and readtypes.intersection(ILLUMINA_READS): | ||
self.paired_assemblers.append(assembler) | ||
elif assembler.algorithm in PACBIO and readtypes.intersection(PACBIO_READS): | ||
self.pacbio_assemblers.append(assembler) | ||
elif assembler.algorithm in OXFORD and readtypes.intersection(OXFORD_READS): | ||
self.oxford_assemblers.append(assembler) | ||
|
||
def get_batch_samples(self, assemblers): | ||
samples = {} | ||
for assembler in assemblers: | ||
samples = samples | dict( | ||
(key, self.samples[key]) for key in assembler.samples if key in self.samples | ||
) | ||
return samples | ||
readtype = self.samples[sample].readtype | ||
if readtype == "paired": | ||
self.paired_sample_labels.add(sample) | ||
self.paired_assemblers.add(assembler) | ||
elif readtype == "single": | ||
self.single_sample_labels.add(sample) | ||
self.single_assemblers.add(assembler) | ||
elif readtype in PACBIO_READS: | ||
self.pacbio_sample_labels.add(sample) | ||
self.pacbio_assemblers.add(assembler) | ||
elif readtype in OXFORD_READS: | ||
self.oxford_sample_labels.add(sample) | ||
self.oxford_assemblers.add(assembler) | ||
Comment on lines
117
to
+131
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of determining which workflow an assembler object belongs to with a bunch of complex conditional statements, we now go by "read type". |
||
|
||
def get_samples(self, labels): | ||
return {label: self.samples[label] for label in labels} | ||
|
||
def to_dict(self, args, readtype="all"): | ||
if readtype == "all": | ||
|
@@ -130,18 +140,24 @@ def to_dict(self, args, readtype="all"): | |
else: | ||
samples = self.batch[readtype]["samples"] | ||
assemblers = self.batch[readtype]["assemblers"] | ||
label_to_samples = {} | ||
for assembler in assemblers: | ||
label_to_samples[assembler.label] = [ | ||
sample for sample in assembler.samples if sample in samples | ||
] | ||
return dict( | ||
samples={label: sample.to_string() for label, sample in samples.items()}, | ||
labels=[assembler.label for assembler in assemblers], | ||
assemblers={assembler.label: assembler.algorithm for assembler in assemblers}, | ||
extra_args={assembler.label: assembler.extra_args for assembler in assemblers}, | ||
label_to_samples={assembler.label: assembler.samples for assembler in assemblers}, | ||
label_to_samples=label_to_samples, | ||
Comment on lines
+143
to
+153
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Originally, This parameter had to be changed because, not all For example, a basic |
||
sample_readtype={label: sample.readtype for label, sample in samples.items()}, | ||
threads=args.threads, | ||
downsample=args.downsample, | ||
coverage=args.coverage, | ||
genomesize=args.genome_size, | ||
seed=args.seed, | ||
length_required=args.length_required, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
) | ||
|
||
|
||
|
@@ -184,3 +200,15 @@ def check_canu_required_params(self): | |
raise ValueError( | ||
"Canu requires at least 4 avaliable cores; increase `--threads` to 4 or more" | ||
) | ||
|
||
def __members(self): | ||
return (self.label,) | ||
|
||
def __eq__(self, other): | ||
if type(other) is type(self): | ||
return self.__members() == other.__members() | ||
else: | ||
return False | ||
|
||
def __hash__(self): | ||
return hash(self.__members()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Roger that. It looks like you based this code on an example incorporating multiple values into the hash function. Since you're only hashing a single value, you could probably simplify with something like this. def __eq__(self, other):
return hash(self) == hash(other)
def __hash__(self):
return hash(self.label) The type check is subsumed in the astronomically small possibility that an object of a different data type will hash to the same numerical value. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,18 @@ | |
from argparse import ArgumentTypeError | ||
|
||
|
||
def fastp_options(parser): | ||
illumina = parser.add_argument_group("fastp arguments") | ||
illumina.add_argument( | ||
"-l", | ||
"--length-required", | ||
type=int, | ||
metavar="L", | ||
default=50, | ||
help="discard reads shorter than the required L length during fastp; by default L=50", | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Users can now adjust the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd suggest replacing "during fastp" with "after pre-processing" or something like that. |
||
|
||
|
||
def check_positive(value): | ||
try: | ||
value = int(value) | ||
|
@@ -20,33 +32,33 @@ def check_positive(value): | |
return value | ||
|
||
|
||
def options(parser): | ||
short = parser.add_argument_group("downsample arguments") | ||
short.add_argument( | ||
def downsample_options(parser): | ||
illumina = parser.add_argument_group("downsample arguments") | ||
illumina.add_argument( | ||
"-c", | ||
"--coverage", | ||
type=check_positive, | ||
metavar="C", | ||
default=150, | ||
help="target an average depth of coverage Cx when auto-downsampling; by default, C=150", | ||
) | ||
short.add_argument( | ||
illumina.add_argument( | ||
"-d", | ||
"--downsample", | ||
type=int, | ||
metavar="D", | ||
default=0, | ||
help="randomly sample D reads from the input rather than assembling the full set; set D=0 to perform auto-downsampling to a desired level of coverage (see --coverage); set D=-1 to disable downsampling; by default D=0", | ||
) | ||
short.add_argument( | ||
illumina.add_argument( | ||
"-g", | ||
"--genome-size", | ||
type=int, | ||
metavar="G", | ||
default=0, | ||
help="provide known genome size in base pairs (bp); by default, G=0", | ||
) | ||
short.add_argument( | ||
illumina.add_argument( | ||
"--seed", | ||
type=int, | ||
metavar="S", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,8 +7,10 @@ | |
# Development Center. | ||
# ------------------------------------------------------------------------------------------------- | ||
|
||
import json | ||
import multiprocessing | ||
import os | ||
from pathlib import Path | ||
from pkg_resources import resource_filename | ||
|
||
|
||
|
@@ -20,3 +22,23 @@ def data_file(path): | |
|
||
def get_core_count(): | ||
return multiprocessing.cpu_count() | ||
|
||
|
||
def write_config(labels, wd, cfg): | ||
data = json.load(open(data_file(f"configs/{cfg}"))) | ||
assemblers = [] | ||
for assembler in data["assemblers"]: | ||
if assembler["label"] in labels: | ||
assemblers.append(assembler) | ||
data["assemblers"] = assemblers | ||
json.dump(data, open(Path(wd) / cfg, "w")) | ||
return assemblers | ||
|
||
|
||
def files_exists(wd, assemblers, expected): | ||
analysis_dir = Path(wd).resolve() / "analysis" | ||
for assembler in assemblers: | ||
for sample in assembler["samples"]: | ||
label = assembler["label"] | ||
algorithm = assembler["algorithm"] | ||
assert (analysis_dir / sample / label / algorithm / expected).exists() | ||
Comment on lines
+27
to
+44
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The purpose of these two functions (
|
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed the coverage warning that alerts the user when Snakemake files are unparceable:
CoverageWarning: Couldn't parse Python file
.