Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add resolutions to rsv #68

Merged
merged 7 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ auspice_dir = 'auspice'

rule all:
input:
expand("auspice/rsv_{subtype}_{build}.json",
expand("auspice/rsv_{subtype}_{build}_{resolution}.json",
subtype = config.get("subtypes",['a']),
build = config.get("buildstorun", ['genome'])),
build = config.get("builds_to_run", ['genome']),
resolution = config.get("resolutions_to_run", ["all-time"])),

include: "workflow/snakemake_rules/chores.smk"

Expand Down
20 changes: 18 additions & 2 deletions config/configfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ conda_environment: "workflow/envs/nextstrain.yaml"

genesforglycosylation: ["G", "F"]

buildstorun: ["genome", "G", "F"]
builds_to_run: ["genome", "G", "F"]

resolutions_to_run: ["all-time", "6y", "3y"]

exclude: "config/outliers.txt"

Expand All @@ -20,8 +22,22 @@ filter:
G: 0.3
F: 0.3

min_length:
genome: 10000
G: 600
F: 1200
resolutions:
all-time:
min_date: 100Y
6y:
min_date: 6Y
background_min_date: 100Y
3y:
min_date: 3Y
background_min_date: 100Y

subsample_max_sequences:
genome: 2000
genome: 3000
G: 3000
F: 3000

Expand Down
7 changes: 7 additions & 0 deletions config/outliers.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@

MW455133 # over diverged/misdated
OK649675 # over diverged/misdated
KP317955
JX661581
OR140548


# B
JX489420 # over diverged/misdated
KF246645
KF246646
MF426028
KJ672473
OR326815
150 changes: 90 additions & 60 deletions example_data/a/metadata.tsv

Large diffs are not rendered by default.

25,290 changes: 21,970 additions & 3,320 deletions example_data/a/sequences.fasta

Large diffs are not rendered by default.

143 changes: 87 additions & 56 deletions example_data/b/metadata.tsv

Large diffs are not rendered by default.

24,749 changes: 21,484 additions & 3,265 deletions example_data/b/sequences.fasta

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion nextclade/config/pathogen.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,10 @@
},
"stopCodons": {
"enabled": true,
"ignoredStopCodons": []
"ignoredStopCodons": [{
"codon":320,
"cdsName":"G"
}]
}
},
"cdsOrderPreference": [
Expand Down
12 changes: 6 additions & 6 deletions workflow/snakemake_rules/clades.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ rule clades_genome:
nuc_muts = rules.ancestral.output.node_data,
clades = "config/clades_genome_{a_or_b}.tsv"
output:
node_data = build_dir + "/{a_or_b}/{build_name}/clades_genome.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/clades_genome.json"
log:
"logs/{a_or_b}/clades_genome_{build_name}.txt"
"logs/{a_or_b}/clades_genome_{build_name}_{resolution}.txt"
shell:
"""
augur clades --tree {input.tree} \
Expand All @@ -29,9 +29,9 @@ rule clades_Goya:
nuc_muts = rules.ancestral.output.node_data,
clades = "config/clades_G_{a_or_b}.tsv"
output:
node_data = build_dir + "/{a_or_b}/{build_name}/clades_G.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/clades_G.json"
log:
"logs/{a_or_b}/clades_{build_name}.txt"
"logs/{a_or_b}/clades_{build_name}_{resolution}.txt"
shell:
"""
augur clades --tree {input.tree} \
Expand All @@ -50,9 +50,9 @@ rule clades_consortium:
nuc_muts = rules.ancestral.output.node_data,
clades = "config/clades_consortium_{a_or_b}.tsv"
output:
node_data = build_dir + "/{a_or_b}/{build_name}/clades_consortium.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/clades_consortium.json"
log:
"logs/{a_or_b}/clades_{build_name}.txt"
"logs/{a_or_b}/clades_{build_name}_{resolution}.txt"
shell:
"""
augur clades --tree {input.tree} \
Expand Down
125 changes: 96 additions & 29 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ rule index_sequences:
input:
sequences = "data/{a_or_b}/sequences.fasta"
output:
sequence_index = build_dir + "/{a_or_b}/{build_name}/sequence_index.tsv"
sequence_index = build_dir + "/{a_or_b}/{build_name}/{resolution}/sequence_index.tsv"
shell:
"""
augur index \
Expand All @@ -28,8 +28,8 @@ rule newreference:
input:
oldreference = "config/{a_or_b}reference.gbk"
output:
newreferencegbk = build_dir + "/{a_or_b}/{build_name}/{gene}_reference.gbk",
newreferencefasta = build_dir + "/{a_or_b}/{build_name}/{gene}_reference.fasta",
newreferencegbk = build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_reference.gbk",
newreferencefasta = build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_reference.fasta",
params:
gene = lambda w: w.gene,
shell:
Expand All @@ -41,8 +41,7 @@ rule newreference:
--gene {params.gene}
"""


rule filter:
rule filter_recent:
message:
"""
filtering sequences
Expand All @@ -54,12 +53,14 @@ rule filter:
sequence_index = rules.index_sequences.output,
exclude = config['exclude']
output:
sequences = build_dir + "/{a_or_b}/{build_name}/filtered.fasta"
sequences = build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered_recent.fasta"
params:
group_by = config["filter"]["group_by"],
min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}',
min_length = lambda w: config["filter"]["min_length"].get(w.build_name, 10000),
subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000),
strain_id=config["strain_id_field"],
min_date=lambda w: config['filter']['resolutions'][w.resolution]["min_date"]
shell:
"""
augur filter \
Expand All @@ -69,27 +70,92 @@ rule filter:
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--exclude-where 'qc.overallStatus=bad' \
--min-date {params.min_date} \
--min-length {params.min_length} \
--output {output.sequences} \
--group-by {params.group_by} \
--subsample-max-sequences {params.subsample_max_sequences} \
--query '{params.min_coverage}'
"""

rule filter_background:
message:
"""
filtering sequences
"""
input:
sequences = "data/{a_or_b}/sequences.fasta",
reference = "config/{a_or_b}reference.gbk",
metadata = "data/{a_or_b}/metadata.tsv",
sequence_index = rules.index_sequences.output,
exclude = config['exclude']
output:
sequences = build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered_background.fasta"
params:
group_by = config["filter"]["group_by"],
min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}',
min_length = lambda w: config["filter"]["min_length"].get(w.build_name, 10000),
subsample_max_sequences = lambda w: int(config["filter"]["subsample_max_sequences"].get(w.build_name, 1000))//10,
strain_id=config["strain_id_field"],
max_date=lambda w: config['filter']['resolutions'][w.resolution]["min_date"],
min_date=lambda w: config['filter']['resolutions'][w.resolution]["background_min_date"]
shell:
"""
augur filter \
--sequences {input.sequences} \
--sequence-index {input.sequence_index} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--exclude-where 'qc.overallStatus=bad' 'qc.overallStatus=mediocre' \
--min-date {params.min_date} \
--max-date {params.max_date} \
--min-length {params.min_length} \
--output {output.sequences} \
--group-by {params.group_by} \
--subsample-max-sequences {params.subsample_max_sequences} \
--query '{params.min_coverage}'
"""

rule combine_samples:
input:
subsamples = lambda w: [rules.filter_recent.output.sequences, rules.filter_background.output.sequences] if 'background_min_date' in config['filter']['resolutions'][w.resolution] else [rules.filter_recent.output.sequences]
output:
sequences = build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered.fasta"
shell:
"""
cat {input.subsamples} | seqkit rmdup > {output}
"""

rule get_nextclade_dataset:
message:
"""
fetching nextclade dataset
"""
output:
dataset="nextclade_rsv-{a_or_b}.zip"
params:
ds_name = lambda w: "nextstrain/rsv/a/EPI_ISL_412866" if w.a_or_b=='a' else "nextstrain/rsv/b/EPI_ISL_1653999"
shell:
"""
nextclade3 dataset get -n {params.ds_name} --output-zip {output.dataset}
"""

rule genome_align:
message:
"""
Aligning sequences to {input.reference}
Aligning sequences to the reference
"""
input:
sequences = rules.filter.output.sequences,
reference = build_dir + "/{a_or_b}/{build_name}/genome_reference.fasta"
sequences = rules.combine_samples.output.sequences,
dataset = rules.get_nextclade_dataset.output.dataset
output:
alignment = build_dir + "/{a_or_b}/{build_name}/sequences.aligned.fasta"
alignment = build_dir + "/{a_or_b}/{build_name}/{resolution}/sequences.aligned.fasta"
threads: 4
shell:
"""
nextclade3 run -j {threads}\
--input-ref {input.reference} \
-D {input.dataset} \
--output-fasta {output.alignment} \
{input.sequences}
"""
Expand All @@ -100,7 +166,7 @@ rule cut:
oldalignment = rules.genome_align.output.alignment,
reference = "config/{a_or_b}reference.gbk"
output:
slicedalignment = build_dir + "/{a_or_b}/{build_name}/{gene}_slicedalignment.fasta"
slicedalignment = build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_slicedalignment.fasta"
params:
gene = lambda w: w.gene
shell:
Expand All @@ -116,9 +182,9 @@ rule cut:
rule realign:
input:
slicedalignment = rules.cut.output.slicedalignment,
reference = build_dir + "/{a_or_b}/{build_name}/{gene}_reference.fasta"
reference = build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_reference.fasta"
output:
realigned = build_dir + "/{a_or_b}/{build_name}/{gene}_aligned.fasta"
realigned = build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_aligned.fasta"
threads: 4
shell:
"""
Expand All @@ -132,10 +198,10 @@ rule realign:
rule hybrid_align:
input:
original = rules.genome_align.output.alignment,
G_alignment = build_dir + "/{a_or_b}/{build_name}/G_aligned.fasta",
G_alignment = build_dir + "/{a_or_b}/{build_name}/{resolution}/G_aligned.fasta",
reference = "config/{a_or_b}reference.gbk"
output:
hybrid_alignment = build_dir + "/{a_or_b}/{build_name}/hybrid_alignment.fasta"
hybrid_alignment = build_dir + "/{a_or_b}/{build_name}/{resolution}/hybrid_alignment.fasta"
params:
gene = lambda w: w.build_name
shell:
Expand All @@ -152,20 +218,21 @@ def get_alignment(w):
if w.build_name == "genome":
return rules.hybrid_align.output.hybrid_alignment
else:
return build_dir + f"/{w.a_or_b}/{w.build_name}/{w.build_name}_aligned.fasta"
return build_dir + f"/{w.a_or_b}/{w.build_name}/{w.resolution}/{w.build_name}_aligned.fasta"

rule tree:
message: "Building tree"
input:
alignment = get_alignment
output:
tree = build_dir + "/{a_or_b}/{build_name}/tree_raw.nwk"
tree = build_dir + "/{a_or_b}/{build_name}/{resolution}/tree_raw.nwk"
threads: 4
shell:
"""
augur tree \
--alignment {input.alignment} \
--output {output.tree} \
--tree-builder-args '-ninit 10 -n 4 -czb' \
--nthreads {threads}
"""

Expand All @@ -180,10 +247,10 @@ rule refine:
input:
tree = rules.tree.output.tree,
alignment =get_alignment,
metadata = rules.filter.input.metadata
metadata = "data/{a_or_b}/metadata.tsv"
output:
tree = build_dir + "/{a_or_b}/{build_name}/tree.nwk",
node_data = build_dir + "/{a_or_b}/{build_name}/branch_lengths.json"
tree = build_dir + "/{a_or_b}/{build_name}/{resolution}/tree.nwk",
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/branch_lengths.json"
params:
coalescent = config["refine"]["coalescent"],
clock_filter_iqd = config["refine"]["clock_filter_iqd"],
Expand Down Expand Up @@ -215,9 +282,9 @@ rule ancestral:
input:
tree = rules.refine.output.tree,
alignment = get_alignment,
root_sequence = build_dir + "/{a_or_b}/{build_name}/{build_name}_reference.gbk"
root_sequence = build_dir + "/{a_or_b}/{build_name}/{resolution}/{build_name}_reference.gbk"
output:
node_data = build_dir + "/{a_or_b}/{build_name}/nt_muts.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/nt_muts.json"
params:
inference = config["ancestral"]["inference"]
shell:
Expand All @@ -235,11 +302,11 @@ rule translate:
input:
tree = rules.refine.output.tree,
node_data = rules.ancestral.output.node_data,
reference = build_dir + "/{a_or_b}/{build_name}/{build_name}_reference.gbk",
reference = build_dir + "/{a_or_b}/{build_name}/{resolution}/{build_name}_reference.gbk",
output:
node_data = build_dir + "/{a_or_b}/{build_name}/aa_muts.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/aa_muts.json"
params:
alignment_file_mask = build_dir + "/{a_or_b}/{build_name}/aligned_%GENE.fasta"
alignment_file_mask = build_dir + "/{a_or_b}/{build_name}/{resolution}/aligned_%GENE.fasta"
shell:
"""
augur translate \
Expand All @@ -253,11 +320,11 @@ rule translate:
rule traits:
input:
tree = rules.refine.output.tree,
metadata = rules.filter.input.metadata
metadata = "data/{a_or_b}/metadata.tsv"
output:
node_data = build_dir + "/{a_or_b}/{build_name}/traits.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/traits.json"
log:
"logs/{a_or_b}/traits_{build_name}_rsv.txt"
"logs/{a_or_b}/traits_{build_name}_{resolution}_rsv.txt"
params:
columns = config["traits"]["columns"],
strain_id=config["strain_id_field"],
Expand Down
Loading