Skip to content

Commit

Permalink
Merge pull request #245 from nextstrain/nextclade3
Browse files Browse the repository at this point in the history
Use nextclade3 for phylogenetic workflow
  • Loading branch information
corneliusroemer authored Apr 30, 2024
2 parents 22835a9 + 27fdc7b commit b24c521
Show file tree
Hide file tree
Showing 8 changed files with 422 additions and 31 deletions.
3 changes: 1 addition & 2 deletions phylogenetic/build-configs/ci/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ custom_rules:
- build-configs/ci/copy_example_data.smk

reference: "defaults/reference.fasta"
genemap: "defaults/genemap.gff"
genome_annotation: "defaults/genome_annotation.gff3"
genbank_reference: "defaults/reference.gb"
include: "defaults/hmpxv1/include.txt"
clades: "defaults/clades.tsv"
Expand All @@ -20,7 +20,6 @@ build_name: "hmpxv1"
auspice_name: "mpox_clade-IIb"

filter:
exclude: "defaults/exclude_accessions.txt"
min_date: 2017
min_length: 100000

Expand Down
3 changes: 0 additions & 3 deletions phylogenetic/defaults/exclude_accessions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,4 @@ PP098578

HM172544 # cidofovir-resistant lab strain that is derived from DQ011155 (h/t Andrew Rambaut)

TMP0003 # Overdiverged 23MPX1786C
TMP0045 # Overdiverged RDC-NKV-GOM-MPOX-004

NC_003310 # Overdiverged RefSeq NC_003310
391 changes: 391 additions & 0 deletions phylogenetic/defaults/genome_annotation.gff3

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions phylogenetic/defaults/hmpxv1/config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
reference: "defaults/reference.fasta"
genemap: "defaults/genemap.gff"
genome_annotation: "defaults/genome_annotation.gff3"
genbank_reference: "defaults/reference.gb"
include: "defaults/hmpxv1/include.txt"
clades: "defaults/clades.tsv"
Expand All @@ -17,7 +17,6 @@ build_name: "hmpxv1"
auspice_name: "mpox_clade-IIb"

filter:
exclude: "defaults/exclude_accessions.txt"
min_date: 2017
min_length: 100000

Expand Down
3 changes: 1 addition & 2 deletions phylogenetic/defaults/hmpxv1_big/config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
reference: "defaults/reference.fasta"
genemap: "defaults/genemap.gff"
genome_annotation: "defaults/genome_annotation.gff3"
genbank_reference: "defaults/reference.gb"
include: "defaults/hmpxv1_big/include.txt"
clades: "defaults/clades.tsv"
Expand All @@ -17,7 +17,6 @@ build_name: "hmpxv1_big"
auspice_name: "mpox_lineage-B.1"

filter:
exclude: "defaults/exclude_accessions.txt"
min_date: 2022
min_length: 180000

Expand Down
3 changes: 1 addition & 2 deletions phylogenetic/defaults/mpxv/config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
auspice_config: "defaults/mpxv/auspice_config.json"
include: "defaults/mpxv/include.txt"
reference: "defaults/reference.fasta"
genemap: "defaults/genemap.gff"
genome_annotation: "defaults/genome_annotation.gff3"
genbank_reference: "defaults/reference.gb"
lat_longs: "defaults/lat_longs.tsv"
description: "defaults/description.md"
Expand All @@ -17,7 +17,6 @@ build_name: "mpxv"
auspice_name: "mpox_all-clades"

filter:
exclude: "defaults/exclude_accessions.txt"
min_date: 1950
min_length: 100000

Expand Down
4 changes: 2 additions & 2 deletions phylogenetic/rules/annotate_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ rule translate:
input:
tree=build_dir + "/{build_name}/tree.nwk",
node_data=build_dir + "/{build_name}/nt_muts.json",
genemap=config["genemap"],
genome_annotation=config["genome_annotation"],
output:
node_data=build_dir + "/{build_name}/aa_muts.json",
shell:
"""
augur translate \
--tree {input.tree} \
--ancestral-sequences {input.node_data} \
--reference-sequence {input.genemap} \
--reference-sequence {input.genome_annotation} \
--output {output.node_data}
"""

Expand Down
43 changes: 25 additions & 18 deletions phylogenetic/rules/prepare_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ REQUIRED INPUTS:
include = path to file of sequences to in force include
reference = path to reference sequence FASTA for Nextclade alignment
genemap = path to genemap GFF for Nextclade alignment
genome_annotation = path to genome_annotation GFF for Nextclade alignment
maskfile = path to maskfile of sites to be masked
OUTPUTS:
Expand Down Expand Up @@ -56,12 +56,12 @@ rule filter:
input:
sequences="data/sequences.fasta",
metadata="data/metadata.tsv",
exclude="defaults/exclude_accessions.txt",
output:
sequences=build_dir + "/{build_name}/good_sequences.fasta",
metadata=build_dir + "/{build_name}/good_metadata.tsv",
log=build_dir + "/{build_name}/good_filter.log",
params:
exclude=config["filter"]["exclude"],
min_date=config["filter"]["min_date"],
min_length=config["filter"]["min_length"],
strain_id=config["strain_id_field"],
Expand All @@ -73,7 +73,7 @@ rule filter:
--metadata-id-columns {params.strain_id} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
--exclude {params.exclude} \
--exclude {input.exclude} \
--min-date {params.min_date} \
--min-length {params.min_length} \
--query "(QC_rare_mutations == 'good' | QC_rare_mutations == 'mediocre')" \
Expand All @@ -93,9 +93,11 @@ rule subsample:
"sequences_per_group"
],
other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""),
exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}"
if "exclude_lineages" in config["subsample"][w.sample]
else "",
exclude=lambda w: (
f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}"
if "exclude_lineages" in config["subsample"][w.sample]
else ""
),
strain_id=config["strain_id_field"],
shell:
"""
Expand Down Expand Up @@ -156,30 +158,35 @@ rule reverse_reverse_complements:
rule align:
"""
Aligning sequences to {input.reference}
- filling gaps with N
"""
input:
sequences=build_dir + "/{build_name}/reversed.fasta",
reference=config["reference"],
genemap=config["genemap"],
genome_annotation=config["genome_annotation"],
output:
alignment=build_dir + "/{build_name}/aligned.fasta",
insertions=build_dir + "/{build_name}/insertions.fasta",
params:
max_indel=config["max_indel"],
seed_spacing=config["seed_spacing"],
# Alignment params from all-clades nextclade dataset
excess_bandwidth=100,
terminal_bandwidth=300,
window_size=40,
min_seed_cover=0.1,
allowed_mismatches=8,
gap_alignment_side="left",
threads: workflow.cores
shell:
"""
nextalign run \
nextclade3 run \
--jobs {threads} \
--reference {input.reference} \
--genemap {input.genemap} \
--max-indel {params.max_indel} \
--seed-spacing {params.seed_spacing} \
--retry-reverse-complement \
--input-ref {input.reference} \
--input-annotation {input.genome_annotation} \
--excess-bandwidth {params.excess_bandwidth} \
--terminal-bandwidth {params.terminal_bandwidth} \
--window-size {params.window_size} \
--min-seed-cover {params.min_seed_cover} \
--allowed-mismatches {params.allowed_mismatches} \
--gap-alignment-side {params.gap_alignment_side} \
--output-fasta - \
--output-insertions {output.insertions} \
{input.sequences} | seqkit seq -i > {output.alignment}
"""

Expand Down

0 comments on commit b24c521

Please sign in to comment.