Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pangolin flag #593

Merged
merged 14 commits into from
Apr 12, 2021
32 changes: 32 additions & 0 deletions scripts/make_pangolin_node_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Translate pangolineages from CSV -> JSON for node_data
Note: this should arguably live instead as part of `combine_metadata`,
but this gets particularly complex given the new multiple-inputs logic.
So, for now, following the initial suggestion in the issue.
"""

import argparse
import pandas as pd
import csv
import json
from augur.utils import write_json

if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Create node data for assigned pangolin lineages",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--pangolineages", type=str, required=True, help="pangolineages.csv")
parser.add_argument("--node_data_outfile", type=str, help="pangolineages.json")
parser.add_argument("--attribute_name", default="pango_lineage_local", help="attribute name for pangolin lineage annotations in the output JSON")
args = parser.parse_args()

pangolineages = pd.read_csv(args.pangolineages)

node_data = {
"nodes": {
row['taxon']: {args.attribute_name: row['lineage']} for idx, row in pangolineages.iterrows()
}
}

write_json(node_data, args.node_data_outfile)
3 changes: 3 additions & 0 deletions workflow/envs/nextstrain.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
name: nextstrain
channels:
- conda-forge
- bioconda
- defaults
dependencies:
Expand All @@ -10,6 +11,8 @@ dependencies:
- mafft=7.471
- nextalign=0.1.6
- pandas
- pangolin=2.3.8
- pangolearn=2021.04.01
- psutil
- python=3.7*
- nodejs=10
Expand Down
52 changes: 52 additions & 0 deletions workflow/snakemake_rules/main_workflow.smk
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,55 @@ else:
{input.reference} > {output} 2> {log}
"""

if "run_pangolin" in config and config["run_pangolin"]:
rule run_pangolin:
message:
"""
Running pangolin to assign lineage labels to samples. Includes putative lineage definitions by default.
Please remember to update your installation of pangolin regularly to ensure the most up-to-date classifications.
"""
input:
alignment = rules.build_align.output.alignment,
output:
lineages = "results/{build_name}/pangolineages.csv",
params:
huddlej marked this conversation as resolved.
Show resolved Hide resolved
outdir = "results/{build_name}",
csv_outfile = "pangolineages.csv",
node_data_outfile = "pangolineages.json"
log:
"logs/pangolin_{build_name}.txt"
conda: config["conda_environment"]
threads: 1
resources:
mem_mb=3000
benchmark:
"benchmarks/pangolineages_{build_name}.txt"
shell: ## once pangolin fully supports threads, add `--threads {threads}` to the below (existing pango cli param)
"""
pangolin {input.alignment}\
--outdir {params.outdir} \
--outfile {params.csv_outfile} 2>&1 | tee {log}\
"""

rule make_pangolin_node_data:
input:
lineages = rules.run_pangolin.output.lineages
output:
node_data = "results/{build_name}/pangolineages.json"
log:
huddlej marked this conversation as resolved.
Show resolved Hide resolved
"logs/pangolin_export_{build_name}.txt"
conda: config["conda_environment"]
resources:
mem_mb=3000
benchmark:
"benchmarks/make_pangolin_node_data_{build_name}.txt"
shell:
"""
python3 scripts/make_pangolin_node_data.py \
--pangolineages {input.lineages} \
--node_data_outfile {output.node_data} 2>&1 | tee {log}\
"""

# TODO: This will probably not work for build names like "country_usa" where we need to know the country is "USA".
rule adjust_metadata_regions:
message:
Expand Down Expand Up @@ -1116,9 +1165,12 @@ def _get_node_data_by_wildcards(wildcards):
if "use_nextalign" in config and config["use_nextalign"]:
inputs.append(rules.aa_muts_explicit.output.node_data)
inputs.append(rules.distances.output.node_data)
if "run_pangolin" in config and config["run_pangolin"]:
inputs.append(rules.make_pangolin_node_data.output.node_data)

# Convert input files from wildcard strings to real file names.
huddlej marked this conversation as resolved.
Show resolved Hide resolved
inputs = [input_file.format(**wildcards_dict) for input_file in inputs]

return inputs

rule export:
Expand Down