Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pangolin flag #593

Merged
merged 14 commits into from
Apr 12, 2021
34 changes: 34 additions & 0 deletions scripts/make_pangolin_node_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
Translate pangolineages from CSV -> JSON for node_data
Note: this should arguably live instead as part of `combine_metadata`,
but this gets particularly complex given the new multiple-inputs logic.
So, for now, following the initial suggestion in the issue.
"""

import argparse
import pandas as pd
import csv
import json

if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Create node data for assigned pangolin lineages",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--pangolineages", type=str, required=True, help="pangolineages.csv")
parser.add_argument("--node_data_outfile", type=str, help="pangolineages.json")
args = parser.parse_args()
print('INPUT TO MAKE NODE DATA', '\n\n', args.pangolineages, '\n\n', args.node_data_outfile)

pangolineages = pd.read_csv(args.pangolineages)
print(pangolineages.head())
node_data = {
"nodes": {
row['taxon']: row['lineage'] for idx, row in pangolineages.iterrows()
huddlej marked this conversation as resolved.
Show resolved Hide resolved
}
}

# input_json['colorings'].append({'key': 'pangolin lineage', 'type': 'categorical'})

with open(args.node_data_outfile, 'w') as fh:
json.dump(node_data, fh, indent=2)
huddlej marked this conversation as resolved.
Show resolved Hide resolved
52 changes: 51 additions & 1 deletion workflow/snakemake_rules/main_workflow.smk
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,53 @@ else:
{input.reference} > {output} 2> {log}
"""

if "run_pangolin" in config and config["run_pangolin"]:
rule run_pangolin:
message:
"""
Running pangolin to assign lineage labels to samples. Includes putative lineage definitions by default.
Please remember to update your installation of pangolin regularly to ensure the most up-to-date classifications.
"""
input:
alignment = rules.build_align.output.alignment,
output:
lineages = "results/{build_name}/pangolineages.csv",
params:
huddlej marked this conversation as resolved.
Show resolved Hide resolved
outdir = "results/{build_name}",
csv_outfile = "pangolineages.csv",
node_data_outfile = "pangolineages.json"
log:
"logs/pangolin_{build_name}.txt"
conda: config["conda_environment"] ## not sure what this arg does -- could it be used to separate the requirements for `pangolin` rather than adding them to the ncov env?
huddlej marked this conversation as resolved.
Show resolved Hide resolved
threads: 8
huddlej marked this conversation as resolved.
Show resolved Hide resolved
resources:
mem_mb=3000 ## should update these
shell:
"""
pangolin {input.alignment}\
--threads {threads} \
--outdir {params.outdir} \
--outfile {params.csv_outfile} \
huddlej marked this conversation as resolved.
Show resolved Hide resolved
"""

rule make_pangolin_node_data:
input:
lineages = rules.run_pangolin.output.lineages
output:
node_data = "results/{build_name}/pangolineages.json"
log:
huddlej marked this conversation as resolved.
Show resolved Hide resolved
"logs/pangolin_export_{build_name}.txt"
conda: config["conda_environment"]
threads: 8
huddlej marked this conversation as resolved.
Show resolved Hide resolved
resources:
mem_mb=3000 ## should update these
shell:
"""
python3 scripts/make_pangolin_node_data.py \
--pangolineages {input.lineages} \
--node_data_outfile {output.node_data} \
huddlej marked this conversation as resolved.
Show resolved Hide resolved
"""

# TODO: This will probably not work for build names like "country_usa" where we need to know the country is "USA".
rule adjust_metadata_regions:
message:
Expand Down Expand Up @@ -1113,12 +1160,15 @@ def _get_node_data_by_wildcards(wildcards):
rules.traits.output.node_data
]

# Convert input files from wildcard strings to real file names.
if "use_nextalign" in config and config["use_nextalign"]:
inputs.append(rules.aa_muts_explicit.output.node_data)
inputs.append(rules.distances.output.node_data)
if "run_pangolin" in config and config["run_pangolin"]:
inputs.append(rules.make_pangolin_node_data.output.node_data)

# Convert input files from wildcard strings to real file names.
huddlej marked this conversation as resolved.
Show resolved Hide resolved
inputs = [input_file.format(**wildcards_dict) for input_file in inputs]

return inputs

rule export:
Expand Down