nextstrain · sidneymbell · Apr 12, 2021 · Mar 24, 2021 · Mar 25, 2021 · Mar 26, 2021
diff --git a/scripts/make_pangolin_node_data.py b/scripts/make_pangolin_node_data.py
@@ -0,0 +1,32 @@
+"""
+Translate pangolineages from CSV -> JSON for node_data
+Note: this should arguably live instead as part of `combine_metadata`,
+but this gets particularly complex given the new multiple-inputs logic.
+So, for now, following the initial suggestion in the issue.
+"""
+
+import argparse
+import pandas as pd
+import csv
+import json
+from augur.utils import write_json
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Create node data for assigned pangolin lineages",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--pangolineages", type=str, required=True, help="pangolineages.csv")
+    parser.add_argument("--node_data_outfile", type=str, help="pangolineages.json")
+    parser.add_argument("--attribute_name", default="pango_lineage_local", help="attribute name for pangolin lineage annotations in the output JSON")
+    args = parser.parse_args()
+
+    pangolineages = pd.read_csv(args.pangolineages)
+
+    node_data = {
+    "nodes": {
+    row['taxon']: {args.attribute_name: row['lineage']} for idx, row in pangolineages.iterrows()
+        }
+    }
+
+    write_json(node_data, args.node_data_outfile)
diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
@@ -1,5 +1,6 @@
 name: nextstrain
 channels:
+  - conda-forge
   - bioconda
   - defaults
 dependencies:
@@ -10,6 +11,8 @@ dependencies:
   - mafft=7.471
   - nextalign=0.1.6
   - pandas
+  - pangolin=2.3.8
+  - pangolearn=2021.04.01
   - psutil
   - python=3.7*
   - nodejs=10

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
@@ -565,6 +565,55 @@ else:
                 {input.reference} > {output} 2> {log}
             """
 
+if "run_pangolin" in config and config["run_pangolin"]:
+    rule run_pangolin:
+        message:
+            """
+            Running pangolin to assign lineage labels to samples. Includes putative lineage definitions by default.
+            Please remember to update your installation of pangolin regularly to ensure the most up-to-date classifications.
+            """
+        input:
+            alignment = rules.build_align.output.alignment,
+        output:
+            lineages = "results/{build_name}/pangolineages.csv",
+        params:
+            outdir = "results/{build_name}",
+            csv_outfile = "pangolineages.csv",
+            node_data_outfile = "pangolineages.json"
+        log:
+            "logs/pangolin_{build_name}.txt"
+        conda: config["conda_environment"]
+        threads: 1
+        resources:
+            mem_mb=3000
+        benchmark:
+            "benchmarks/pangolineages_{build_name}.txt"
+        shell: ## once pangolin fully supports threads, add `--threads {threads}` to the below (existing pango cli param)
+            """
+            pangolin {input.alignment}\
+                --outdir {params.outdir} \
+                --outfile {params.csv_outfile} 2>&1 | tee {log}\
+            """
+
+    rule make_pangolin_node_data:
+        input:
+            lineages = rules.run_pangolin.output.lineages
+        output:
+            node_data = "results/{build_name}/pangolineages.json"
+        log:
+            "logs/pangolin_export_{build_name}.txt"
+        conda: config["conda_environment"]
+        resources:
+            mem_mb=3000
+        benchmark:
+            "benchmarks/make_pangolin_node_data_{build_name}.txt"
+        shell:
+            """
+            python3 scripts/make_pangolin_node_data.py \
+            --pangolineages {input.lineages} \
+            --node_data_outfile {output.node_data} 2>&1 | tee {log}\
+            """
+
 # TODO: This will probably not work for build names like "country_usa" where we need to know the country is "USA".
 rule adjust_metadata_regions:
     message:
@@ -1116,9 +1165,12 @@ def _get_node_data_by_wildcards(wildcards):
     if "use_nextalign" in config and config["use_nextalign"]:
         inputs.append(rules.aa_muts_explicit.output.node_data)
         inputs.append(rules.distances.output.node_data)
+    if "run_pangolin" in config and config["run_pangolin"]:
+        inputs.append(rules.make_pangolin_node_data.output.node_data)
 
     # Convert input files from wildcard strings to real file names.
     inputs = [input_file.format(**wildcards_dict) for input_file in inputs]
+
     return inputs
 
 rule export: