nextstrain · sidneymbell · Apr 12, 2021 · Mar 24, 2021 · Mar 25, 2021 · Mar 26, 2021
diff --git a/scripts/make_pangolin_node_data.py b/scripts/make_pangolin_node_data.py
@@ -0,0 +1,34 @@
+"""
+Translate pangolineages from CSV -> JSON for node_data
+Note: this should arguably live instead as part of `combine_metadata`,
+but this gets particularly complex given the new multiple-inputs logic.
+So, for now, following the initial suggestion in the issue.
+"""
+
+import argparse
+import pandas as pd
+import csv
+import json
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Create node data for assigned pangolin lineages",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--pangolineages", type=str, required=True, help="pangolineages.csv")
+    parser.add_argument("--node_data_outfile", type=str, help="pangolineages.json")
+    args = parser.parse_args()
+    print('INPUT TO MAKE NODE DATA', '\n\n', args.pangolineages, '\n\n', args.node_data_outfile)
+
+    pangolineages = pd.read_csv(args.pangolineages)
+    print(pangolineages.head())
+    node_data = {
+    "nodes": {
+    row['taxon']: row['lineage'] for idx, row in pangolineages.iterrows()
+        }
+    }
+
+    # input_json['colorings'].append({'key': 'pangolin lineage', 'type': 'categorical'})
+
+    with open(args.node_data_outfile, 'w') as fh:
+        json.dump(node_data, fh, indent=2)
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
@@ -565,6 +565,53 @@ else:
                 {input.reference} > {output} 2> {log}
             """
 
+if "run_pangolin" in config and config["run_pangolin"]:
+    rule run_pangolin:
+        message:
+            """
+            Running pangolin to assign lineage labels to samples. Includes putative lineage definitions by default.
+            Please remember to update your installation of pangolin regularly to ensure the most up-to-date classifications.
+            """
+        input:
+            alignment = rules.build_align.output.alignment,
+        output:
+            lineages = "results/{build_name}/pangolineages.csv",
+        params:
+            outdir = "results/{build_name}",
+            csv_outfile = "pangolineages.csv",
+            node_data_outfile = "pangolineages.json"
+        log:
+            "logs/pangolin_{build_name}.txt"
+        conda: config["conda_environment"] ## not sure what this arg does -- could it be used to separate the requirements for `pangolin` rather than adding them to the ncov env?
+        threads: 8
+        resources:
+            mem_mb=3000 ## should update these
+        shell:
+            """
+            pangolin {input.alignment}\
+                --threads {threads} \
+                --outdir {params.outdir} \
+                --outfile {params.csv_outfile} \
+            """
+
+    rule make_pangolin_node_data:
+        input:
+            lineages = rules.run_pangolin.output.lineages
+        output:
+            node_data = "results/{build_name}/pangolineages.json"
+        log:
+            "logs/pangolin_export_{build_name}.txt"
+        conda: config["conda_environment"]
+        threads: 8
+        resources:
+            mem_mb=3000 ## should update these
+        shell:
+            """
+            python3 scripts/make_pangolin_node_data.py \
+            --pangolineages {input.lineages} \
+            --node_data_outfile {output.node_data} \
+            """
+
 # TODO: This will probably not work for build names like "country_usa" where we need to know the country is "USA".
 rule adjust_metadata_regions:
     message:
@@ -1113,12 +1160,15 @@ def _get_node_data_by_wildcards(wildcards):
         rules.traits.output.node_data
     ]
 
+    # Convert input files from wildcard strings to real file names.
     if "use_nextalign" in config and config["use_nextalign"]:
         inputs.append(rules.aa_muts_explicit.output.node_data)
         inputs.append(rules.distances.output.node_data)
+    if "run_pangolin" in config and config["run_pangolin"]:
+        inputs.append(rules.make_pangolin_node_data.output.node_data)
 
-    # Convert input files from wildcard strings to real file names.
     inputs = [input_file.format(**wildcards_dict) for input_file in inputs]
+
     return inputs
 
 rule export: