Update nextclade workflow

nextstrain · Aug 27, 2024 · 92f12e4 · 92f12e4
1 parent 5e52962
commit 92f12e4
Show file tree

Hide file tree

Showing 19 changed files with 234 additions and 197 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,23 +1,23 @@
 exclude: '\.(tsv|fasta|gb)$|^ingest/vendored/'
 repos:
   - repo: https://github.com/snakemake/snakefmt
-    rev: v0.10.1
+    rev: v0.10.2
     hooks:
       - id: snakefmt
         language_version: python3.11
   - repo: https://github.com/rhysd/actionlint
-    rev: v1.6.27
+    rev: v1.7.1
     hooks:
       - id: actionlint
         entry: env SHELLCHECK_OPTS='--exclude=SC2027' actionlint
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
+    rev: v2.3.0
     hooks:
       - id: codespell
         additional_dependencies:
           - tomli
   - repo: https://github.com/google/yamlfmt
-    rev: v0.12.1
+    rev: v0.13.0
     hooks:
       - id: yamlfmt
   - repo: https://github.com/pappasam/toml-sort

diff --git a/nextclade/.ruff.toml b/nextclade/.ruff.toml
@@ -0,0 +1,7 @@
+line-length = 110
+indent-width = 4
+
+[lint]
+ignore = [
+  "E712" # Messes with pandas
+]
diff --git a/nextclade/Snakefile b/nextclade/Snakefile
@@ -3,9 +3,9 @@ if not config:
     configfile: "config/config.yaml"
 
 
-# build_names = ["lineage-b.1", "clade-iib", "all-clades", "clade-i"]
+build_names = ["lineage-b.1", "clade-iib", "all-clades", "clade-i"]
 # build_names = ["lineage-b.1", "clade-iib", "all-clades"]
-build_names = ["clade-i"]
+# build_names = ["all-clades"]
 
 PREALIGN_REFERENCE = "resources/lineage-b.1/reference.fasta"
 PREMASK_BED = "resources/lineage-b.1/mask.bed"

diff --git a/nextclade/resources/all-clades/CHANGELOG.md b/nextclade/resources/all-clades/CHANGELOG.md
@@ -1,13 +1 @@
 ## Unreleased
-
-Initial release of this dataset. This dataset is similar to the v2 dataset [`MPXV/ancestral`](https://github.com/nextstrain/nextclade_data/tree/2023-08-17--15-51-24--UTC/data/datasets/MPXV/references/ancestral/versions/2023-08-01T12%3A00%3A00Z/files) with some differences.
-
-### New and changed gene names
-
-Some genes have been renamed and one has been added. The new annotation is based on NCBI refseq annotations that were released in November 2022. The v2 dataset predates this refseq:
-
-- The 4 genes in the inverted terminal repeat segment (ITR) on both ends of the genome (OPG001, OPG002, OPG003,OPG015) are now all included. The genes on the 3' end (~positions 190000-197000) now have an `_dup` appended to distinguish them.
-- The gene previously named `NBT03_gp052` is now called `OPG073`
-- The gene previously named `NBT03_gp174` is now called `OPG016`
-- The gene previously named `NBT03_gp175` is now called `OPG015_dup`
-- Gene `OPG166` has been added
diff --git a/nextclade/resources/all-clades/README.md b/nextclade/resources/all-clades/README.md
@@ -1,24 +1,24 @@
 # Nextclade dataset for "Mpox virus (All Clades)"
 
-| Key                    | Value                                                                                                                 |
-| ---------------------- | --------------------------------------------------------------------------------------------------------------------- |
-| authors                | [Cornelius Roemer](https://neherlab.org), [Richard Neher](https://neherlab.org), [Nextstrain](https://nextstrain.org) |
-| data source            | Genbank                                                                                                               |
-| workflow               | [github.com/nextstrain/mpox/nextclade](https://github.com/nextstrain/mpox/nextclade)                                  |
-| nextclade dataset path | nextstrain/mpox/all-clades                                                                                            |
-| annotation             | [NC_063383.1](https://www.ncbi.nlm.nih.gov/nuccore/NC_063383)                                                         |
-| clade definitions      | [github.com/mpxv-lineages/lineage-designation](https://github.com/mpxv-lineages/lineage-designation)                  |
-| related datasets       | Mpox virus (Clade IIb): `nextstrain/mpox/clade-iib`<br> Mpox virus (Lineage B.1) `nextstrain/mpox/lineage-b.1`        |
+| Key                    | Value                                                                                                                                                            |
+| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| authors                | [Cornelius Roemer](https://neherlab.org), [Richard Neher](https://neherlab.org), [Nextstrain](https://nextstrain.org)                                            |
+| data source            | Genbank                                                                                                                                                          |
+| workflow               | [github.com/nextstrain/mpox/nextclade](https://github.com/nextstrain/mpox/nextclade)                                                                             |
+| nextclade dataset path | nextstrain/mpox/all-clades                                                                                                                                       |
+| annotation             | [NC_063383.1](https://www.ncbi.nlm.nih.gov/nuccore/NC_063383)                                                                                                    |
+| clade definitions      | [github.com/mpxv-lineages/lineage-designation](https://github.com/mpxv-lineages/lineage-designation)                                                             |
+| related datasets       | Mpox virus (Clade IIb): `nextstrain/mpox/clade-iib`<br>Mpox virus (Lineage B.1) `nextstrain/mpox/lineage-b.1`<br>Mpox virus (Clade I): `nextstrain/mpox/clade-i` |
 
 ## Scope of this dataset
 
-This dataset is for Mpox viruses of all clades (I, IIa and IIb). For a focused analysis of sequences from clade IIb, you may want to use the more specific dataset: "Clade IIb" (`nextstrain/mpox/clade-iib`). For an even more focused analysis of 2022-2023 outbreak sequences (lineage B.1 and sublineages), you may want to use the even more specific dataset: "Lineage B.1" (`nextstrain/mpox/lineage-b.1`).
+This dataset is for Mpox viruses of all clades (Ia, Ib, IIa and IIb). For a focused analysis of sequences from clade IIb, you may want to use the more specific dataset: "Clade IIb" (`nextstrain/mpox/clade-iib`). For an even more focused analysis of 2022-2023 outbreak sequences (lineage B.1 and sublineages), you may want to use the even more specific dataset: "Lineage B.1" (`nextstrain/mpox/lineage-b.1`). For clade I sequences, you may want to use the dataset "Clade I" (`nextstrain/mpox/clade-i`).
 
 ## Reference sequence and reference tree
 
 The reference used in this dataset is the clade IIb NCBI refseq `NC_063383.1` (Isolate `MPXV-M5312_HM12_Rivers`).
 
-Sequences for the reference tree come from NCBI/Genbank and are downsampled to around 500 sequences from the diversity of clades, lineages, countries and collection dates.
+Sequences for the reference tree come from NCBI/Genbank and are downsampled to around 900 sequences from the diversity of clades, lineages, countries and collection dates.
 
 ## Further reading
 

diff --git a/nextclade/resources/all-clades/sequences.fasta b/nextclade/resources/all-clades/sequences.fasta
diff --git a/nextclade/scripts/assign-colors.py b/nextclade/scripts/assign-colors.py
@@ -1,21 +1,24 @@
 import argparse
-import pdb
+
 import pandas as pd
 
 # Forced colours MUST NOT appear in the ordering TSV
-forced_colors = {
-}
+forced_colors = {}
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Assign colors based on ordering",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
 
-    parser.add_argument('--ordering', type=str, required=True, help="input ordering file")
-    parser.add_argument('--color-schemes', type=str, required=True, help="input color schemes file")
-    parser.add_argument('--metadata', type=str, help="if provided, restrict colors to only those found in metadata")
-    parser.add_argument('--output', type=str, required=True, help="output colors tsv")
+    parser.add_argument("--ordering", type=str, required=True, help="input ordering file")
+    parser.add_argument("--color-schemes", type=str, required=True, help="input color schemes file")
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        help="if provided, restrict colors to only those found in metadata",
+    )
+    parser.add_argument("--output", type=str, required=True, help="output colors tsv")
     args = parser.parse_args()
 
     assignment = {}
@@ -34,14 +37,18 @@
     # 1. remove assignments that don't exist in metadata
     # 2. remove assignments that have 'focal' set to 'False' in metadata
     if args.metadata:
-        metadata = pd.read_csv(args.metadata, delimiter='\t')
+        metadata = pd.read_csv(args.metadata, delimiter="\t")
         for name, trait in assignment.items():
             # Items not to exclude if not (yet) present in metadata to solve bootstrapping issue
-            if name in metadata and name not in ['clade_membership', 'outbreak', 'lineage']:
+            if name in metadata and name not in [
+                "clade_membership",
+                "outbreak",
+                "lineage",
+            ]:
                 subset_present = [x for x in assignment[name] if x in metadata[name].unique()]
                 assignment[name] = subset_present
-            if name in metadata and 'focal' in metadata:
-                focal_list = metadata.loc[metadata['focal'] == True, name].unique()
+            if name in metadata and "focal" in metadata:
+                focal_list = metadata.loc[metadata["focal"] == True, name].unique()
                 subset_focal = [x for x in assignment[name] if x in focal_list]
                 assignment[name] = subset_focal
 
@@ -53,28 +60,28 @@
             array = line.lstrip().rstrip().split("\t")
             schemes[counter] = array
 
-    with open(args.output, 'w') as f:
+    with open(args.output, "w") as f:
         for trait_name, trait_array in assignment.items():
-            if len(trait_array)==0:
+            if len(trait_array) == 0:
                 print(f"No traits found for {trait_name}")
                 continue
-            if len(schemes)<len(trait_array):
-              print(f"WARNING: insufficient colours available for trait {trait_name} - reusing colours!")
-              remain = len(trait_array)
-              color_array = []
-              while(remain>0):
-                if (remain>len(schemes)):
-                  color_array = [*color_array, *schemes[len(schemes)]]
-                  remain -= len(schemes)
-                else:
-                  color_array = [*color_array, *schemes[remain]]
-                  remain = 0
+            if len(schemes) < len(trait_array):
+                print(f"WARNING: insufficient colours available for trait {trait_name} - reusing colours!")
+                remain = len(trait_array)
+                color_array = []
+                while remain > 0:
+                    if remain > len(schemes):
+                        color_array = [*color_array, *schemes[len(schemes)]]
+                        remain -= len(schemes)
+                    else:
+                        color_array = [*color_array, *schemes[remain]]
+                        remain = 0
             else:
-              color_array = schemes[len(trait_array)]
+                color_array = schemes[len(trait_array)]
             extra_trait_values = list(forced_colors.get(trait_name, {}).keys())
             extra_color_values = list(forced_colors.get(trait_name, {}).values())
 
-            zipped = list(zip(trait_array+extra_trait_values, color_array+extra_color_values))
+            zipped = list(zip(trait_array + extra_trait_values, color_array + extra_color_values))
             for trait_value, color in zipped:
                 f.write(trait_name + "\t" + trait_value + "\t" + color + "\n")
             f.write("\n")
diff --git a/nextclade/scripts/clades_renaming.py b/nextclade/scripts/clades_renaming.py
@@ -6,9 +6,7 @@
         description="remove time info",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
-    parser.add_argument(
-        "--input-node-data", type=str, required=True, help="input data"
-    )
+    parser.add_argument("--input-node-data", type=str, required=True, help="input data")
     parser.add_argument(
         "--output-node-data",
         type=str,
@@ -40,6 +38,10 @@
             clade_name = "IIb"
             outbreak_name = "hMPXV-1"
             lineage_name = old_clade_name
+        elif old_clade_name[0] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+            clade_name = "IIb"
+            outbreak_name = "hMPXV-1"
+            lineage_name = old_clade_name
         else:
             raise ValueError(f"Unknown clade name: {old_clade_name}")
 

diff --git a/nextclade/scripts/collapse-zero-branches.py b/nextclade/scripts/collapse-zero-branches.py
@@ -6,11 +6,7 @@
 
 
 def get_branch_length_distribution(tree) -> Counter[float, int]:
-    return Counter(
-        node.branch_length
-        for node in tree.find_clades()
-        if node.branch_length is not None
-    )
+    return Counter(node.branch_length for node in tree.find_clades() if node.branch_length is not None)
 
 
 def collapse_near_zero_branches(tree, threshold=0.001, verbose=False):
@@ -35,9 +31,7 @@ def collapse_near_zero_branches(tree, threshold=0.001, verbose=False):
     difference = branch_length_counts_before - branch_length_counts_after
 
     if verbose:
-        print(
-            f"Collapsed {difference.total()} internal branches with lengths below {threshold}"
-        )
+        print(f"Collapsed {difference.total()} internal branches with lengths below {threshold}")
         print("Collapsed branches:")
         for length, count in difference.items():
             print(f"Branch length {length}: {count} branches")

diff --git a/nextclade/scripts/deduplicate.py b/nextclade/scripts/deduplicate.py
@@ -9,10 +9,11 @@
 
 """
 
-import typer
-from Bio import SeqIO
 import itertools
+
 import llist
+import typer
+from Bio import SeqIO
 
 
 def informative_sites(sequence: str) -> int:
@@ -21,6 +22,7 @@ def informative_sites(sequence: str) -> int:
     """
     return sum([1 for c in sequence if c in "ACGT"])
 
+
 def identical(seq1: str, seq2: str, info_sites: list) -> bool:
     """
     Check if two sequences are identical (up to Ns)
@@ -33,6 +35,7 @@ def identical(seq1: str, seq2: str, info_sites: list) -> bool:
             return False
     return True
 
+
 def composition_per_site(sequences) -> list:
     """
     Compute the composition of each site in a list of sequences
@@ -47,6 +50,7 @@ def composition_per_site(sequences) -> list:
             composition[i][c] += 1
     return composition
 
+
 def mismatch_prob_per_site(composition: list) -> list:
     """
     Compute the mismatch probability of each site in a list of sequences
@@ -55,7 +59,7 @@ def mismatch_prob_per_site(composition: list) -> list:
     result = []
     seqs = sum(composition[0].values())
     for site in composition:
-        values = [val/seqs for char, val in site.items() if char in "ACGT"]
+        values = [val / seqs for char, val in site.items() if char in "ACGT"]
         prob = 0
         for i, value in enumerate(values):
             for j, other_value in enumerate(values):
@@ -64,17 +68,17 @@ def mismatch_prob_per_site(composition: list) -> list:
         result.append(prob)
     return result
 
+
 def informative_indexes_sorted_by_entropy(composition: list) -> list:
     """
     List of indexes of informative sites sorted by entropy
     Uninformative sites are not included
     """
-    site_information = { i: info for i, info in enumerate(mismatch_prob_per_site(composition)) if info > 0 }
+    site_information = {i: info for i, info in enumerate(mismatch_prob_per_site(composition)) if info > 0}
     site_information = sorted(site_information.items(), key=lambda x: x[1], reverse=True)
     return [x[0] for x in site_information]
 
 
-
 def deduplicate(input: str, output: str):
     """
     Deduplicate sequences in a file
@@ -85,7 +89,8 @@ def deduplicate(input: str, output: str):
     """
     with open(input, "r") as f:
         sequences = [
-            {   "id": record.id,
+            {
+                "id": record.id,
                 "seq": str(record.seq),
                 "number_informative_sites": informative_sites(str(record.seq)),
             }
@@ -106,7 +111,7 @@ def deduplicate(input: str, output: str):
     while ying is not None:
         yang = ying.next
         while yang is not None:
-            if identical(str(ying.value["seq"]),str(yang.value["seq"]), info_sites):
+            if identical(str(ying.value["seq"]), str(yang.value["seq"]), info_sites):
                 print(f"Removing {yang.value['id']} as identical to {ying.value['id']}")
                 to_remove = yang
                 dup_list.append(yang.value["id"])