nanopore multiple genomes update

linsalrob · Oct 1, 2024 · 1ec4e67 · 1ec4e67
1 parent 02d250f
commit 1ec4e67
Show file tree

Hide file tree

Showing 7 changed files with 288 additions and 141 deletions.
diff --git a/Changes.md b/Changes.md
@@ -1,7 +1,9 @@
 # Changes being added
 
-## dev branch 
-- working on adding a map module
+## v1.4.5
+- catches a specific cases of having mutiple circular phages from assembly
+- updating the code to count number of hypothetical proteins to catch other genes that dont have a biological function assigned
+- Addressing issue#36- Phrogs annotated toxin not recognised in sphae summary output
 
 ## v1.4.4
 - adding the option to run pharokka with --pyrodigal-gv to test for alternate coding genes in config file

diff --git a/misc/num_cds.py b/misc/num_cds.py
@@ -3,34 +3,92 @@
 import os
 from collections import defaultdict
 from Bio import SeqIO
+import re
+import csv
+
+
+#Code from https://github.com/linsalrob/EdwardsLab/blob/e49085a1b0c97735f93bc1a1261514b4829e0ef3/roblib/functions.py#L10-L59
+def is_hypothetical(func):
+    """
+    Returns True if the function is hypothetical. Otherwise returns false
+    :param func: string
+    :return: boolean
+    """
+
+    if not func: return True
+    if func.lower() == 'hypothetical protein': return True
+    if re.search(r'lmo\d+ protein', func, re.IGNORECASE): return True
+    if re.search(r'hypoth', func, re.IGNORECASE): return True
+    if re.search(r'conserved protein', func, re.IGNORECASE): return True
+    if re.search(r'gene product', func, re.IGNORECASE): return True
+    if re.search(r'interpro', func, re.IGNORECASE): return True
+    if re.search(r'B[sl][lr]\d', func, re.IGNORECASE): return True
+    if re.search(r'^U\d', func, re.IGNORECASE): return True
+    if re.search(r'^orf[^_]', func, re.IGNORECASE): return True
+    if re.search(r'uncharacterized', func, re.IGNORECASE): return True
+    if re.search(r'pseudogene', func, re.IGNORECASE): return True
+    if re.search(r'^predicted', func, re.IGNORECASE): return True
+    if re.search(r'AGR_', func, re.IGNORECASE): return True
+    if re.search(r'similar to', func, re.IGNORECASE): return True
+    if re.search(r'similarity', func, re.IGNORECASE): return True
+    if re.search(r'glimmer', func, re.IGNORECASE): return True
+    if re.search(r'unknown', func, re.IGNORECASE): return True
+    if re.search(r'domain', func, re.IGNORECASE): return True
+    if re.search(r'^y[a-z]{2,4}\b', func, re.IGNORECASE): return True
+    if re.search(r'complete', func, re.IGNORECASE): return True
+    if re.search(r'ensang', func, re.IGNORECASE): return True
+    if re.search(r'unnamed', func, re.IGNORECASE): return True
+    if re.search(r'EG:', func, re.IGNORECASE): return True
+    if re.search(r'orf\d+', func, re.IGNORECASE): return True
+    if re.search(r'RIKEN', func, re.IGNORECASE): return True
+    if re.search(r'Expressed', func, re.IGNORECASE): return True
+    if re.search(r'[a-zA-Z]{2,3}\|', func, re.IGNORECASE): return True
+    if re.search(r'predicted by Psort', func, re.IGNORECASE): return True
+    if re.search(r'^bh\d+', func, re.IGNORECASE): return True
+    if re.search(r'cds_', func, re.IGNORECASE): return True
+    if re.search(r'^[a-z]{2,3}\d+[^:\+\-0-9]', func, re.IGNORECASE): return True
+    if re.search(r'similar to', func, re.IGNORECASE): return True
+    if re.search(r' identi', func, re.IGNORECASE): return True
+    if re.search(r'ortholog of', func, re.IGNORECASE): return True
+    if re.search(r'ortholog of', func, re.IGNORECASE): return True
+    if re.search(r'structural feature', func, re.IGNORECASE): return True
+    if re.search(r'Phage protein', func, re.IGNORECASE): return True
+    if re.search(r'mobile element', func, re.IGNORECASE): return True
+
+    return False
+
+def count_hypothetical_proteins(gbk_file):
+    count = 0
+    for record in SeqIO.parse(gbk_file, "genbank"):
+        for feature in record.features:
+            if feature.type == "CDS":
+                if "product" in feature.qualifiers:
+                    # Take the first entry of the 'product' list
+                    fn = feature.qualifiers["product"][0]
+                    if is_hypothetical(fn):
+                        count += 1
+    return count
+
+def iterate_genbank_files(genbank_directory, output_csv):
+    hypothetical_counts_dict = {}
 
-def extract_genes(genbank_directory):
-    genes_dict = defaultdict(lambda: defaultdict(int))
-    file_names = []
     for file_name in os.listdir(genbank_directory):
         if file_name.endswith(".gbk") or file_name.endswith(".gb"):
-            file_names.append(file_name)
             file_path = os.path.join(genbank_directory, file_name)
-            for record in SeqIO.parse(file_path, "genbank"):
-                for feature in record.features:
-                    if feature.type == "CDS":
-                        gene_name = feature.qualifiers.get("product", ["Unknown_gene"])[0]
-                        genes_dict[gene_name][file_name] += 1
-    return genes_dict, file_names
-
-def write_to_csv(genes_dict, file_names, output_file):
-    with open(output_file, "w") as f:
-        # Write header
-        f.write("Gene," + ",".join(file_names) + "\n")
-        # Write gene names and the number of CDS in each file
-        for gene, counts_per_file in genes_dict.items():
-            counts = [str(counts_per_file.get(file_name, 0)) for file_name in file_names]
-            f.write(f"{gene},{','.join(counts)}\n")
+            hypothetical_count = count_hypothetical_proteins(file_path)
+            hypothetical_counts_dict[file_name] = hypothetical_count
+
+    # Write to CSV
+    with open(output_csv, mode='w', newline='') as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow(['Filename', 'Hypothetical_Protein_Count'])
+
+        for file_name, hypothetical_count in hypothetical_counts_dict.items():
+            writer.writerow([file_name, hypothetical_count])
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="A script that takes a directory containing genbank files and writes gene presence absence table")
     parser.add_argument('-d', '--directory', dest='directory', help='Enter the directory containing the genbank files')
     parser.add_argument('-o', dest='output', help='Enter the output tabular format')
     results = parser.parse_args()
-    genes_dict, file_names = extract_genes(results.directory)
-    write_to_csv(genes_dict, file_names, results.output)
+    iterate_genbank_files(results.directory, results.output)
diff --git a/sphae.sh b/sphae.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-#SBATCH --job-name=sphae-medaka
+#SBATCH --job-name=sphae
 #SBATCH --mail-type=ALL
 #SBATCH --output=%x-%j.out.txt
 #SBATCH --error=%x-%j.err.txt
@@ -12,8 +12,8 @@
 #SBATCH --qos=hc-concurrent-jobs
 
 #sphae install
-#sphae run --input tests/data/illumina-subset --threads 64 -k 
-#sphae run --input tests/data/nanopore-subset --sequencing longread --threads 64 -k
-#sphae run --input tests/data/nanopore-subset --sequencing longread --threads 64 -k --no_medaka 
+sphae run --input tests/data/illumina-subset --threads 64 -k 
+sphae run --input tests/data/nanopore-subset --sequencing longread --threads 64 -k
+sphae run --input tests/data/nanopore-subset --sequencing longread --threads 64 -k --no_medaka 
 #sphae annotate --genome tests/data/genome --threads 64
 
diff --git a/sphae/sphae.VERSION b/sphae/sphae.VERSION
@@ -1 +1 @@
-1.4.4
+1.4.5
diff --git a/sphae/workflow/rules/10.final-reporting.smk b/sphae/workflow/rules/10.final-reporting.smk
@@ -97,6 +97,7 @@ rule summarize_paired:
         plots=os.path.join(dir_final, "{sample}-pr", "{sample}_phynteny_plot.png"),
         outdir=os.path.join(dir_final,"{sample}-pr"),
         ID="{sample}",
+        seq= "pr"
     localrule: True
     script:
         os.path.join(dir_script, 'summary.py')
@@ -196,6 +197,7 @@ rule summarize_longread:
         ID="{sample}",
         plots=os.path.join(dir_final, "{sample}-sr", "{sample}_phynteny_plot.png"),
         outdir=os.path.join(dir_final, "{sample}-sr"),
+        seq= "sr"
     localrule: True
     script:
         os.path.join(dir_script, 'summary.py')

diff --git a/sphae/workflow/scripts/pick_phage_contigs.py b/sphae/workflow/scripts/pick_phage_contigs.py
@@ -17,7 +17,7 @@ def picking_contigs(file,out):
         datav = data[data["Length_x"] > 1000]
         datav = datav[datav["Prediction"] == "Virus"]
         datac = datav[datav["completeness"]> 70.00]
-        #print (len(data))
+        #print (len(datac))
         #print (datac)
     else:
         open(out, 'a').close()
@@ -33,6 +33,9 @@ def picking_contigs(file,out):
             print ("The genome is fragmented")
             datac.to_csv(out, index=False)
         #return None
+        else:
+            print ("Multiple genomes")
+            datac.to_csv(out, index=False)
 
     elif (len(datac))==1:
         #print ("entering this if statement")