Skip to content

Commit

Permalink
nanopore multiple genomes update
Browse files Browse the repository at this point in the history
  • Loading branch information
npbhavya committed Oct 1, 2024
1 parent 02d250f commit 1ec4e67
Show file tree
Hide file tree
Showing 7 changed files with 288 additions and 141 deletions.
6 changes: 4 additions & 2 deletions Changes.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# Changes being added

## dev branch
- working on adding a map module
## v1.4.5
- catches a specific cases of having mutiple circular phages from assembly
- updating the code to count number of hypothetical proteins to catch other genes that dont have a biological function assigned
- Addressing issue#36- Phrogs annotated toxin not recognised in sphae summary output

## v1.4.4
- adding the option to run pharokka with --pyrodigal-gv to test for alternate coding genes in config file
Expand Down
100 changes: 79 additions & 21 deletions misc/num_cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,92 @@
import os
from collections import defaultdict
from Bio import SeqIO
import re
import csv


#Code from https://github.com/linsalrob/EdwardsLab/blob/e49085a1b0c97735f93bc1a1261514b4829e0ef3/roblib/functions.py#L10-L59
def is_hypothetical(func):
"""
Returns True if the function is hypothetical. Otherwise returns false
:param func: string
:return: boolean
"""

if not func: return True
if func.lower() == 'hypothetical protein': return True
if re.search(r'lmo\d+ protein', func, re.IGNORECASE): return True
if re.search(r'hypoth', func, re.IGNORECASE): return True
if re.search(r'conserved protein', func, re.IGNORECASE): return True
if re.search(r'gene product', func, re.IGNORECASE): return True
if re.search(r'interpro', func, re.IGNORECASE): return True
if re.search(r'B[sl][lr]\d', func, re.IGNORECASE): return True
if re.search(r'^U\d', func, re.IGNORECASE): return True
if re.search(r'^orf[^_]', func, re.IGNORECASE): return True
if re.search(r'uncharacterized', func, re.IGNORECASE): return True
if re.search(r'pseudogene', func, re.IGNORECASE): return True
if re.search(r'^predicted', func, re.IGNORECASE): return True
if re.search(r'AGR_', func, re.IGNORECASE): return True
if re.search(r'similar to', func, re.IGNORECASE): return True
if re.search(r'similarity', func, re.IGNORECASE): return True
if re.search(r'glimmer', func, re.IGNORECASE): return True
if re.search(r'unknown', func, re.IGNORECASE): return True
if re.search(r'domain', func, re.IGNORECASE): return True
if re.search(r'^y[a-z]{2,4}\b', func, re.IGNORECASE): return True
if re.search(r'complete', func, re.IGNORECASE): return True
if re.search(r'ensang', func, re.IGNORECASE): return True
if re.search(r'unnamed', func, re.IGNORECASE): return True
if re.search(r'EG:', func, re.IGNORECASE): return True
if re.search(r'orf\d+', func, re.IGNORECASE): return True
if re.search(r'RIKEN', func, re.IGNORECASE): return True
if re.search(r'Expressed', func, re.IGNORECASE): return True
if re.search(r'[a-zA-Z]{2,3}\|', func, re.IGNORECASE): return True
if re.search(r'predicted by Psort', func, re.IGNORECASE): return True
if re.search(r'^bh\d+', func, re.IGNORECASE): return True
if re.search(r'cds_', func, re.IGNORECASE): return True
if re.search(r'^[a-z]{2,3}\d+[^:\+\-0-9]', func, re.IGNORECASE): return True
if re.search(r'similar to', func, re.IGNORECASE): return True
if re.search(r' identi', func, re.IGNORECASE): return True
if re.search(r'ortholog of', func, re.IGNORECASE): return True
if re.search(r'ortholog of', func, re.IGNORECASE): return True
if re.search(r'structural feature', func, re.IGNORECASE): return True
if re.search(r'Phage protein', func, re.IGNORECASE): return True
if re.search(r'mobile element', func, re.IGNORECASE): return True

return False

def count_hypothetical_proteins(gbk_file):
count = 0
for record in SeqIO.parse(gbk_file, "genbank"):
for feature in record.features:
if feature.type == "CDS":
if "product" in feature.qualifiers:
# Take the first entry of the 'product' list
fn = feature.qualifiers["product"][0]
if is_hypothetical(fn):
count += 1
return count

def iterate_genbank_files(genbank_directory, output_csv):
hypothetical_counts_dict = {}

def extract_genes(genbank_directory):
genes_dict = defaultdict(lambda: defaultdict(int))
file_names = []
for file_name in os.listdir(genbank_directory):
if file_name.endswith(".gbk") or file_name.endswith(".gb"):
file_names.append(file_name)
file_path = os.path.join(genbank_directory, file_name)
for record in SeqIO.parse(file_path, "genbank"):
for feature in record.features:
if feature.type == "CDS":
gene_name = feature.qualifiers.get("product", ["Unknown_gene"])[0]
genes_dict[gene_name][file_name] += 1
return genes_dict, file_names

def write_to_csv(genes_dict, file_names, output_file):
with open(output_file, "w") as f:
# Write header
f.write("Gene," + ",".join(file_names) + "\n")
# Write gene names and the number of CDS in each file
for gene, counts_per_file in genes_dict.items():
counts = [str(counts_per_file.get(file_name, 0)) for file_name in file_names]
f.write(f"{gene},{','.join(counts)}\n")
hypothetical_count = count_hypothetical_proteins(file_path)
hypothetical_counts_dict[file_name] = hypothetical_count

# Write to CSV
with open(output_csv, mode='w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['Filename', 'Hypothetical_Protein_Count'])

for file_name, hypothetical_count in hypothetical_counts_dict.items():
writer.writerow([file_name, hypothetical_count])

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="A script that takes a directory containing genbank files and writes gene presence absence table")
parser.add_argument('-d', '--directory', dest='directory', help='Enter the directory containing the genbank files')
parser.add_argument('-o', dest='output', help='Enter the output tabular format')
results = parser.parse_args()
genes_dict, file_names = extract_genes(results.directory)
write_to_csv(genes_dict, file_names, results.output)
iterate_genbank_files(results.directory, results.output)
8 changes: 4 additions & 4 deletions sphae.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

#SBATCH --job-name=sphae-medaka
#SBATCH --job-name=sphae
#SBATCH --mail-type=ALL
#SBATCH --output=%x-%j.out.txt
#SBATCH --error=%x-%j.err.txt
Expand All @@ -12,8 +12,8 @@
#SBATCH --qos=hc-concurrent-jobs

#sphae install
#sphae run --input tests/data/illumina-subset --threads 64 -k
#sphae run --input tests/data/nanopore-subset --sequencing longread --threads 64 -k
#sphae run --input tests/data/nanopore-subset --sequencing longread --threads 64 -k --no_medaka
sphae run --input tests/data/illumina-subset --threads 64 -k
sphae run --input tests/data/nanopore-subset --sequencing longread --threads 64 -k
sphae run --input tests/data/nanopore-subset --sequencing longread --threads 64 -k --no_medaka
#sphae annotate --genome tests/data/genome --threads 64

2 changes: 1 addition & 1 deletion sphae/sphae.VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.4.4
1.4.5
2 changes: 2 additions & 0 deletions sphae/workflow/rules/10.final-reporting.smk
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ rule summarize_paired:
plots=os.path.join(dir_final, "{sample}-pr", "{sample}_phynteny_plot.png"),
outdir=os.path.join(dir_final,"{sample}-pr"),
ID="{sample}",
seq= "pr"
localrule: True
script:
os.path.join(dir_script, 'summary.py')
Expand Down Expand Up @@ -196,6 +197,7 @@ rule summarize_longread:
ID="{sample}",
plots=os.path.join(dir_final, "{sample}-sr", "{sample}_phynteny_plot.png"),
outdir=os.path.join(dir_final, "{sample}-sr"),
seq= "sr"
localrule: True
script:
os.path.join(dir_script, 'summary.py')
Expand Down
5 changes: 4 additions & 1 deletion sphae/workflow/scripts/pick_phage_contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def picking_contigs(file,out):
datav = data[data["Length_x"] > 1000]
datav = datav[datav["Prediction"] == "Virus"]
datac = datav[datav["completeness"]> 70.00]
#print (len(data))
#print (len(datac))
#print (datac)
else:
open(out, 'a').close()
Expand All @@ -33,6 +33,9 @@ def picking_contigs(file,out):
print ("The genome is fragmented")
datac.to_csv(out, index=False)
#return None
else:
print ("Multiple genomes")
datac.to_csv(out, index=False)

elif (len(datac))==1:
#print ("entering this if statement")
Expand Down
Loading

0 comments on commit 1ec4e67

Please sign in to comment.