Skip to content

Commit

Permalink
adding upgrades including database replacement, eukaryote fasta dumpi…
Browse files Browse the repository at this point in the history
…ng, awkward space removal, etc.
  • Loading branch information
akrinos committed Sep 2, 2024
1 parent 9ecd10b commit 5464bfe
Show file tree
Hide file tree
Showing 12 changed files with 208 additions and 92 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.7
2.0.9
1 change: 0 additions & 1 deletion bin/EUKulele
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/local/bin/python
##!/usr/bin/env python3

import sys

Expand Down
5 changes: 4 additions & 1 deletion docs/source/parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ A full list of parameters can be found in the table at the bottom of this page.
* - ``--use_salmon_counts``
- use_salmon_counts
- If included in a command line argument or set to 1 in a configuration file, this argument causes classifications to be made based both on number of classified transcripts and by counts.
* - ``--create_euk_fasta``
- create_euk_fasta
- If ``--create_euk_fasta`` is included in a command line argument or set to 1 in a configuration file, FASTA files will be created containing the nucleotide or protein sequences predicted to be eukaryotic by ``EUKulele``.
* - ``--salmon_dir``
- salmon_dir
- If ``--use_salmon_counts`` is true, this must be specified, which is the directory location of the ``salmon`` output/quantification files.
Expand All @@ -65,7 +68,7 @@ A full list of parameters can be found in the table at the bottom of this page.
- A choice of aligner to use, currently ``BLAST`` or ``DIAMOND``.
* - ``--cutoff_file``
- cutoff_file
- A ``YAML`` file, provided in ``src/EUKulele/static/``, that contains the percent identity cutoffs for various taxonomic classifications. Any path may be provided here to a user-specified file.
- A ``YAML`` file, with a default provided in ``src/EUKulele/static/``, that contains the percent identity cutoffs for various taxonomic classifications. Any path may be provided here to a user-specified file - if no absolute path is specified, the default file will be used instead.
* - ``--filter_metric``
- filter_metric
- Either evalue, pid, or bitscore (default evalue) - the metric to be used to filter hits based on their quality prior to taxonomic estimation.
Expand Down
87 changes: 51 additions & 36 deletions scripts/create_protein_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,13 @@
import os
import argparse
import json
import gzip
from Bio import SeqIO
import pandas as pd

def iz_gz(path):
return path.endswith(".gz")

def createProteinTable(args=None):
'''
Main function; intended to parse and create required files
Expand Down Expand Up @@ -85,41 +89,52 @@ def createProteinTable(args=None):
odict = {}
for curr_pepfile in list(args.infile_peptide):
pepfile = "".join(curr_pepfile)
for record in SeqIO.parse(pepfile, "fasta"):
header = record.description
rid = record.id.replace(".","N") #record.id.split(".")[0] #record.id.replace(".","N")
counter = 2
while rid in odict:
if "_" in rid:
rid = "_".join(rid.split("_")[0:-1]) + "_" + str(counter)

open_fn = gzip.open if iz_gz(pepfile) else open
with open_fn(pepfile, "rt") as handle:
for record in SeqIO.parse(handle, "fasta"):
header = record.description
rid = record.id.replace(".","N") #record.id.split(".")[0] #record.id.replace(".","N")
counter = 2
while rid in odict:
if "_" in rid:
rid = "_".join(rid.split("_")[0:-1]) + "_" + str(counter)
else:
rid = rid + "_" + str(counter)
counter = counter + 1
if 't' in args.delim: # why is this tab thing not working otherwise?? even the equality
tester = "".join(list(str(header))).replace('\t', ' ')
hlist = tester.split(" ")
else:
rid = rid + "_" + str(counter)
counter = counter + 1
if 't' in args.delim: # why is this tab thing not working otherwise?? even the equality
tester = "".join(list(str(header))).replace('\t', ' ')
hlist = tester.split(" ")
else:
header = str(header).replace(args.delim, "hello")
hlist = header.split("hello")
if len(args.infile_peptide) > 1:
# if there is a list of files, use the filename as the ID
sid = pepfile.split("/")[-1].split("_")[0]
odict[rid] = sid
elif args.column.isdigit():
sid = hlist[int(args.column)]
odict[rid] = sid
else:
for h_curr in hlist:
if args.column in h_curr: #h.startswith(args.column):
sid = h_curr.split('=')[1].strip()
odict[rid] = sid
break

print("Modifying...",pepfile,flush=True)
os.system("cut -f 1 " + str(pepfile) + " > " + str(pepfile) + ".tester.pep.fa")
os.system("perl -i -pe 's/$/_$seen{$_}/ if ++$seen{$_}>1 and /^>/; ' " + \
str(pepfile) + ".tester.pep.fa")
os.system("mv " + str(pepfile) + ".tester.pep.fa " + str(pepfile))
header = str(header).replace(args.delim, "hello")
hlist = header.split("hello")
if len(args.infile_peptide) > 1:
# if there is a list of files, use the filename as the ID
sid = pepfile.split("/")[-1].split("_")[0]
odict[rid] = sid
elif args.column.isdigit():
sid = hlist[int(args.column)]
odict[rid] = sid
else:
for h_curr in hlist:
if args.column in h_curr: #h.startswith(args.column):
sid = h_curr.split('=')[1].strip()
odict[rid] = sid
break

if not iz_gz(pepfile):
print("Modifying...",pepfile,flush=True)
os.system("cut -f 1 " + str(pepfile) + " > " + str(pepfile) + ".tester.pep.fa")
os.system("perl -i -pe 's/$/_$seen{$_}/ if ++$seen{$_}>1 and /^>/; ' " + \
str(pepfile) + ".tester.pep.fa")
os.system("mv " + str(pepfile) + ".tester.pep.fa " + str(pepfile))
else:
print("Modifying zipped pepfile...",pepfile,flush=True)
#os.system("zcat " + str(pepfile) + " | cut -f 1 " + " > " + str(pepfile) + ".tester.pep.fa")
#os.system("perl -i -pe 's/$/_$seen{$_}/ if ++$seen{$_}>1 and /^>/; ' " + \
# str(pepfile) + ".tester.pep.fa")
#os.system("gzip -c "+str(pepfile) + ".tester.pep.fa > "+str(pepfile) + ".tester.pep.fa.gz")
#os.system("mv " + str(pepfile) + ".tester.pep.fa.gz " + str(pepfile))

tax_file = pd.read_csv(args.infile_taxonomy, sep = "\t", encoding='latin-1')

Expand All @@ -136,7 +151,7 @@ def createProteinTable(args=None):
elif len(curr_row) > (len(colnames_tax)):
curr_row = curr_row[0:7] + [curr_row[8]]
add_series = pd.Series(curr_row, index = colnames_tax)
tax_out = tax_out.append(add_series, ignore_index=True)
tax_out = pd.concat([tax_out,add_series], ignore_index=True)
else:
curr_row = [tax_file[args.col_source_id][i]] + [""] * 7
full_taxonomy = tax_file[args.taxonomy_col_id][i].split(";")
Expand All @@ -150,7 +165,7 @@ def createProteinTable(args=None):
curr_row[6] = tax_file["Genus_UniEuk"][i]
curr_row[7] = genus_and_species
add_series = pd.Series(curr_row, index = colnames_tax)
tax_out = tax_out.append(add_series, ignore_index=True)
tax_out = pd.concat([tax_out,add_series], ignore_index=True)

tax_file = tax_out
tax_file.to_csv(args.output,sep="\t")
Expand Down
52 changes: 33 additions & 19 deletions scripts/download_database.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
#!/bin/bash

ALLEXITS=0

DATABASE=$1
REF_FASTA="reference.pep.fa"
REF_TABLE="taxonomy-table.txt"
REF_FASTA_URL=$2
REF_TABLE_URL=$3
REFERENCE_DIR=$4

if echo $REF_FASTA_URL | grep -q ".gz"; then
REF_FASTA="reference.pep.fa.gz"
fi
#mkdir -p ${PWD}/$REFERENCE_DIR/$DATABASE
if [[ "$REFERENCE_DIR" == "" ]]; then
REFERENCE_DIR="."
fi

mkdir -p $REFERENCE_DIR/$DATABASE

if [[ $DATABASE == "marmmetsp" ]]; then
Expand Down Expand Up @@ -44,41 +50,41 @@ elif [[ $DATABASE == "gtdb" ]]; then
echo "All reference files for GTDB downloaded to $REFERENCE_DIR/$DATABASE"
elif [[ $DATABASE == "eukprot" ]]; then
# Download tar of all EukProt files
wget -O $REFERENCE_DIR/$DATABASE/$DATABASE.tgz "$REF_FASTA_URL"
wget -O $REFERENCE_DIR/$DATABASE/$REF_FASTA "$REF_FASTA_URL"
ALLEXITS=$(($ALLEXITS + $?))

# Unzip to proteins folder
tar zxvf $REFERENCE_DIR/$DATABASE/$DATABASE.tgz -C $REFERENCE_DIR/$DATABASE
ALLEXITS=$(($ALLEXITS + $?))
# Unzip to proteins folder - defunct
#tar zxvf $REFERENCE_DIR/$DATABASE/$DATABASE.tgz -C $REFERENCE_DIR/$DATABASE
#ALLEXITS=$(($ALLEXITS + $?))

# Download EukProt taxonomy file
wget -O $REFERENCE_DIR/$DATABASE/$REF_TABLE "$REF_TABLE_URL"
ALLEXITS=$(($ALLEXITS + $?))

ALLFILES=""
for entry in "$REFERENCE_DIR/$DATABASE/proteins"/*
do
ALLFILES=$ALLFILES" "$entry
done
#ALLFILES=""
#for entry in "$REFERENCE_DIR/$DATABASE/proteins"/*
#do
# ALLFILES=$ALLFILES" "$entry
#done

for currfile in $ALLFILES
do
((cat $currfile | sed 's/\./N/g'); echo; echo) >> ${PWD}/$DATABASE/$REF_FASTA
done
ALLEXITS=$(($ALLEXITS + $?))
#for currfile in $ALLFILES
#do
# ((cat $currfile | sed 's/\./N/g'); echo; echo) >> ${PWD}/$DATABASE/$REF_FASTA
#done
#ALLEXITS=$(($ALLEXITS + $?))

echo "All reference files for EukProt downloaded to ${PWD}/$DATABASE"
elif [[ $DATABASE == "phylodb" ]]; then
# Download PhyloDB reference FASTA
wget -O $REFERENCE_DIR/$DATABASE/$REF_FASTA.gz "$REF_FASTA_URL"
gunzip -f $REFERENCE_DIR/$DATABASE/$REF_FASTA.gz
wget -O $REFERENCE_DIR/$DATABASE/$REF_FASTA "$REF_FASTA_URL"
#gunzip -f $REFERENCE_DIR/$DATABASE/$REF_FASTA
#sed -i -e 's/>* .*$//' $REFERENCE_DIR/$DATABASE/$REF_FASTA
#sed -i $'s/\t/ /g' $REFERENCE_DIR/$DATABASE/$REF_FASTA
ALLEXITS=$(($ALLEXITS + $?))

# Download PhyloDB reference taxonomy table
wget -O $REFERENCE_DIR/$DATABASE/$REF_TABLE.gz "$REF_TABLE_URL"
gunzip -f $REFERENCE_DIR/$DATABASE/$REF_TABLE.gz
wget -O $REFERENCE_DIR/$DATABASE/$REF_TABLE "$REF_TABLE_URL"
#gunzip -f $REFERENCE_DIR/$DATABASE/$REF_TABLE
ALLEXITS=$(($ALLEXITS + $?))

# Download PhyloDB files from Google Drive
Expand All @@ -90,6 +96,7 @@ elif [[ $DATABASE == "phylodb" ]]; then
#gunzip -c ${PWD}/$DATABASE/$DATABASE.table.tgz > ${PWD}/$DATABASE/$REF_TABLE_URL

echo "All reference files for PhyloDB downloaded to ${PWD}/$DATABASE"

elif [[ $DATABASE == "eukzoo" ]]; then
zenodo_get 1476236
mv EukZoo_taxonomy_table_v_0.2.tsv $REFERENCE_DIR/$DATABASE/$REF_TABLE
Expand All @@ -100,3 +107,10 @@ else
echo "Specified database not found."
exit 1
fi

touch $REFERENCE_DIR/$DATABASE/DATABASE_INFO.txt
rm $REFERENCE_DIR/$DATABASE/DATABASE_INFO.txt

echo "Database $DATABASE downloaded on $(date)" > $REFERENCE_DIR/$DATABASE/DATABASE_INFO.txt
echo "Download URL for protein file was $REF_FASTA_URL" >> $REFERENCE_DIR/$DATABASE/DATABASE_INFO.txt
echo "Download URL for taxonomy table was $REF_TABLE_URL" >> $REFERENCE_DIR/$DATABASE/DATABASE_INFO.txt
3 changes: 3 additions & 0 deletions scripts/remove_newlines.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

awk '{ if (NR > 1 && index(prev, ">") == 0 && index($0, ">") == 0) printf "%s%s", prev, $0; else if (NR > 1) print prev; prev = $0 } END { print prev }' $1
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
version=version,
description = ("A package to make the process of taxonomically classifying "
"microbial eukaryotes easier."),
keywords = "eukaryote taxonomy classification",
keywords = "eukaryotic taxonomy classification",
conda_buildnum=1,
url="https://github.com/AlexanderLabWHOI/EUKulele",
author="Arianna Krinos",
Expand Down
48 changes: 35 additions & 13 deletions src/EUKulele/EUKulele_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from scripts.names_to_reads import namesToReads

__author__ = "Harriet Alexander, Arianna Krinos"
__author__ = "Arianna Krinos, Harriet Alexander"
__copyright__ = "EUKulele"
__license__ = "MIT"
__email__ = "akrinos@mit.edu"
Expand All @@ -26,7 +26,7 @@ def main(args_in):
'''
Main function for calling subfunctions and running EUKulele.
'''

parser = argparse.ArgumentParser(
description='Thanks for using EUKulele! EUKulele is a standalone taxonomic '+\
'annotation software.\n EUKulele is designed primarily for marine '+\
Expand All @@ -35,14 +35,14 @@ def main(args_in):
'[sample_directory] --reference_dir [reference_database_location] ' +\
'[all other options]')

parser.add_argument('-v', '--version', dest = "version", default=False,
action='store_true')
parser.add_argument('subroutine', metavar="subroutine", nargs='?', type=str,
default="all",
choices = ["","all","download","setup","alignment",
"busco","coregenes"],
help='Choice of subroutine to run.')

parser.add_argument('-v', '--version', dest = "version", default=False,
action='store_true')

parser.add_argument('-m', '--mets_or_mags', dest = "mets_or_mags", required = False,
default = "")
parser.add_argument('--n_ext', '--nucleotide_extension', dest = "nucleotide_extension",
Expand Down Expand Up @@ -108,7 +108,7 @@ def main(args_in):
help = "Taxonomic level of organisms specified in organisms tag.")

## OTHER USER CHOICES ##
cutoff_file = "tax-cutoffs.yaml"
cutoff_file = "default_in_static" # this will cause cutoff file to be read from folder
parser.add_argument('--cutoff_file', default = cutoff_file)
parser.add_argument('--consensus_proportion', default = 1, type = float)
parser.add_argument('--filter_metric', default = "evalue",
Expand All @@ -120,6 +120,10 @@ def main(args_in):
parser.add_argument('--busco_threshold', default=50)
parser.add_argument('--no_busco', action='store_true', default=False,
help = "When true, BUSCO steps are not run.")

parser.add_argument('--create_euk_fasta', action='store_true', default=False,
help = "Whether to create FASTA files containing sequences identified "+\
"to be eukaryotic.")
parser.add_argument('--create_fasta', action='store_true', default=False,
help = "Whether to create FASTA files containing ID'd transcripts "+\
"during BUSCO analysis.")
Expand All @@ -133,6 +137,15 @@ def main(args_in):
help = "Whether we're just running a test and should not execute downloads.")

args = parser.parse_args(list(filter(None, args_in.split(" "))))

if args.version:
test_var = True
filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
"static", "VERSION")
file_read = open(filename, "r")
print("The current EUKulele version is",file_read.read())
sys.exit(0)

if (args.mets_or_mags == "") & (args.subroutine != "download") & (not args.version):
print("METs or MAGs argument (-m/--mets_or_mags) is required with one of 'mets' or 'mags'.")
sys.exit(1)
Expand Down Expand Up @@ -175,6 +188,7 @@ def main(args_in):
busco_file = args.busco_file
rerun_rules = args.force_rerun
run_transdecoder = args.run_transdecoder
create_euk_fasta = args.create_euk_fasta

organisms, organisms_taxonomy = readBuscoFile(individual_or_summary, busco_file,
organisms, organisms_taxonomy)
Expand All @@ -185,10 +199,9 @@ def main(args_in):
busco_choice = False
core_genes = False

if args.version:
test_var = True
filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
"static", "VERSION")
filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
"static", "VERSION")
if os.path.isfile(filename):
file_read = open(filename, "r")
print("The current EUKulele version is",file_read.read())

Expand Down Expand Up @@ -242,17 +255,21 @@ def main(args_in):
f.write("The version of EUKulele was "+str(file_read.read())+".\n")
f.write("Time finished was " + str(e) + " for database " + \
str(args.database.lower())+"\n")


if os.path.isfile(os.path.join(reference_dir, ref_fasta+".gz")) & \
(not os.path.isfile(os.path.join(reference_dir, ref_fasta))):
ref_fasta = ref_fasta + ".gz"

## Next, see whether there is a subdirectory of reference
## directory containing folder for our DB name
if (not os.path.isfile(os.path.join(reference_dir, ref_fasta))) | \
if (not (os.path.isfile(os.path.join(reference_dir, ref_fasta)))) | \
(not os.path.isfile(tax_tab)) | \
(not os.path.isfile(prot_tab)):
ref_fasta, tax_tab, prot_tab = downloadDatabase(args.database.lower(),
alignment_choice, output_dir,
"/".join(reference_dir.
split("/")[0:-1]))
if (not os.path.isfile(os.path.join(reference_dir, ref_fasta))) | \
if (not (os.path.isfile(os.path.join(reference_dir, ref_fasta)))) | \
(not os.path.isfile(tax_tab)) | \
(not os.path.isfile(prot_tab)):
print("Download and formatting did not complete successfully. " +\
Expand Down Expand Up @@ -327,6 +344,11 @@ def main(args_in):
output_dir = output_dir,
level_hierarchy = levels_file)

if create_euk_fasta:
manageEukulele(piece = "dump_euks", samples = samples, mets_or_mags = mets_or_mags,
sample_dir = sample_dir, pep_ext = pep_ext,
output_dir = output_dir,
level_hierarchy = levels_file)
busco_matched = True
if busco_choice:
print("Performing BUSCO steps...", flush=True)
Expand Down
Loading

0 comments on commit 5464bfe

Please sign in to comment.