adding upgrades including database replacement, eukaryote fasta dumpi…

…ng, awkward space removal, etc.
AlexanderLabWHOI · Sep 2, 2024 · 5464bfe · 5464bfe
1 parent 9ecd10b
commit 5464bfe
Show file tree

Hide file tree

Showing 12 changed files with 208 additions and 92 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.0.7
+2.0.9
diff --git a/bin/EUKulele b/bin/EUKulele
@@ -1,5 +1,4 @@
 #!/usr/local/bin/python
-##!/usr/bin/env python3
 
 import sys
 

diff --git a/docs/source/parameters.rst b/docs/source/parameters.rst
@@ -51,6 +51,9 @@ A full list of parameters can be found in the table at the bottom of this page.
    * - ``--use_salmon_counts`` 
      - use_salmon_counts 
      - If included in a command line argument or set to 1 in a configuration file, this argument causes classifications to be made based both on number of classified transcripts and by counts.
+   * - ``--create_euk_fasta`` 
+     - create_euk_fasta 
+     - If ``--create_euk_fasta`` is included in a command line argument or set to 1 in a configuration file, FASTA files will be created containing the nucleotide or protein sequences predicted to be eukaryotic by ``EUKulele``.
    * - ``--salmon_dir`` 
      - salmon_dir 
      - If ``--use_salmon_counts`` is true, this must be specified, which is the directory location of the ``salmon`` output/quantification files.
@@ -65,7 +68,7 @@ A full list of parameters can be found in the table at the bottom of this page.
      - A choice of aligner to use, currently ``BLAST`` or ``DIAMOND``.
    * - ``--cutoff_file`` 
      - cutoff_file 
-     - A ``YAML`` file, provided in ``src/EUKulele/static/``, that contains the percent identity cutoffs for various taxonomic classifications. Any path may be provided here to a user-specified file.
+     - A ``YAML`` file, with a default provided in ``src/EUKulele/static/``, that contains the percent identity cutoffs for various taxonomic classifications. Any path may be provided here to a user-specified file - if no absolute path is specified, the default file will be used instead.
    * - ``--filter_metric`` 
      - filter_metric 
      - Either evalue, pid, or bitscore (default evalue) - the metric to be used to filter hits based on their quality prior to taxonomic estimation. 

diff --git a/scripts/create_protein_table.py b/scripts/create_protein_table.py
@@ -43,9 +43,13 @@
 import os
 import argparse
 import json
+import gzip
 from Bio import SeqIO
 import pandas as pd
 
+def iz_gz(path):
+    return path.endswith(".gz")
+
 def createProteinTable(args=None):
     '''
     Main function; intended to parse and create required files
@@ -85,41 +89,52 @@ def createProteinTable(args=None):
     odict = {}
     for curr_pepfile in list(args.infile_peptide):
         pepfile = "".join(curr_pepfile)
-        for record in SeqIO.parse(pepfile, "fasta"):
-            header = record.description
-            rid = record.id.replace(".","N") #record.id.split(".")[0] #record.id.replace(".","N")
-            counter = 2
-            while rid in odict:
-                if "_" in rid:
-                    rid = "_".join(rid.split("_")[0:-1]) + "_" + str(counter)
+
+        open_fn = gzip.open if iz_gz(pepfile) else open
+        with open_fn(pepfile, "rt") as handle:
+            for record in SeqIO.parse(handle, "fasta"):
+                header = record.description
+                rid = record.id.replace(".","N") #record.id.split(".")[0] #record.id.replace(".","N")
+                counter = 2
+                while rid in odict:
+                    if "_" in rid:
+                        rid = "_".join(rid.split("_")[0:-1]) + "_" + str(counter)
+                    else:
+                        rid = rid + "_" + str(counter)
+                    counter = counter + 1
+                if 't' in args.delim: # why is this tab thing not working otherwise?? even the equality
+                    tester = "".join(list(str(header))).replace('\t', '    ')
+                    hlist = tester.split("    ")
                 else:
-                    rid = rid + "_" + str(counter)
-                counter = counter + 1
-            if 't' in args.delim: # why is this tab thing not working otherwise?? even the equality
-                tester = "".join(list(str(header))).replace('\t', '    ')
-                hlist = tester.split("    ")
-            else:
-                header = str(header).replace(args.delim, "hello")
-                hlist = header.split("hello")
-            if len(args.infile_peptide) > 1:
-                # if there is a list of files, use the filename as the ID
-                sid = pepfile.split("/")[-1].split("_")[0]
-                odict[rid] = sid
-            elif args.column.isdigit():
-                sid = hlist[int(args.column)]
-                odict[rid] = sid
-            else:
-                for h_curr in hlist:
-                    if args.column in h_curr: #h.startswith(args.column):
-                        sid = h_curr.split('=')[1].strip()
-                        odict[rid] = sid
-                        break
-
-        print("Modifying...",pepfile,flush=True)
-        os.system("cut -f 1 " + str(pepfile) + " > " + str(pepfile) + ".tester.pep.fa")
-        os.system("perl -i -pe 's/$/_$seen{$_}/ if ++$seen{$_}>1 and /^>/; ' " + \
-                  str(pepfile) + ".tester.pep.fa")
-        os.system("mv " + str(pepfile) + ".tester.pep.fa " + str(pepfile))
+                    header = str(header).replace(args.delim, "hello")
+                    hlist = header.split("hello")
+                if len(args.infile_peptide) > 1:
+                    # if there is a list of files, use the filename as the ID
+                    sid = pepfile.split("/")[-1].split("_")[0]
+                    odict[rid] = sid
+                elif args.column.isdigit():
+                    sid = hlist[int(args.column)]
+                    odict[rid] = sid
+                else:
+                    for h_curr in hlist:
+                        if args.column in h_curr: #h.startswith(args.column):
+                            sid = h_curr.split('=')[1].strip()
+                            odict[rid] = sid
+                            break
+
+        if not iz_gz(pepfile):
+            print("Modifying...",pepfile,flush=True)
+            os.system("cut -f 1 " + str(pepfile) + " > " + str(pepfile) + ".tester.pep.fa")
+            os.system("perl -i -pe 's/$/_$seen{$_}/ if ++$seen{$_}>1 and /^>/; ' " + \
+                      str(pepfile) + ".tester.pep.fa")
+            os.system("mv " + str(pepfile) + ".tester.pep.fa " + str(pepfile))
+        else:
+            print("Modifying zipped pepfile...",pepfile,flush=True)
+            #os.system("zcat " + str(pepfile) + " | cut -f 1 " + " > " + str(pepfile) + ".tester.pep.fa")
+            #os.system("perl -i -pe 's/$/_$seen{$_}/ if ++$seen{$_}>1 and /^>/; ' " + \
+            #          str(pepfile) + ".tester.pep.fa")
+            #os.system("gzip -c "+str(pepfile) + ".tester.pep.fa > "+str(pepfile) + ".tester.pep.fa.gz") 
+            #os.system("mv " + str(pepfile) + ".tester.pep.fa.gz " + str(pepfile))
 
     tax_file = pd.read_csv(args.infile_taxonomy, sep = "\t", encoding='latin-1')
 
@@ -136,7 +151,7 @@ def createProteinTable(args=None):
                 elif len(curr_row) > (len(colnames_tax)):
                     curr_row = curr_row[0:7] + [curr_row[8]]
                 add_series = pd.Series(curr_row, index = colnames_tax)
-                tax_out = tax_out.append(add_series, ignore_index=True)
+                tax_out = pd.concat([tax_out,add_series], ignore_index=True)
             else:
                 curr_row = [tax_file[args.col_source_id][i]] + [""] * 7
                 full_taxonomy = tax_file[args.taxonomy_col_id][i].split(";")
@@ -150,7 +165,7 @@ def createProteinTable(args=None):
                 curr_row[6] = tax_file["Genus_UniEuk"][i]
                 curr_row[7] = genus_and_species
                 add_series = pd.Series(curr_row, index = colnames_tax)
-                tax_out = tax_out.append(add_series, ignore_index=True)
+                tax_out = pd.concat([tax_out,add_series], ignore_index=True)
 
         tax_file = tax_out
     tax_file.to_csv(args.output,sep="\t")

diff --git a/scripts/download_database.sh b/scripts/download_database.sh
@@ -1,15 +1,21 @@
 #!/bin/bash
 
 ALLEXITS=0
-
 DATABASE=$1
 REF_FASTA="reference.pep.fa"
 REF_TABLE="taxonomy-table.txt"
 REF_FASTA_URL=$2
 REF_TABLE_URL=$3
 REFERENCE_DIR=$4
 
+if echo $REF_FASTA_URL | grep -q ".gz"; then
+    REF_FASTA="reference.pep.fa.gz"
+fi
 #mkdir -p ${PWD}/$REFERENCE_DIR/$DATABASE
+if [[ "$REFERENCE_DIR" == "" ]]; then
+    REFERENCE_DIR="."
+fi
+
 mkdir -p $REFERENCE_DIR/$DATABASE
 
 if [[ $DATABASE == "marmmetsp" ]]; then
@@ -44,41 +50,41 @@ elif [[ $DATABASE == "gtdb" ]]; then
     echo "All reference files for GTDB downloaded to $REFERENCE_DIR/$DATABASE"
 elif [[ $DATABASE == "eukprot" ]]; then
     # Download tar of all EukProt files 
-    wget -O $REFERENCE_DIR/$DATABASE/$DATABASE.tgz "$REF_FASTA_URL"
+    wget -O $REFERENCE_DIR/$DATABASE/$REF_FASTA "$REF_FASTA_URL"
     ALLEXITS=$(($ALLEXITS + $?))
 
-    # Unzip to proteins folder
-    tar zxvf $REFERENCE_DIR/$DATABASE/$DATABASE.tgz -C $REFERENCE_DIR/$DATABASE
-    ALLEXITS=$(($ALLEXITS + $?))
+    # Unzip to proteins folder - defunct
+    #tar zxvf $REFERENCE_DIR/$DATABASE/$DATABASE.tgz -C $REFERENCE_DIR/$DATABASE
+    #ALLEXITS=$(($ALLEXITS + $?))
 
     # Download EukProt taxonomy file
     wget -O $REFERENCE_DIR/$DATABASE/$REF_TABLE "$REF_TABLE_URL"
     ALLEXITS=$(($ALLEXITS + $?))
 
-    ALLFILES=""
-    for entry in "$REFERENCE_DIR/$DATABASE/proteins"/*
-    do
-      ALLFILES=$ALLFILES" "$entry
-    done
+    #ALLFILES=""
+    #for entry in "$REFERENCE_DIR/$DATABASE/proteins"/*
+    #do
+    #  ALLFILES=$ALLFILES" "$entry
+    #done
 
-    for currfile in $ALLFILES
-    do 
-        ((cat $currfile | sed 's/\./N/g'); echo; echo) >> ${PWD}/$DATABASE/$REF_FASTA
-    done
-    ALLEXITS=$(($ALLEXITS + $?))
+    #for currfile in $ALLFILES
+    #do 
+    #    ((cat $currfile | sed 's/\./N/g'); echo; echo) >> ${PWD}/$DATABASE/$REF_FASTA
+    #done
+    #ALLEXITS=$(($ALLEXITS + $?))
 
     echo "All reference files for EukProt downloaded to ${PWD}/$DATABASE"
 elif [[ $DATABASE == "phylodb" ]]; then
     # Download PhyloDB reference FASTA
-    wget -O $REFERENCE_DIR/$DATABASE/$REF_FASTA.gz "$REF_FASTA_URL"
-    gunzip -f $REFERENCE_DIR/$DATABASE/$REF_FASTA.gz
+    wget -O $REFERENCE_DIR/$DATABASE/$REF_FASTA "$REF_FASTA_URL"
+    #gunzip -f $REFERENCE_DIR/$DATABASE/$REF_FASTA
     #sed -i -e 's/>* .*$//' $REFERENCE_DIR/$DATABASE/$REF_FASTA
     #sed -i $'s/\t/    /g' $REFERENCE_DIR/$DATABASE/$REF_FASTA
     ALLEXITS=$(($ALLEXITS + $?))
 
     # Download PhyloDB reference taxonomy table
-    wget -O $REFERENCE_DIR/$DATABASE/$REF_TABLE.gz "$REF_TABLE_URL"
-    gunzip -f $REFERENCE_DIR/$DATABASE/$REF_TABLE.gz
+    wget -O $REFERENCE_DIR/$DATABASE/$REF_TABLE "$REF_TABLE_URL"
+    #gunzip -f $REFERENCE_DIR/$DATABASE/$REF_TABLE
     ALLEXITS=$(($ALLEXITS + $?))
 
     # Download PhyloDB files from Google Drive
@@ -90,6 +96,7 @@ elif [[ $DATABASE == "phylodb" ]]; then
     #gunzip -c ${PWD}/$DATABASE/$DATABASE.table.tgz > ${PWD}/$DATABASE/$REF_TABLE_URL
 
     echo "All reference files for PhyloDB downloaded to ${PWD}/$DATABASE"
+
 elif [[ $DATABASE == "eukzoo" ]]; then
     zenodo_get 1476236
     mv EukZoo_taxonomy_table_v_0.2.tsv $REFERENCE_DIR/$DATABASE/$REF_TABLE
@@ -100,3 +107,10 @@ else
     echo "Specified database not found."
     exit 1
 fi
+
+touch $REFERENCE_DIR/$DATABASE/DATABASE_INFO.txt
+rm $REFERENCE_DIR/$DATABASE/DATABASE_INFO.txt
+
+echo "Database $DATABASE downloaded on $(date)" > $REFERENCE_DIR/$DATABASE/DATABASE_INFO.txt
+echo "Download URL for protein file was $REF_FASTA_URL" >> $REFERENCE_DIR/$DATABASE/DATABASE_INFO.txt
+echo "Download URL for taxonomy table was $REF_TABLE_URL" >> $REFERENCE_DIR/$DATABASE/DATABASE_INFO.txt
diff --git a/scripts/remove_newlines.sh b/scripts/remove_newlines.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+awk '{ if (NR > 1 && index(prev, ">") == 0 && index($0, ">") == 0) printf "%s%s", prev, $0; else if (NR > 1) print prev; prev = $0 } END { print prev }' $1
diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
     version=version,
     description = ("A package to make the process of taxonomically classifying "
                    "microbial eukaryotes easier."),
-    keywords = "eukaryote taxonomy classification",
+    keywords = "eukaryotic taxonomy classification",
     conda_buildnum=1,
     url="https://github.com/AlexanderLabWHOI/EUKulele",
     author="Arianna Krinos",

diff --git a/src/EUKulele/EUKulele_main.py b/src/EUKulele/EUKulele_main.py
@@ -17,7 +17,7 @@
 
 from scripts.names_to_reads import namesToReads
 
-__author__ = "Harriet Alexander, Arianna Krinos"
+__author__ = "Arianna Krinos, Harriet Alexander"
 __copyright__ = "EUKulele"
 __license__ = "MIT"
 __email__ = "akrinos@mit.edu"
@@ -26,7 +26,7 @@ def main(args_in):
     '''
     Main function for calling subfunctions and running EUKulele.
     '''
-
+    
     parser = argparse.ArgumentParser(
         description='Thanks for using EUKulele! EUKulele is a standalone taxonomic '+\
                     'annotation software.\n EUKulele is designed primarily for marine '+\
@@ -35,14 +35,14 @@ def main(args_in):
               '[sample_directory] --reference_dir [reference_database_location] ' +\
               '[all other options]')
 
+    parser.add_argument('-v', '--version', dest = "version", default=False,
+                        action='store_true')
     parser.add_argument('subroutine', metavar="subroutine", nargs='?', type=str,
                         default="all",
                         choices = ["","all","download","setup","alignment",
                                    "busco","coregenes"],
                         help='Choice of subroutine to run.')
-
-    parser.add_argument('-v', '--version', dest = "version", default=False,
-                        action='store_true')
+
     parser.add_argument('-m', '--mets_or_mags', dest = "mets_or_mags", required = False,
                         default = "")
     parser.add_argument('--n_ext', '--nucleotide_extension', dest = "nucleotide_extension",
@@ -108,7 +108,7 @@ def main(args_in):
                         help = "Taxonomic level of organisms specified in organisms tag.")
 
     ## OTHER USER CHOICES ##
-    cutoff_file = "tax-cutoffs.yaml"
+    cutoff_file = "default_in_static" # this will cause cutoff file to be read from folder
     parser.add_argument('--cutoff_file', default = cutoff_file)
     parser.add_argument('--consensus_proportion', default = 1, type = float)
     parser.add_argument('--filter_metric', default = "evalue",
@@ -120,6 +120,10 @@ def main(args_in):
     parser.add_argument('--busco_threshold', default=50)
     parser.add_argument('--no_busco', action='store_true', default=False,
                        help = "When true, BUSCO steps are not run.")
+
+    parser.add_argument('--create_euk_fasta', action='store_true', default=False,
+                       help = "Whether to create FASTA files containing sequences identified "+\
+                              "to be eukaryotic.")
     parser.add_argument('--create_fasta', action='store_true', default=False,
                        help = "Whether to create FASTA files containing ID'd transcripts "+\
                               "during BUSCO analysis.")
@@ -133,6 +137,15 @@ def main(args_in):
                        help = "Whether we're just running a test and should not execute downloads.")
 
     args = parser.parse_args(list(filter(None, args_in.split(" "))))
+
+    if args.version:
+        test_var = True
+        filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                "static", "VERSION")
+        file_read = open(filename, "r")
+        print("The current EUKulele version is",file_read.read())
+        sys.exit(0)
+
     if (args.mets_or_mags == "") & (args.subroutine != "download") & (not args.version):
         print("METs or MAGs argument (-m/--mets_or_mags) is required with one of 'mets' or 'mags'.")
         sys.exit(1)
@@ -175,6 +188,7 @@ def main(args_in):
     busco_file = args.busco_file
     rerun_rules = args.force_rerun
     run_transdecoder = args.run_transdecoder
+    create_euk_fasta = args.create_euk_fasta
 
     organisms, organisms_taxonomy = readBuscoFile(individual_or_summary, busco_file,
                                                   organisms, organisms_taxonomy)
@@ -185,10 +199,9 @@ def main(args_in):
     busco_choice = False
     core_genes = False
 
-    if args.version:
-        test_var = True
-        filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                "static", "VERSION")
+    filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                            "static", "VERSION")
+    if os.path.isfile(filename):
         file_read = open(filename, "r")
         print("The current EUKulele version is",file_read.read())
 
@@ -242,17 +255,21 @@ def main(args_in):
         f.write("The version of EUKulele was "+str(file_read.read())+".\n")
         f.write("Time finished was " + str(e) + " for database " + \
                 str(args.database.lower())+"\n")
-
+
+        if os.path.isfile(os.path.join(reference_dir, ref_fasta+".gz")) & \
+           (not os.path.isfile(os.path.join(reference_dir, ref_fasta))):
+            ref_fasta = ref_fasta + ".gz"
+
         ## Next, see whether there is a subdirectory of reference
         ## directory containing folder for our DB name
-        if (not os.path.isfile(os.path.join(reference_dir, ref_fasta))) | \
+        if (not (os.path.isfile(os.path.join(reference_dir, ref_fasta)))) | \
            (not os.path.isfile(tax_tab)) | \
            (not os.path.isfile(prot_tab)):
             ref_fasta, tax_tab, prot_tab = downloadDatabase(args.database.lower(),
                                                             alignment_choice, output_dir,
                                                             "/".join(reference_dir.
                                                                      split("/")[0:-1]))
-            if (not os.path.isfile(os.path.join(reference_dir, ref_fasta))) | \
+            if (not (os.path.isfile(os.path.join(reference_dir, ref_fasta)))) | \
                (not os.path.isfile(tax_tab)) | \
                (not os.path.isfile(prot_tab)):
                 print("Download and formatting did not complete successfully. " +\
@@ -327,6 +344,11 @@ def main(args_in):
                        output_dir = output_dir,
                        level_hierarchy = levels_file)
 
+        if create_euk_fasta:
+            manageEukulele(piece = "dump_euks", samples = samples, mets_or_mags = mets_or_mags,
+                       sample_dir = sample_dir, pep_ext = pep_ext,
+                       output_dir = output_dir,
+                       level_hierarchy = levels_file)
     busco_matched = True
     if busco_choice:
         print("Performing BUSCO steps...", flush=True)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash

		awk '{ if (NR > 1 && index(prev, ">") == 0 && index($0, ">") == 0) printf "%s%s", prev, $0; else if (NR > 1) print prev; prev = $0 } END { print prev }' $1