update, please dont break

carden24 · Oct 26, 2015 · 2546909 · 2546909
1 parent 8109335
commit 2546909
Show file tree

Hide file tree

Showing 48 changed files with 979 additions and 408 deletions.
diff --git a/GET_LINEAGES_NCBI.EC.py b/GET_LINEAGES_NCBI.EC.py
@@ -1,3 +1,5 @@
+# Author Roli Wilhelm
+
 #!/usr/bin/python
 import sys, os, re, getopt, glob, subprocess, os.path, numpy as np, time
 import timeit

diff --git a/GET_LINEAGES_NCBI.py b/GET_LINEAGES_NCBI.py
@@ -1,3 +1,5 @@
+#Author Roli Wilhelm
+
 #!/usr/bin/python
 import sys, os, re, getopt, glob, subprocess, os.path, numpy as np, time
 import timeit

diff --git a/HMM_search_and_parse_and_extract.py b/HMM_search_and_parse_and_extract.py
@@ -21,7 +21,7 @@
 
 
 #config = load_config()
-script_info={}
+script_info = {}
 script_info['brief_description'] = """Filters sequence according to a minimum 
             size parameter"""
 script_info['script_description'] = """HMMER parser. Runs hmmscan, filters
@@ -64,12 +64,12 @@
                         '\n(d) \'all\' -- Extract hits, contigs, and all\
                          proteins from hits\n')
 
-#Compiling frequently used regular expression patterns
+# Compiling frequently used regular expression patterns
 hmm_pattern = re.compile('[.](hmm)')
 query_pattern = re.compile('[.](fasta$|fas$|faa$|fsa$|fa$)')
 
 
-# checks if the supplied arguments are adequate
+# Checks if the supplied arguments are adequate
 def valid_arguments(opts, args):
     if (opts.input_model == None or opts.input_fp == None ):
         return True
@@ -98,13 +98,12 @@ def update_progress(progress):
     sys.stderr.flush()
 
 
-#Get HMM length function
+# Get HMM length function
 def get_hmm_len(input_model):
-#    hmmshortname = re.sub('[.](hmm)','',input_model, re.I)
-    hmmshortname = re.sub(hmm_pattern,'',input_model, re.I)  
-    hmm_leng_file = hmmshortname+".length.txt"
-    hmm_fileout = open(hmm_leng_file,'w')
-    hmm_filein = open(input_model,'r')
+    hmmshortname = re.sub(hmm_pattern, '', input_model, re.I)  
+    hmm_leng_file = hmmshortname + ".length.txt"
+    hmm_fileout = open(hmm_leng_file, 'w')
+    hmm_filein = open(input_model, 'r')
     for line in hmm_filein:
         if line.startswith('NAME'):
             line = line.strip('\n')
@@ -121,32 +120,28 @@ def get_hmm_len(input_model):
                 continue
     hmm_fileout.close()
     hmm_filein.close()
-    os.system(' '.join(['cp',hmm_leng_file,'all.hmm.ps.len']))
+    os.system(' '.join(['cp', hmm_leng_file, 'all.hmm.ps.len']))
 
 
-#Function to run hmmscan and parse
+# Function to run hmmscan and parse
 def run_hmm_scan (model,query,output):
-    #removes extension, case insensitive search
-#   hmmshortname = re.sub('[.](hmm)','',model, re.I)  
-    hmmshortname = re.sub(hmm_pattern,'',model, re.I)  
-    #finds file format removes extension, case insensitive search
-#   shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I)
-    shortname = re.sub(query_pattern,'',query, re.I)
+    # Removes extension, case insensitive search
+    hmmshortname = re.sub(hmm_pattern, '', model, re.I)  
+    # Finds file format removes extension, case insensitive search
+    shortname = re.sub(query_pattern, ' ', query, re.I)
     output_file = output + "/" + shortname + "_" + hmmshortname + '.hmm.out'
     output_file2 = output +"/" + shortname + "_" + hmmshortname + '.txt'
     print 'Running hmmscan...'
-    os.system(' '.join(['hmmscan',model,query,">",output_file]))
+    os.system(' '.join(['hmmscan', model, query, ">", output_file]))
     print 'Parsing results...'
-    os.system(' '.join(['sh','hmmscan-parser.sh',output_file,'>',output_file2]))
+    os.system(' '.join(['sh', 'hmmscan-parser.sh', output_file, '>', output_file2]))
 
-#Filtering by evalue and coverage
-def filtering_by_evalue_and_coverage(model,query,output,evalue,coverage):
-    #removes extension, case insensitive search
-#   hmmshortname = re.sub('[.](hmm)','',model, re.I)
+# Filtering by evalue and coverage
+def filtering_by_evalue_and_coverage(model, query, output, evalue, coverage):
+    # Removes extension, case insensitive search
     hmmshortname = re.sub(hmm_pattern,'',model, re.I)  
-    #finds file format removes extension, case insensitive search
-#   shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I)
-    shortname = re.sub(query_pattern,'',query, re.I)
+    # Finds file format removes extension, case insensitive search
+    shortname = re.sub(query_pattern, '', query, re.I)
     output_file2 = output+"/" + shortname + "_" + hmmshortname + '.txt'
     hmm_table = open(output_file2, 'r')
     output_file3 = output + "/" + shortname + "_" + hmmshortname+'.filtered.txt'
@@ -157,67 +152,64 @@ def filtering_by_evalue_and_coverage(model,query,output,evalue,coverage):
         line2 = line.strip('\n').split('\t')
         result_evalue = float(line2[2])
         result_model_coverage = float(line2[7])
-        if (result_evalue <=evalue) and (result_model_coverage*100 >= coverage):
+        if (result_evalue <= evalue) and (result_model_coverage * 100 >= coverage):
             hmm_filtered_table.write('%s' %line)
         else:
             continue
     hmm_table.close()
     hmm_filtered_table.close()
 
 
-#Function to extract hits from filtered results
+# Function to extract hits from filtered results
 def extract_protein_hits(query,model,output):
-    #removes extension, case insensitive search
-    hmmshortname = re.sub(hmm_pattern,'',model, re.I)  
-#   hmmshortname = re.sub('[.](hmm)', '', model, re.I)
-    #finds file format removes extension, case insensitive search
-#   shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I)
-    shortname = re.sub(query_pattern,'',query, re.I)
-    input_file4 = output+"/"+shortname+"_"+hmmshortname+'.filtered.txt'
+    # Removes extension, case insensitive search
+    hmmshortname = re.sub(hmm_pattern, '', model, re.I)  
+    # Finds file format removes extension, case insensitive search
+    shortname = re.sub(query_pattern, '', query, re.I)
+    input_file4 = output + "/" + shortname + "_" + hmmshortname + '.filtered.txt'
     hmm_filtered_table2 = open(input_file4, 'r')
 
-    print '   Extracting proteins for %s and HMM database=%s' %(query,model)
-    #Create dictionary with protein:[list of model it hits]
+    print '   Extracting proteins for %s and HMM database=%s' %(query, model)
+    # Create dictionary with protein:[list of model it hits]
     protein_hit_dictionary = {}
     all_models_hits = []
     for line3 in hmm_filtered_table2:
         line4 = line3.strip('\n').split('\t')
         protein_hit = line4[0]
         model_of_protein_hit = line4[1].rstrip(' ')
-
-        #update list of proteins
+        # Update list of proteins
         all_models_hits.append(model_of_protein_hit)
 
-        #Get list of proteins hits, if non existent create empty list
+        # Get list of proteins hits, if non existent create empty list
         models = protein_hit_dictionary.get(protein_hit, [])
-        #Append current model hit to list
+        # Append current model hit to list
         models.append(model_of_protein_hit)   
-        #Update dictionary entry   
+        # Update dictionary entry   
         protein_hit_dictionary[protein_hit] = models
 
-    #Print message
-    count_of_models=list(set(all_models_hits))
-    count_of_proteins=len(protein_hit_dictionary.keys())
+    # Print message
+    count_of_models = list(set(all_models_hits))
+    count_of_proteins = len(protein_hit_dictionary.keys())
 
     print '   Extracting %s unique proteins corresponding to %s HMM models' \
           %(count_of_proteins,len(count_of_models))      
 
-    #open one output file per model
-    #Generate list of output files 
-    #for item in all_models_hits:
+    # Open one output file per model
+    # Generate list of output files 
+    # For item in all_models_hits:
     files = [open(output + '/' + shortname + '_' + hmmshortname + '_' + item + '.fasta', 'w') \
            for item in set(all_models_hits)]
 
-    #Open original file, find if name is in hit list,
-    #Then get models hits and write to model result files
+    # Open original file, find if name is in hit list,
+    # Then get models hits and write to model result files
     filein = open(query, 'r')
-    for record in SeqIO.parse(filein,"fasta"):
+    for record in SeqIO.parse(filein, "fasta"):
         name = record.name
         if name in protein_hit_dictionary.keys():
             what_models_list = protein_hit_dictionary.get(name)
-            #Iterate this list
+            # Iterate this list
             for what_model in what_models_list:
-                #Find index
+                # Find index
                 index = count_of_models.index(what_model)
                 files[index].write('>%s\n%s\n' % (name, record.seq))
     #Close files
@@ -226,46 +218,44 @@ def extract_protein_hits(query,model,output):
 
 
 #Function to extract contigs
-def extract_contigs(query,model,output,assembly_file):
-    # removes extension, case insensitive search
-#   hmmshortname = re.sub('[.](hmm)', '', model, re.I)
-    hmmshortname = re.sub(hmm_pattern,'',model, re.I)  
-    #finds file format removes extension, case insensitive search
-#   shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','', query, re.I)
-    shortname = re.sub(query_pattern,'',query, re.I)
+def extract_contigs(query, model, output, assembly_file):
+    # Removes extension, case insensitive search
+    hmmshortname = re.sub(hmm_pattern, '', model, re.I)  
+    # Finds file format removes extension, case insensitive search
+    shortname = re.sub(query_pattern, '', query, re.I)
     input_file4 = output + "/" + shortname + "_" + hmmshortname +'.filtered.txt'
     hmm_filtered_table2 = open(input_file4, 'r')
 
-    print '   Extracting contigs for file=%s and HMM database=%s' %(query,model)
+    print '   Extracting contigs for file=%s and HMM database=%s' %(query, model)
     #Create dictionary with protein:[list of model it hits]
     protein_model_dictionary = {}
     for line3 in hmm_filtered_table2:
         line4 = line3.strip('\n').split('\t')
         protein_hit = line4[0]
         model_of_protein_hit = line4[1].rstrip(' ')
-        #Get list of proteins hits, if non existent create empty list
+        # Get list of proteins hits, if non existent create empty list
         models = protein_model_dictionary.get(protein_hit, [])
-        #Append current model hit to list
+        # Append current model hit to list
         models.append(model_of_protein_hit)   
-        #Update dictionary entry   
+        # Update dictionary entry   
         protein_model_dictionary[protein_hit] = models
 
-    #Create protein-contig dictionary
+    # Create protein-contig dictionary
     contigs_list = []
-    #parse through list and add to contigs_list
+    # Parse through list and add to contigs_list
     for protein in protein_model_dictionary.keys():
         contig = protein.rsplit('_', 1)
         contigs_list.append(contig[0])
     contigs_list = list(set(contigs_list))
 
-    #Open original file, find if name is in hit list,
-    #Then get models hits and write to model result files
-    assembly_in = open(assembly_file,'r')
-    contigs_file = output+"/" + shortname + "_" + hmmshortname +'_contigs.fasta'
+    # Open original file, find if name is in hit list,
+    # Then get models hits and write to model result files
+    assembly_in = open(assembly_file, 'r')
+    contigs_file = output + "/" + shortname + "_" + hmmshortname + '_contigs.fasta'
     contigs_out = open(contigs_file, 'w')
     print '   Looking for %s contigs' %len(contigs_list)
     progress_counter = 0
-    for record in SeqIO.parse(assembly_in,"fasta"):
+    for record in SeqIO.parse(assembly_in, "fasta"):
         name = record.name
         if name in contigs_list:
             progress_counter = progress_counter + 1
@@ -278,13 +268,11 @@ def extract_contigs(query,model,output,assembly_file):
         print 'Some contigs were not found'
 
 
-#Function to extract all proteins from contig
+# Function to extract all proteins from contig
 def extract_all_proteins_from_contigs(query, model, output):
     # Removes extension, case insensitive search
-    hmmshortname = re.sub(hmm_pattern,'',model, re.I)   
-#   hmmshortname = re.sub('[.](hmm)','',model, re.I)
+    hmmshortname = re.sub(hmm_pattern, '', model, re.I)   
     # Finds file format removes extension, case insensitive search
-#   shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I)
     shortname = re.sub(query_pattern, '', query, re.I)
     input_file4 = output + "/" + shortname + "_" + hmmshortname + '.filtered.txt'
     hmm_filtered_table2 = open(input_file4, 'r')
@@ -311,7 +299,7 @@ def extract_all_proteins_from_contigs(query, model, output):
     # Open one output file per model
     # Generate list of output files 
     files = [open(output + '/' + shortname + '_' + hmmshortname + '_' \
-             + contigs + '.fasta','w') for contigs in (contigs_list)]
+             + contigs + '.fasta', 'w') for contigs in (contigs_list)]
 
     # Open original file, find if name is in hit list,
     # Then get models hits and write to model result files
@@ -346,7 +334,7 @@ def main(argv):
         raise IOError,\
         "Cannot open hmmscan-parser.sh. Please copy it to the local directory"
 
-    # initialize the input file and model, loading parameters
+    # Initialize the input file and model, loading parameters
     input_model = opts.input_model 
     input_fp = opts.input_fp 
     output_dir = opts.output_dir 
@@ -357,8 +345,7 @@ def main(argv):
 
     # Creates a model length dictionary
     print 'Checking model length...'
-#   hmmshortname = re.sub('[.](hmm)', '', input_model, re.I)
-    hmmshortname = re.sub(hmm_pattern,'',input_model, re.I)   
+    hmmshortname = re.sub(hmm_pattern, '', input_model, re.I)   
     hmm_leng_file = hmmshortname + ".length.txt"
     print '   Created %s file' % hmm_leng_file
     get_hmm_len(input_model)
@@ -380,7 +367,6 @@ def main(argv):
         extract_protein_hits(input_fp, input_model, output_dir)
     elif extract_mode == 'contigs':
         extract_contigs(input_fp, input_model, output_dir, assembly_file)
-
     elif extract_mode == 'all':
         extract_protein_hits(input_fp, input_model, output_dir)
         extract_contigs(input_fp, input_model, output_dir, assembly_file)

diff --git a/add_coverage_to_fasta_contigs_2.py b/add_coverage_to_fasta_contigs_2.py
@@ -0,0 +1,84 @@
+#!/usr/bin/python
+# File created on 31 Jan 2014.
+
+__author__ = "Erick Cardenas Poire"
+__copyright__ = "Copyright 2014"
+__credits__ = [""]
+__version__ = "1.0"
+__maintainer__ = "Erick Cardenas Poire"
+__status__ = "Release"
+
+from Bio import SeqIO
+import sys
+from os import makedirs, sys, listdir, environ, path
+import re 
+import inspect
+from commands import getstatusoutput
+from optparse import OptionParser
+import shutil 
+
+#config = load_config()
+script_info = {}
+script_info['brief_description'] = """Adds coverage information from one file and modifies fasta header"""
+script_info['script_description'] = """Adds coverage information from one file and modifies fasta header
+             REQUIRED: You must have a fasta and coverage file with same base name"""
+script_info['script_usage'] = []
+
+usage= """
+Need to run it like this:
+./add.coverage.to.fasta.py  -i input_file
+For more options:  ./add.coverage.to.fasta.py -h"""
+
+parser = OptionParser(usage)
+parser.add_option("-i", "--input_file", dest = "input_fp",
+                  help = 'the input fasta file/input dir [REQUIRED]')
+
+
+#creates an input output pair if input is just an input file
+def create_an_inputs_and_output(input_file):
+   input_output = []
+   shortname = re.sub('[.](fasta$|fas$|fna$|faa$|fsa$|fa$)','',input_file, re.I)  #finds file format removes extension, case insensitive search
+   coverage_input_file = shortname+".cov"
+   output_file = shortname + ".new.fasta"
+   input_output.append(input_file)
+   input_output.append(coverage_input_file)
+   input_output.append(output_file)
+   return input_output
+
+# checks if the supplied arguments are adequate
+def valid_arguments(opts, args):
+   if opts.input_fp == None:
+      return True
+   else:
+      return False
+
+def main(argv):
+   (opts, args) = parser.parse_args()
+   if valid_arguments(opts, args):
+      print usage
+      sys.exit(0)
+
+   # initialize the input directory or file
+   input_fp = opts.input_fp 
+   list_of_files = create_an_inputs_and_output(input_fp)
+
+   # Creates coverage dictionary
+   coverage_dictionary = {}
+   coverage_file_in = open(list_of_files[1],'r')
+   for line in coverage_file_in:
+      line = line.split('\t')
+      seq_ID = line[0]
+      seq_coverage = line[1]
+      coverage_dictionary[seq_ID] = seq_coverage
+   coverage_file_in.close()   
+
+   fileout = open(list_of_files[2], 'w')
+   for seq_record in SeqIO.parse(list_of_files[0], format = "fasta"):
+      seq_name = seq_record.id
+      coverage = coverage_dictionary.get(seq_name,0)
+      description = "coverage=" + coverage
+      fileout.write('>%s %s\n%s\n' %(seq_record.id, description, seq_record.seq))
+   fileout.close()
+
+# the main function 
+main(sys.argv[1:])