Updating

carden24 · Mar 7, 2018 · bad23a0 · bad23a0
1 parent 2e9f752
commit bad23a0
Show file tree

Hide file tree

Showing 3 changed files with 156 additions and 26 deletions.
diff --git a/filter_mothur_biome.py b/filter_mothur_biome.py
@@ -0,0 +1,48 @@
+
+
+# Script to remove unwanted samples from mothur shared file
+
+# Read list of unwanted samples
+# Read line
+
+
+# Write if not in list
+
+
+import sys
+import re
+
+# Inputs
+filein = open(sys.argv[1], 'r')
+filelist = open(sys.argv[2], 'r')
+
+shared_shortname = re.sub('[.](shared)', '', sys.argv[1], re.I)
+fileout_handle = shared_shortname + "_filtered.shared"
+fileout = open(fileout_handle, 'w')
+
+
+#create a list with the names of the samples requested
+requested_samples = []
+for sample in filelist:
+    sample = sample.strip('\n').strip('\r')
+    requested_samples.append(sample)
+filelist.close()
+
+# Print number of requested samples
+print "%s records requested" % len(requested_samples)
+
+found_counter = 0
+for line in filein:
+    line2 = line.strip('\n').strip('\r')
+    line2 = line2.split('\t')
+    # Second column correspond to the sample in the otu table
+    if line2[1] in requested_samples:
+        found_counter = found_counter + 1
+        continue
+    else:
+        fileout.write("%s" % line)
+
+print "%s samples removed" % found_counter
+
+fileout.close()
+filein.close()
diff --git a/score_blast_results_from_cazy.py b/score_blast_results_from_cazy.py
@@ -11,27 +11,27 @@
 
 #config = load_config()
 script_info = {}
-script_info['brief_description'] = """This script converts a tabular BLAST output
-             into a table with count for each CAZy family.
+script_info['brief_description'] = """This script converts a tabular BLAST
+             output into a table with count for each CAZy family.
              It requires a pregenerated dictionary (pkl) from the CAZy file used
              as BLAST database """
 
 script_info['script_usage'] = []
 
 usage = '''
-Usage:  
-python score_blast_results_from_cazy.py -i <blast tabular input> -d <cazy dictionary>
+Usage:
+python score_blast_results_from_cazy.py -i <blast tabular input>
+-d <cazy dictionary>
  -o <output text file>
 '''
 
 parser = OptionParser(usage)
-parser.add_option("-i", "--input_blast_result", dest = "input_file",
-                  help = 'The blast tabular input [REQUIRED]')
-parser.add_option("-d", "--dictionary_file", dest = "dictionary_file",
-                  help = 'The dictionary file [REQUIRED]')
-parser.add_option("-o", "--output_table", dest = "output_file", default = 'None',
-                  help = 'The output file [OPTIONAL]')
-
+parser.add_option("-i", "--input_blast_result", dest="input_file",
+                  help='The blast tabular input [REQUIRED]')
+parser.add_option("-d", "--dictionary_file", dest="dictionary_file",
+                  help='The dictionary file [REQUIRED]')
+parser.add_option("-o", "--output_table", dest="output_file", default='None',
+                  help='The output file [OPTIONAL]')
 
 
 # checks if the supplied arguments are adequate
@@ -41,7 +41,7 @@ def valid_arguments(opts, args):
     else:
         return False
 
-   
+
 def main(argv):
     (opts, args) = parser.parse_args()
     print ''
@@ -52,21 +52,21 @@ def main(argv):
         sys.exit(0)
 
     # initialize the input file, dictionary, and output file
-    input_file = opts.input_file 
+    input_file = opts.input_file
     dictionary_file = opts.dictionary_file
     output_file = opts.output_file
-    if output_file == 'None' :
+    if output_file == 'None':
         print 'No output file was specified'
         output_file = input_file + '_cazy_family'
-        print 'Output will be directed to file: %s' %output_file
+        print 'Output will be directed to file: %s' % output_file
 #    else:
 #        continue
 
     # Open input and outputs
     filedict = open(dictionary_file, 'rb')
     filein = open(input_file, 'r')
     fileout = open(output_file, 'w')
-    print 'Loading dictionary %s' %dictionary_file
+    print 'Loading dictionary %s' % dictionary_file
     cazy_dictionary = pickle.load(filedict)
     print 'Loading complete'
     family_output_dictionary = {}
@@ -76,33 +76,33 @@ def main(argv):
     for line in filein:
         entry_counter = entry_counter + 1
         # Split line from blast output
-        line = line.split('\t')			
+        line = line.split('\t')
         # Get subject from blast output
         subject = line[1]
 #        print subject
         # Get info for that subject in dictionary
         new_dictionary_entry = cazy_dictionary.get(subject)
         if new_dictionary_entry == None:
-            print 'Could not find data for %s' %subject
+            print 'Could not find data for %s' % subject
             print 'Please check you are using the correct dictionary file'
             sys.exit(0)
          # Obtain CAZy family
         family = new_dictionary_entry[1]
         # Entry family name in results dictionary
         # If there is no result use zero as the initial count
-        initial_count = family_output_dictionary.get(family,0)
+        initial_count = family_output_dictionary.get(family, 0)
         # Update count
-        updated_count = initial_count + 1		
-        # Update entry in dictionary			
-        family_output_dictionary[family] = updated_count			
+        updated_count = initial_count + 1
+        # Update entry in dictionary
+        family_output_dictionary[family] = updated_count
 
-    print 'Found a total of %s entries' %entry_counter
-    print 'And a total of %d families' %len(family_output_dictionary.keys())
-    print 'Writing to %s' %output_file
+    print 'Found a total of %s entries' % entry_counter
+    print 'And a total of %d families' % len(family_output_dictionary.keys())
+    print 'Writing to %s' % output_file
 
     # write family_dictionary
     for key, value in family_output_dictionary.iteritems():
-        fileout.write("%s\t%d\n" %(key, value))
+        fileout.write("%s\t%d\n" % (key, value))
     fileout.close()
     filein.close()
     filedict.close()

diff --git a/score_butyrate.py b/score_butyrate.py
@@ -0,0 +1,82 @@
+#! /usr/bin/python
+
+# author__ = "Erick Cardenas Poire"
+# Usage python ./score_butyrate.py -i <input file> -d <dictionary file>
+
+import sys
+import re
+from optparse import OptionParser
+
+
+#config = load_config()
+script_info = {}
+script_info['brief_description'] = """Converts a tabular BLAST output
+             into a table with count for butyrate synthesis genes.
+             It requires a table that links the img ids with the gene names"""
+
+script_info['script_usage'] = []
+
+usage = '''
+Usage:
+python score_blast_butyrate.py -i <blast tabular input> -d <dictionary file>
+ -o <output text file>
+'''
+
+parser = OptionParser(usage)
+parser.add_option("-i", "--input_blast_result", dest="input_file",
+                  help='The blast tabular input [REQUIRED]')
+parser.add_option("-d", "--dictionary_file", dest="dictionary_file",
+                  help='The dictionary file [REQUIRED]')
+
+
+def main(argv):
+    (opts, args) = parser.parse_args()
+    print ''
+    print 'Initializing...'
+
+    # initialize the input file, dictionary, and output file
+    input_file = opts.input_file
+    dictionary_file = opts.dictionary_file
+    shortname = re.sub('[.](txt)', '', input_file, re.I)
+    output_file = shortname + "_summary.txt"
+
+    filedict = open(dictionary_file, 'r')
+    filein = open(input_file, 'r')
+    fileout = open(output_file, 'w')
+
+    gene_id_dictionary = {}
+    for line in filedict:
+        line = line.rstrip(' ')
+        line = line.rstrip('\n')
+        line = line.split('\t')
+        gene_id = line[0]
+        gene = line[1]
+        gene_id_dictionary[gene_id] = gene
+
+    gene_count_dictionary = {}
+    for line2 in filein:
+        line2 = line2.lstrip(' ').rstrip(' ')
+        line2 = line2.strip('\n')
+       # Split line from results
+        line2 = line2.split(' ')
+        subject = line2[1]
+        line_count = int(line2[0])
+        the_gene = gene_id_dictionary.get(subject, 'no_match')
+        print the_gene
+        # Get current count and update, if not found use zero
+
+        current_gene_count = gene_count_dictionary.get(the_gene, 0)
+        new_score = current_gene_count + line_count
+        gene_count_dictionary[the_gene] = new_score
+
+    for key, value in gene_count_dictionary.iteritems():
+        fileout.write("%s\t%s\t%s\n" % (input_file, key, value))
+
+    # Closing file
+    filein.close()
+    filedict.close()
+    fileout.close()
+
+
+# the main function
+main(sys.argv[1:])