adding scripts to normalize and score kegg data from IMG

carden24 · Mar 18, 2016 · 41f88ce · 41f88ce
1 parent 1da7905
commit 41f88ce
Show file tree

Hide file tree

Showing 2 changed files with 210 additions and 0 deletions.
diff --git a/normalize_kegg.py b/normalize_kegg.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python
+
+import sys
+from optparse import OptionParser
+
+script_info = {}
+script_info['brief_description'] = """This script normalizes the KO count according to the number of read in the library."""
+
+script_info['script_usage'] = []
+
+usage = '''
+Usage:  
+
+python normalize_kegg.py -i <kegg table input> -d <list_of_files.txt>
+ -o <output text file>
+'''
+
+parser = OptionParser(usage)
+parser.add_option("-i", "--input_kegg_count_file", dest = "input_file",
+                  help = 'The tabular input [REQUIRED]')
+parser.add_option("-d", "--library_read_ciybt", dest = "dictionary_file",
+                  help = 'The dictionary file [REQUIRED]')
+parser.add_option("-o", "--output_table", dest = "output_file", default = 'None',
+                  help = 'The output file [OPTIONAL]')
+parser.add_option("-v", action="store_true", dest="verbose")
+
+
+# checks if the supplied arguments are adequate
+def valid_arguments(opts, args):
+    if (opts.input_file == None or opts.dictionary_file == None ):
+        return True
+    else:
+        return False
+
+def main(argv):
+    (opts, args) = parser.parse_args()
+    verbose = opts.verbose
+    parser.set_defaults(verbose=False)
+    if verbose:
+        print ''
+        print "Verbosity is on"
+    if verbose:
+        print 'Initializing...'
+
+    if valid_arguments(opts, args):
+        print usage
+        sys.exit(0)
+
+    # initialize the input file, dictionary, and output file
+    input_file = opts.input_file 
+    dictionary_file = opts.dictionary_file
+    output_file = opts.output_file
+
+    if output_file == 'None' :
+        print 'No output file was specified'
+        output_file = input_file + '_normalized'
+        print 'Output will be directed to file: %s' %output_file
+
+    # Open input and outputs
+    filedict = open(dictionary_file, 'r')
+    filein = open(input_file, 'r')
+    fileout = open(output_file, 'w')
+    if verbose:
+        print 'Loading dictionary: %s' %dictionary_file
+
+    file_read_count_dict = {}
+
+    for line in filedict:
+        #print line
+        line = line.strip('\n\r').split('\t')
+        #print line
+        my_file = line[0]
+        file_read_count = line[1]
+        file_read_count_dict[my_file] = file_read_count   
+    if verbose:
+        print 'Loading complete'
+
+    # Normalizing the output
+	# This is how the summary file is formatted
+	# TXA-OM3C0-O3-3300001461.u.ko_summary.txt
+	# This is how the summary file "file" column looks like
+	# TXA/TXA-OM3C0/TXA-OM3C0-O3-3300001461.u.ko.txt
+	# or
+	# TXA-OM3C0-O3-3300001461.u.ko.txt
+
+    if verbose:
+        print 'Starting normalization'
+    the_read_count = file_read_count_dict.get(input_file, 0)
+
+    if the_read_count == 0:
+         print "No read count was found for %s" %input_file
+         sys.exit(0)
+
+    for line2 in filein:
+        line2 = line2.strip('\n\r').split('\t')  # split the file into three
+        the_file0 = line2[0]
+        # This takes care of the wd path if it appears
+        the_file =  the_file0.split('/')[-1]
+        the_KEGG_number = line2[1]
+        the_KEGG_count = int(line2[2])
+        the_new_KEGG_count = (the_KEGG_count/float(the_read_count)) *1000
+        fileout.write('%s\t%s\t%f\n' %(the_file, the_KEGG_number, the_new_KEGG_count))
+    if verbose:
+        print 'Normalization complete'
+    filein.close()
+    fileout.close()
+    filedict.close()
+
+
+# the main function
+main(sys.argv[1:])
+
+
+
+
+
diff --git a/score_keggs.py b/score_keggs.py
@@ -0,0 +1,94 @@
+#!/usr/bin/python
+
+import sys
+import subprocess
+import re
+from optparse import OptionParser
+
+script_info = {}
+script_info['brief_description'] = """This script converts a tabular KO summary 
+into a table with count for each KEGG Module """
+
+script_info['script_usage'] = []
+
+usage = '''
+Usage:  
+
+python score_keggs_to_module.py -i <IMG kegg tabular input> -d <kegg dictionary>
+ -o <output text file>
+'''
+
+parser = OptionParser(usage)
+parser.add_option("-i", "--input_kegg_input", dest = "input_file",
+                  help = 'The tabular input [REQUIRED]')
+parser.add_option("-d", "--kegg_dictionary", dest = "dictionary_file",
+                  help = 'The dictionary file [REQUIRED]')
+parser.add_option("-o", "--output_table", dest = "output_file", default = 'None',
+                  help = 'The output file [OPTIONAL]')
+parser.add_option("-v", action="store_true", dest="verbose")
+
+
+
+
+# checks if the supplied arguments are adequate
+def valid_arguments(opts, args):
+    if (opts.input_file == None or opts.dictionary_file == None ):
+        return True
+    else:
+        return False
+
+
+def main(argv):
+    (opts, args) = parser.parse_args()
+    verbose = opts.verbose
+    parser.set_defaults(verbose=False)
+
+    if verbose:
+        print ''
+        print "Verbosity is on"
+    if verbose:
+        print 'Initializing...'
+
+    if valid_arguments(opts, args):
+        print usage
+        sys.exit(0)
+
+    # initialize the input file, dictionary, and output file
+    input_file = opts.input_file 
+    dictionary_file = opts.dictionary_file
+    output_file = opts.output_file
+    if output_file == 'None':
+        output_file = input_file + '_' + dictionary_file
+        if verbose:
+            print 'No output file was specified'
+            print 'Output will be directed to file: %s' %output_file
+
+    # Open input and outputs
+    filedict = open(dictionary_file, 'r')
+    filein = open(input_file, 'r')
+    fileout = open(output_file, 'w')
+
+    ko_count = {}
+    # Ko6506	K06478	K06458	K06459	K01488	K06713	K06450	K06451	K06454
+    for line in filedict:
+        line = line.strip('\n\r').split("\t")
+        my_key = line[0]
+        my_results = line[1:len(line)]
+        for result in my_results:
+            p = subprocess.Popen('grep %s %s' %(result, input_file), stdout=subprocess.PIPE, shell=True)
+            node0 = p.communicate()[0]
+            if len(node0) == 0:
+                continue
+            else:
+                node1 = node0.strip('\r\n').split('\t')
+                count = float(node1[2])    
+                current_count =  ko_count.get(my_key, 0)
+                new_count = current_count + count
+                ko_count[my_key] = new_count
+
+#    print ko_count
+    # write family_dictionary
+    for key, value in ko_count.iteritems():
+        fileout.write("%s\t%s\t%f\n" %(input_file, key, value))
+# the main function
+main(sys.argv[1:])