diff --git a/GET_LINEAGES_NCBI.EC.py b/GET_LINEAGES_NCBI.EC.py index 1dfe710..0ed9869 100644 --- a/GET_LINEAGES_NCBI.EC.py +++ b/GET_LINEAGES_NCBI.EC.py @@ -1,3 +1,5 @@ +# Author Roli Wilhelm + #!/usr/bin/python import sys, os, re, getopt, glob, subprocess, os.path, numpy as np, time import timeit diff --git a/GET_LINEAGES_NCBI.py b/GET_LINEAGES_NCBI.py index e38d400..5ebd273 100755 --- a/GET_LINEAGES_NCBI.py +++ b/GET_LINEAGES_NCBI.py @@ -1,3 +1,5 @@ +#Author Roli Wilhelm + #!/usr/bin/python import sys, os, re, getopt, glob, subprocess, os.path, numpy as np, time import timeit diff --git a/HMM_search_and_parse_and_extract.py b/HMM_search_and_parse_and_extract.py index e6da306..14f279e 100644 --- a/HMM_search_and_parse_and_extract.py +++ b/HMM_search_and_parse_and_extract.py @@ -21,7 +21,7 @@ #config = load_config() -script_info={} +script_info = {} script_info['brief_description'] = """Filters sequence according to a minimum size parameter""" script_info['script_description'] = """HMMER parser. Runs hmmscan, filters @@ -64,12 +64,12 @@ '\n(d) \'all\' -- Extract hits, contigs, and all\ proteins from hits\n') -#Compiling frequently used regular expression patterns +# Compiling frequently used regular expression patterns hmm_pattern = re.compile('[.](hmm)') query_pattern = re.compile('[.](fasta$|fas$|faa$|fsa$|fa$)') -# checks if the supplied arguments are adequate +# Checks if the supplied arguments are adequate def valid_arguments(opts, args): if (opts.input_model == None or opts.input_fp == None ): return True @@ -98,13 +98,12 @@ def update_progress(progress): sys.stderr.flush() -#Get HMM length function +# Get HMM length function def get_hmm_len(input_model): -# hmmshortname = re.sub('[.](hmm)','',input_model, re.I) - hmmshortname = re.sub(hmm_pattern,'',input_model, re.I) - hmm_leng_file = hmmshortname+".length.txt" - hmm_fileout = open(hmm_leng_file,'w') - hmm_filein = open(input_model,'r') + hmmshortname = re.sub(hmm_pattern, '', input_model, re.I) + hmm_leng_file = hmmshortname + ".length.txt" + hmm_fileout = open(hmm_leng_file, 'w') + hmm_filein = open(input_model, 'r') for line in hmm_filein: if line.startswith('NAME'): line = line.strip('\n') @@ -121,32 +120,28 @@ def get_hmm_len(input_model): continue hmm_fileout.close() hmm_filein.close() - os.system(' '.join(['cp',hmm_leng_file,'all.hmm.ps.len'])) + os.system(' '.join(['cp', hmm_leng_file, 'all.hmm.ps.len'])) -#Function to run hmmscan and parse +# Function to run hmmscan and parse def run_hmm_scan (model,query,output): - #removes extension, case insensitive search -# hmmshortname = re.sub('[.](hmm)','',model, re.I) - hmmshortname = re.sub(hmm_pattern,'',model, re.I) - #finds file format removes extension, case insensitive search -# shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I) - shortname = re.sub(query_pattern,'',query, re.I) + # Removes extension, case insensitive search + hmmshortname = re.sub(hmm_pattern, '', model, re.I) + # Finds file format removes extension, case insensitive search + shortname = re.sub(query_pattern, ' ', query, re.I) output_file = output + "/" + shortname + "_" + hmmshortname + '.hmm.out' output_file2 = output +"/" + shortname + "_" + hmmshortname + '.txt' print 'Running hmmscan...' - os.system(' '.join(['hmmscan',model,query,">",output_file])) + os.system(' '.join(['hmmscan', model, query, ">", output_file])) print 'Parsing results...' - os.system(' '.join(['sh','hmmscan-parser.sh',output_file,'>',output_file2])) + os.system(' '.join(['sh', 'hmmscan-parser.sh', output_file, '>', output_file2])) -#Filtering by evalue and coverage -def filtering_by_evalue_and_coverage(model,query,output,evalue,coverage): - #removes extension, case insensitive search -# hmmshortname = re.sub('[.](hmm)','',model, re.I) +# Filtering by evalue and coverage +def filtering_by_evalue_and_coverage(model, query, output, evalue, coverage): + # Removes extension, case insensitive search hmmshortname = re.sub(hmm_pattern,'',model, re.I) - #finds file format removes extension, case insensitive search -# shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I) - shortname = re.sub(query_pattern,'',query, re.I) + # Finds file format removes extension, case insensitive search + shortname = re.sub(query_pattern, '', query, re.I) output_file2 = output+"/" + shortname + "_" + hmmshortname + '.txt' hmm_table = open(output_file2, 'r') output_file3 = output + "/" + shortname + "_" + hmmshortname+'.filtered.txt' @@ -157,7 +152,7 @@ def filtering_by_evalue_and_coverage(model,query,output,evalue,coverage): line2 = line.strip('\n').split('\t') result_evalue = float(line2[2]) result_model_coverage = float(line2[7]) - if (result_evalue <=evalue) and (result_model_coverage*100 >= coverage): + if (result_evalue <= evalue) and (result_model_coverage * 100 >= coverage): hmm_filtered_table.write('%s' %line) else: continue @@ -165,59 +160,56 @@ def filtering_by_evalue_and_coverage(model,query,output,evalue,coverage): hmm_filtered_table.close() -#Function to extract hits from filtered results +# Function to extract hits from filtered results def extract_protein_hits(query,model,output): - #removes extension, case insensitive search - hmmshortname = re.sub(hmm_pattern,'',model, re.I) -# hmmshortname = re.sub('[.](hmm)', '', model, re.I) - #finds file format removes extension, case insensitive search -# shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I) - shortname = re.sub(query_pattern,'',query, re.I) - input_file4 = output+"/"+shortname+"_"+hmmshortname+'.filtered.txt' + # Removes extension, case insensitive search + hmmshortname = re.sub(hmm_pattern, '', model, re.I) + # Finds file format removes extension, case insensitive search + shortname = re.sub(query_pattern, '', query, re.I) + input_file4 = output + "/" + shortname + "_" + hmmshortname + '.filtered.txt' hmm_filtered_table2 = open(input_file4, 'r') - print ' Extracting proteins for %s and HMM database=%s' %(query,model) - #Create dictionary with protein:[list of model it hits] + print ' Extracting proteins for %s and HMM database=%s' %(query, model) + # Create dictionary with protein:[list of model it hits] protein_hit_dictionary = {} all_models_hits = [] for line3 in hmm_filtered_table2: line4 = line3.strip('\n').split('\t') protein_hit = line4[0] model_of_protein_hit = line4[1].rstrip(' ') - - #update list of proteins + # Update list of proteins all_models_hits.append(model_of_protein_hit) - #Get list of proteins hits, if non existent create empty list + # Get list of proteins hits, if non existent create empty list models = protein_hit_dictionary.get(protein_hit, []) - #Append current model hit to list + # Append current model hit to list models.append(model_of_protein_hit) - #Update dictionary entry + # Update dictionary entry protein_hit_dictionary[protein_hit] = models - #Print message - count_of_models=list(set(all_models_hits)) - count_of_proteins=len(protein_hit_dictionary.keys()) + # Print message + count_of_models = list(set(all_models_hits)) + count_of_proteins = len(protein_hit_dictionary.keys()) print ' Extracting %s unique proteins corresponding to %s HMM models' \ %(count_of_proteins,len(count_of_models)) - #open one output file per model - #Generate list of output files - #for item in all_models_hits: + # Open one output file per model + # Generate list of output files + # For item in all_models_hits: files = [open(output + '/' + shortname + '_' + hmmshortname + '_' + item + '.fasta', 'w') \ for item in set(all_models_hits)] - #Open original file, find if name is in hit list, - #Then get models hits and write to model result files + # Open original file, find if name is in hit list, + # Then get models hits and write to model result files filein = open(query, 'r') - for record in SeqIO.parse(filein,"fasta"): + for record in SeqIO.parse(filein, "fasta"): name = record.name if name in protein_hit_dictionary.keys(): what_models_list = protein_hit_dictionary.get(name) - #Iterate this list + # Iterate this list for what_model in what_models_list: - #Find index + # Find index index = count_of_models.index(what_model) files[index].write('>%s\n%s\n' % (name, record.seq)) #Close files @@ -226,46 +218,44 @@ def extract_protein_hits(query,model,output): #Function to extract contigs -def extract_contigs(query,model,output,assembly_file): - # removes extension, case insensitive search -# hmmshortname = re.sub('[.](hmm)', '', model, re.I) - hmmshortname = re.sub(hmm_pattern,'',model, re.I) - #finds file format removes extension, case insensitive search -# shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','', query, re.I) - shortname = re.sub(query_pattern,'',query, re.I) +def extract_contigs(query, model, output, assembly_file): + # Removes extension, case insensitive search + hmmshortname = re.sub(hmm_pattern, '', model, re.I) + # Finds file format removes extension, case insensitive search + shortname = re.sub(query_pattern, '', query, re.I) input_file4 = output + "/" + shortname + "_" + hmmshortname +'.filtered.txt' hmm_filtered_table2 = open(input_file4, 'r') - print ' Extracting contigs for file=%s and HMM database=%s' %(query,model) + print ' Extracting contigs for file=%s and HMM database=%s' %(query, model) #Create dictionary with protein:[list of model it hits] protein_model_dictionary = {} for line3 in hmm_filtered_table2: line4 = line3.strip('\n').split('\t') protein_hit = line4[0] model_of_protein_hit = line4[1].rstrip(' ') - #Get list of proteins hits, if non existent create empty list + # Get list of proteins hits, if non existent create empty list models = protein_model_dictionary.get(protein_hit, []) - #Append current model hit to list + # Append current model hit to list models.append(model_of_protein_hit) - #Update dictionary entry + # Update dictionary entry protein_model_dictionary[protein_hit] = models - #Create protein-contig dictionary + # Create protein-contig dictionary contigs_list = [] - #parse through list and add to contigs_list + # Parse through list and add to contigs_list for protein in protein_model_dictionary.keys(): contig = protein.rsplit('_', 1) contigs_list.append(contig[0]) contigs_list = list(set(contigs_list)) - #Open original file, find if name is in hit list, - #Then get models hits and write to model result files - assembly_in = open(assembly_file,'r') - contigs_file = output+"/" + shortname + "_" + hmmshortname +'_contigs.fasta' + # Open original file, find if name is in hit list, + # Then get models hits and write to model result files + assembly_in = open(assembly_file, 'r') + contigs_file = output + "/" + shortname + "_" + hmmshortname + '_contigs.fasta' contigs_out = open(contigs_file, 'w') print ' Looking for %s contigs' %len(contigs_list) progress_counter = 0 - for record in SeqIO.parse(assembly_in,"fasta"): + for record in SeqIO.parse(assembly_in, "fasta"): name = record.name if name in contigs_list: progress_counter = progress_counter + 1 @@ -278,13 +268,11 @@ def extract_contigs(query,model,output,assembly_file): print 'Some contigs were not found' -#Function to extract all proteins from contig +# Function to extract all proteins from contig def extract_all_proteins_from_contigs(query, model, output): # Removes extension, case insensitive search - hmmshortname = re.sub(hmm_pattern,'',model, re.I) -# hmmshortname = re.sub('[.](hmm)','',model, re.I) + hmmshortname = re.sub(hmm_pattern, '', model, re.I) # Finds file format removes extension, case insensitive search -# shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I) shortname = re.sub(query_pattern, '', query, re.I) input_file4 = output + "/" + shortname + "_" + hmmshortname + '.filtered.txt' hmm_filtered_table2 = open(input_file4, 'r') @@ -311,7 +299,7 @@ def extract_all_proteins_from_contigs(query, model, output): # Open one output file per model # Generate list of output files files = [open(output + '/' + shortname + '_' + hmmshortname + '_' \ - + contigs + '.fasta','w') for contigs in (contigs_list)] + + contigs + '.fasta', 'w') for contigs in (contigs_list)] # Open original file, find if name is in hit list, # Then get models hits and write to model result files @@ -346,7 +334,7 @@ def main(argv): raise IOError,\ "Cannot open hmmscan-parser.sh. Please copy it to the local directory" - # initialize the input file and model, loading parameters + # Initialize the input file and model, loading parameters input_model = opts.input_model input_fp = opts.input_fp output_dir = opts.output_dir @@ -357,8 +345,7 @@ def main(argv): # Creates a model length dictionary print 'Checking model length...' -# hmmshortname = re.sub('[.](hmm)', '', input_model, re.I) - hmmshortname = re.sub(hmm_pattern,'',input_model, re.I) + hmmshortname = re.sub(hmm_pattern, '', input_model, re.I) hmm_leng_file = hmmshortname + ".length.txt" print ' Created %s file' % hmm_leng_file get_hmm_len(input_model) @@ -380,7 +367,6 @@ def main(argv): extract_protein_hits(input_fp, input_model, output_dir) elif extract_mode == 'contigs': extract_contigs(input_fp, input_model, output_dir, assembly_file) - elif extract_mode == 'all': extract_protein_hits(input_fp, input_model, output_dir) extract_contigs(input_fp, input_model, output_dir, assembly_file) diff --git a/add_coverage_to_fasta_contigs_2.py b/add_coverage_to_fasta_contigs_2.py new file mode 100644 index 0000000..7aa658c --- /dev/null +++ b/add_coverage_to_fasta_contigs_2.py @@ -0,0 +1,84 @@ +#!/usr/bin/python +# File created on 31 Jan 2014. + +__author__ = "Erick Cardenas Poire" +__copyright__ = "Copyright 2014" +__credits__ = [""] +__version__ = "1.0" +__maintainer__ = "Erick Cardenas Poire" +__status__ = "Release" + +from Bio import SeqIO +import sys +from os import makedirs, sys, listdir, environ, path +import re +import inspect +from commands import getstatusoutput +from optparse import OptionParser +import shutil + +#config = load_config() +script_info = {} +script_info['brief_description'] = """Adds coverage information from one file and modifies fasta header""" +script_info['script_description'] = """Adds coverage information from one file and modifies fasta header + REQUIRED: You must have a fasta and coverage file with same base name""" +script_info['script_usage'] = [] + +usage= """ +Need to run it like this: +./add.coverage.to.fasta.py -i input_file +For more options: ./add.coverage.to.fasta.py -h""" + +parser = OptionParser(usage) +parser.add_option("-i", "--input_file", dest = "input_fp", + help = 'the input fasta file/input dir [REQUIRED]') + + +#creates an input output pair if input is just an input file +def create_an_inputs_and_output(input_file): + input_output = [] + shortname = re.sub('[.](fasta$|fas$|fna$|faa$|fsa$|fa$)','',input_file, re.I) #finds file format removes extension, case insensitive search + coverage_input_file = shortname+".cov" + output_file = shortname + ".new.fasta" + input_output.append(input_file) + input_output.append(coverage_input_file) + input_output.append(output_file) + return input_output + +# checks if the supplied arguments are adequate +def valid_arguments(opts, args): + if opts.input_fp == None: + return True + else: + return False + +def main(argv): + (opts, args) = parser.parse_args() + if valid_arguments(opts, args): + print usage + sys.exit(0) + + # initialize the input directory or file + input_fp = opts.input_fp + list_of_files = create_an_inputs_and_output(input_fp) + + # Creates coverage dictionary + coverage_dictionary = {} + coverage_file_in = open(list_of_files[1],'r') + for line in coverage_file_in: + line = line.split('\t') + seq_ID = line[0] + seq_coverage = line[1] + coverage_dictionary[seq_ID] = seq_coverage + coverage_file_in.close() + + fileout = open(list_of_files[2], 'w') + for seq_record in SeqIO.parse(list_of_files[0], format = "fasta"): + seq_name = seq_record.id + coverage = coverage_dictionary.get(seq_name,0) + description = "coverage=" + coverage + fileout.write('>%s %s\n%s\n' %(seq_record.id, description, seq_record.seq)) + fileout.close() + +# the main function +main(sys.argv[1:]) \ No newline at end of file diff --git a/add_lineage_to_dictionary_2.py b/add_lineage_to_dictionary_2.py new file mode 100644 index 0000000..a37af93 --- /dev/null +++ b/add_lineage_to_dictionary_2.py @@ -0,0 +1,138 @@ +#!/usr/bin/python +import sys, os, re, glob, subprocess, numpy as np, pickle +from cogent.parse.ncbi_taxonomy import NcbiTaxonomyFromFiles +from optparse import OptionParser + +#config = load_config() +script_info={} +script_info['brief_description'] = """Adds lineage to dictionary""" +script_info['script_description'] = """Adds lineage to dictionary""" +script_info['script_usage'] = [] + +usage= """ +Need to run it like this: +./add_lineage_to_dictionary -i input_file""" + +parser = OptionParser(usage) +parser.add_option("-i", "--input_dictionary", dest="input_fp", + help='the input dictionary file [REQUIRED]') +parser.add_option("-o", "--destination_dictionary", dest="output_fp", + help='the output dictionary file [REQUIRED]') +parser.add_option("-t", "--tax_level", dest="tax_level", + help='the desired taxonomic levels [REQUIRED]') +parser.add_option("-d", "--ncbi_database", dest="ncbi_db", + help='ncbi database [REQUIRED]') + +## Define function for pulling lineage info from NCBI nodes and names files +def get_lineage(node, my_ranks): + ranks_lookup = dict([(r,idx) for idx, r in enumerate(my_ranks)]) + lineage = [None] * len(my_ranks) + curr = node + while curr.Parent is not None: + if curr.Rank in ranks_lookup: + lineage[ranks_lookup[curr.Rank]] = curr.Name + curr = curr.Parent + return lineage + + +#def get_lineage_from_taxid(gi): +# try: +# # Superkingdom search +# node = tree.ById[taxid] + +# tax_superkingdom = get_lineage(node, 'superkingdom') +# tax_superkingdom = str(tax_superkingdom[0]).lower() + +# tax_phylum = get_lineage(node, 'phylum') +# tax_phylum = str(tax_phylum[0]).lower() + +# tax_class = get_lineage(node, 'class') +# tax_class = str(tax_class[0]).lower() + +# tax_order = get_lineage(node, 'order') +# tax_order = str(tax_order[0]).lower() + + # tax_family = get_lineage(node, 'family') + # tax_family = str(tax_family[0]).lower() + + # tax_genus = get_lineage(node, 'genus') + # tax_genus = str(tax_genus[0]).lower() + + # tax_species = get_lineage(node, 'species') + # tax_species = str(tax_species[0]).lower() + + # tax = [tax_superkingdom, tax_phylum, tax_class, tax_order, tax_family, tax_genus, tax_species] + #except KeyError: + # tax = ['NA','NA','NA','NA','NA','NA','NA'] + #print tax + + + +def main(argv): + (opts, args) = parser.parse_args() + + #Initialize files + input_fp = opts.input_fp + input_dictionary_file = open(input_fp, "rb") + input_dictionary = pickle.load(input_dictionary_file) + + output_fp = opts.output_fp + output_file = open(output_fp, "w") +# output = {} + ncbi_db = opts.ncbi_db + + # Print loading dictionary +# test_dictionary = {'gi_63300aaa':['a','b','c'], 'gi|163862923|gb_ABY43982.1_': ['d','e','f']} + + # Print loading tree +# tree = NcbiTaxonomyFromFiles(open('nodes.dmp'), open('names.dmp')) +# root = tree.Root +# all_taxids = [] + + for key in input_dictionary.keys(): +# for key in test_dictionary.keys(): + #print key + if key.startswith('gi|'): + gi_location = key.split('|') + gi = gi_location[1] + else: + gi_location = key.split('_') + gi = gi_location[1] +# subprocess.call('grep --max-count=1 \"'+gi+'\" \"'+ncbi_db+'\" | tee -a blast_taxid.txt', shell = True) + grep = subprocess.Popen('grep --max-count=1 \"'+gi+'\" \"'+ncbi_db+'\"', shell = True, stdout = subprocess.PIPE) + node0 = grep.communicate()[0] + node1 = node0.strip('\n').split('\t') + try: + taxid = int(node1[1]) + except IndexError: + taxid = 'nope' + #print taxid + output_file.write ('%s\t%s\t%s\n' %(key, gi, taxid)) + #print 'end' + +# if taxid == None : +# print 'No taxid found for gi %s' %gi +# taxid = 'nope' +# else: +# continue + # all_taxids.append(taxid) + +# value = test_dictionary.get(key) +# print value +# print 'gi is %s' %gi +# value2 = value.append(str(gi)) +# print value2 +# value3 = value2.append(str(taxid)) +# print value + +# output[key] = value2 +# print len(set(all_taxids)) +# print all_taxids.count('none') +# input_dictionary_file.close() +# pickle.dump(output,output_file) + output_file.close() + +# Run main function +main(sys.argv[1:]) + + diff --git a/all_hmm_ps_2.len b/all_hmm_ps_2.len new file mode 100644 index 0000000..d78b143 --- /dev/null +++ b/all_hmm_ps_2.len @@ -0,0 +1,333 @@ +CBM10.hmm 28 +CBM11.hmm 163 +CBM12.hmm 34 +CBM13.hmm 188 +CBM14.hmm 54 +CBM15.hmm 146 +CBM16.hmm 116 +CBM17.hmm 203 +CBM18.hmm 38 +CBM19.hmm 45 +CBM1.hmm 29 +CBM20.hmm 90 +CBM21.hmm 107 +CBM22.hmm 131 +CBM23.hmm 162 +CBM24.hmm 76 +CBM25.hmm 78 +CBM26.hmm 75 +CBM27.hmm 168 +CBM28.hmm 208 +CBM29.hmm 144 +CBM2.hmm 101 +CBM30.hmm 91 +CBM31.hmm 92 +CBM32.hmm 124 +CBM34.hmm 120 +CBM35.hmm 123 +CBM36.hmm 115 +CBM37.hmm 62 +CBM38.hmm 129 +CBM39.hmm 94 +CBM3.hmm 88 +CBM40.hmm 179 +CBM41.hmm 102 +CBM42.hmm 136 +CBM43.hmm 83 +CBM44.hmm 64 +CBM45.hmm 81 +CBM46.hmm 87 +CBM47.hmm 128 +CBM48.hmm 76 +CBM49.hmm 78 +CBM4.hmm 126 +CBM50.hmm 40 +CBM51.hmm 134 +CBM52.hmm 52 +CBM53.hmm 87 +CBM54.hmm 114 +CBM55.hmm 46 +CBM56.hmm 159 +CBM57.hmm 147 +CBM58.hmm 117 +CBM59.hmm 145 +CBM5.hmm 40 +CBM60.hmm 108 +CBM61.hmm 141 +CBM62.hmm 131 +CBM6.hmm 138 +CBM8.hmm 143 +CBM9.hmm 182 +CE10.hmm 341 +CE11.hmm 271 +CE12.hmm 210 +CE13.hmm 355 +CE14.hmm 124 +CE15.hmm 269 +CE16.hmm 267 +CE1.hmm 227 +CE2.hmm 209 +CE3.hmm 194 +CE4.hmm 130 +CE5.hmm 189 +CE6.hmm 99 +CE7.hmm 313 +CE8.hmm 288 +CE9.hmm 373 +cohesin.hmm 134 +dockerin.hmm 21 +GH100.hmm 458 +GH102.hmm 157 +GH103.hmm 295 +GH104.hmm 145 +GH105.hmm 332 +GH106.hmm 824 +GH107.hmm 329 +GH108.hmm 86 +GH109.hmm 126 +GH10.hmm 303 +GH110.hmm 548 +GH111.hmm 1032 +GH112.hmm 715 +GH113.hmm 306 +GH114.hmm 190 +GH115.hmm 697 +GH116.hmm 363 +GH117.hmm 211 +GH118.hmm 477 +GH119.hmm 1070 +GH11.hmm 177 +GH120.hmm 91 +GH121.hmm 1392 +GH122.hmm 337 +GH123.hmm 538 +GH124.hmm 332 +GH125.hmm 402 +GH12.hmm 156 +GH13.hmm 299 +GH14.hmm 412 +GH15.hmm 361 +GH16.hmm 189 +GH17.hmm 311 +GH18.hmm 296 +GH19.hmm 231 +GH1.hmm 429 +GH20.hmm 337 +GH22.hmm 122 +GH23.hmm 135 +GH24.hmm 137 +GH25.hmm 177 +GH26.hmm 303 +GH27.hmm 375 +GH28.hmm 325 +GH29.hmm 346 +GH2.hmm 752 +GH30.hmm 417 +GH31.hmm 427 +GH32.hmm 293 +GH33.hmm 342 +GH34.hmm 461 +GH35.hmm 307 +GH36.hmm 688 +GH37.hmm 491 +GH38.hmm 269 +GH39.hmm 431 +GH3.hmm 216 +GH42.hmm 371 +GH43.hmm 248 +GH44.hmm 514 +GH45.hmm 198 +GH46.hmm 222 +GH47.hmm 446 +GH48.hmm 617 +GH49.hmm 549 +GH4.hmm 179 +GH50.hmm 653 +GH51.hmm 630 +GH52.hmm 415 +GH53.hmm 342 +GH54.hmm 316 +GH55.hmm 740 +GH56.hmm 333 +GH57.hmm 383 +GH58.hmm 449 +GH59.hmm 631 +GH5.hmm 275 +GH62.hmm 278 +GH63.hmm 570 +GH64.hmm 367 +GH65.hmm 372 +GH66.hmm 556 +GH67.hmm 669 +GH68.hmm 417 +GH6.hmm 294 +GH70.hmm 803 +GH71.hmm 375 +GH72.hmm 312 +GH73.hmm 128 +GH74.hmm 233 +GH75.hmm 220 +GH76.hmm 358 +GH77.hmm 494 +GH78.hmm 504 +GH79.hmm 455 +GH7.hmm 415 +GH80.hmm 63 +GH81.hmm 622 +GH82.hmm 185 +GH83.hmm 542 +GH84.hmm 295 +GH85.hmm 315 +GH86.hmm 591 +GH87.hmm 597 +GH88.hmm 329 +GH89.hmm 663 +GH8.hmm 320 +GH90.hmm 551 +GH91.hmm 395 +GH92.hmm 491 +GH93.hmm 307 +GH94.hmm 1036 +GH95.hmm 722 +GH96.hmm 614 +GH97.hmm 631 +GH98.hmm 327 +GH99.hmm 334 +GH9.hmm 418 +GT10.hmm 347 +GT11.hmm 276 +GT12.hmm 134 +GT13.hmm 395 +GT14.hmm 250 +GT15.hmm 273 +GT16.hmm 350 +GT17.hmm 284 +GT18.hmm 686 +GT19.hmm 354 +GT1.hmm 382 +GT20.hmm 475 +GT21.hmm 233 +GT22.hmm 389 +GT23.hmm 318 +GT24.hmm 248 +GT25.hmm 181 +GT26.hmm 171 +GT27.hmm 295 +GT28.hmm 157 +GT29.hmm 247 +GT2.hmm 168 +GT30.hmm 177 +GT31.hmm 192 +GT32.hmm 90 +GT33.hmm 425 +GT34.hmm 246 +GT35.hmm 674 +GT37.hmm 459 +GT38.hmm 467 +GT39.hmm 223 +GT3.hmm 637 +GT40.hmm 211 +GT41.hmm 705 +GT42.hmm 288 +GT43.hmm 212 +GT44.hmm 100 +GT45.hmm 115 +GT46.hmm 356 +GT47.hmm 296 +GT48.hmm 739 +GT49.hmm 337 +GT4.hmm 160 +GT50.hmm 262 +GT51.hmm 177 +GT52.hmm 263 +GT53.hmm 1049 +GT54.hmm 290 +GT55.hmm 383 +GT56.hmm 357 +GT57.hmm 481 +GT58.hmm 364 +GT59.hmm 404 +GT5.hmm 472 +GT60.hmm 330 +GT61.hmm 242 +GT62.hmm 268 +GT63.hmm 340 +GT64.hmm 248 +GT65.hmm 340 +GT66.hmm 693 +GT67.hmm 315 +GT68.hmm 350 +GT69.hmm 239 +GT6.hmm 280 +GT70.hmm 368 +GT71.hmm 264 +GT72.hmm 355 +GT73.hmm 245 +GT74.hmm 280 +GT75.hmm 343 +GT76.hmm 407 +GT77.hmm 216 +GT78.hmm 134 +GT79.hmm 879 +GT7.hmm 250 +GT80.hmm 379 +GT81.hmm 293 +GT82.hmm 311 +GT83.hmm 540 +GT84.hmm 215 +GT85.hmm 427 +GT87.hmm 231 +GT88.hmm 523 +GT89.hmm 556 +GT8.hmm 257 +GT90.hmm 250 +GT91.hmm 451 +GT92.hmm 279 +GT9.hmm 225 +PL10.hmm 287 +PL11.hmm 606 +PL12.hmm 138 +PL13.hmm 363 +PL14.hmm 200 +PL15.hmm 134 +PL16.hmm 278 +PL17.hmm 139 +PL18.hmm 188 +PL1.hmm 202 +PL20.hmm 229 +PL21.hmm 72 +PL22.hmm 265 +PL2.hmm 530 +PL3.hmm 197 +PL4.hmm 567 +PL5.hmm 317 +PL6.hmm 372 +PL7.hmm 231 +PL8.hmm 259 +PL9.hmm 374 +SLH.hmm 42 +CBM63.hmm 78 +CBM64.hmm 85 +GH101.hmm 707 +GH126.hmm 321 +GH127.hmm 524 +GH128.hmm 224 +GH129.hmm 618 +GH130.hmm 296 +GT93.hmm 309 +GT94.hmm 283 +AA10.hmm 178 +AA1.hmm 943 +AA2.hmm 255 +AA3.hmm 618 +AA4.hmm 522 +AA5.hmm 1281 +AA6.hmm 195 +AA7.hmm 458 +AA8.hmm 815 +AA9.hmm 220 +CBM65.hmm 114 +CBM66.hmm 155 +CBM67.hmm 176 +GH131.hmm 255 +GH132.hmm 303 diff --git a/correct_cazy_dictionary.py b/correct_cazy_dictionary.py index 33ad558..6a99b28 100755 --- a/correct_cazy_dictionary.py +++ b/correct_cazy_dictionary.py @@ -88,7 +88,7 @@ print len(good_dictionary.keys()) #print bad_dictionary -pickle.dump(good_dictionary,file_good_dictionary) +pickle.dump(good_dictionary, file_good_dictionary) file_bad_dictionary.close() diff --git a/count_hmmer_hits.py b/count_hmmer_hits.py new file mode 100644 index 0000000..fae6149 --- /dev/null +++ b/count_hmmer_hits.py @@ -0,0 +1,19 @@ +#!/usr/bin/python + +#usage +#python count_hmmer_hits.py + +import sys + +filein = open(sys.argv[1], 'r') +counter = 0 + +for line in filein: + if line.ends.with('[number of targets reported over threshold]'): + line2 = line.split(' ') + print line2 +else: + continue + +print '%s\t%s' %(sys.argv[1], str(counter)) + diff --git a/create_newcazy_dictionary.py b/create_newcazy_dictionary.py index 699ffe5..bc22dcb 100755 --- a/create_newcazy_dictionary.py +++ b/create_newcazy_dictionary.py @@ -1,12 +1,7 @@ #!/usr/bin/python # File created on 13 Feb 2014. -__author__ = "Erick Cardenas Poire" -__copyright__ = "Copyright 2014" -__credits__ = [""] -__version__ = "1.0" -__maintainer__ = "Erick Cardenas Poire" -__status__ = "Release" +# Author = "Erick Cardenas Poire" import pickle diff --git a/create_newcazy_dictionary_with_subfamilies.py b/create_newcazy_dictionary_with_subfamilies.py index f8d092e..a248ba9 100755 --- a/create_newcazy_dictionary_with_subfamilies.py +++ b/create_newcazy_dictionary_with_subfamilies.py @@ -1,13 +1,7 @@ #!/usr/bin/python # File created on 13 Feb 2014. -__author__ = "Erick Cardenas Poire" -__copyright__ = "Copyright 2014" -__credits__ = [""] -__version__ = "1.0" -__maintainer__ = "Erick Cardenas Poire" -__status__ = "Release" - +# Author = "Erick Cardenas Poire" import pickle import sys diff --git a/dereplicate_fasta.py b/dereplicate_fasta.py index 24977f1..725eddf 100644 --- a/dereplicate_fasta.py +++ b/dereplicate_fasta.py @@ -1,12 +1,7 @@ #!/usr/bin/python from __future__ import division -__author__ = "Erick Cardenas Poire" -__copyright__ = "Copyright 2014" -__credits__ = [""] -__version__ = "1.0" -__maintainer__ = "Erick Cardenas Poire" -__status__ = "Release" +# Author = "Erick Cardenas Poire" try: from Bio import SeqIO @@ -22,7 +17,7 @@ # config = load_config() -script_info={} +script_info = {} script_info['brief_description'] = """Dereplicates sequences based on name""""" script_info['script_description'] = """ REQUIRED: Fasta file diff --git a/fasta_removal.py b/fasta_removal.py index 56f474f..a423e00 100644 --- a/fasta_removal.py +++ b/fasta_removal.py @@ -1,6 +1,7 @@ # Removes sequences that are in a list provided # Requires fasta file and list of sequences to be removed (one name per line) # Requires screed module + #usage #python fasta.removal.py # 0 1 2 @@ -8,8 +9,8 @@ import sys, screed # Inputs -filein = open(sys.argv[1],'r') -filelist = open(sys.argv[2],'r') +filein = open(sys.argv[1], 'r') +filelist = open(sys.argv[2], 'r') # Outputs outy = sys.argv[1] @@ -52,4 +53,4 @@ fileout.close() fileout2.close() -filein.close() \ No newline at end of file +filein.close() diff --git a/fasta_to_stockholm.py b/fasta_to_stockholm.py index 6d29834..6abc356 100755 --- a/fasta_to_stockholm.py +++ b/fasta_to_stockholm.py @@ -6,11 +6,11 @@ from Bio import AlignIO #input -filein=open(sys.argv[1],"r") +filein = open(sys.argv[1], "r") #outputs -fileout=open(sys.argv[2],'w') +fileout = open(sys.argv[2], 'w') -AlignIO.convert(filein,"fasta",fileout,"stockholm") +AlignIO.convert(filein, "fasta", fileout, "stockholm") diff --git a/fastaselection_nucleotide.py b/fastaselection_nucleotide.py index 4f86345..77b5dde 100755 --- a/fastaselection_nucleotide.py +++ b/fastaselection_nucleotide.py @@ -5,46 +5,46 @@ import sys, screed #inputs -filein=open(sys.argv[1],'r') -filelist=open(sys.argv[2],'r') +filein = open(sys.argv[1], 'r') +filelist = open(sys.argv[2], 'r') #outputs -outy=sys.argv[1] -out1=outy+'.cleaned.sequences' -fileout1=open(out1,'w') +outy = sys.argv[1] +out1 = outy + '.cleaned.sequences' +fileout1 = open(out1, 'w') -outy=sys.argv[1] -out2=outy+'.removed.sequences' -fileout2=open(out2,'w') +outy = sys.argv[1] +out2 = outy + '.removed.sequences' +fileout2 = open(out2, 'w') #create a list with the names of the sequences requested -requestedsequences=[] +requestedsequences = [] for line in filelist: - line=line.strip('\n').strip('\r') + line = line.strip('\n').strip('\r') requestedsequences.append(line) #print requestedsequences -number_records=len(requestedsequences) +number_records = len(requestedsequences) print "%s records requested" % number_records #read file, read each record, if name is in list write it, otherwise continue -counter=1 +counter = 1 for record in screed.open(sys.argv[1]): - sequence_name=record.name #get sequence name + sequence_name = record.name #get sequence name if sequence_name in requestedsequences: print "%s of %s records found" %(counter, number_records) - sequence=record.sequence - sequence=sequence.strip('*') - description=record.description + sequence = record.sequence + sequence = sequence.strip('*') + description = record.description fileout2.write(">%s %s\n%s\n" %(sequence_name, description, sequence)) - counter=counter+1 + counter = counter + 1 else: - sequence=record.sequence - sequence=sequence.strip('*') - description=record.description + sequence = record.sequence + sequence = sequence.strip('*') + description = record.description fileout1.write(">%s %s\n%s\n" %(sequence_name, description, sequence)) fileout.close() fileout2.close() diff --git a/fastaselection_protein_v2.py b/fastaselection_protein_v2.py index fce0662..1cde036 100644 --- a/fastaselection_protein_v2.py +++ b/fastaselection_protein_v2.py @@ -5,22 +5,22 @@ import sys, screed #inputs -filein=open(sys.argv[1],'r') -name=sys.argv[2] +filein = open(sys.argv[1], 'r') +name = sys.argv[2] #outputs -outy=sys.argv[1] -out1=outy+'.'+name+'.fasta' +outy = sys.argv[1] +out1 = outy + '.' + name + '.fasta' -fileout=open(out1,'w') +fileout = open(out1, 'w') #read file, read each record, if name is in list write it, otherwise continue for record in screed.open(sys.argv[1]): - sequence_name=record.name #get sequence name + sequence_name = record.name #get sequence name if sequence_name == name: - description=record.description - sequence=record.sequence + description = record.description + sequence = record.sequence print "Records found" fileout.write(">%s %s\n%s\n" %(sequence_name, description, sequence)) else: diff --git a/fastq-to-fasta.py b/fastq-to-fasta.py index 4757691..e972b0a 100755 --- a/fastq-to-fasta.py +++ b/fastq-to-fasta.py @@ -27,14 +27,14 @@ def update_progress(progress): sys.stderr.write(text) sys.stderr.flush() -counter=0 +counter = 0 for n, record in enumerate(fastq_iter(open(sys.argv[1]))): - counter=counter+1 + counter = counter + 1 print ('%s reads found' %counter) for n, record in enumerate(fastq_iter(open(sys.argv[1]))): if n % 1 == 0: - progress=n/float(counter) + progress=n / float(counter) update_progress(progress) #print>>sys.stderr, '...', n sequence = record['sequence'] diff --git a/fastq_to_fasta.py b/fastq_to_fasta.py index 320f734..9b8adc9 100644 --- a/fastq_to_fasta.py +++ b/fastq_to_fasta.py @@ -19,7 +19,7 @@ from optparse import OptionParser #config = load_config() -script_info={} +script_info = {} script_info['brief_description'] = """Converts fastq to fasta""" script_info['script_description'] = """Read fastq with Biopython, writes fasta REQUIRED: You must have a fasta file""" @@ -30,8 +30,8 @@ ./fastq.to.fasta.py -i input_file""" parser = OptionParser(usage) -parser.add_option("-i", "--input_file", dest="input_fp", - help='the input fastq file [REQUIRED]') +parser.add_option("-i", "--input_file", dest = "input_fp", + help = 'the input fastq file [REQUIRED]') # Creates an input output pair if input is just an input file @@ -39,7 +39,7 @@ def create_an_inputs_and_output(input_file): input_output = [] # finds file format removes extension, case insensitive search shortname = re.sub('[.](fastq$|fq$)','',input_file, re.I) - output_file = shortname+".fasta" + output_file = shortname + ".fasta" input_output.append(input_file) input_output.append(output_file) return input_output diff --git a/filter_blast_by_query_coverage.py b/filter_blast_by_query_coverage.py index 3648578..3b96f6a 100755 --- a/filter_blast_by_query_coverage.py +++ b/filter_blast_by_query_coverage.py @@ -5,10 +5,10 @@ import sys -filein=open(sys.argv[1],'r') -out1=sys.argv[2] +filein = open(sys.argv[1], 'r') +out1 = sys.argv[2] -fileout=open(out1,'w') +fileout = open(out1, 'w') #blast output #HS6_179:1:1101:10145:166587/1 gi|49642693|emb|CAH00655.1| 58.33 24 4e-04 35.0 79 @@ -17,15 +17,15 @@ for line in filein: - output=line - line=line.split('\t') #split line from blast output - query_len=float(line[3]) #get query lenght - alignment_len=float(line[5]) #get alignment length - query_coverage=alignment_len/query_len #get query coverage + output = line + line = line.split('\t') #split line from blast output + query_len = float(line[3]) #get query lenght + alignment_len = float(line[5]) #get alignment length + query_coverage = alignment_len / query_len #get query coverage print query_len print alignment_len print query_coverage - if query_coverage>=0.7: + if query_coverage >= 0.7: fileout.write ('%s' %output) else: continue diff --git a/filter_by_size.py b/filter_by_size.py index e264f5b..f7650a1 100755 --- a/filter_by_size.py +++ b/filter_by_size.py @@ -30,24 +30,24 @@ #config = load_config() -script_info={} +script_info = {} script_info['brief_description'] = """Filters sequence according to a minimum size parameter""" script_info['script_description'] = """Reads sequences, calculates size and writes to output if length is more or equal to size parameter REQUIRED: You must have a fasta and size parameter""" script_info['script_usage'] = [] -usage= """ +usage = """ Sorry to bother you, but you need to run it like this: python filter.by.size.py -i > -s """ parser = OptionParser(usage) -parser.add_option("-i", "--input_file", dest="input_fp", - help='the input fasta file [REQUIRED]') -parser.add_option("-s", "--input_size", dest="threshold_size", - help='the input threshold [REQUIRED]') +parser.add_option("-i", "--input_file", dest = "input_fp", + help = 'the input fasta file [REQUIRED]') +parser.add_option("-s", "--input_size", dest = "threshold_size", + help = 'the input threshold [REQUIRED]') @@ -56,7 +56,7 @@ def create_inputs_and_output(input_file): input_output = [] shortname = re.sub('[.](fasta$|fas$|fna$|faa$|fsa$|fa$)','',input_file, re.I) #finds file format removes extension, case insensitive search - output_file=shortname+".filtered.fasta" + output_file = shortname + ".filtered.fasta" input_output.append(input_file) input_output.append(output_file) return input_output @@ -79,12 +79,12 @@ def main(argv): input_fp = opts.input_fp list_of_files=create_inputs_and_output(input_fp) size = opts.threshold_size - fileout=open(list_of_files[1], 'w') + fileout = open(list_of_files[1], 'w') print ("Filtering out sequences smaller than %s bases" %size) #Read sequences and filter - for seq_record in SeqIO.parse(list_of_files[0], format="fasta"): - seq_size=(len(seq_record.seq)) + for seq_record in SeqIO.parse(list_of_files[0], format = "fasta"): + seq_size = (len(seq_record.seq)) if float(seq_size) >= float(size): fileout.write('>%s %s\n%s\n' %(seq_record.id, seq_record.description, seq_record.seq)) else: diff --git a/filter_fasta.py b/filter_fasta.py index 2c8da8e..b2963af 100644 --- a/filter_fasta.py +++ b/filter_fasta.py @@ -7,19 +7,19 @@ import Bio from Bio import SeqIO -filein=open(sys.argv[1],'rb') -fileout=open(sys.argv[2], 'w') +filein = open(sys.argv[1], 'rb') +fileout = open(sys.argv[2], 'w') -for seq_record in SeqIO.parse(filein, format="fasta"): - line=seq_record.description +for seq_record in SeqIO.parse(filein, format = "fasta"): + line = seq_record.description # print line - line=line.split('#') + line = line.split('#') # print line - partial_info=line[4] + partial_info = line[4] # print partial_info - partial=partial_info.split(';') + partial = partial_info.split(';') # print partial - if partial[1]=='partial=00': + if partial[1] == 'partial=00': fileout.write('>%s %s\n%s\n' %(seq_record.id, seq_record.description, seq_record.seq)) # print 'complete' else: diff --git a/find_unique_contigs.hit.py b/find_unique_contigs.hit.py index db7c6fe..002b227 100755 --- a/find_unique_contigs.hit.py +++ b/find_unique_contigs.hit.py @@ -3,18 +3,18 @@ import sys -out1=sys.argv[2] -fileout=open(out1,'w') +out1 = sys.argv[2] +fileout = open(out1,'w') -contiglist=[] +contiglist = [] for line in open (sys.argv[1]): - line=line.split('\t') - contig0=line[0] - contig1=contig0.split('_') - contig2=contig1[0] + line = line.split('\t') + contig0 = line[0] + contig1 = contig0.split('_') + contig2 = contig1[0] contiglist.append(contig2) -lista=set(contiglist) +lista = set(contiglist) #print lista contigdict={} @@ -22,8 +22,8 @@ contigdict[member]=contiglist.count(member) for key in contigdict: - firstcol=key - secondcol=contigdict.get(key) + firstcol = key + secondcol = contigdict.get(key) fileout.write ('%s\t%s\n' %(firstcol , secondcol)) #fileout.write('%s' %contiglist) diff --git a/fq_gz-first_100000.fa.py b/fq_gz-first_100000.fa.py index 8562363..1c96726 100644 --- a/fq_gz-first_100000.fa.py +++ b/fq_gz-first_100000.fa.py @@ -12,7 +12,7 @@ from screed.fastq import fastq_iter for n, record in enumerate(fastq_iter(gzip.open(filein,'rb'))): - if n <=100000: + if n <= 100000: sequence = record['sequence'] name = record['name'] fw.write('>%s\n%s\n' % (name, sequence)) diff --git a/genbank_to_fasta.py b/genbank_to_fasta.py index 16ac13d..99ef92f 100644 --- a/genbank_to_fasta.py +++ b/genbank_to_fasta.py @@ -41,7 +41,7 @@ """ parser = OptionParser(usage) -parser.add_option("-i", "--input_file", dest="input_fp", +parser.add_option("-i", "--input_file", dest = "input_fp", help='the input fasta file [REQUIRED]') @@ -50,7 +50,7 @@ def create_inputs_and_output(input_file): input_output = [] shortname = re.sub('[.](gbk$|.gen$|gb$)','',input_file, re.I) #finds file format removes extension, case insensitive search - output_file=shortname+".fasta" + output_file = shortname+".fasta" input_output.append(input_file) input_output.append(output_file) return input_output @@ -71,12 +71,12 @@ def main(argv): # initialize the inputs and outputs input_fp = opts.input_fp - list_of_files=create_inputs_and_output(input_fp) - fileout=open(list_of_files[1], 'w') + list_of_files = create_inputs_and_output(input_fp) + fileout = open(list_of_files[1], 'w') print ("Converting Genbank to Fasta") #Read sequences and filter - for seq_record in SeqIO.parse(list_of_files[0], format="genbank"): + for seq_record in SeqIO.parse(list_of_files[0], format = "genbank"): fileout.write('>%s %s\n%s\n' %(seq_record.id, seq_record.description, seq_record.seq)) fileout.close() diff --git a/get_full_name.py b/get_full_name.py index 3d7ad53..0da8539 100755 --- a/get_full_name.py +++ b/get_full_name.py @@ -2,5 +2,5 @@ import screed for record in screed.open(sys.argv[1]): - print '%s\t%s' %(record.name,record.description) + print '%s\t%s' %(record.name, record.description) diff --git a/get_hmm_len.py b/get_hmm_len.py index 1d6e71b..a60397b 100644 --- a/get_hmm_len.py +++ b/get_hmm_len.py @@ -15,21 +15,21 @@ #if line starts with LENG get name write name.hmm #read next line -filein=open(sys.argv[1],'r') -fileout=open(sys.argv[2],'w') +filein = open(sys.argv[1], 'r') +fileout = open(sys.argv[2], 'w') for line in filein: if line.startswith('NAME'): - line=line.strip('\n') - line=line.split(' ') - name=line[2] + line = line.strip('\n') + line = line.split(' ') + name = line[2] fileout.write('%s\t' %name) #print line[2] else: if line.startswith('LENG'): - line=line.strip('\n') - line=line.split(' ') - len=line[2] + line = line.strip('\n') + line = line.split(' ') + len = line[2] fileout.write('%s\n' %len) # print line[2] else: diff --git a/get_name_and_description.py b/get_name_and_description.py index efa8318..718a45f 100755 --- a/get_name_and_description.py +++ b/get_name_and_description.py @@ -1,13 +1,13 @@ import sys -filein=open(sys.argv[1]) -fp=open(sys.argv[2], 'w') +filein = open(sys.argv[1]) +fp = open(sys.argv[2], 'w') for line in filein: if line.startswith('>'): - line=line.split(" ",1) - name=line[0] - desc=line[1] - fp.write('%s\t%s' %(name,desc)) + line = line.split(" ", 1) + name = line[0] + desc = line[1] + fp.write('%s\t%s' %(name, desc)) else: continue fp.close() diff --git a/get_protein_from_nucleotide_accession.py b/get_protein_from_nucleotide_accession.py index 02c3733..b8dbd0d 100644 --- a/get_protein_from_nucleotide_accession.py +++ b/get_protein_from_nucleotide_accession.py @@ -1,14 +1,7 @@ - - #!/usr/bin/python # File created on 28 Feb 2014. -__author__ = "Erick Cardenas Poire" -__copyright__ = "Copyright 2014" -__credits__ = [""] -__version__ = "1.0" -__maintainer__ = "Erick Cardenas Poire" -__status__ = "Release" +# Author__ = "Erick Cardenas Poire" import sys from Bio import SeqIO @@ -28,13 +21,13 @@ """ script_info['script_usage'] = [] -usage= """ +usage = """ Need to run it like this: ./get_protein_from_nucleotide_accession.py -i input_file""" parser = OptionParser(usage) -parser.add_option("-i", "--input_file", dest="input_fp", - help='the input fastq file [REQUIRED]') +parser.add_option("-i", "--input_file", dest = "input_fp", + help = 'the input fastq file [REQUIRED]') # Creates an input output pair if input is just an input file diff --git a/get_protein_from_nucleotide_accession_v2.py b/get_protein_from_nucleotide_accession_v2.py index 36be1b7..89821d2 100644 --- a/get_protein_from_nucleotide_accession_v2.py +++ b/get_protein_from_nucleotide_accession_v2.py @@ -34,8 +34,8 @@ ./get_protein_from_nucleotide_accession.py -i input_file""" parser = OptionParser(usage) -parser.add_option("-i", "--input_file", dest="input_fp", - help='the input fastq file [REQUIRED]') +parser.add_option("-i", "--input_file", dest = "input_fp", + help = 'the input fastq file [REQUIRED]') # Creates an input output pair if input is just an input file @@ -79,7 +79,7 @@ def main(argv): dict_entry = dict_entry0[0] #print record_name #print dict_entry - handle = Entrez.efetch(db="nucleotide", id=record_name, rettype="gb", retmode="text") + handle = Entrez.efetch(db = "nucleotide", id = record_name, rettype = "gb", retmode = "text") records = SeqIO.parse(handle, "genbank") time.sleep(0.5) for record in records: diff --git a/get_protein_with_gi_number.py b/get_protein_with_gi_number.py index 8c15551..1099719 100644 --- a/get_protein_with_gi_number.py +++ b/get_protein_with_gi_number.py @@ -24,7 +24,7 @@ #handle = Entrez.efetch(db="protein", id=requestedsequences, rettype="gb", retmode="text") -handle = Entrez.efetch(db="protein", id=requestedsequences, rettype="fasta", retmode="text") +handle = Entrez.efetch(db = "protein", id = requestedsequences, rettype = "fasta", retmode = "text") records = SeqIO.parse(handle, "fasta") for record in records: diff --git a/len_stats_from.fasta.py b/len_stats_from.fasta.py index 7cedc59..ecacfc2 100755 --- a/len_stats_from.fasta.py +++ b/len_stats_from.fasta.py @@ -1,24 +1,24 @@ import sys -filein=open(sys.argv[1],'r') +filein = open(sys.argv[1],'r') print 'Reading ', sys.argv[1] -out=sys.argv[1] +out = sys.argv[1] -fileout=out+'.stats.txt' +fileout = out + '.stats.txt' print fileout -fp=open(fileout, 'w') +fp = open(fileout, 'w') -big_table=[] +big_table = [] for line in filein: if line.startswith('>'): continue else: - length=int(len(line)) + length = int(len(line)) big_table.append(length) -count=len(big_table) -average=sum(big_table)/float(count) -mini=min(big_table) -maxi=max(big_table) +count = len(big_table) +average = sum(big_table)/ float(count) +mini = min(big_table) +maxi = max(big_table) print count , ' reads detected' print 'With an average read lenght of ' , average , 'bases' diff --git a/modify_dereplicated.py b/modify_dereplicated.py index 5627fe8..e912ff5 100644 --- a/modify_dereplicated.py +++ b/modify_dereplicated.py @@ -11,37 +11,36 @@ #Create dereplication dictionary -derep_dictionary={} +derep_dictionary = {} - -derep_table=open(sys.argv[2],'r') +derep_table = open(sys.argv[2],'r') for line in derep_table: if line.startswith('Representative'): continue else: - line=line.split("\t") - seq=line[0] #extract sequence name - seq_count=line[1] #extract sequence count - seq_count=seq_count.rstrip('\n') - derep_dictionary[seq]=seq_count + line = line.split("\t") + seq = line[0] #extract sequence name + seq_count = line[1] #extract sequence count + seq_count = seq_count.rstrip('\n') + derep_dictionary[seq] = seq_count derep_table.close() print derep_dictionary -filein=open(sys.argv[1],'r') +filein = open(sys.argv[1],'r') -out0=str(sys.argv[1]) -out=out0.rsplit( ".", 1 )[ 0 ] -out1=out+'.modified.fasta' -fileout1=open(out1,'w') +out0 = str(sys.argv[1]) +out = out0.rsplit( ".", 1 )[ 0 ] +out1 = out + '.modified.fasta' +fileout1 = open(out1,'w') -for seq_record in SeqIO.parse(filein, format="fasta"): - name=seq_record.id - name_count=derep_dictionary.get(name) #get info for read in dictionary - new_name=name+'size='+name_count+';' - sequence=seq_record.seq - fileout1.write('>%s\n%s\n' %(new_name,sequence)) +for seq_record in SeqIO.parse(filein, format = "fasta"): + name = seq_record.id + name_count = derep_dictionary.get(name) #get info for read in dictionary + new_name = name + 'size=' + name_count + ';' + sequence = seq_record.seq + fileout1.write('>%s\n%s\n' %(new_name, sequence)) fileout1.close() diff --git a/multiline_fasta_to_fasta.py b/multiline_fasta_to_fasta.py new file mode 100644 index 0000000..6fbd474 --- /dev/null +++ b/multiline_fasta_to_fasta.py @@ -0,0 +1,17 @@ +#usage +#python multiline_fasta_to_fasta.py + +import sys +import Bio +from Bio import SeqIO + +filein = open(sys.argv[1], 'r') +fileout_name = sys.argv[1] + '_new.fa' +fileout = open(fileout_name, 'w') + +for seq_record in SeqIO.parse(filein, format = "fasta"): + fileout.write('>%s %s\n%s\n' %(seq_record.id, seq_record.description, seq_record.seq)) + +filein.close() +fileout.close() + diff --git a/parse_taxonomy.py b/parse_taxonomy.py index a43608a..8799fad 100644 --- a/parse_taxonomy.py +++ b/parse_taxonomy.py @@ -18,8 +18,6 @@ # input_dictionary[0]['LineageEx'] if d['Rank'] in ['family', 'order']} - - for organism in input_list: # print organism lineage = {d['Rank']:d['ScientificName'] for d in organism['LineageEx'] if d['Rank'] in ['phylum', 'class', 'order', 'family', 'genus' ]} diff --git a/parse_xml_result.py b/parse_xml_result.py index 5c0f12e..b2c27de 100755 --- a/parse_xml_result.py +++ b/parse_xml_result.py @@ -10,19 +10,19 @@ from Bio.Blast import NCBIXML #input -result_handle=open(sys.argv[1],"r") +result_handle = open(sys.argv[1], "r") #result_handle = open("test.xml") -print ("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %("Query","Alignment","Identity(%)","Similarity(%)","Alignment length","Expected value","Score","Length Query","Length Subject")) +print ("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %("Query", "Alignment", "Identity(%)", "Similarity(%)", "Alignment length", "Expected value", "Score", "Length Query", "Length Subject")) blast_records = NCBIXML.parse(result_handle) for blast_record in blast_records: for alignment in blast_record.alignments: for hsp in alignment.hsps: - pident=(hsp.identities)/float(hsp.align_length)*100 - psimil=(hsp.positives)/float(hsp.align_length)*100 - mygaps=str(hsp.gaps) - print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(blast_record.query, alignment.title, pident, psimil, hsp.align_length, hsp.expect, hsp.score,len(hsp.query),len(hsp.sbjct),mygaps) + pident = (hsp.identities) / float(hsp.align_length) * 100 + psimil = (hsp.positives) / float(hsp.align_length) * 100 + mygaps = str(hsp.gaps) + print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(blast_record.query, alignment.title, pident, psimil, hsp.align_length, hsp.expect, hsp.score, len(hsp.query), len(hsp.sbjct), mygaps) result_handle.close() diff --git a/quality-trim.py b/quality-trim.py index 2c2ac71..86cc176 100755 --- a/quality-trim.py +++ b/quality-trim.py @@ -1,3 +1,6 @@ +# this scripts looks for B as a indicator of bad quality in the qual section of a fastq. It does not checks the quality encoding system. Use it carefully + + #!/usr/bin/env python import sys diff --git a/quality-trim_to_gz.py b/quality-trim_to_gz.py index 673b62c..14a226c 100755 --- a/quality-trim_to_gz.py +++ b/quality-trim_to_gz.py @@ -1,3 +1,6 @@ +# Does not check for quality encoding but for a B as indicator of bad quality. +# use it with caution + #!/usr/bin/env python import sys diff --git a/remoteblastp.py b/remoteblastp.py index 7d8336a..62ef9ff 100755 --- a/remoteblastp.py +++ b/remoteblastp.py @@ -1,12 +1,16 @@ -#usage -#python remoteblastp.py -# 0 1 2 +# This scripts will do a blast search against the nr database +# The blast program as well as search parameters +# can be specified inthe lines below +# Usage +# python remoteblastp.py +# 0 1 2 -my_perc_ident='none' -my_blast_program='blastp' -my_evalue_treshold=0.00001 -my_hitlist_size=10 + +my_perc_ident = 'none' +my_blast_program = 'blastp' +my_evalue_treshold = 0.00001 +my_hitlist_size = 10 import sys import Bio @@ -15,17 +19,17 @@ from Bio.Blast import NCBIXML #input -filein=open(sys.argv[1],"r") +filein = open(sys.argv[1], "r") #outputs -myout=sys.argv[2] -fileout=open(myout,'w') +myout = sys.argv[2] +fileout = open(myout, 'w') for seq_record in SeqIO.parse(filein, format="fasta"): # print seq_record # print seq_record.format("fasta") - result_handle = NCBIWWW.qblast(my_blast_program, "nr", seq_record.format("fasta"), hitlist_size=my_hitlist_size,expect=my_evalue_treshold,perc_ident=my_perc_ident) + result_handle = NCBIWWW.qblast(my_blast_program, "nr", seq_record.format("fasta"), hitlist_size = my_hitlist_size, expect = my_evalue_treshold, perc_ident = my_perc_ident) fileout.write(result_handle.read()) filein.close() fileout.close() diff --git a/remoteblastp_vs_refsq.py b/remoteblastp_vs_refsq.py index c490c9e..8abaa02 100644 --- a/remoteblastp_vs_refsq.py +++ b/remoteblastp_vs_refsq.py @@ -1,12 +1,17 @@ +# This scripts will do a blast search against the refseq database +# The blast program as well as search parameters +# can be specified inthe lines below + + #usage #python remoteblastp.py # 0 1 2 -my_perc_ident='none' -my_blast_program='blastp' -my_evalue_treshold=0.00001 -my_hitlist_size=100 +my_perc_ident = 'none' +my_blast_program = 'blastp' +my_evalue_treshold = 0.00001 +my_hitlist_size = 100 import sys import Bio @@ -15,14 +20,14 @@ from Bio.Blast import NCBIXML #input -filein=open(sys.argv[1],"r") +filein = open(sys.argv[1], "r") #outputs -myout=sys.argv[2] -fileout=open(myout,'w') +myout = sys.argv[2] +fileout = open(myout, 'w') -for seq_record in SeqIO.parse(filein, format="fasta"): +for seq_record in SeqIO.parse(filein, format = "fasta"): # print seq_record # print seq_record.format("fasta") result_handle = NCBIWWW.qblast(my_blast_program, "refseq", seq_record.format("fasta"), hitlist_size=my_hitlist_size,expect=my_evalue_treshold,perc_ident=my_perc_ident) diff --git a/rename_fasta-to-fasta.py b/rename_fasta-to-fasta.py index e633969..4834d08 100644 --- a/rename_fasta-to-fasta.py +++ b/rename_fasta-to-fasta.py @@ -7,14 +7,14 @@ import Bio from Bio import SeqIO -filein=open(sys.argv[1],'rb') +filein = open(sys.argv[1], 'rb') -for seq_record in SeqIO.parse(filein, format="fasta"): - line=seq_record.id +for seq_record in SeqIO.parse(filein, format = "fasta"): + line = seq_record.id # print line - line=line.split('|') + line = line.split('|') # print line - name=line[2] + name = line[2] # print name print '>%s\n%s' % (name, seq_record.seq) diff --git a/retrieve_genbank_annotation.py b/retrieve_genbank_annotation.py index 068e4bf..95e9f8e 100755 --- a/retrieve_genbank_annotation.py +++ b/retrieve_genbank_annotation.py @@ -10,40 +10,40 @@ Entrez.email = "carden24@mail.ubc.ca" #inputs -filelist=open(sys.argv[1],'r') +filelist = open(sys.argv[1], 'r') #output -fileout=open(sys.argv[2],'w') +fileout = open(sys.argv[2], 'w') #create a list with the names of the sequences requested -requestedsequences=[] +requestedsequences = [] for line in filelist: - line=line.strip('\n') + line = line.strip('\n') requestedsequences.append(line) print "%d Sequences requested" % len(requestedsequences) print requestedsequences -handle = Entrez.efetch(db="protein", id=requestedsequences, rettype="gb", retmode="text") -records=SeqIO.parse(handle,"genbank") +handle = Entrez.efetch(db = "protein", id = requestedsequences, rettype = "gb", retmode = "text") +records = SeqIO.parse(handle, "genbank") for record in records: - feat=record.features + feat = record.features for f in feat: - if f.type=="CDS": - quali=f.qualifiers - gene=str(quali.get('gene','no_gene_name')) - gene=gene.strip('\'[]') - locus=str(quali.get('locus_tag','no_locus_tag')) - locus=locus.strip('\'[]') - old_locus=str(quali.get('old_locus_tag','no_old_locus_tag')) - old_locus=old_locus.strip('\'[]') - product=str(quali.get('product','no_product_name')) - product=product.strip('\'[]') - protein_id=str(quali.get('protein_id','no_protein_id')) - protein_id=protein_id.strip('\'[]') - fileout.write("%s\t%s\t%s\t%s\t%s\n" %(gene,locus,old_locus,product,protein_id)) + if f.type == "CDS": + quali = f.qualifiers + gene = str(quali.get('gene', 'no_gene_name')) + gene = gene.strip('\'[]') + locus = str(quali.get('locus_tag', 'no_locus_tag')) + locus = locus.strip('\'[]') + old_locus = str(quali.get('old_locus_tag', 'no_old_locus_tag')) + old_locus = old_locus.strip('\'[]') + product = str(quali.get('product', 'no_product_name')) + product = product.strip('\'[]') + protein_id = str(quali.get('protein_id', 'no_protein_id')) + protein_id = protein_id.strip('\'[]') + fileout.write("%s\t%s\t%s\t%s\t%s\n" %(gene, locus, old_locus, product, protein_id)) else: continue fileout.close() diff --git a/retrieve_genbank_record.to.fasta.py b/retrieve_genbank_record.to.fasta.py index 9a574b1..7233bd6 100755 --- a/retrieve_genbank_record.to.fasta.py +++ b/retrieve_genbank_record.to.fasta.py @@ -8,27 +8,27 @@ Entrez.email = "carden24@mail.ubc.ca" #inputs -filelist=open(sys.argv[1],'r') +filelist = open(sys.argv[1], 'r') #output -fileout=open(sys.argv[2],'w') +fileout = open(sys.argv[2], 'w') #create a list with the names of the sequences requested -requestedsequences=[] +requestedsequences = [] for line in filelist: - line=line.strip('\n') + line = line.strip('\n') requestedsequences.append(line) print "%d Sequences requested" % len(requestedsequences) print requestedsequences -handle = Entrez.efetch(db="protein", id=requestedsequences, rettype="fasta", retmode="text") -records=SeqIO.parse(handle,"fasta") +handle = Entrez.efetch(db = "protein", id = requestedsequences, rettype = "fasta", retmode = "text") +records = SeqIO.parse(handle, "fasta") for record in records: - seq_name=record.id - seq_description=record.description - seq_sequence =record.seq - fileout.write (">%s %s\n%s\n" %(seq_name,seq_description,seq_sequence)) + seq_name = record.id + seq_description = record.description + seq_sequence = record.seq + fileout.write (">%s %s\n%s\n" %(seq_name, seq_description, seq_sequence)) diff --git a/retrieve_genbank_record.to.fasta.v2.py b/retrieve_genbank_record.to.fasta.v2.py index 2a0f536..21ac2fb 100755 --- a/retrieve_genbank_record.to.fasta.v2.py +++ b/retrieve_genbank_record.to.fasta.v2.py @@ -8,46 +8,46 @@ Entrez.email = "carden24@mail.ubc.ca" #inputs -filelist=open(sys.argv[1],'r') +filelist = open(sys.argv[1], 'r') #output -fileout=open(sys.argv[2],'w') +fileout = open(sys.argv[2], 'w') #create a list with the names of the sequences requested -requestedsequences=[] +requestedsequences = [] for line in filelist: - line=line.strip('\n') + line = line.strip('\n') requestedsequences.append(line) print "%d Sequences requested" % len(requestedsequences) print requestedsequences -handle = Entrez.efetch(db="protein", id=requestedsequences, rettype="gb", retmode="text") -records=SeqIO.parse(handle,"genbank") +handle = Entrez.efetch(db = "protein", id = requestedsequences, rettype = "gb", retmode = "text") +records = SeqIO.parse(handle, "genbank") for record in records: feat=record.features for f in feat: - if f.type=="CDS": - quali=f.qualifiers - gene=str(quali.get('gene','no_gene_name')) - gene=gene.strip('\'[]') + if f.type == "CDS": + quali = f.qualifiers + gene = str(quali.get('gene', 'no_gene_name')) + gene = gene.strip('\'[]') print gene - product=str(quali.get('product','no_product_name')) - product=product.strip('\'[]') + product = str(quali.get('product', 'no_product_name')) + product = product.strip('\'[]') # print product # description=gene+'-'+product - protein_id=str(quali.get('protein_id','no_protein_id')) + protein_id = str(quali.get('protein_id', 'no_protein_id')) # protein_id=str(f.qualifiers['protein_id']) - protein_id=protein_id.strip('\'[]') - translated_protein=str(quali.get('translation','no_translation')) + protein_id = protein_id.strip('\'[]') + translated_protein = str(quali.get('translation', 'no_translation')) # translated_protein=str(f.qualifiers['translation']) - translated_protein=translated_protein.strip('\'[]') - if protein_id=='no_protein_id': + translated_protein = translated_protein.strip('\'[]') + if protein_id == 'no_protein_id': continue else: - fileout.write(">%s %s\n%s\n" %(protein_id,gene,translated_protein)) + fileout.write(">%s %s\n%s\n" %(protein_id, gene, translated_protein)) else: continue fileout.close() diff --git a/retrieve_taxonomy_from_accession_numbers_v2.py b/retrieve_taxonomy_from_accession_numbers_v2.py index 3183b2b..d405009 100644 --- a/retrieve_taxonomy_from_accession_numbers_v2.py +++ b/retrieve_taxonomy_from_accession_numbers_v2.py @@ -22,8 +22,8 @@ print "%d Sequence(s) requested" % len(requestedsequences) print '' -handle = Entrez.efetch(db="nuccore", id=requestedsequences, rettype="gb", retmode="text") -records = SeqIO.parse(handle,"genbank") +handle = Entrez.efetch(db = "nuccore", id = requestedsequences, rettype = "gb", retmode = "text") +records = SeqIO.parse(handle, "genbank") for record in records: # print record.id diff --git a/retrieve_taxonomy_from_gis.py b/retrieve_taxonomy_from_gis.py index 90b9cc6..8ae8799 100644 --- a/retrieve_taxonomy_from_gis.py +++ b/retrieve_taxonomy_from_gis.py @@ -25,7 +25,7 @@ handle = Entrez.efetch(db="protein", id=requestedsequences, rettype="gb", retmode="text") -records = SeqIO.parse(handle,"genbank") +records = SeqIO.parse(handle, "genbank") ##print records for record in records: diff --git a/score_blast2.py b/score_blast2.py index b5fa6d7..6725a3b 100755 --- a/score_blast2.py +++ b/score_blast2.py @@ -4,20 +4,20 @@ import pickle import sys -threshold=float(0.4) -filedict=open(sys.argv[1],'rb') -filein=open(sys.argv[2],'r') +threshold = float(0.4) +filedict = open(sys.argv[1], 'rb') +filein = open(sys.argv[2], 'r') -outy=sys.argv[3] -out1=outy+'family.out' -out2=outy+'subfamily.out' +outy = sys.argv[3] +out1 = outy + 'family.out' +out2 = outy + 'subfamily.out' -fileout1=open(out1,'w') -fileout2=open(out2,'w') +fileout1 = open(out1, 'w') +fileout2 = open(out2, 'w') cazy_or_foly_dict = pickle.load(filedict) -familydict={} -subfamilydict={} +familydict = {} +subfamilydict = {} #blast output #HS6_179:1:1101:10145:166587/1 gi|49642693|emb|CAH00655.1| 58.33 24 4e-04 35.0 79 @@ -27,29 +27,29 @@ for line in filein: - line=line.split('\t') #split line from blast output - subject=line[1] #get subject - bits=float(line[6]) #get raw score - dict_entry=cazy_or_foly_dict.get(subject) #get info for subject in dictionary - maxbits=float(dict_entry[3]) #get maximum raw score ratio for the subject vs itself - scoreratio=bits/float(maxbits) #calculate bits score ratio - if scoreratio>=threshold: #if bits score ratio is higher than treshold - family=dict_entry[2] #obtain family - fff=familydict.get(family,0) #Entry family name in new dictionary, if absent use 0 if not get count - ggg=fff+1 #update count - familydict[family]=ggg #update entry in dictionary - subfamily=dict_entry[1] #do the same for subfamily - sss=subfamilydict.get(subfamily,0) - ttt=sss+1 - subfamilydict[subfamily]=ttt + line = line.split('\t') #split line from blast output + subject = line[1] #get subject + bits = float(line[6]) #get raw score + dict_entry = cazy_or_foly_dict.get(subject) #get info for subject in dictionary + maxbits = float(dict_entry[3]) #get maximum raw score ratio for the subject vs itself + scoreratio = bits/float(maxbits) #calculate bits score ratio + if scoreratio >= threshold: #if bits score ratio is higher than treshold + family = dict_entry[2] #obtain family + fff = familydict.get(family, 0) #Entry family name in new dictionary, if absent use 0 if not get count + ggg = fff + 1 #update count + familydict[family] = ggg #update entry in dictionary + subfamily = dict_entry[1] #do the same for subfamily + sss = subfamilydict.get(subfamily, 0) + ttt = sss + 1 + subfamilydict[subfamily] = ttt else: continue #write familydict dictionary for key1, value1 in familydict.iteritems(): - fileout1.write("%s\t%s\n" %(key1,value1)) + fileout1.write("%s\t%s\n" %(key1, value1)) #write subfamily dictionary for key2, value2 in subfamilydict.iteritems(): - fileout2.write("%s\t%s\n" %(key2,value2)) + fileout2.write("%s\t%s\n" %(key2, value2)) diff --git a/score_blast_and_normalize.py b/score_blast_and_normalize.py index c0efe96..3af3ea3 100755 --- a/score_blast_and_normalize.py +++ b/score_blast_and_normalize.py @@ -5,43 +5,43 @@ import pickle import sys -filedict=open(sys.argv[1],'rb') -filein=open(sys.argv[3],'r') -fileassembly_dict=open(sys.argv[2],'r') +filedict = open(sys.argv[1], 'rb') +filein = open(sys.argv[3], 'r') +fileassembly_dict = open(sys.argv[2], 'r') -outy=sys.argv[4] -out1=outy+'family.out' -out2=outy+'subfamily.out' -fileout1=open(out1,'w') -fileout2=open(out2,'w') +outy = sys.argv[4] +out1 = outy + 'family.out' +out2 = outy + 'subfamily.out' +fileout1 = open(out1, 'w') +fileout2 = open(out2, 'w') cazy_or_foly_dict = pickle.load(filedict) assembly_dict = pickle.load(fileassembly_dict) -familydict={} -subfamilydict={} +familydict = {} +subfamilydict = {} for line in filein: - line=line.split('\t') #split line from fasta - query=line[0] #get query - coverage=assembly_dict.get(query) #get fold coverage from assembly dictionary - fold=float(coverage[9]) - subject=line[1] #get subject - dict_entry=cazy_or_foly_dict.get(subject) #get info for subject in dictionary + line = line.split('\t') #split line from fasta + query = line[0] #get query + coverage = assembly_dict.get(query) #get fold coverage from assembly dictionary + fold = float(coverage[9]) + subject = line[1] #get subject + dict_entry = cazy_or_foly_dict.get(subject) #get info for subject in dictionary # print subject # print fold - family=dict_entry[2] #obtain family - fff=familydict.get(family,0) #Entry family name in new dictionary, if absent use 0 if not get count - ggg=fff+fold #update count - familydict[family]=ggg #update entry in dictionary - subfamily=dict_entry[1] #do the same for subfamily - sss=subfamilydict.get(subfamily,0) - ttt=sss+fold - subfamilydict[subfamily]=ttt + family = dict_entry[2] #obtain family + fff = familydict.get(family, 0) #Entry family name in new dictionary, if absent use 0 if not get count + ggg = fff + fold #update count + familydict[family] = ggg #update entry in dictionary + subfamily = dict_entry[1] #do the same for subfamily + sss = subfamilydict.get(subfamily ,0) + ttt = sss + fold + subfamilydict[subfamily] = ttt #write familydict dictionary for key1, value1 in familydict.iteritems(): - fileout1.write("%s\t%s\n" %(key1,value1)) + fileout1.write("%s\t%s\n" %(key1, value1)) #write subfamily dictionary for key2, value2 in subfamilydict.iteritems(): - fileout2.write("%s\t%s\n" %(key2,value2)) + fileout2.write("%s\t%s\n" %(key2, value2)) diff --git a/score_blast_for_redundancy.py b/score_blast_for_redundancy.py index c1ab3fa..d3be19c 100644 --- a/score_blast_for_redundancy.py +++ b/score_blast_for_redundancy.py @@ -8,28 +8,28 @@ import sys #threshold=float(0.4) -filedict=open(sys.argv[1],'rb') -filein=open(sys.argv[2],'r') -outy=sys.argv[3] -out1=outy+'.family.out' -fileout1=open(out1,'w') +filedict = open(sys.argv[1], 'rb') +filein = open(sys.argv[2], 'r') +outy = sys.argv[3] +out1 = outy + '.family.out' +fileout1 = open(out1, 'w') cazy_or_foly_dict = pickle.load(filedict) -subfamilydict={} +subfamilydict = {} for line in filein: - line=line.lstrip(" ") - line=line.rstrip("\n ") - line=line.split(' ') #split line from fasta - subject_count=line[0] #get subject count - subject=line[1] #get subject - dict_entry=cazy_or_foly_dict.get(subject) #get info for subject in dictionary - family=dict_entry[1] #obtain family + line = line.lstrip(" ") + line = line.rstrip("\n ") + line = line.split(' ') #split line from fasta + subject_count = line[0] #get subject count + subject = line[1] #get subject + dict_entry = cazy_or_foly_dict.get(subject) #get info for subject in dictionary + family = dict_entry[1] #obtain family # print subject # print subject_count # print family - fileout1.write("%s\t%s\t%s\n" %(subject,subject_count,family)) + fileout1.write("%s\t%s\t%s\n" %(subject, subject_count, family)) fileout1.close() filein.close() filedict.close() diff --git a/score_blast_for_redundancy.v2.py b/score_blast_for_redundancy.v2.py index a580898..539bfe4 100644 --- a/score_blast_for_redundancy.v2.py +++ b/score_blast_for_redundancy.v2.py @@ -8,28 +8,28 @@ import sys #threshold=float(0.4) -filedict=open(sys.argv[1],'rb') -filein=open(sys.argv[2],'r') -outy=sys.argv[3] -out1=outy+'.family.out' -fileout1=open(out1,'w') +filedict = open(sys.argv[1], 'rb') +filein = open(sys.argv[2], 'r') +outy = sys.argv[3] +out1 = outy + '.family.out' +fileout1 = open(out1, 'w') cazy_or_foly_dict = pickle.load(filedict) -subfamilydict={} +subfamilydict = {} for line in filein: - line=line.lstrip(" ") - line=line.rstrip("\n ") - line=line.split(' ') #split line from fasta - subject_count=line[0] #get subject count - subject=line[1] #get subject - dict_entry=cazy_or_foly_dict.get(subject) #get info for subject in dictionary - family=dict_entry[2] #obtain family + line = line.lstrip(" ") + line = line.rstrip("\n ") + line = line.split(' ') #split line from fasta + subject_count = line[0] #get subject count + subject = line[1] #get subject + dict_entry = cazy_or_foly_dict.get(subject) #get info for subject in dictionary + family = dict_entry[2] #obtain family # print subject # print subject_count # print family - fileout1.write("%s\t%s\t%s\n" %(subject,subject_count,family)) + fileout1.write("%s\t%s\t%s\n" %(subject, subject_count, family)) fileout1.close() filein.close() filedict.close()