Skip to content

Commit

Permalink
update, please dont break
Browse files Browse the repository at this point in the history
  • Loading branch information
carden24 committed Oct 26, 2015
1 parent 8109335 commit 2546909
Show file tree
Hide file tree
Showing 48 changed files with 979 additions and 408 deletions.
2 changes: 2 additions & 0 deletions GET_LINEAGES_NCBI.EC.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Author Roli Wilhelm

#!/usr/bin/python
import sys, os, re, getopt, glob, subprocess, os.path, numpy as np, time
import timeit
Expand Down
2 changes: 2 additions & 0 deletions GET_LINEAGES_NCBI.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#Author Roli Wilhelm

#!/usr/bin/python
import sys, os, re, getopt, glob, subprocess, os.path, numpy as np, time
import timeit
Expand Down
146 changes: 66 additions & 80 deletions HMM_search_and_parse_and_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@


#config = load_config()
script_info={}
script_info = {}
script_info['brief_description'] = """Filters sequence according to a minimum
size parameter"""
script_info['script_description'] = """HMMER parser. Runs hmmscan, filters
Expand Down Expand Up @@ -64,12 +64,12 @@
'\n(d) \'all\' -- Extract hits, contigs, and all\
proteins from hits\n')

#Compiling frequently used regular expression patterns
# Compiling frequently used regular expression patterns
hmm_pattern = re.compile('[.](hmm)')
query_pattern = re.compile('[.](fasta$|fas$|faa$|fsa$|fa$)')


# checks if the supplied arguments are adequate
# Checks if the supplied arguments are adequate
def valid_arguments(opts, args):
if (opts.input_model == None or opts.input_fp == None ):
return True
Expand Down Expand Up @@ -98,13 +98,12 @@ def update_progress(progress):
sys.stderr.flush()


#Get HMM length function
# Get HMM length function
def get_hmm_len(input_model):
# hmmshortname = re.sub('[.](hmm)','',input_model, re.I)
hmmshortname = re.sub(hmm_pattern,'',input_model, re.I)
hmm_leng_file = hmmshortname+".length.txt"
hmm_fileout = open(hmm_leng_file,'w')
hmm_filein = open(input_model,'r')
hmmshortname = re.sub(hmm_pattern, '', input_model, re.I)
hmm_leng_file = hmmshortname + ".length.txt"
hmm_fileout = open(hmm_leng_file, 'w')
hmm_filein = open(input_model, 'r')
for line in hmm_filein:
if line.startswith('NAME'):
line = line.strip('\n')
Expand All @@ -121,32 +120,28 @@ def get_hmm_len(input_model):
continue
hmm_fileout.close()
hmm_filein.close()
os.system(' '.join(['cp',hmm_leng_file,'all.hmm.ps.len']))
os.system(' '.join(['cp', hmm_leng_file, 'all.hmm.ps.len']))


#Function to run hmmscan and parse
# Function to run hmmscan and parse
def run_hmm_scan (model,query,output):
#removes extension, case insensitive search
# hmmshortname = re.sub('[.](hmm)','',model, re.I)
hmmshortname = re.sub(hmm_pattern,'',model, re.I)
#finds file format removes extension, case insensitive search
# shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I)
shortname = re.sub(query_pattern,'',query, re.I)
# Removes extension, case insensitive search
hmmshortname = re.sub(hmm_pattern, '', model, re.I)
# Finds file format removes extension, case insensitive search
shortname = re.sub(query_pattern, ' ', query, re.I)
output_file = output + "/" + shortname + "_" + hmmshortname + '.hmm.out'
output_file2 = output +"/" + shortname + "_" + hmmshortname + '.txt'
print 'Running hmmscan...'
os.system(' '.join(['hmmscan',model,query,">",output_file]))
os.system(' '.join(['hmmscan', model, query, ">", output_file]))
print 'Parsing results...'
os.system(' '.join(['sh','hmmscan-parser.sh',output_file,'>',output_file2]))
os.system(' '.join(['sh', 'hmmscan-parser.sh', output_file, '>', output_file2]))

#Filtering by evalue and coverage
def filtering_by_evalue_and_coverage(model,query,output,evalue,coverage):
#removes extension, case insensitive search
# hmmshortname = re.sub('[.](hmm)','',model, re.I)
# Filtering by evalue and coverage
def filtering_by_evalue_and_coverage(model, query, output, evalue, coverage):
# Removes extension, case insensitive search
hmmshortname = re.sub(hmm_pattern,'',model, re.I)
#finds file format removes extension, case insensitive search
# shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I)
shortname = re.sub(query_pattern,'',query, re.I)
# Finds file format removes extension, case insensitive search
shortname = re.sub(query_pattern, '', query, re.I)
output_file2 = output+"/" + shortname + "_" + hmmshortname + '.txt'
hmm_table = open(output_file2, 'r')
output_file3 = output + "/" + shortname + "_" + hmmshortname+'.filtered.txt'
Expand All @@ -157,67 +152,64 @@ def filtering_by_evalue_and_coverage(model,query,output,evalue,coverage):
line2 = line.strip('\n').split('\t')
result_evalue = float(line2[2])
result_model_coverage = float(line2[7])
if (result_evalue <=evalue) and (result_model_coverage*100 >= coverage):
if (result_evalue <= evalue) and (result_model_coverage * 100 >= coverage):
hmm_filtered_table.write('%s' %line)
else:
continue
hmm_table.close()
hmm_filtered_table.close()


#Function to extract hits from filtered results
# Function to extract hits from filtered results
def extract_protein_hits(query,model,output):
#removes extension, case insensitive search
hmmshortname = re.sub(hmm_pattern,'',model, re.I)
# hmmshortname = re.sub('[.](hmm)', '', model, re.I)
#finds file format removes extension, case insensitive search
# shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I)
shortname = re.sub(query_pattern,'',query, re.I)
input_file4 = output+"/"+shortname+"_"+hmmshortname+'.filtered.txt'
# Removes extension, case insensitive search
hmmshortname = re.sub(hmm_pattern, '', model, re.I)
# Finds file format removes extension, case insensitive search
shortname = re.sub(query_pattern, '', query, re.I)
input_file4 = output + "/" + shortname + "_" + hmmshortname + '.filtered.txt'
hmm_filtered_table2 = open(input_file4, 'r')

print ' Extracting proteins for %s and HMM database=%s' %(query,model)
#Create dictionary with protein:[list of model it hits]
print ' Extracting proteins for %s and HMM database=%s' %(query, model)
# Create dictionary with protein:[list of model it hits]
protein_hit_dictionary = {}
all_models_hits = []
for line3 in hmm_filtered_table2:
line4 = line3.strip('\n').split('\t')
protein_hit = line4[0]
model_of_protein_hit = line4[1].rstrip(' ')

#update list of proteins
# Update list of proteins
all_models_hits.append(model_of_protein_hit)

#Get list of proteins hits, if non existent create empty list
# Get list of proteins hits, if non existent create empty list
models = protein_hit_dictionary.get(protein_hit, [])
#Append current model hit to list
# Append current model hit to list
models.append(model_of_protein_hit)
#Update dictionary entry
# Update dictionary entry
protein_hit_dictionary[protein_hit] = models

#Print message
count_of_models=list(set(all_models_hits))
count_of_proteins=len(protein_hit_dictionary.keys())
# Print message
count_of_models = list(set(all_models_hits))
count_of_proteins = len(protein_hit_dictionary.keys())

print ' Extracting %s unique proteins corresponding to %s HMM models' \
%(count_of_proteins,len(count_of_models))

#open one output file per model
#Generate list of output files
#for item in all_models_hits:
# Open one output file per model
# Generate list of output files
# For item in all_models_hits:
files = [open(output + '/' + shortname + '_' + hmmshortname + '_' + item + '.fasta', 'w') \
for item in set(all_models_hits)]

#Open original file, find if name is in hit list,
#Then get models hits and write to model result files
# Open original file, find if name is in hit list,
# Then get models hits and write to model result files
filein = open(query, 'r')
for record in SeqIO.parse(filein,"fasta"):
for record in SeqIO.parse(filein, "fasta"):
name = record.name
if name in protein_hit_dictionary.keys():
what_models_list = protein_hit_dictionary.get(name)
#Iterate this list
# Iterate this list
for what_model in what_models_list:
#Find index
# Find index
index = count_of_models.index(what_model)
files[index].write('>%s\n%s\n' % (name, record.seq))
#Close files
Expand All @@ -226,46 +218,44 @@ def extract_protein_hits(query,model,output):


#Function to extract contigs
def extract_contigs(query,model,output,assembly_file):
# removes extension, case insensitive search
# hmmshortname = re.sub('[.](hmm)', '', model, re.I)
hmmshortname = re.sub(hmm_pattern,'',model, re.I)
#finds file format removes extension, case insensitive search
# shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','', query, re.I)
shortname = re.sub(query_pattern,'',query, re.I)
def extract_contigs(query, model, output, assembly_file):
# Removes extension, case insensitive search
hmmshortname = re.sub(hmm_pattern, '', model, re.I)
# Finds file format removes extension, case insensitive search
shortname = re.sub(query_pattern, '', query, re.I)
input_file4 = output + "/" + shortname + "_" + hmmshortname +'.filtered.txt'
hmm_filtered_table2 = open(input_file4, 'r')

print ' Extracting contigs for file=%s and HMM database=%s' %(query,model)
print ' Extracting contigs for file=%s and HMM database=%s' %(query, model)
#Create dictionary with protein:[list of model it hits]
protein_model_dictionary = {}
for line3 in hmm_filtered_table2:
line4 = line3.strip('\n').split('\t')
protein_hit = line4[0]
model_of_protein_hit = line4[1].rstrip(' ')
#Get list of proteins hits, if non existent create empty list
# Get list of proteins hits, if non existent create empty list
models = protein_model_dictionary.get(protein_hit, [])
#Append current model hit to list
# Append current model hit to list
models.append(model_of_protein_hit)
#Update dictionary entry
# Update dictionary entry
protein_model_dictionary[protein_hit] = models

#Create protein-contig dictionary
# Create protein-contig dictionary
contigs_list = []
#parse through list and add to contigs_list
# Parse through list and add to contigs_list
for protein in protein_model_dictionary.keys():
contig = protein.rsplit('_', 1)
contigs_list.append(contig[0])
contigs_list = list(set(contigs_list))

#Open original file, find if name is in hit list,
#Then get models hits and write to model result files
assembly_in = open(assembly_file,'r')
contigs_file = output+"/" + shortname + "_" + hmmshortname +'_contigs.fasta'
# Open original file, find if name is in hit list,
# Then get models hits and write to model result files
assembly_in = open(assembly_file, 'r')
contigs_file = output + "/" + shortname + "_" + hmmshortname + '_contigs.fasta'
contigs_out = open(contigs_file, 'w')
print ' Looking for %s contigs' %len(contigs_list)
progress_counter = 0
for record in SeqIO.parse(assembly_in,"fasta"):
for record in SeqIO.parse(assembly_in, "fasta"):
name = record.name
if name in contigs_list:
progress_counter = progress_counter + 1
Expand All @@ -278,13 +268,11 @@ def extract_contigs(query,model,output,assembly_file):
print 'Some contigs were not found'


#Function to extract all proteins from contig
# Function to extract all proteins from contig
def extract_all_proteins_from_contigs(query, model, output):
# Removes extension, case insensitive search
hmmshortname = re.sub(hmm_pattern,'',model, re.I)
# hmmshortname = re.sub('[.](hmm)','',model, re.I)
hmmshortname = re.sub(hmm_pattern, '', model, re.I)
# Finds file format removes extension, case insensitive search
# shortname = re.sub('[.](fasta$|fas$|faa$|fsa$|fa$)','',query, re.I)
shortname = re.sub(query_pattern, '', query, re.I)
input_file4 = output + "/" + shortname + "_" + hmmshortname + '.filtered.txt'
hmm_filtered_table2 = open(input_file4, 'r')
Expand All @@ -311,7 +299,7 @@ def extract_all_proteins_from_contigs(query, model, output):
# Open one output file per model
# Generate list of output files
files = [open(output + '/' + shortname + '_' + hmmshortname + '_' \
+ contigs + '.fasta','w') for contigs in (contigs_list)]
+ contigs + '.fasta', 'w') for contigs in (contigs_list)]

# Open original file, find if name is in hit list,
# Then get models hits and write to model result files
Expand Down Expand Up @@ -346,7 +334,7 @@ def main(argv):
raise IOError,\
"Cannot open hmmscan-parser.sh. Please copy it to the local directory"

# initialize the input file and model, loading parameters
# Initialize the input file and model, loading parameters
input_model = opts.input_model
input_fp = opts.input_fp
output_dir = opts.output_dir
Expand All @@ -357,8 +345,7 @@ def main(argv):

# Creates a model length dictionary
print 'Checking model length...'
# hmmshortname = re.sub('[.](hmm)', '', input_model, re.I)
hmmshortname = re.sub(hmm_pattern,'',input_model, re.I)
hmmshortname = re.sub(hmm_pattern, '', input_model, re.I)
hmm_leng_file = hmmshortname + ".length.txt"
print ' Created %s file' % hmm_leng_file
get_hmm_len(input_model)
Expand All @@ -380,7 +367,6 @@ def main(argv):
extract_protein_hits(input_fp, input_model, output_dir)
elif extract_mode == 'contigs':
extract_contigs(input_fp, input_model, output_dir, assembly_file)

elif extract_mode == 'all':
extract_protein_hits(input_fp, input_model, output_dir)
extract_contigs(input_fp, input_model, output_dir, assembly_file)
Expand Down
84 changes: 84 additions & 0 deletions add_coverage_to_fasta_contigs_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/python
# File created on 31 Jan 2014.

__author__ = "Erick Cardenas Poire"
__copyright__ = "Copyright 2014"
__credits__ = [""]
__version__ = "1.0"
__maintainer__ = "Erick Cardenas Poire"
__status__ = "Release"

from Bio import SeqIO
import sys
from os import makedirs, sys, listdir, environ, path
import re
import inspect
from commands import getstatusoutput
from optparse import OptionParser
import shutil

#config = load_config()
script_info = {}
script_info['brief_description'] = """Adds coverage information from one file and modifies fasta header"""
script_info['script_description'] = """Adds coverage information from one file and modifies fasta header
REQUIRED: You must have a fasta and coverage file with same base name"""
script_info['script_usage'] = []

usage= """
Need to run it like this:
./add.coverage.to.fasta.py -i input_file
For more options: ./add.coverage.to.fasta.py -h"""

parser = OptionParser(usage)
parser.add_option("-i", "--input_file", dest = "input_fp",
help = 'the input fasta file/input dir [REQUIRED]')


#creates an input output pair if input is just an input file
def create_an_inputs_and_output(input_file):
input_output = []
shortname = re.sub('[.](fasta$|fas$|fna$|faa$|fsa$|fa$)','',input_file, re.I) #finds file format removes extension, case insensitive search
coverage_input_file = shortname+".cov"
output_file = shortname + ".new.fasta"
input_output.append(input_file)
input_output.append(coverage_input_file)
input_output.append(output_file)
return input_output

# checks if the supplied arguments are adequate
def valid_arguments(opts, args):
if opts.input_fp == None:
return True
else:
return False

def main(argv):
(opts, args) = parser.parse_args()
if valid_arguments(opts, args):
print usage
sys.exit(0)

# initialize the input directory or file
input_fp = opts.input_fp
list_of_files = create_an_inputs_and_output(input_fp)

# Creates coverage dictionary
coverage_dictionary = {}
coverage_file_in = open(list_of_files[1],'r')
for line in coverage_file_in:
line = line.split('\t')
seq_ID = line[0]
seq_coverage = line[1]
coverage_dictionary[seq_ID] = seq_coverage
coverage_file_in.close()

fileout = open(list_of_files[2], 'w')
for seq_record in SeqIO.parse(list_of_files[0], format = "fasta"):
seq_name = seq_record.id
coverage = coverage_dictionary.get(seq_name,0)
description = "coverage=" + coverage
fileout.write('>%s %s\n%s\n' %(seq_record.id, description, seq_record.seq))
fileout.close()

# the main function
main(sys.argv[1:])
Loading

0 comments on commit 2546909

Please sign in to comment.