-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathconstruct_DeSeq2_input.py
129 lines (93 loc) · 4.14 KB
/
construct_DeSeq2_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
### Boas Pucker ###
### v0.1 ###
### bpucker@cebitec.uni-bielefeld.de ###
__usage__ = """
python construct_DeSeq2_input.py\n
--sample_sheet <FULL_PATH_TO_TABLE_WITH_INFORMATION_ABOUT_SAMPLES>\n
--sample_dir <FULL_PATH_TO_DIRECTORY_CONTAINING_COUNT_TABLES>\n
--output_dir <FULL_PATH_TO_OUTPUT_DIRECTORY(will_be_created_if_necessary)>\n
bug reports and feature requests: bpucker@cebitec.uni-bielefeld.de
"""
import glob, sys, re, os, random
from datetime import date
from operator import itemgetter
# --- end of imports --- #
def load_expression_values( filename ):
"""! @brief load all expression values from featureCounts result file and return raw counts """
expression_data = {}
with open( filename, "r" ) as f:
line = f.readline()
while line:
try:
ID = re.findall( "AT[12345CM]{1}G\d{5}", line )[0]
current = random.choice( [ 1 ]*100 + [ 1000 ] ) * int( line.strip().split('\t')[-1] )
expression_data.update( { ID: current } )
except:
pass #print line
line = f.readline()
return expression_data
def load_sample_information( sample_information_file, data_dir, pseudo_replicates):
"""! @brief load information about all samples into list of dictionaries """
sample_information = []
with open( sample_information_file, "r" ) as f:
f.readline() #header
line = f.readline()
while line:
parts = line.strip().split(',')
for i in range( pseudo_replicates ):
sample_information.append( { 'name': parts[1]+'_'+str(i+1),
'expression':load_expression_values( data_dir + parts[2] + ".countTable" ),
'genotype': parts[1]
} )
line = f.readline()
print "number of loaded sample information data sets: " + str( len( sample_information ) )
return sample_information
def construct_sample_table( sample_information, sample_table ):
"""! @brief construct sample table for DeSeq2 """
# --- construct sample table --- #
ordered_valid_samples = sorted( sample_information, key=itemgetter('genotype') )
with open( sample_table, "w" ) as out:
out.write( '\t'.join( [ "", "genotype" ] ) + '\n' )
for idx, sample in enumerate( ordered_valid_samples ):
new_line = "\t".join( [ sample['name'], #sample name
sample['genotype'] #genotype
] ) + '\n'
out.write( new_line )
return ordered_valid_samples
def construct_data_matrix( ordered_valid_samples, data_matrix_file ):
"""! @brief construct the data matrix for DeSeq2 based on featureCount result files """
with open( data_matrix_file, "w" ) as out:
# --- construct header line (all sample IDs in order) --- #
ordered_valid_ids = []
for sample in ordered_valid_samples:
ordered_valid_ids.append( sample['name'] )
out.write( "\t".join( [ "" ] + ordered_valid_ids ) + '\n' )
# --- construct data body (expression data per sample in one column) --- #
gene_IDs = sorted( ordered_valid_samples[0]['expression'].keys() )
for gene in gene_IDs:
new_line = [ gene ] #add geneID
for sample in ordered_valid_samples:
new_line.append( sample[ 'expression' ][ gene ] )
out.write( '\t'.join( map( str, new_line ) ) + '\n' )
def main( arguments ):
"""! @brief takes input and runs all functions """
sample_dir = arguments[ arguments.index( '--sample_dir' ) +1 ]
sample_information_file = arguments[ arguments.index( '--sample_sheet' ) +1 ]
prefix = arguments[ arguments.index( '--output_dir' ) +1 ]
pseudo_replicates = 3 #1 would prevent the production of pseudo replicates
# --- nothing needs to be changed beyond this point --- #
if prefix[-1] != '/':
prefix += "/"
if not os.path.exists( prefix ):
os.makedirs( prefix )
sample_table = prefix + "clean_sample_table.txt"
data_matrix_file = prefix + "clean_data_matrix.txt"
sample_information = load_sample_information( sample_information_file, sample_dir, pseudo_replicates )
ordered_valid_samples = construct_sample_table( sample_information, sample_table )
construct_data_matrix( ordered_valid_samples, data_matrix_file )
print "all done!"
if __name__ == '__main__':
if '--sample_sheet' in sys.argv and '--sample_dir' in sys.argv and '--output_dir' in sys.argv:
main( sys.argv )
else:
sys.exit( __usage__ )