-
Notifications
You must be signed in to change notification settings - Fork 3
/
haybaler.py
296 lines (259 loc) · 14.1 KB
/
haybaler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# Haybaler
# Sophia Poertner, Nov 2020 - April 2021
# Lisa Hollstein, Jan 2022
# Combine your Wochenende .bam.txt or reporting output from multiple samples into one matrix per stat.
# Usage: bash run_haybaler.sh
import pandas as pd
import click
import os
import re
version = "0.31 - Jan 2022"
# changelog
# 0.31 shorten names, remove identifier and longer elements leaving only Genus_species
# 0.30 read all samples in one call. Filter out taxa with values below a readcount and RPMM limit
# 0.23 improve file input and arg handling
# 0.22 bugfix, correct gc_ref and chr_length for new chromosomes
# 0.21 fix ordering problems
# 0.20 add find_order and sort_new functions, so taxa with highest readcounts come first
# 0.11 add heatmap prep and R scripts
# 0.10 initial commits, improvements, testing
def read_csv(filename, filepath):
file = pd.read_csv(filepath + '/' + filename, decimal=",", index_col=0)
return file
def txt_to_df(filename, filepath):
with open(filepath + '/' + filename) as infile, open('tmp.csv', 'w') as outfile:
# add column names (not given in txt.file), save new file as temp outfile
outfile.write("species,chr_length,read_count,unmapped_read_segments\n")
# replace tabs with comma(tab separated to comma separated)
for line in infile:
outfile.write(" ".join(line.split()).replace(' ', ','))
outfile.write("\n")
file = pd.read_csv("tmp.csv", decimal=",", index_col=0)
if os.path.exists("tmp.csv"): # del tmp file outfile
os.remove("tmp.csv")
del file['unmapped_read_segments'] # unneeded column?
return file
def join_dfs(file, name, path, column, input_name):
sample = (input_name[:input_name.find(".")]) # shorten sample name
sub_df = file[[column]].copy() # new df with just the wanted column
sub_df = sub_df.rename(columns={column: sample}) # rename column to sample name
if os.path.isfile(path + "/" + column + "_" + name): # if the file for the wanted stat already exists
old = pd.read_csv(path + "/" + column + "_" + name, decimal=",", index_col=0, sep='\t')
old.fillna(0.0, inplace=True)
if sample not in old.columns: # no double samples
new_chr = [] # get chromosomes which are new in this sample
for chromosome in file.index:
if chromosome not in old.index:
new_chr.append(chromosome)
# get a df with the chr_length and gc_ref from the new chromosomes
if 'gc_ref' in file:
new_chr_df = file.loc[new_chr, ['chr_length', 'gc_ref']]
else:
new_chr_df = file.loc[new_chr, ['chr_length']]
old = old.append(new_chr_df) # append the df with chr_length and gc_ref to the old df
new = pd.concat([old, sub_df], axis=1, sort=False) # add the new column to the old df
if 'gc_ref' not in new and 'gc_ref' in file:
gc = file[['gc_ref']].copy()
new = pd.concat([new, gc], axis=1, sort=False)
tmp = new['gc_ref'].to_list()
del new['gc_ref']
new.insert(1, 'gc_ref', tmp)
else:
new = old
# if the file for the wanted stat does not exist, make this file containing the columns which are always the same
# and the current sample
else:
if 'gc_ref' in file:
new = file[['chr_length', 'gc_ref', column]].copy()
else:
new = file[['chr_length', column]].copy()
new = new.rename(columns={column: sample})
new.fillna(0.0, inplace=True)
new = new.astype(float)
new = new.round(2)
return new
# calculate in which order the organisms should be in the output files.
# the organism with the most read count in all samples should come first
def find_order(df):
samples = [] # list with all samples
for column in df.columns:
if column != 'chr_length' and column != 'gc_ref':
samples.append(column)
sum_organisms = [] # list of the sum form all samples for each organism (row sums)
for organism in df.index:
tmp_organism = [] # list of the stats from all samples for one organism
for column in samples:
tmp_organism.append(float(df.at[organism, column]))
sum_organisms.append(sum(tmp_organism))
df['sum_organisms'] = sum_organisms # add a column with the sums to the df
df = df.sort_values(by='sum_organisms', ascending=False) # sort the df by the sums
df = df.drop(['sum_organisms'], axis=1) # delete the column with the sums
order = df.index
return df, order
# sort the new df so it fits the previous calculated order
def sort_new(df, order):
order_df = pd.DataFrame(index=order) # create an empty order_df with just the right orderer organisms as index
new = pd.concat([order_df, df], axis=1, sort=False) # concat the df on the order_df so it is in the right order too
return new
def adding_species(path, column, name):
# when concating two df's, the value species gets lost, so it needs to be added afterwards
with open(path + "/" + column + "_" + name, 'r+') as f:
content = f.read()
if not content.split()[0] == "species":
f.seek(0, 0)
f.write(f"species" + content)
def get_taxa_to_exclude(file, limit, taxa_to_exclude, path):
reason = file.replace("_haybaler.csv", "_below_" + str(limit))
reason = reason.replace(path + "/", "")
df = pd.read_csv(file, decimal=",", index_col=0, sep='\t')
df = df.drop(['chr_length', 'gc_ref'], axis=1)
# check which rows have all values below the limit, add them to the "taxa_to_exclude_list"
taxa_to_exclude.extend(df[(df.astype('float') < limit).all(axis=1)].index)
# create a df with the excluded taxa and why they are excluded
taxa_to_exclude_index = df[(df.astype('float') < limit).all(axis=1)].index
taxa_to_exclude_df = pd.DataFrame(index=taxa_to_exclude_index)
taxa_to_exclude_df[reason] = "yes"
return taxa_to_exclude, taxa_to_exclude_df
def exclude_taxa(file, path, taxa_to_exclude):
df = pd.read_csv(path + "/" + file, decimal=",", index_col=0, sep='\t') # read csv
df = df[~df.index.isin(taxa_to_exclude)] # exclude taxa
df.to_csv(path + "/" + file, sep="\t") # save again as csv
def shorten_names(output_path, col, output_file):
short = pd.read_csv(output_path + "/" + col + "_" + output_file, index_col = 0, sep = "\t")
rownames = list(short.index)
for row in rownames:
new_name = row
split_name = row.split("_")
for n in range(len(split_name)):
if split_name[n] == "organism" or split_name[n] == "candidatus":
# if "organism" or "candidatus" is in the name, the species name consists of the two following words
new_name = split_name[n] + "_" + split_name[n+1] + "_" + split_name[n+2]
new_name = subspecies(new_name, n, 3, split_name) # check for subspecies
short = change_name(new_name, row, short)
break
try: # Using Unicode to find the species
# The species name consists of at least two words: generic and specific name.
# The first word starts with a capital letter
first_letter = ord(split_name[n][0]) # must be a capital letter
second_letter = ord(split_name[n][1]) # must be a lowercase letter
second_first = ord(split_name[n+1][0]) # must be a lowercase letter
# Unicode value of lowercase letters is between 97 and 122, for capital letters it is between 65 and 90
if 64 < first_letter < 91 and 96 < second_letter < 123 and 96 < second_first < 123:
new_name = split_name[n] + "_" + split_name[n+1]
new_name = subspecies(new_name, n, 2, split_name) # check for subspecies
short = change_name(new_name, row, short)
break
except:
pass
save_name = output_path + "/" + col + "_" + output_file
try:
save_name = save_name.split(".")[-2] + "_short.csv"
except:
save_name = save_name + "_short.csv"
index = short.index
if index.is_unique: # only saved if all row names are unique
short.to_csv(save_name, sep='\t')
def subspecies(new_name, n, count, split_name):
if split_name[n+count] == "subsp":
add = split_name[n+count+1]
while len(add) == 0:
count += 1
add = split_name[n+count+1]
new_name = new_name + "_subsp_" + add
return(new_name)
def change_name(new_name, row, short):
short.rename(index={row:new_name}, inplace=True)
return(short)
@click.command()
@click.option('--input_files', '-i', help='Name of the input file', required=True)
@click.option('--input_path', '-p', help='Path of the input file', required=True)
@click.option('--output_path', '-op', help='Name of the output path')
@click.option('--output_file', '-o', help='Name of the output file')
@click.option('--readcount_limit', '-l', help='minimum amount of readcounts per sample. "Chromosomes with less than x '
'reads in every sample are filtered out"! Default = 10', default=10)
@click.option('--rpmm_limit', '-r', help='minimum amount of RPMM per sample. "Chromosomes with less than x '
'RPMM in every sample are filtered out"! Default = 300', default=300)
def main(input_files, input_path, output_path, output_file, readcount_limit, rpmm_limit):
list_input_files = input_files.split(";")[1:]
col_list = []
# Debug prints messages on input and progress
debug = False # True or False
if debug:
print("INFO: Haybaler debug is on.")
for input_file in list_input_files:
try:
if input_file.endswith('.csv'):
if not re.search("rep.u*s.csv", input_file):
print("WARNING: Input file {0} does not match the typical file names. Only bam.txt, rep.s.csv and "
"rep.us.csv work as input files.".format(input_file))
file = read_csv(input_file, input_path)
if debug:
print(input_file)
elif input_file.endswith('.txt'):
if not re.search("bam.txt", input_file):
print("WARNING: Input file {0} does not match the typical file names. Only bam.txt, rep.s.csv and "
"rep.us.csv work as input files.".format(input_file))
file = txt_to_df(input_file, input_path)
if debug:
print(input_file)
else:
raise Exception(
"Inputfile {0} has the wrong file format. Only works for txt and csv".format(input_file))
except FileNotFoundError:
raise Exception("Failed to find or read input file: {0}".format(input_file))
except AttributeError:
raise Exception("No input file given. Please specify input file. Try --help for help")
# make an own file for each stat, so for each column in the input file (or add the stats to existing files)
for col in file.columns:
if col != "chr_length" and col != "gc_ref": # columns which are the same in every sample. Don't need extra file
if debug:
print(col)
df = join_dfs(file, output_file, output_path, col, input_file)
if col == 'read_count':
df, order = find_order(df)
elif 'order' in locals():
df = sort_new(df, order)
else:
print(
"Sorting process for file {0} passed. It was not possible to calculate the correct order".format
(input_file))
df.to_csv(output_path + "/" + col + "_" + output_file, sep='\t')
adding_species(output_path, col, output_file)
col_list.append(col)
taxa_to_exclude = []
excluded_taxa_readcount = None
excluded_taxa_RPMM = None
if os.path.isfile(output_path + "/" + "read_count_" + output_file):
taxa_to_exclude, excluded_taxa_readcount = get_taxa_to_exclude(output_path + "/" + "read_count_" + output_file,
readcount_limit, taxa_to_exclude, output_path)
if os.path.isfile(output_path + "/" + "RPMM_" + output_file):
taxa_to_exclude, excluded_taxa_RPMM = get_taxa_to_exclude(output_path + "/" + "RPMM_" + output_file, rpmm_limit,
taxa_to_exclude, output_path)
# check if the df of the excluded taxa from readcount and RPMM exist, concat them if both do
if excluded_taxa_readcount is not None and excluded_taxa_RPMM is not None:
excluded_taxa_df = pd.concat([excluded_taxa_readcount, excluded_taxa_RPMM], axis=1)
elif excluded_taxa_readcount is not None and excluded_taxa_RPMM is None:
excluded_taxa_df = excluded_taxa_readcount
elif excluded_taxa_readcount is None and excluded_taxa_RPMM is not None:
excluded_taxa_df = excluded_taxa_RPMM
else:
excluded_taxa_df = pd.DataFrame()
excluded_taxa_df.fillna("no", inplace=True)
excluded_taxa_df.to_csv(output_path + "/excluded_taxa.csv", sep="\t") # save the df with the excluded taxa and the reason
# when concating two df's, the value species gets lost, so it needs to be added afterwards
try:
with open(output_path + "/excluded_taxa.csv", 'r+') as f:
content = f.read()
if not content.split()[0] == "species":
f.seek(0, 0)
f.write(f"species" + content)
except FileNotFoundError:
print("WARNING: Output file can't be created and written")
for haybaler_csv in os.listdir(output_path):
if haybaler_csv.endswith(output_file):
exclude_taxa(haybaler_csv, output_path, taxa_to_exclude) # exclude the taxa from the haybaler.csv
# recreating all the output csv's with only species names as row names
for col in list(set(col_list)):
shorten_names(output_path, col, output_file)
if __name__ == '__main__':
main()