-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_humann.py
101 lines (78 loc) · 2.98 KB
/
merge_humann.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import argparse as ap
import os
import re
import pandas as pd
import numpy as np
parser = ap.ArgumentParser(description='combining results from humann')
parser.add_argument('-f', '--file', type=str,
help='file contains humann results', required=True)
parser.add_argument('-p', '--prefix', type=str,
help='prefix of output', required=True)
parser.add_argument('-d', '--datatype', type=str,
help='datatype of column', required=True)
parser.add_argument('-o', '--out', type=str, default="./",
help='output path', required=False)
args = parser.parse_args()
def Filename_to_filepath(filelist):
file2path = {}
with open(filelist, 'r') as f:
files = f.readlines()
for File in files:
File = File.strip()
if os.path.exists(File):
FileName = str(re.match(r'\S+[humann2_card|humann2_kegg|humann2_vfdb|humann2|humann3]'
r'\/(\d+|\d+\_\d+|\d+\_\d+\_\d+)'
r'\_[genefamilies.tsv|genefamilies_des.tsv|'
r'pathway_abundance_mean.tsv|pathway_abundance_mean_des.tsv|'
r'pathway_abundance_median.tsv|pathway_abundance_median_des.tsv|'
r'pathabundance.tsv]', File)[1])
file2path[FileName] = File
return(file2path)
def Read_File(filenames, samplenames, type):
# skip row with '#' and the 1st row
dat = pd.read_table(filenames,
sep='\t',
skip_blank_lines=True,
comment='#',
#skiprows=1,
header=None)
res = dat.iloc[:, 0:2]
# adding column name to the respective columns
res.columns = [type, samplenames]
return(res)
def Merge_humann(filedict, type):
df = pd.DataFrame()
for key in filedict:
temp_df = Read_File(filedict[key], key, type)
if df.empty:
df = temp_df
else:
df = pd.merge(df,
temp_df,
on=type,
how='outer')
df_na = df.replace(np.nan, 0)
res = df_na.copy()
# sorting by type
res.sort_values(type, inplace=True)
# dropping ALL duplicate values
res.drop_duplicates(
subset=type,
keep=False,
inplace=True)
return(res)
def Make_dir(dirpath):
if not os.path.exists(dirpath):
os.mkdir(dirpath)
def main():
file = args.file
out = args.out
type = args.datatype
prefix = args.prefix
file2dic = Filename_to_filepath(file)
df_res = Merge_humann(file2dic, type)
Make_dir(out)
outfile_name = out + "/" + prefix + ".csv"
df_res.to_csv(outfile_name, sep='\t', encoding='utf-8', index=False)
print('Congratulations, Program Ended Without Problem')
main()