-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathfeatures.py
executable file
·87 lines (67 loc) · 3.18 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
"""
Analyze ProteinGraphML training data, from input pickle file.
E.g. diabetes_pid_TrainingData.pkl
Example output:
./pickle_utils.py diabetes_pid_TrainingData.pkl
INFO:rows: 2105; columns: 22056
INFO:Y: 1975: 0
INFO:Y: 130: 1
INFO:PPI columns : 0/22057 (0.00%)
INFO:cell line columns : 18997/22057 (86.13%)
INFO:GO columns : 785/22057 (3.56%)
INFO:IPR columns : 362/22057 (1.64%)
INFO:KEGG/R-HSA columns : 340/22057 (1.54%)
INFO:KEGG/hsa columns : 117/22057 (0.53%)
INFO:ACH columns : 1156/22057 (5.24%)
INFO:0. 6: features: 2868 / 22054 ; missing 0 / 22054; zeros 19186 / 22054
INFO:1. 16903: features: 1242 / 22054 ; missing 0 / 22054; zeros 20812 / 22054
INFO:2. 9740: features: 2500 / 22054 ; missing 0 / 22054; zeros 19554 / 22054
INFO:3. 3599: features: 21674 / 22054 ; missing 0 / 22054; zeros 380 / 22054
INFO:4. 15387: features: 2933 / 22054 ; missing 0 / 22054; zeros 19121 / 22054
"""
import sys,os,pickle,logging
import pandas as pd
if __name__=="__main__":
logging.basicConfig(format='%(levelname)s:%(message)s', level=(logging.DEBUG))
if len(sys.argv)<2:
logging.error(f"Syntax: {sys.argv[0]} PICKLEFILE")
with open(sys.argv[1], 'rb') as f:
df = pickle.load(f)
logging.info(f"rows: {df.shape[0]}; columns: {df.shape[1]}")
df.reset_index(level=0, inplace=True) #protein_id
tag = "Y"
for key,val in df[tag].value_counts().iteritems():
logging.info(f"{tag}: {val:6d}: {key}")
#for tag in df.columns: print(f"{tag}")
coltags = pd.Series([tag for tag in df.columns])
# Columns with PIDs:
ppi_match = coltags.str.fullmatch('[0-9]+\s*')
logging.info(f"PPI columns : {ppi_match.sum()}/{df.shape[1]} ({100*ppi_match.sum()/df.shape[1]:.2f}%)")
# Columns with cell lines? co-expression?:
cell_match = coltags.str.match('[0-9]+:[A-Z]')
logging.info(f"cell line columns : {cell_match.sum()}/{df.shape[1]} ({100*cell_match.sum()/df.shape[1]:.2f}%)")
# Columns with GO:
go_match = coltags.str.match('GO:')
logging.info(f"GO columns : {go_match.sum()}/{df.shape[1]} ({100*go_match.sum()/df.shape[1]:.2f}%)")
# Columns with IPR:
ipr_match = coltags.str.match('IPR')
logging.info(f"IPR columns : {ipr_match.sum()}/{df.shape[1]} ({100*ipr_match.sum()/df.shape[1]:.2f}%)")
# Columns with HSA (KEGG):
rhsa_match = coltags.str.match('R-HSA')
logging.info(f"KEGG/R-HSA columns : {rhsa_match.sum()}/{df.shape[1]} ({100*rhsa_match.sum()/df.shape[1]:.2f}%)")
# Columns with hsa (KEGG):
hsa_match = coltags.str.match('hsa')
logging.info(f"KEGG/hsa columns : {hsa_match.sum()}/{df.shape[1]} ({100*hsa_match.sum()/df.shape[1]:.2f}%)")
# Columns with ACH:
ach_match = coltags.str.match(r'.*\(ACH-')
logging.info(f"ACH columns : {ach_match.sum()}/{df.shape[1]} ({100*ach_match.sum()/df.shape[1]:.2f}%)")
###
# Training positives
# Counts and categories of non-empty features.
for i in df[df["Y"]==1].index:
protein_id = int(df.iloc[i,]["protein_id"])
fvec = df.iloc[i,3:]
missing = (fvec.isna() | fvec.isnull())
zeros = fvec==0
logging.info(f"{i}. {protein_id:5d}: features: {fvec.size-missing.sum()-zeros.sum()} / {fvec.size} ; missing {missing.sum()} / {fvec.size}; zeros {zeros.sum()} / {fvec.size}")