-
Notifications
You must be signed in to change notification settings - Fork 14
/
create_fingerprints.py
111 lines (83 loc) · 3.23 KB
/
create_fingerprints.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
import numpy as np
from aux_functions import to_numpyarray_to_list
from rdkit.Chem import rdMolDescriptors
def get_morgan(molecule, length=512):
try:
# radius=2 = ECFP4, radius=3 = ECFP6, etc.
desc = rdMolDescriptors.GetMorganFingerprintAsBitVect(molecule, 2, nBits=length)
except Exception as e:
print(e)
print('error ' + str(molecule))
desc = np.nan
return desc
def get_maccs(molecule):
try:
maccs = rdMolDescriptors.GetMACCSKeysFingerprint(molecule)
# Does not have length
except Exception as e:
print(e)
print("error" + str(molecule))
maccs = np.nan
return maccs
def get_atompairs(molecule, length=512):
try:
atompairs = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(molecule, nBits=length)
except Exception as e:
print(e)
print("error" + str(molecule))
atompairs = np.nan
return atompairs
def get_topological_torsion(molecule, length=512):
try:
tt = rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(molecule, nBits=length)
except Exception as e:
print(e)
print("error" + str(molecule))
tt = np.nan
return tt
def create_ecfp4_fingerprint(df_molecules, length=512, write=False):
# Morgan Fingerprint (ECFP4)
df_w = df_molecules.copy()
df_w["ECFP4"] = df_w["mols"].apply(lambda x: get_morgan(x, length)).apply(to_numpyarray_to_list)
# New DF with one column for each ECFP bit
ecfp_df = df_w['ECFP4'].apply(pd.Series)
ecfp_df = ecfp_df.rename(columns=lambda x: 'ECFP4_' + str(x + 1))
# Write to csv
if write:
ecfp_df.to_csv("./dataframes/ecfp4.csv")
return ecfp_df
def create_maccs_fingerprint(df_molecules, write=False):
# MACCS keys
df_w = df_molecules.copy()
df_w["MACCS"] = df_w["mols"].apply(get_maccs).apply(to_numpyarray_to_list)
# New DF with one column for each MACCS key
maccs_df = df_w['MACCS'].apply(pd.Series)
maccs_df = maccs_df.rename(columns=lambda x: 'MACCS_' + str(x + 1))
# Write to csv
if write:
maccs_df.to_csv("./dataframes/maccs.csv")
return maccs_df
def create_atompairs_fingerprint(df_molecules, length=512, write=False):
# ATOM PAIRS
df_w = df_molecules.copy()
df_w["ATOMPAIRS"] = df_w["mols"].apply(lambda x: get_atompairs(x, length)).apply(
to_numpyarray_to_list)
# New DF with one column for each ATOM PAIRS key
atom_pairs_df = df_w['ATOMPAIRS'].apply(pd.Series)
atom_pairs_df = atom_pairs_df.rename(columns=lambda x: 'ATOMPAIR_' + str(x + 1))
# Write to csv
if write:
atom_pairs_df.to_csv("./dataframes/atom_pairs.csv")
return atom_pairs_df
def create_topological_torsion_fingerprint(df_molecules, length=512, write=False):
# Topological Torsion
df_w = df_molecules.copy()
df_w["TT"] = df_w["mols"].apply(lambda x: get_topological_torsion(x, length)).apply(to_numpyarray_to_list)
# New DF with one column for each Topological torsion key
tt_df = df_w['TT'].apply(pd.Series)
tt_df = tt_df.rename(columns=lambda x: 'TT' + str(x + 1))
# Write to csv
if write:
tt_df.to_csv("./dataframes/topological_torsion.csv")
return tt_df