-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfeature.py
49 lines (39 loc) · 1.72 KB
/
feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from tqdm import tqdm
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
def mol2alt_sentence(mol, radius):
"""Same as mol2sentence() expect it only returns the alternating sentence
Calculates ECFP (Morgan fingerprint) and returns identifiers of substructures as 'sentence' (string).
Returns a tuple with 1) a list with sentence for each radius and 2) a sentence with identifiers from all radii
combined.
NOTE: Words are ALWAYS reordered according to atom order in the input mol object.
NOTE: Due to the way how Morgan FPs are generated, number of identifiers at each radius is smaller
Parameters
----------
mol : rdkit.Chem.rdchem.Mol
radius : float
Fingerprint radius
Returns
-------
list
alternating sentence
combined
"""
radii = list(range(int(radius) + 1))
info = {}
_ = AllChem.GetMorganFingerprint(mol, radius, bitInfo=info) # info: dictionary identifier, atom_idx, radius
mol_atoms = [a.GetIdx() for a in mol.GetAtoms()]
# print(mol_atoms)
dict_atoms = {x: {r: None for r in radii} for x in mol_atoms}
for element in info:
for atom_idx, radius_at in info[element]:
dict_atoms[atom_idx][radius_at] = element # {atom number: {fp radius: identifier}}
# merge identifiers alternating radius to sentence: atom 0 radius0, atom 0 radius 1, etc.
identifiers_alt = []
for atom in dict_atoms: # iterate over atoms
for r in radii: # iterate over radii
identifiers_alt.append(dict_atoms[atom][r])
alternating_sentence = map(str, [x for x in identifiers_alt if x])
return list(alternating_sentence)