-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathcanonicalize_products.py
34 lines (28 loc) · 1.14 KB
/
canonicalize_products.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pandas as pd
from rdkit import Chem
from tqdm import tqdm
input_csv_file = ''
output_csv_file = ''
csv = pd.read_csv(input_csv_file)
reaction_list = csv['rxn_smiles']
reactant_smiles_list = list(map(lambda x: x.split('>>')[0], reaction_list))
product_smiles_list = list(map(lambda x: x.split('>>')[1], reaction_list))
reaction_list_new = []
for idx, product in enumerate(tqdm(product_smiles_list)):
mol = Chem.MolFromSmiles(product)
index2mapnums = {}
for atom in mol.GetAtoms():
index2mapnums[atom.GetIdx()] = atom.GetAtomMapNum()
# canonicalize the product smiles
mol_cano = Chem.RWMol(mol)
[atom.SetAtomMapNum(0) for atom in mol_cano.GetAtoms()]
smi_cano = Chem.MolToSmiles(mol_cano)
mol_cano = Chem.MolFromSmiles(smi_cano)
matches = mol.GetSubstructMatches(mol_cano)
if matches:
for atom, mat in zip(mol_cano.GetAtoms(), matches[0]):
atom.SetAtomMapNum(index2mapnums[mat])
product = Chem.MolToSmiles(mol_cano, canonical=False)
reaction_list_new.append(reactant_smiles_list[idx] + '>>' + product)
csv['rxn_smiles'] = reaction_list_new
csv.to_csv(output_csv_file, index=False)