-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapoholo.py
2841 lines (2295 loc) · 154 KB
/
apoholo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
@author: ChrisX
"""
# Apo-Holo Juxtaposition - AHoJ
import shutil
'''
Given a protein structure (PDB code), with optionally specified chain(s), ligand(s) and their position,
find its equivalent apo and holo forms, for each chain separately.
When no ligand(s) are specified, the program will detect and consider all ligands in the query.
The user can specify the following input arguments
i) When looking for apo from holo:
-Min arguments: PDB code
-Max arguments: PDB code, chain(s), ligand(s), position
*When position is specified, only one ligand can be defined.
'''
VERSION = '0.5.0'
import copy
import pathlib
from common import get_workdir, load_dict_binary, tmalign2, write_file, save_dict_binary
from residue_mapping import map_pdb_resnum_to_uniprot, group_mapped_res_by_chain, examine_cndt_mapped_bs_res, remove_negative_duplicate_cndt_bs_res_pos, evaluate_candidate_bs_rsds, good_candidates_from_residue_mapping, bad_candidates_from_residue_mapping, get_scores_from_residue_mapping
import __main__
__main__.pymol_argv = ['pymol', '-qc'] # Quiet and no GUI
# import pymol.cmd as cmd
#import psico.fitting
#import psico.fullinit
import pymol2
import ast
import gzip
import os
import wget
import time
import argparse
import sys
from dataclasses import dataclass
import pandas as pd
from concurrent.futures import ThreadPoolExecutor; import threading # multi-threading
#from concurrent.futures import ProcessPoolExecutor; import multiprocessing # multi-processing (doesn't work atm)
#import rich.traceback
#rich.traceback.install(show_locals=True, extra_lines=4, max_frames=1)
_global_lock = threading.Lock() # multi-threading
# global_lock = multiprocessing.Manager().Lock() # multi-processing (must be moved to main)
##########################################################################################################
# Define functions
##########################################################################################################
def download_mmCIF_gz2(pdb_id, pdb_dir): # Version 2 of download mmCIF gz (without exception handling)
# in pdb_dir mimic directory structure of FTP/rsynced whole PDB
# e.g.: 4ZZW is in {pdb_dir}/zz/4zzw.cif.gz
urlA = 'https://files.rcsb.org/download/'
ext = '.cif.gz'
url = urlA + pdb_id.upper() + ext
pdb_id = pdb_id.lower()
middle_bit = pdb_id[1:3]
subdir = f'{pdb_dir}/{middle_bit}'
file_path = f'{subdir}/{pdb_id}{ext}'
if not os.path.isfile(file_path):
pathlib.Path(subdir).mkdir(exist_ok=True)
print(f'Downloading: {pdb_id + ext}')
wget.download(url, subdir)
return file_path
else:
return file_path
# TODO(rdk): solve problem where structure is downloaded at the same time by multiple processes and saved as '3hku.cif (1).gz'
def download_mmCIF_lig(lig_id, destination_path): # Download mmCIF for ligands (without exception handling)
urlA = 'https://files.rcsb.org/ligands/view/'
urlB = '.cif'
url = urlA + lig_id.upper() + urlB
file_path = destination_path + '/' + lig_id + urlB
if not os.path.isfile(file_path):
wget.download(url, destination_path)
print('Downloading: ', lig_id + urlB)
return file_path
else:
return file_path
def download_sifts_xml_gz(pdb_id, sifts_dir):
urlA = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/split_xml/'
ext = '.xml.gz'
middle_bit = pdb_id[1:3]
pdb_id = pdb_id.lower()
url = urlA + f'{middle_bit}/' + pdb_id + ext
subdir = f'{sifts_dir}/{middle_bit}'
file_path = f'{subdir}/{pdb_id}{ext}'
if not os.path.isfile(file_path):
pathlib.Path(subdir).mkdir(exist_ok=True)
print(f'Downloading: {pdb_id + ext}')
wget.download(url, subdir)
return file_path
else:
return file_path
def merge_fragmented_unp_overlaps(fragmented_overlap_dict):
merged_overlap_dict = dict()
for key, values in fragmented_overlap_dict.items(): # key = candidate structchain, values = <query_structchain %UNP_overlap>
if len(values) > 1:
#total_overlap = 0
temp_dict = dict()
for value in values:
chain = value.split()[0]
overlap = value.split()[1]
if chain in temp_dict.keys(): # structchain has already a unp fragment
new_overlap = float(temp_dict[chain]) + float(overlap)
temp_dict[chain] = str(round(new_overlap, 1))
else:
temp_dict[chain] = overlap
#total_overlap += float(overlap)
#merged_overlap_dict.setdefault(key, []).append(chain + ' ' + str(round(total_overlap, 1)))
temp_list = list()
for x, y in temp_dict.items():
temp_list.append(x + ' ' + y)
merged_overlap_dict[key] = temp_list
else:
merged_overlap_dict[key] = values
return merged_overlap_dict
def add_log(msg, log_file): # Create error log
msg = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + '\t' + msg
with open(log_file, 'a') as file:
file.write(msg + '\n')
def next_job(path_pattern): # Create incrementing directory name for each job
i = 1
while os.path.exists(path_pattern % i): # First do an exponential search
i = i * 2
a, b = (i // 2, i) # Result lies somewhere in the interval (i/2..i] # We call this interval (a..b] and narrow it down until a + 1 = b
while a + 1 < b:
c = (a + b) // 2 # interval midpoint
a, b = (c, b) if os.path.exists(path_pattern % c) else (a, c)
return path_pattern % b
def search_query_history(pathQRS, new_query_name, past_queries_filename): # Find past job under the same query name, if found, return the job id
dict_q = dict()
try:
with open(pathQRS + '/' + past_queries_filename, 'r') as in_q:
for line in in_q:
dict_q[line.split('-')[0]] = line.split('-')[1][:-1]
if new_query_name in dict_q.keys():
return dict_q[new_query_name]
else:
return 0
except:
return 0
def wrong_input_error():
print('\n=== ERROR: Wrong input format ===')
print('-use a whitespace character to separate input arguments\n-chains are case-sensitive')
print('\nInput format structure:\n<pdb_id> <chain> <ligand/residue> <position> or\n<pdb_id> <chain> <ligandA,ligandB> or\n<pdb_id> <chainA,chainB> <ligand> or\n<pdb_id> <chain> or\n<pdb_id>')
print('\nInput examples:\n"3fav A ZN 101"\n"3fav A,B ZN"\n"3fav ! ZN"\n"3fav * ZN"\n"3fav ALL ZN"\n"3fav"\n')
sys.exit(1) # exit with error
def wrong_input_error_b():
print('\n=== ERROR: Wrong input ===')
print('-Chain identifier has wrong case or ligand is misspelled or does not exist in the structure')
sys.exit(1) # exit with error
def print_dict_readable(input_dict, header_msg):
print(header_msg)
for i, j in input_dict.items():
print(i, j)
# Join list of ligand codes to a single string, sorted
# separator: '-'
# if the list is empty return '-'
def join_ligands(ligands):
if not ligands:
return '-'
return '-'.join(sorted(ligands))
def is_not_blank(s):
return bool(s and not s.isspace())
##########################################################################################################
@dataclass
class Query:
struct: str
chains: str # maybe even change to list
ligands: str # maybe even change to list
position: str
autodetect_lig: bool
water_as_ligand_auto: bool
nonstd_rsds_as_lig_auto: bool
d_aa_as_lig_auto: bool
@dataclass
class QueryResult:
""" Result of a single query """
result_dir: str
num_apo_chains: int = 0
num_holo_chains: int = 0
error: str = None
def __str__(self):
return f"QueryResult[ apo: {self.num_apo_chains}, holo: {self.num_holo_chains}, error: {self.error}, dir: {self.result_dir})]"
@dataclass
class CandidateChainResult:
""" Result of candidate chain evaluation """
# TODO remodel, include/remove attributes
query_struct: str
query_chain: str
candidate_struct: str
candidate_chain: str
#query_lig_positions: dict
passed: bool = False
discard_reason: str = None
tm_score: float = None
tm_score_i: float = None
rmsd: float = None
#bndg_rsd_ratio: str
#bndg_rsd_percent: str
apo_holo_dict_instance: dict = None
apo_holo_dict_H_instance: dict = None
cndt_lig_positions_instance: dict = None
binding_sites_instance: dict = None
@dataclass
class PrecompiledData:
"""
Pre-compiled data needed for computation
UniProt PDB mapping
"""
dict_SIFTS: dict # regular SIFTS dictionary
dict_rSIFTS: dict # reverse SIFTS (SPnum) dictionary
def verify_ligands(ligand_names, pathLIGS):
for lig_id in ligand_names:
#try:
lig_path = download_mmCIF_lig(lig_id, pathLIGS)
with open(lig_path, 'r') as in_lig:
for line in in_lig:
if line.startswith('_chem_comp.name'):
lig_name = line.split()[1:]
if line.startswith('_chem_comp.pdbx_synonyms'):
lig_syn = line.split()[1:]
#print('>', lig_id, ' '.join(lig_name), ' '.join(lig_syn))
#print('Verifying ligand:\t', ligand_names, '> ', lig_id, ' '.join(lig_name), ' '.join(lig_syn))
print('Verifying ligand:\t', lig_id, '\t> ', lig_id, ' '.join(lig_name), ' '.join(lig_syn))
break
#except Exception as ex:
#print('Error verifying ligand:\t', lig_id, ex)
def parse_query(query: str, autodetect_lig: bool = False, water_as_ligand_auto: bool = False, nonstd_rsds_as_lig_auto: bool = False, d_aa_as_lig_auto: bool = False) -> Query: #
# Parse single line input (line by line mode, 1 structure per line)
# If only first argument specified (structure but no chains), consider all chains
print(f"Parsing query '{query}'")
query = query.strip()
parts = query.split()
struct = parts[0].lower()
chains = 'ALL'
ligands = None
position = None
# Define non-ligands (3-letter names of amino acids and h2o)
std_rsds = "ALA CYS ASP GLU PHE GLY HIS ILE LYS LEU MET ASN PRO GLN ARG SER THR VAL TRP TYR".split()
nonstd_rsds = "SEP TPO PSU MSE MSO 1MA 2MG 5MC 5MU 7MG H2U M2G OMC OMG PSU YG PYG PYL SEC PHA".split()
d_rsds = "DAL DAR DSG DAS DCY DGN DGL DHI DIL DLE DLY MED DPN DPR DSN DTH DTR DTY DVA".split()
#nolig_resn = list() #nolig_resn.extend(std_rsds + nonstd_rsds + d_rsds_hoh)
if len(struct) != 4:
raise ValueError(f"Invalid query '{query}': '{struct}' is not a valid PDB structure code")
if len(parts) == 1:
autodetect_lig = 1 # overrides cmd line param
elif len(parts) == 2:
chains = parts[1]
autodetect_lig = 1
elif len(parts) == 3:
chains = parts[1]
ligands = parts[2].upper() # adjust case, ligands = upper
# When position is specified, there has to be a single ligand/residue specified
elif len(parts) == 4 and len(parts[2]) < 4 and len(parts[2].split(',')) == 1:# and int(parts[3]):
try:
chains = parts[1]
ligands = parts[2].upper()
position = str(int(parts[3])) # test if int
#if ligands in std_rsds:
#autodetect_lig = 1 # not needed
#print('\nLigand is standard residue')
except Exception:
raise ValueError(f"Invalid query '{query}': wrong number of parts")
else:
raise ValueError(f"Invalid query '{query}': wrong number of parts")
if chains == '*' or chains == 'all':
chains = 'ALL'
elif chains == '!' and ligands is not None: # this will only process chain(s) that include query ligand(s)
pass
elif not all(chain.isalnum() for chain in chains.split(',')): # check that chains are alphanumeric characters
raise ValueError(f"Invalid query '{query}': only alphanumeric characters allowed as chains or '!/*'")
# If ligand is HOH or std residue or non-std residue, make sure that:
# i) there is just one specified (handled earlier)
# ii) there is a fourth argument (position)
if ligands is not None:
for ligand in ligands.split(','):
ligand = ligand.upper()
if ligand in std_rsds and position is None:
raise ValueError(f"Invalid query '{query}': specify index position of residue")
elif ligand == 'HOH' and position is not None:
water_as_ligand_auto = 1
elif ligand == 'HOH' and position is None:
raise ValueError(f"Invalid query '{query}': specify index position of HOH")
elif ligand in nonstd_rsds:
nonstd_rsds_as_lig_auto = 1
elif ligand in d_rsds:
d_aa_as_lig_auto = 1
elif ligand == '*': # overide any other ligands in input and replace with autodetect all ligands
ligands = None
autodetect_lig = 1
break
return Query(struct=struct, chains=chains, ligands=ligands, position=position, autodetect_lig=autodetect_lig, water_as_ligand_auto=water_as_ligand_auto, nonstd_rsds_as_lig_auto=nonstd_rsds_as_lig_auto, d_aa_as_lig_auto=d_aa_as_lig_auto)
def load_precompiled_data_txt(workdir) -> PrecompiledData:
pathSIFTS = workdir + '/SIFTS'
fileSIFTSdict = pathSIFTS + '/' + "pdb_chain_uniprot_dict.txt" # regular SIFTS file stripped
fileRSIFTS = pathSIFTS + '/' + "pdb_chain_uniprot_REVERSE_SPnum.txt" # pre-compiled rSIFTS file (reverse_SIFTS_SPnum.py)
print('Loading SIFTS dictionary') # Load normal SIFTS dictionary as dict_SIFTS
with open(fileSIFTSdict, 'r') as input1:
data = input1.read()
dict_SIFTS = ast.literal_eval(data)
print('Loading reverse SIFTS dictionary') # Load reverse_SIFTS (SPnum) dictionary as dict_rSIFTS
with open(fileRSIFTS, 'r') as input2:
data = input2.read()
dict_rSIFTS = ast.literal_eval(data)
return PrecompiledData(dict_SIFTS=dict_SIFTS, dict_rSIFTS=dict_rSIFTS)
def load_precompiled_data_bin(workdir) -> PrecompiledData:
pathSIFTS = workdir + '/SIFTS'
#fileSIFTSdict = pathSIFTS + '/pdb_chain_uniprot_dict.bin'
#fileRSIFTS = pathSIFTS + '/pdb_chain_uniprot_REVERSE_SPnum.bin'
# Observed residues only
fileSIFTSdict = pathSIFTS + '/uniprot_segments_observed_dict.bin'
fileRSIFTS = pathSIFTS + '/uniprot_segments_observed_REVERSE_SPnum.bin'
print('Loading SIFTS dictionary')
dict_SIFTS = load_dict_binary(fileSIFTSdict)
print('Loading reverse SIFTS dictionary')
dict_rSIFTS = load_dict_binary(fileRSIFTS)
return PrecompiledData(dict_SIFTS=dict_SIFTS, dict_rSIFTS=dict_rSIFTS)
def load_precompiled_data(workdir) -> PrecompiledData:
"""Load pre-compiled data generated by prepare.py"""
res = load_precompiled_data_bin(workdir)
print('Done loading pre-compiled data\n')
return res
def compile_query_report(user_query, normalized_input, job_id, query_chain_states, query_lig_positions, num_apo, num_holo, apo_dict, holo_dict, include_query_chains=1):
query_report = list() # Final report as a list that will be printed/saved
results_dict_eq = dict() # Results excluding query chains
results_dict_iq = dict() # Results including query chains
no_pair_iq = list() # Query chains without a single apo-holo pair
paired_iq = list() # Query chains with at least one apo-holo pair
no_pair_eq = list()
paired_eq = list()
normalized_query = normalized_input.split(':')[0]
normalized_settings = normalized_input.split(':')[1]
normalized_settings = normalized_settings.split('-')[0]
query_report.append('user_query\t\t' + user_query)
query_report.append('normalized_query\t' + normalized_query)
query_report.append('normalized_settings\t' + normalized_settings)
query_report.append('job_id\t\t\t' + job_id)
query_report.append('=== Results summary ===')
query_report.append(f'query_chains_num\t{len(query_chain_states)}')
query_report.append('query_chain_states\t' + str(query_chain_states)) # Write dict
query_report.append('query_lig_positions\t' + str(query_lig_positions)) # Write dict
query_report.append('total_apo_chains\t' + str(num_apo))
query_report.append('total_holo_chains\t' + str(num_holo) + '\n')
try:
# Get number of results for each query chain (for both excluding and including query chains)
for key in query_chain_states:
add_apo = 0
add_holo = 0
if query_chain_states[key] == 'apo':
add_apo = 1
elif query_chain_states[key] == 'holo':
add_holo = 1
# Apo
if apo_dict.get(key) is not None:
total_apo = len(apo_dict.get(key)) + add_apo
results_dict_eq[key + '_apo'] = total_apo - add_apo
results_dict_iq[key + '_apo'] = total_apo
else:
total_apo = 0 + add_apo
results_dict_eq[key + '_apo'] = total_apo - add_apo
results_dict_iq[key + '_apo'] = total_apo
# Holo
if holo_dict.get(key) is not None:
total_holo = len(holo_dict.get(key)) + add_holo
results_dict_eq[key + '_holo'] = total_holo - add_holo
results_dict_iq[key + '_holo'] = total_holo
else:
total_holo = 0 + add_holo
results_dict_eq[key + '_holo'] = total_holo - add_holo
results_dict_iq[key + '_holo'] = total_holo
'''
for key in query_chain_states:
if query_chain_states[key] == 'apo':
if holo_dict.get(key) is not None:
paired_iq.append(key)
if apo_dict.get(key) is not None:
paired_eq.append(key)
else:# if holo_dict.get(key) is None:
no_pair_iq.append(key)
elif query_chain_states[key] == 'holo':
if apo_dict.get(key) is not None:
paired_iq.append(key)
'''
# Find chains without any apo-holo pairs (iq dict)
for key, value in results_dict_iq.items():
if value == 0:
#no_pair_iq.append(key)
no_pair_iq.append(key.split('_')[0]) # remove _apo/_holo suffix
# Find chains without any apo-holo pairs (eq dict)
for key, value in results_dict_eq.items():
if value == 0:
#no_pair_eq.append(key)
no_pair_eq.append(key.split('_')[0]) # remove _apo/_holo suffix
# Subtract to find paired chains (for both iq and eq)
for key in query_chain_states:
if key not in no_pair_iq:
paired_iq.append(key)
if key not in no_pair_eq:
paired_eq.append(key)
'''
# Subtract to find paired chains
for key in query_chain_states:
#apo_key = key + '_apo'
#holo_key = key + '_holo'
#if apo_key not in no_pair_iq and holo_key not in no_pair_iq:
if key not in no_pair_iq:
paired_iq.append(key)
# Subtract to find paired chains
for key in query_chain_states:
#apo_key = key + '_apo'
#holo_key = key + '_holo'
#if apo_key not in no_pair_eq and holo_key not in no_pair_eq:
if key not in no_pair_eq:
paired_eq.append(key)
'''
# Remove _apo/_holo suffix from list item names
# done earlier (maybe do it here instead)
#no_pair_eq_chains = list()
#no_pair_eq_chains.append(i.split('_')[0] for i in no_pair_iq)
#print(no_pair_eq_chains)
# Remove possible duplicate chains
#no_pair_iq = list(no_pair_iq.fromkeys)
#print('\n\nkey, value in dict')
#print(key, value, results_dict_eq)
#print('\n\nno_pair_iq list', no_pair_iq)
#print('\n\nno_pair_eq list', no_pair_eq)
no_pair_iq = list(dict.fromkeys(no_pair_iq))
no_pair_eq = list(dict.fromkeys(no_pair_eq))
#print('\n\nno_pair_iq list', no_pair_iq)
#print('\n\nno_pair_eq list', no_pair_eq)
#query_report.append('\nresults_dict_iq\t\t' + str(results_dict_iq)) # Write dict
#query_report.append('no_pair_iq\t\t\t' + ','.join(no_pair_iq))
#query_report.append('paired_iq\t\t\t' + ','.join(paired_iq) + '\n')
# Including query chains
paired_chains_iq = len(query_chain_states) - len(no_pair_iq)
paired_chains_iq_pcnt = round(paired_chains_iq/len(query_chain_states)*100)
non_paired_chains_iq_pcnt = round(len(no_pair_iq)/len(query_chain_states)*100)
query_report.append('#paired_chains_iq\t' + str(paired_chains_iq))
query_report.append('%paired_chains_iq\t' + str(paired_chains_iq_pcnt))
query_report.append('paired_chains_iq\t' + ','.join(i.split('_')[0] for i in paired_iq))
query_report.append('#non-paired_chains_iq\t' + str(len(no_pair_iq)))
query_report.append('%non-paired_chains_iq\t' + str(non_paired_chains_iq_pcnt))
#query_report.append('non-paired_chains_iq\t' + ','.join(i.split('_')[0] for i in no_pair_iq))
query_report.append('non-paired_chains_iq\t' + ','.join(no_pair_iq))
# Excluding query chains
paired_chains_eq = len(query_chain_states) - len(no_pair_eq)
paired_chains_eq_pcnt = round(paired_chains_eq/len(query_chain_states)*100)
non_paired_chains_eq_pcnt = round(len(no_pair_eq)/len(query_chain_states)*100)
query_report.append('\n#paired_chains_eq\t' + str(paired_chains_eq))
query_report.append('%paired_chains_eq\t' + str(paired_chains_eq_pcnt))
query_report.append('paired_chains_eq\t' + ','.join(i.split('_')[0] for i in paired_eq))
query_report.append('#non-paired_chains_eq\t' + str(len(no_pair_eq)))
query_report.append('%non-paired_chains_eq\t' + str(non_paired_chains_eq_pcnt))
#query_report.append('non-paired_chains_eq\t' + ','.join(i.split('_')[0] for i in no_pair_eq))
query_report.append('non-paired_chains_eq\t' + ','.join(no_pair_eq))
# Append results as key-value pairs to report
query_report.append('\n- Results excluding query chains -') # Excluding query
for key, value in results_dict_eq.items():
query_report.append(key + '\t\t' + str(value))
query_report.append('- Results including query chains -') # Including query
for key, value in results_dict_iq.items():
query_report.append(key + '_iq\t\t' + str(value))
except Exception as ex:
print(f'\n*Exception while compiling query report: {ex}')
#query_report.append(f'\nException while compiling query report: {ex}')
return query_report
def write_query_report(query_report_list, report_filename, save_path):
report_filepath = save_path + '/' + report_filename
with open(report_filepath, 'w') as queryi_out:
queryi_out.write('\n'.join(query_report_list))
def write_ligands_csv(query_lig_positions, cndt_lig_positions, path_results): # Write dict(s) to csv
# we don't want to edit original dicts while computation is still running
# doing deepcopy because they are dicts of lists
cndt_lig_positions = copy.deepcopy(cndt_lig_positions)
query_lig_positions = copy.deepcopy(query_lig_positions)
filename_csv = path_results + '/ligands.csv'
header = "#chain, ligand_positions\n"
with open(filename_csv, 'w') as csv_out:
csv_out.write(header)
# Write query ligands
for key, values in query_lig_positions.items():
for idx, item in enumerate(values): # replace " " with "_"
values[idx] = item.replace(" ", "_")
csv_out.write("%s,%s\n" % (key, '-'.join(values)))
# Write (holo or apo) candidate ligands
for key, values in cndt_lig_positions.items():
for idx, item in enumerate(values): # replace " " with "_"
values[idx] = item.replace(" ", "_")
csv_out.write("%s,%s\n" % (key, '-'.join(values)))
def write_binding_sites_csv(binding_sites_dict, path_results): # Write dict to csv
# we don't want to edit original dicts while computation is still running
# doing deepcopy because they are dicts of lists
binding_sites_dict = copy.deepcopy(binding_sites_dict)
filename_csv = path_results + '/binding_sites.csv'
header = "chain,ligand,binding_residues\n"
with open(filename_csv, 'w') as csv_out:
csv_out.write(header)
# Format dict values and write
for key, values in binding_sites_dict.items():
chain = key.split('.')[0]
ligand = key.split('.')[1]
csv_out.write("%s,%s,%s\n" % (chain, ligand, ' '.join(values)))
def sort_results_pd(dict_name, column_names, sort=1): # Transfer results from dict to dataframe and sort them
if len(dict_name) > 0:
df = pd.DataFrame.from_dict(dict_name, orient='index').T.melt(var_name='query_chain', value_name='values').dropna(subset=['values'])
df[column_names] = df['values'].str.split(' ', expand=True)
df.drop('values', axis=1, inplace=True)
# Convert selected columns to floating numbers (for correct sorting)
df[[' %Mapped_bndg_rsds',' %UniProt_overlap',' Resolution',' R-free']] = df[[' %Mapped_bndg_rsds',' %UniProt_overlap',' Resolution',' R-free']].apply(pd.to_numeric, errors='coerce').fillna('-')
#df[df.columns.drop('D')] = df[df.columns.drop('D')].apply(pd.to_numeric, errors='coerce')
if sort == 1:
df.sort_values(by=['query_chain', ' %Mapped_bndg_rsds',' %UniProt_overlap', ' Resolution', ' R-free'], ascending=[True, False, False, True, True], inplace=True)
else:
column_names.insert(0, 'query_chain')
df = pd.DataFrame(columns = column_names)
return df
def write_results_apo_csv(apo_holo_dict, path_results):
# Write CSV file
filename_csv = path_results + '/results_apo.csv'
header = "query_chain, apo_chain, Exp.method, Resolution, R-free, %UniProt_overlap, Mapped_bndg_rsds, %Mapped_bndg_rsds, RMSD, TM_score, iTM_score, ligands\n"
with open(filename_csv, 'w') as csv_out:
csv_out.write(header)
for key, values in apo_holo_dict.items():
for value in values:
csv_out.write("%s,%s\n" % (key, ','.join(value.split())))
def write_results_holo_csv(apo_holo_dict_H, path_results):
# Write CSV file
filename_csv = path_results + '/results_holo.csv'
header = "query_chain, holo_chain, Exp.method, Resolution, R-free, %UniProt_overlap, Mapped_bndg_rsds, %Mapped_bndg_rsds, RMSD, TM_score, iTM_score, ligands\n"
with open(filename_csv, 'w') as csv_out:
csv_out.write(header)
for key, values in apo_holo_dict_H.items():
for value in values:
csv_out.write("%s,%s\n" % (key, ','.join(value.split())))
def write_discarded_chains(discarded_chains_dict, path_results):
filename_txt = path_results + '/discarded_chains.txt'
#header = 'Discarded_chain, Discard_reason'
with open(filename_txt, 'w') as txt_out:
#txt_out.write(header)
for key, values in discarded_chains_dict.items():
txt_out.write("%s\t%s\n" % (key, ' | '.join(values)))
def write_pymol_script(path_results):
script_dir = os.path.dirname(os.path.realpath(__file__))
pymol_file = "load_results_into_PyMOL.pml"
# copyfile() should be fast and does not copy permissions and metadata which is what we want
shutil.copyfile(f"{script_dir}/{pymol_file}", f"{path_results}/{pymol_file}")
def process_query(query, workdir, args, data: PrecompiledData = None) -> QueryResult:
"""
Process single line query
:param query: single line query with format "<pdb_id> <chains> <ligands>" (see README.md)
:param workdir: global work directory
:param args: all parsed cmd line args
:param data: if not provided will be loaded on the fly
:return:
"""
''' Test input (overrides argparse) '''
#multiline_input = '3fav all zn\n1a73 a zn,MG,HEM\n5ok3 all tpo'
#query = '1a0u' #hem, big search
#query = '1a73 a zn'#',MG,HEM'
#query = '5ok3 all tpo' #phosphothreonine, no apos
#query = '2ZB1 all gk4'
#query = '7l1f all F86' # too long
#query = '1SI4 cyn'
#query = '2v7c a'
#query = '5gss all gsh' # slow
#query = '1jq8 so4'
#query = '1l5h b CLF'
#query = '1DB1 vdx' #vitamin D3 study
#query = '3IXJ all 586' # beta-secretase 1 with inhibitor (cryptic?) # too long
#query = '2jds all L20' # cAMP-dependent protein kinase w inhibitor #202 chains 145 structs, long
#query = '1pzo all cbt' # TEM-1 Beta-Lactamase with Core-Disrupting Inhibitor #115 chains, 58 structs, longish
#query = '1qsh d heg' # long
#query = '3N3I all roc' # HIV mutations
#query = '2whh all ppn' # related to upper example, multiple identical structchains in values of dict
# Fast examples
#query = '2v0v' # Fully apo structure
#query = '3CQV all hem'#,coh'# hem,f86,mg,tpo,act,jkl,ue7,909' # apohaemoglobin study [OK]
#query = '3fav all zn' # [OK]
#query = '1py2 d frh' # 228 chains, 180 valid, long - run only on one chain [OK*]
#query = '2hka all c3s' # bovine NPC2 complex with cholesterol sulfate [OK]
#query = '2v57 a,c prl' # apo-holo SS changes in TetR-like transcriptional regulator LfrR in complex with proflavine [OK]
# Init independent pymol instance
pm = pymol2.PyMOL()
pm.start()
cmd = pm.cmd
# Basic
res_threshold = args.res_threshold
include_nmr = args.include_nmr
xray_only = args.xray_only
lig_free_sites = args.lig_free_sites
#autodetect_lig = args.autodetect_lig
#reverse_search = args.reverse_search # should be renamed to "start with apo" or "broad search"
water_as_ligand_usr = args.water_as_ligand
# Advanced
overlap_threshold = args.overlap_threshold
bndgrsds_threshold = args.bndgrsds_threshold
lig_scan_radius = args.lig_scan_radius
min_tmscore = args.min_tmscore
nonstd_rsds_as_lig_usr = args.nonstd_rsds_as_lig
d_aa_as_lig_usr = args.d_aa_as_lig
# Experimental
#beyond_hetatm = args.beyond_hetatm
look_in_archive = args.look_in_archive
# Internal
apo_chain_limit = args.apo_chain_limit
intrfc_lig_radius = args.intrfc_lig_radius
hoh_scan_radius = args.hoh_scan_radius # TODO replace with dynamic function for scan radius
bndg_res_radius = args.bndg_res_radius
# Saving
save_apo = args.save_apo
save_holo = args.save_holo
#save_session = args.save_session
#multisave = args.multisave
# Adjust input, resolve conflicts
autodetect_lig = 0 # default OFF
#if reverse_search == 1:
# autodetect_lig = 1
lig_scan_radius = str(lig_scan_radius) # needs to be str
intrfc_lig_radius = str(intrfc_lig_radius) # needs to be str
hoh_scan_radius = str(hoh_scan_radius) # needs to be str
bndg_res_radius = str(bndg_res_radius) # needs to be str
#cndtlig_scan_radius = lig_scan_radius # why is this "not used", since it is required later on? -local vrbl
#broad_search_mode = False # previously called "reverse_mode"
# Set directories, create job_id
path_root = workdir
pathSTRUCTS = path_root + '/structures' # Directory with ALL pdb structures (used for fetch/download)
pathLIGS = path_root + '/ligands' # Directory with ALL pdb ligands (used for fetch/download)
pathQRS = path_root + '/queries' # Directory/index with parameters of previously run jobs
pathXML = path_root + '/rsd_mappings' # Directory with xml files for residue mappings
# TODO make next job_id generation less clumsy
global _global_lock
if _global_lock is None:
_global_lock = threading.Lock() # to allow unit tests
with _global_lock:
generated_path_job_results = next_job(path_root + '/results/job_%s') #pathRSLTS = path_root + r'/results' + '/' + 'job_' + str(job_id)
if args.out_dir is not None:
path_results = args.out_dir # user defined
else:
path_results = generated_path_job_results # generated
if not os.path.isdir(path_results):
os.makedirs(path_results) # must be created inside lock to ensure each next_job is unique
print('Results directory:\t', path_results)
path_results_structs = path_results + '/structure_files' # Create subdirectory to store mmCIF files
if not os.path.isdir(path_results_structs):
os.makedirs(path_results_structs)
job_id = os.path.basename(os.path.normpath(path_results))
if data is None:
data = load_precompiled_data(workdir)
dict_SIFTS = data.dict_SIFTS
dict_rSIFTS = data.dict_rSIFTS
# Get additional info
# script_name = os.path.basename(__file__) #log_file = script_name[:-3] + '_rejected_res_' + infile1[:-4] + '.log'
# log_file_dnld = script_name + '_downloadErrors.log' #log_file_dnld = job_id + '_' + script_name + '_downloadErrors' + '.log'
# log_file_dnld = path_root + '/download_errors.log'
print('PyMOL version: ', cmd.get_version()[0:3])
# Create directories if they don't exist
print('Setting up directories')
if os.path.isdir(pathSTRUCTS):
print(f'Structure directory:\t{pathSTRUCTS}')
else:
print(f'Creating structure directory:\t{pathSTRUCTS}')
os.makedirs(pathSTRUCTS)
if os.path.isdir(pathLIGS):
print(f'Ligands directory:\t\t{pathLIGS}')
else:
print(f'Creating ligands directory:\t\t{pathLIGS}')
os.makedirs(pathLIGS)
if os.path.isdir(pathQRS):
print(f'Queries directory:\t\t{pathQRS}')
else:
print(f'Creating queries directory:\t\t{pathQRS}')
os.makedirs(pathQRS)
if os.path.isdir(pathXML):
print(f'XML file directory:\t\t{pathXML}')
else:
print(f'Creating XML file directory:\t\t{pathXML}')
os.makedirs(pathXML)
print('Done\n')
# Parse input
print('Parsing input')
try:
#q = parse_query(query, autodetect_lig, water_as_ligand_auto, nonstd_rsds_as_lig_auto, d_aa_as_lig_auto)
q = parse_query(query, autodetect_lig) # don't define, use function defaults
except ValueError as e:
print(e)
wrong_input_error()
user_chains = q.chains
struct = q.struct
ligand_names = q.ligands
position = q.position
autodetect_lig = q.autodetect_lig
water_as_ligand_auto = q.water_as_ligand_auto
nonstd_rsds_as_lig_auto = q.nonstd_rsds_as_lig_auto
d_aa_as_lig_auto = q.d_aa_as_lig_auto
# Parse chains
if user_chains != 'ALL' and user_chains != '!':
user_chains = ''.join(user_chains)
user_chains = user_chains.split(',')
user_chains_bundle = '+'.join(user_chains)
# Convert chains to structchain combos
user_structchains = list()
for user_chain in user_chains:
user_structchain = struct.lower() + user_chain#.upper()
user_structchains.append(user_structchain)
# Verify input structure here # TODO move verifications into parse_query to fail fast
try:
struct_path = download_mmCIF_gz2(struct, pathSTRUCTS)
print('Verifying structure:', struct, '\t> ', struct_path.split('/')[-1]) #'/'.join(struct_path.split('/')[-3:]))
except:
raise ValueError(f"Invalid PDB ID in query '{query}': use a valid 4-letter PDB code")
# Verify ligand/residue names here
if autodetect_lig == 0 or ligand_names is not None:
try:
verify_ligands(ligand_names.split(','), pathLIGS)
except Exception as ex:
print(ex)
raise ValueError(f"Invalid ligands in query '{query}': use PDB ligand names")
# Parse ligands
if autodetect_lig == 1 and ligand_names is not None or autodetect_lig == 0:
ligand_names = ''.join(ligand_names)
ligand_names = ligand_names.split(',')
ligand_names_bundle = '+'.join(ligand_names)
# Print input info
print('')
print('Input structure:\t', struct)
if user_chains == "!":
print('Input chains:\t\t', 'LIG-BINDING ONLY')
else:
print('Input chains:\t\t', user_chains)
if user_chains != 'ALL' and user_chains != '!':
print('Input structchains:\t', user_structchains)
if autodetect_lig == 1 and ligand_names is None:
print('Input ligands:\t\t', 'auto-detect')
elif autodetect_lig == 1 and ligand_names is not None: # this scenario should not occur anymore and thus be removed
print('Input ligands:\t\t', ligand_names, '+ auto-detect')
else:
print('Input ligands:\t\t', ligand_names) #, '\t', ligand_names_bundle)
if position is not None:
print('Input position:\t\t', position)
print('Done')
# Toggle "!" chain switch on = examine only chains with defined ligands
only_lig_chains = 0
if user_chains == "!":
only_lig_chains = 1
# Finished parsing query and configuring settings, store final settings and define non-ligands here
# Merge user settings and auto settings from parse query here
if water_as_ligand_auto == 1 or water_as_ligand_usr == 1:
water_as_ligand = 1
else:
water_as_ligand = 0
if nonstd_rsds_as_lig_auto == 1 or nonstd_rsds_as_lig_usr == 1:
nonstd_rsds_as_lig = 1
else:
nonstd_rsds_as_lig = 0
if d_aa_as_lig_auto == 1 or d_aa_as_lig_usr == 1:
d_aa_as_lig = 1
else:
d_aa_as_lig = 0
# Define non-ligands (3-letter names of amino acids and h2o)
nolig_resn = "ALA CYS ASP GLU PHE GLY HIS ILE LYS LEU MET ASN PRO GLN ARG SER THR VAL TRP TYR".split()
std_rsds = list(nolig_resn)
if water_as_ligand == 0:
nolig_resn.append('HOH')
# Non-standard residues
nonstd_rsds = "SEP TPO PSU MSE MSO 1MA 2MG 5MC 5MU 7MG H2U M2G OMC OMG PSU YG PYG PYL SEC PHA".split()
if nonstd_rsds_as_lig == 0:
nolig_resn.extend(nonstd_rsds)
# D-amino acids
d_aminoacids = "DAL DAR DSG DAS DCY DGN DGL DHI DIL DLE DLY MED DPN DPR DSN DTH DTR DTY DVA".split()
if d_aa_as_lig == 0:
nolig_resn.extend(d_aminoacids)
# Pass final settings to a string
settings_str = 'res' + str(res_threshold) + '_NMR' + str(include_nmr) + '_xrayonly' + str(xray_only) + '_ligfree' + str(lig_free_sites) + '_autodtctlig' + str(autodetect_lig) + '_h2olig' + str(water_as_ligand) + '_overlap' + str(overlap_threshold) + '_ligrad' + str(lig_scan_radius) + '_tmscore' + str(min_tmscore) + '_nonstdaas' + str(nonstd_rsds_as_lig) + '_daas' + str(d_aa_as_lig)
'''# Test print
print('\nuser chains, struct, ligand names, position, autodetect lig, water as ligand, nonstd rsds, d_aa_as_lig:\n',
user_chains, struct, ligand_names, position, autodetect_lig, water_as_ligand, nonstd_rsds_as_lig, d_aa_as_lig)
print('\nnolig resis:', nolig_resn)
#sys.exit(1)'''
## Find Apo candidates (rSIFTS)
# Find & VERIFY input chains by UniProt ID (if they don't exist in uniprot, we cannot process them)
# allow non-UniProt chains, because ligands can be assigned to non-protein chains
print(f'\nFinding & verifying query chains "{user_chains}" by UniProt ID')
discarded_chains = dict() # Discarded chains (format: structchain + '\t' + discard_msg)
usr_structchains_unverified = list()
qr_uniprot_ids = dict()
# Traceback "ALL" / "*" / "!" chains from SIFTS file
if user_chains == 'ALL' or user_chains == "!":
user_chains = list()
user_structchains = list()
#print('Considering all chains in query structure, finding chains..')
for key in dict_SIFTS:
if key[:4] == struct:
user_chains.append(key[4:])
user_structchains.append(key)
print(key, dict_SIFTS[key])
qr_uniprot_ids[key] = dict_SIFTS[key]
else:
for user_structchain in user_structchains[:]:
try:
print(user_structchain, dict_SIFTS[user_structchain])
qr_uniprot_ids[user_structchain] = dict_SIFTS[user_structchain]
except:
user_structchains.remove(user_structchain)
#discarded_chains.append(user_structchain + '\t' + 'Query chain not assigned UniProt ID\n')
usr_structchains_unverified.append(user_structchain)
# Put the UniProt found chains into a list
query_unp_chains = list()
for user_structchain in user_structchains:
usr_chain = user_structchain[4:]
query_unp_chains.append(usr_chain)
# Map ligands with non-protein chains to protein chains [not allowed in broad search]
non_protein_lig_chains = dict()
remap_to_UNP = False
if len(usr_structchains_unverified) > 0 and ligand_names is not None:
print(f'-ligand assigned non-protein chain(s): {usr_structchains_unverified}, attempting to map to protein chains')
for unverified_structchain in usr_structchains_unverified[:]:
if position is not None:
non_protein_lig_expression = struct + ' and chain ' + unverified_structchain[4:] + ' and resn ' + ligand_names_bundle + ' and resi ' + str(position)
else: