-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathcontact_pdb.py
executable file
·683 lines (603 loc) · 27 KB
/
contact_pdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
#!/usr/bin/env python
# 2016-01-28 Chengxin Zhang
docstring='''
contact_pdb.py [options] pdb.pdb
Calculate residue contacts in single chain PDB file "pdb.pdb"
Options:
-cutoff=8 distance cutoff (in Angstrom) for contact
default value is 22 for -infmt=dist and 8 otherwise
-atom={CA,CB} calculate distance between "CA" for all residues, or "CA" for
gly and "CB" for other 19 amino acids
-outfmt={stat,list,dist} output format:
"list": tab-eliminated list listing residue index for contact pairs
"dist": tab-eliminated list listing residue distances for all pairs
"stat": statistics on number of contacts at short/medm/long/all
range and protein length L
"npy": matrix for CNN training
-range={all,short,medium,long} sequences seperation range x
"all": 1<=x
"short": 6<=x<12
"medium": 12<=x<24
"long": 24<=x (most useful)
(default): 6<=x
contact_pdb.py [options] pdb.pdb contact.map
Calculate accuracy of residue contacts map "contact.map" according to
in single chain PDB file "pdb.pdb"
"contact.map" could be of NN-BAYES contact map format (resi1 resi2 p)
or CASP Residue-Residue Separation Distance Prediction Format
(resi1 resi2 dist_lower dist_upper p). See
http://predictioncenter.org/casproll/index.cgi?page=format#RR
Options:
-cutoff=8
-atom={CB,CA} atom with which contact is considered.
CB - CA for GLY, CB for other 19 AA
CA - CA for all 20 AA
-range={all,short,medium,long}
-outfmt={stat,list,dist} output format:
"list": list showing if predicted contact pairs are TRUE
#resi1 resi2 dist TRUE/FALSE p
"dist": the same as above but actual distance is also reported
#resi1 resi2 dist TRUE/FALSE p
"stat": statistics on accuracy (ACC): (short2 stands for short range
contact ACC for top L/2, L=protein length)
#short1 short2 short5 medm1 medm2 medm5 long1 long2 long5
all1 all2 all5
"lnat": statistics on accuracy (ACC): (contact ACC for top Lnative,
where Lnative is the number of native contacts)
#short medm long all
default value is "lnat" if -infmt=dist, and "stat" otherwise
-cutoff_all=0 # ignore contact prediction p<=0
-cutoff_short=0.5 # ignore short range contact prediction p<=0.5
-cutoff_medium=0.4 # ignore medium range contact prediction p<=0.4
-cutoff_long=0.3 # ignore medium range contact prediction p<=0.3
-offset=0 add "offset" to residue index in predicted contact map
-infmt={rr,gremlin,pdb,dist} input format of contact map
rr - CASP RR, NeBcon, or mfDCA format
gremlin - matrix of confidence score
pdb - pdb coordinate file
dist - fitted distance prediction by ResTriplet2
'''
import sys,os
import re
import gzip
# see http://predictioncenter.org/casp12/doc/rr_help.html for contact range
# defination. Note that NeBcon/NN-BAYES uses different defination for long
# range contact
short_range_def=6 # short_range_def <= separation < medm_range_def
medm_range_def=12 # medm_range_def <= separation < long_range_def
long_range_def=24 # long_range_def <= separation. 25 in NeBcon/NN-BAYES
def read_pseudo_contact_map(infile="model1.pdb",atom_sele="CB",
cutoff=8, sep_range=str(short_range_def),offset=0):
res_dist_list=calc_res_dist(infile,atom_sele)
res_con_list=calc_res_contact(res_dist_list,sep_range,cutoff)
if len(res_con_list)==0:
return zip([],[],[])
resi1,resi2,p=map(list,zip(*res_con_list))
for i in range(len(res_con_list)):
resi1[i]+=offset
resi2[i]+=offset
p[i]=1-1.*p[i]/cutoff
p,resi1,resi2=map(list,zip(*sorted(zip(p,resi1,resi2),reverse=True)))
return zip(resi1,resi2,p)
def read_distance_map(infile="model1.pdb",offset=0):
fp=open(infile,'rU')
lines=fp.read().splitlines()
fp.close()
resi1=[]
resi2=[]
mu_list=[]
sigma_list=[]
for line in lines:
i,j,mu,sigma=line.split()[:4]
resi1.append(int(i)+offset)
resi2.append(int(j)+offset)
mu_list.append(float(mu))
sigma_list.append(float(sigma))
return zip(resi1,resi2,mu_list,sigma_list)
def read_contact_map(infile="contact.map",
cutoff_all=0,cutoff_short=0,cutoff_medium=0,cutoff_long=0,
sep_range=str(short_range_def),offset=0,infmt="rr"):
'''Read NN-BAYES or CASP RR format contact map. return them in a zipped
list with 3 fields for each residue pair. 1st field & 2nd filed are for
residue indices, and 3rd field is for euclidean distance.
'''
resi1=[] # residue index 1 list
resi2=[] # residue index 2 list
p=[] # cscore of contact prediction accuracy list
fp=sys.stdin
if infile!='-':
if infile.endswith(".gz"):
fp=gzip.open(infile,'rU')
else:
fp=open(infile,'rU')
lines=fp.read().strip().splitlines()
fp.close()
pattern=re.compile('(^\d+\s+\d+\s+\d+\s+\d+\s+[-+.e\d]+)|(^\d+\s+\d+\s+[-+.e\d]+)|(^\d+\s+[A-Z]\s+\d+\s+[A-Z]\s+[-+.e\d]+\s+[-+.e\d]+)')
if infmt!="rr":
for i,line in enumerate(lines):
for j,cscore in enumerate(line.split()):
if (j<=i):
continue
seperation=abs(i-j)
if (sep_range=="short" and not \
short_range_def<=seperation<medm_range_def) or \
(sep_range=="medium" and not \
medm_range_def<=seperation<long_range_def) or \
(sep_range=="long" and not long_range_def<=seperation):
continue
elif not sep_range in ["all","short","medium","long"] \
and seperation<int(sep_range):
continue
resi1.append(i+1)
resi2.append(j+1)
p.append(float(cscore))
return zip(resi1,resi2,p)
for line in lines:
if not line.strip(): # skip empty lines
continue
match_list=pattern.findall(line.strip())
if not match_list:
continue
line=[line for line in match_list[0] if line.strip()][0].split()
if len(line)==6:
line=[line[0],line[2],line[5]]
if not len(line) in (3,5):
continue
resi_idx1=int(line[0])+offset # residue index 1
resi_idx2=int(line[1])+offset # residue index 2
cscore=float(line[-1]) # cscore for contact prediction
seperation=abs(resi_idx1-resi_idx2)
if (sep_range=="short" and not \
short_range_def<=seperation<medm_range_def) or \
(sep_range=="medium" and not \
medm_range_def<=seperation<long_range_def) or \
(sep_range=="long" and not long_range_def<=seperation):
continue
elif not sep_range in ["all","short","medium","long"] \
and seperation<int(sep_range):
continue
if cscore<=cutoff_all or \
(cscore<=cutoff_short and seperation<=medm_range_def) or \
(cscore<=cutoff_medium and medm_range_def<=seperation<long_range_def
) or (cscore<=cutoff_long and long_range_def<=seperation):
continue
resi1.append(resi_idx1)
resi2.append(resi_idx2)
p.append(cscore)
return zip(resi1,resi2,p)
def calc_res_dist(infile="pdb.pdb",atom_sele="CA"):
'''Calculate Residue Distances of the first chain in PDB file "infile",
and return them in a zipped list with 3 fields for each element. 1st field
& 2nd filed are for residue indices, 3rd field is for euclidean distance.
atom_sele - select atoms whose euclidean distances are to be calculated
"CA" for alpha carbon
"CB" for alpha carbon in gly in beta carbon in all other amino acids'''
fp=sys.stdin
if infile!='-':
if infile.endswith(".gz"):
fp=gzip.open(infile,'rU')
else:
fp=open(infile,'rU')
struct=fp.read().split("ENDMDL")[0] # first model only
fp.close()
'''
1 - 6 Record name "ATOM "
7 - 11 Integer serial Atom serial number.
13 - 16 Atom name Atom name.
17 Character altLoc Alternate location indicator.
18 - 20 Residue name resName Residue name.
22 Character chainID Chain identifier.
23 - 26 Integer resSeq Residue sequence number.
27 AChar iCode Code for insertion of residues.
31 - 38 Real(8.3) x Orthogonal coordinates for X
39 - 46 Real(8.3) y Orthogonal coordinates for Y
47 - 54 Real(8.3) z Orthogonal coordinates for Z
55 - 60 Real(6.2) occupancy Occupancy.
61 - 66 Real(6.2) tempFactor Temperature factor.
77 - 78 LString(2) element Element symbol, right-justified.
79 - 80 LString(2) charge Charge on the atom.
'''
model=[r for r in struct.splitlines() if r.startswith("ATOM ")]
chain_id=[r[21] for r in model][0] # first chain
chain=dict()
for r in model:
if r[21]!=chain_id:
continue # first chain
resName=r[17:20]
resSeq=int(r[22:26])
name=r[12:16].strip()
x=float(r[30:38])
y=float(r[38:46])
z=float(r[46:54])
if name==atom_sele or \
(name=="CA" and not resSeq in chain): # CA if atom_sele is absent
chain[resSeq]=(x,y,z)
residues=sorted([k for k in chain]) # sorted list of residue index
resi1=[] # residue index 1
resi2=[] # residue index 2
dist=[] # euclidean distance, in Angstrom
for i in range(len(residues)-1):
idx1=residues[i]
x1,y1,z1=chain[idx1]
for j in range(i+1,len(residues)):
idx2=residues[j]
x2,y2,z2=chain[idx2]
dx=x1-x2
dy=y1-y2
dz=z1-z2
resi1.append(idx1)
resi2.append(idx2)
dist.append( (dx*dx+dy*dy+dz*dz)**.5)
#dist.append( ((x1-x2)**2+(y1-y2)**2+(z1-z2)**2)**.5)
return zip(resi1,resi2,dist)
def compare_res_contact(res_dist_list,res_pred_list,cutoff=8):
'''compare residue contact map "res_dist_list" calculate from pdb to
predicted residue contact "res_pred_list. return the result in a zipped
list with 5 fields for each pair. 1st field & 2nd filed are for residue
indices, 3rd field is for euclidean distance, 4th field for contact prediction
confidence p. 5th field for whether they are in contact in PDB structure.
'''
res_dist_dict=dict() # key is residue pair, value is distance
for i,j,dist in res_dist_list:
res_dist_dict[(i,j)]=dist
res_pred_dict=dict() # key is residue pair, value is cscore
for i,j,cscore in res_pred_list:
res_pred_dict[(i,j)]=cscore
cmp_list=[]
for i,j in set(res_dist_dict.keys()).intersection(res_pred_dict.keys()):
dist=res_dist_dict[(i,j)]
cscore=res_pred_dict[(i,j)]
cmp_list.append((cscore,i,j,dist,str(dist<cutoff).upper()))
#if len(cmp_list)==0:
#return []
# sort on cscore
p,resi1,resi2,dist,contact=map(list,zip(*sorted(cmp_list,reverse=True)))
cmp_list=zip(resi1,resi2,dist,contact,p)
return cmp_list
def compare_res_dist(res_dist_list,dist_pred_list,cutoff=8):
'''compare residue distance map "res_dist_list" calculate from pdb to
predicted residue contact "dist_pred_list. return the result in a zipped
list with 6 fields for each pair. 1st field & 2nd filed are for residue
indices, 3rd field is for euclidean distance, 4th field for distance
prediction. 5th field for absolute difference in predicted and native
distance. 6th field for predicted deviation.
'''
res_dist_dict=dict() # key is residue pair, value is distance
for i,j,dist in res_dist_list:
res_dist_dict[(i,j)]=dist
res_pred_dict=dict() # key is residue pair, value is cscore
for i,j,mu,sigma in dist_pred_list:
res_pred_dict[(i,j)]=(sigma,mu)
cmp_list=[]
for i,j in set(res_dist_dict.keys()).intersection(res_pred_dict.keys()):
dist=res_dist_dict[(i,j)]
sigma,mu=res_pred_dict[(i,j)]
cmp_list.append((sigma,i,j,dist,mu,abs(dist-mu)))
#if len(cmp_list)==0:
#return []
# sort on sigma
sigma_list,resi1_list,resi2_list,dist_list,mu_list,err_list=map(
list,zip(*sorted(cmp_list,reverse=False)))
return zip(resi1_list,resi2_list,dist_list,mu_list,err_list,sigma_list)
def calc_lnat_acc_dist(cmp_list,con_num_dict,sep_range=str(short_range_def)):
'''Calculate residue contact accuracy using ouput if "compare_res_contact"
and native contact number diction "con_num_dict" '''
top_pred=dict() # top short, medm, long, all prediction
if not sep_range in ["medium","long"]:
top_pred["short"]=[res_pair for res_pair in cmp_list if \
short_range_def<=abs(res_pair[0]-res_pair[1])<medm_range_def
][:con_num_dict["short"]]
if not sep_range in ["short","long"]:
top_pred["medm" ]=[res_pair for res_pair in cmp_list if \
medm_range_def<=abs(res_pair[0]-res_pair[1])<long_range_def
][:con_num_dict["medm"]]
if not sep_range in ["short","medium"]:
top_pred["long" ]=[res_pair for res_pair in cmp_list if \
long_range_def<=abs(res_pair[0]-res_pair[1])
][:con_num_dict["long"]]
if not sep_range in ["short","medium","long"]:
top_pred["all" ]=cmp_list[:con_num_dict["all"]]
ACC=dict() # dRMSD
coef_sigma=dict() # coeficient of sigma
for key in top_pred:
ACC[key]=0
if top_pred[key]:
ACC[key]=(sum([e[4]*e[4] for e in top_pred[key]]
)/len(top_pred[key]))**.5
coef_sigma[key]=sum([e[4]/e[5] for e in top_pred[key]]
)/len(top_pred[key])
else:
ACC[key]=0
coef_sigma[key]=0
return ACC,coef_sigma,top_pred
def calc_lnat_acc_contact(cmp_list,con_num_dict,sep_range=str(short_range_def)):
'''Calculate residue contact accuracy using ouput if "compare_res_contact"
and native contact number diction "con_num_dict" '''
top_pred=dict() # top short, medm, long, all prediction
if not sep_range in ["medium","long"]:
top_pred["short"]=[res_pair for res_pair in cmp_list if \
short_range_def<=abs(res_pair[0]-res_pair[1])<medm_range_def
][:con_num_dict["short"]]
if not sep_range in ["short","long"]:
top_pred["medm" ]=[res_pair for res_pair in cmp_list if \
medm_range_def<=abs(res_pair[0]-res_pair[1])<long_range_def
][:con_num_dict["medm"]]
if not sep_range in ["short","medium"]:
top_pred["long" ]=[res_pair for res_pair in cmp_list if \
long_range_def<=abs(res_pair[0]-res_pair[1])
][:con_num_dict["long"]]
if not sep_range in ["short","medium","long"]:
top_pred["all" ]=cmp_list[:con_num_dict["all"]]
ACC=dict() # accuracy
for key in top_pred:
ACC[key]=0
if top_pred[key]:
ACC[key]=1.*len([e for e in top_pred[key] if e[3]=="TRUE"]
)/con_num_dict[key]
return ACC,top_pred
def calc_acc_contact(cmp_list,L,sep_range=str(short_range_def)):
'''Calculate residue contact accuracy using ouput if "compare_res_contact"
and length of protein 'L" '''
top_pred=dict() # top L, L/2, L/5 prediction
if not sep_range in ["medium","long"]:
top_pred["short1"]=[res_pair for res_pair in cmp_list if \
short_range_def<=abs(res_pair[0]-res_pair[1])<medm_range_def][:L]
top_pred["short2"]=top_pred["short1"][:int(L/2)]
top_pred["short5"]=top_pred["short1"][:int(L/5)]
if not sep_range in ["short","long"]:
top_pred["medm1" ]=[res_pair for res_pair in cmp_list if \
medm_range_def<=abs(res_pair[0]-res_pair[1])<long_range_def][:L]
top_pred["medm2" ]=top_pred["medm1" ][:int(L/2)]
top_pred["medm5" ]=top_pred["medm1" ][:int(L/5)]
if not sep_range in ["short","medium"]:
top_pred["long1" ]=[res_pair for res_pair in cmp_list if \
long_range_def<=abs(res_pair[0]-res_pair[1])][:L]
top_pred["long2" ]=top_pred["long1" ][:int(L/2)]
top_pred["long5" ]=top_pred["long1" ][:int(L/5)]
if not sep_range in ["short","medium","long"]:
top_pred["all1" ]=cmp_list[:L]
top_pred["all2" ]=top_pred["all1" ][:int(L/2)]
top_pred["all5" ]=top_pred["all1" ][:int(L/5)]
ACC=dict() # accuracy
for key in top_pred:
if top_pred[key]:
ACC[key]=1.*len([e for e in top_pred[key] if e[3]=="TRUE"]
)/int(L/float(key.lstrip("shortmedmlongall")))
#)/len(top_pred[key])
else:
ACC[key]=0 # error
return ACC,top_pred
def calc_acc_dist(cmp_list,con_num_dict,sep_range=str(short_range_def)):
'''Calculate residue contact accuracy using ouput if "compare_res_contact"
and native contact number diction "con_num_dict" '''
top_pred=dict() # top short, medm, long, all prediction
if not sep_range in ["medium","long"]:
top_pred["short1"]=[res_pair for res_pair in cmp_list if \
short_range_def<=abs(res_pair[0]-res_pair[1])<medm_range_def][:L]
top_pred["short2"]=top_pred["short1"][:int(L/2)]
top_pred["short5"]=top_pred["short1"][:int(L/5)]
if not sep_range in ["short","long"]:
top_pred["medm1" ]=[res_pair for res_pair in cmp_list if \
medm_range_def<=abs(res_pair[0]-res_pair[1])<long_range_def][:L]
top_pred["medm2" ]=top_pred["medm1" ][:int(L/2)]
top_pred["medm5" ]=top_pred["medm1" ][:int(L/5)]
if not sep_range in ["short","medium"]:
top_pred["long1" ]=[res_pair for res_pair in cmp_list if \
long_range_def<=abs(res_pair[0]-res_pair[1])][:L]
top_pred["long2" ]=top_pred["long1" ][:int(L/2)]
top_pred["long5" ]=top_pred["long1" ][:int(L/5)]
if not sep_range in ["short","medium","long"]:
top_pred["all1" ]=cmp_list[:L]
top_pred["all2" ]=top_pred["all1" ][:int(L/2)]
top_pred["all5" ]=top_pred["all1" ][:int(L/5)]
ACC=dict() # dRMSD
#coef_sigma=dict() # coeficient of sigma
for key in top_pred:
ACC[key]=0
if top_pred[key]:
ACC[key]=(sum([e[4]*e[4] for e in top_pred[key]]
)/len(top_pred[key]))**.5
#coef_sigma[key]=sum([e[4]/e[5] for e in top_pred[key]]
#)/len(top_pred[key])
else:
ACC[key]=0
#coef_sigma[key]=0
return ACC,top_pred
def calc_contact_num(res_con_list,L):
''' calculate the number of contacts at different range '''
con_num_dict={"short":0,"medm":0,"long":0,"all":len(res_con_list),"L":L}
for resi1,resi2,dist in res_con_list:
con_num_dict["short"]+=(
short_range_def<=abs(resi2-resi1)<medm_range_def)
con_num_dict["medm"]+=(
medm_range_def<=abs(resi2-resi1)<long_range_def)
con_num_dict["long"]+=(
long_range_def<=abs(resi2-resi1))
return con_num_dict
def calc_res_contact(res_dist_list,sep_range=str(short_range_def),cutoff=8):
'''Calculate residue contacts from "res_dist_list", a zipped list of residue
pair distances returned by calc_res_dist
cutoff - distance cutoff (in Angstrom) for contact, usu between 6 and 12
sep_range - range of sequence seperations x
"all": 1<=x
"short": short_range_def <= x < medm_range_def
"medium": medm_range_def <= x < long_range_def
"long": long_range_def <=x (most useful)
(default): short_range_def<=x'''
res_dist_list_con=res_dist_list
cutoff=float(cutoff)
if cutoff:
res_dist_list_con=[e for e in res_dist_list if e[2]<cutoff]
if sep_range=="all":
return [e for e in res_dist_list_con if 1<=abs(e[0]-e[1])]
elif sep_range=="short":
return [e for e in res_dist_list_con if \
short_range_def<=abs(e[0]-e[1])<medm_range_def]
elif sep_range=="medium":
return [e for e in res_dist_list_con if \
medm_range_def<=abs(e[0]-e[1])<long_range_def]
elif sep_range=="long":
return [e for e in res_dist_list_con if long_range_def<=abs(e[0]-e[1])]
else:
return [e for e in res_dist_list_con if int(sep_range)<=abs(e[0]-e[1])]
if __name__=="__main__":
if len(sys.argv)<2:
sys.stderr.write(docstring)
exit()
atom_sele="CB"
cutoff=0
outfmt=""
sep_range=str(short_range_def) # "6"
cutoff_all =0
cutoff_short =0
cutoff_medium=0
cutoff_long =0
offset =0
infmt="rr"
file_list=[]
for arg in sys.argv[1:]:
if arg.startswith("-cutoff="):
cutoff=float(arg[len("-cutoff="):])
elif arg.startswith("-atom="):
atom_sele=arg[len("-atom="):]
elif arg.startswith("-range="):
sep_range=arg[len("-range="):]
if sep_range=="medm":
sep_range="medium"
elif arg.startswith("-outfmt="):
outfmt=arg[len("-outfmt="):]
elif arg.startswith("-infmt="):
infmt=arg[len("-infmt="):]
elif arg.startswith("-cutoff_all="):
cutoff_all=float(arg[len("-cutoff_all="):])
elif arg.startswith("-cutoff_short="):
cutoff_short=float(arg[len("-cutoff_short="):])
elif arg.startswith("-cutoff_medium="):
cutoff_medium=float(arg[len("-cutoff_medium="):])
elif arg.startswith("-cutoff_long="):
cutoff_long=float(arg[len("-cutoff_long="):])
elif arg.startswith("-offset="):
offset=int(arg[len("-offset="):])
elif arg.startswith("-") and len(arg)>1:
sys.stderr.write("ERROR! Unknown argument %s\n"%arg)
exit()
else:
file_list.append(arg)
if not file_list:
sys.stderr.write(docstring+"\nERROR! No PDB file")
exit()
if outfmt=="npy":
sep_range="all"
if cutoff==0:
if infmt=="dist":
cutoff=22
else:
cutoff=8
if outfmt=="":
if infmt=="dist":
outfmt="lnat"
else:
outfmt="stat"
res_dist_list=calc_res_dist(file_list[0],atom_sele)
res_con_list=calc_res_contact(res_dist_list,sep_range,cutoff)
L=map(list,zip(*res_dist_list))
L=L[0]+L[1]
L=max(L)-min(L)+1
if len(file_list)==1: # calculate residue contact
if outfmt=="npy":
import numpy as np
tmap=np.zeros((L,L),dtype=np.float32
)+np.diag(np.ones(L,dtype=np.float32))
for res_pair in res_con_list:
if outfmt.startswith("dist"):
sys.stdout.write("%d\t%d\t%.1f\n"%(res_pair[0],res_pair[1],res_pair[2]))
elif outfmt=="list":
sys.stdout.write("%d\t%d\n"%(res_pair[0],res_pair[1]))
elif outfmt=="npy":
tmap[res_pair[0]-1][res_pair[1]-1]= \
tmap[res_pair[1]-1][res_pair[0]-1]=1
if outfmt.startswith("stat") or outfmt.startswith("lnat"):
con_num_dict=calc_contact_num(res_con_list,L)
key_list=["short","medm","long","all","L"]
sys.stderr.write('\t'.join(key_list)+'\n')
sys.stdout.write('\t'.join([str(con_num_dict[key]
) for key in key_list])+'\n')
elif outfmt=="npy":
np.save(sys.stdout,tmap.reshape(L*L))
if not cutoff and outfmt=="list":
sys.stderr.write("\nWARNING! cutoff not set\n\n")
elif len(file_list)==2: # calculate contact prediction accuracy
if infmt=="pdb":
res_pred_list=read_pseudo_contact_map(file_list[1],atom_sele,
cutoff, sep_range, offset)
elif infmt=="dist":
dist_pred_list=read_distance_map(file_list[1],offset)
else:
res_pred_list=read_contact_map(file_list[1],
cutoff_all,cutoff_short,cutoff_medium,cutoff_long,
sep_range,offset,infmt)
if infmt=="dist":
cmp_list=compare_res_dist(res_dist_list,dist_pred_list)
else:
cmp_list=compare_res_contact(res_dist_list,res_pred_list,cutoff)
if not outfmt.startswith("stat") and not outfmt.startswith("lnat"):
for res_pair in cmp_list: #resi1,resi2,dist,contact,p
if outfmt.startswith("dist"):
sys.stdout.write("%d\t%d\t%.1f\t%s\t%.3f\n"%(res_pair[0],
res_pair[1],res_pair[2],res_pair[3],res_pair[4]))
elif outfmt=="list":
sys.stdout.write("%d\t%d\t%s\t%.3f\n"%(res_pair[0],
res_pair[1],res_pair[3],res_pair[4]))
elif outfmt.startswith("lnat"):
con_num_dict=calc_contact_num(res_con_list,L)
if infmt!="dist":
ACC,top_pred=calc_lnat_acc_contact(cmp_list,con_num_dict,sep_range)
if sep_range == "short":
key_list=["short"]
elif sep_range == "medium":
key_list=["medm"]
elif sep_range == "long":
key_list=["long"]
else:
key_list=["short", "medm", "long", "all"]
sys.stderr.write('\t'.join(key_list)+'\n')
sys.stdout.write('\t'.join(['%.3f'%ACC[key] for key \
in key_list])+'\n')
else:
ACC,coef_sigma,top_pred=calc_lnat_acc_dist(cmp_list,con_num_dict,sep_range)
if sep_range == "short":
key_list=["short"]
elif sep_range == "medium":
key_list=["medm"]
elif sep_range == "long":
key_list=["long"]
else:
key_list=["short", "medm", "long", "all"]
sys.stderr.write('\t'.join(["stat"]+key_list)+'\n')
sys.stdout.write('\t'.join(["drmsd"]+['%.3f'%ACC[key
] for key in key_list])+'\n')
sys.stdout.write('\t'.join(["scoef"]+['%.3f'%coef_sigma[key
] for key in key_list])+'\n')
elif outfmt=="stat":
if infmt!="dist":
ACC,top_pred=calc_acc_contact(cmp_list,L,sep_range)
else:
ACC,top_pred=calc_acc_dist(cmp_list,L,sep_range)
if sep_range == "short":
key_list=["short1","short2","short5"]
elif sep_range == "medium":
key_list=["medm1" ,"medm2" ,"medm5"]
elif sep_range == "long":
key_list=["long1" ,"long2" ,"long5"]
else:
key_list=["short1","short2","short5",
"medm1" ,"medm2" ,"medm5",
"long1" ,"long2" ,"long5",
"all1" ,"all2" ,"all5"]
sys.stderr.write('\t'.join(key_list)+'\n')
sys.stdout.write('\t'.join(['%.3f'%ACC[key] for key in key_list]
)+'\n')
if not cutoff:
sys.stderr.write("\nWARNING! cutoff not set\n\n")
else:
sys.stderr.write(docstring+"ERROR! Too many arguments.\n")