-
Notifications
You must be signed in to change notification settings - Fork 1
/
genotype.py
1407 lines (1209 loc) · 49.3 KB
/
genotype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import copy
import helper
import subprocess
import statistics
import operator
import os
DESCRIPTION = f'''tool description:
call star alleles in target gene from genomic data
getting help:
stargazer.py genotype -h
main usages:
genotype with WGS data (or TS data)
stargazer.py genotype -o OUTPUT_PREFIX -d wgs (or -d ts) -t TARGET_GENE --vcf VCF -c CONTROL_GENE --gdf GDF
genotype with WGS-generated (or TS-generated) VCF only
stargazer.py genotype -o OUTPUT_PREFIX -d wgs (or -d ts) -t TARGET_GENE --vcf VCF
genotype with SNP array-generated VCF only
stargazer.py genotype -o OUTPUT_PREFIX -d chip -t TARGET_GENE --vcf VCF
other usages:
genotype with WGS data (or TS data), using the CNSR as the control locus
stargazer.py genotype -o OUTPUT_PREFIX -d wgs (or -d ts) -t TARGET_GENE --vcf VCF --gdf GDF --control_type cnsr
genotype with WGS data (or TS data), using a custom region as the control locus
stargazer.py genotype -o OUTPUT_PREFIX -d wgs (or -d ts) -t TARGET_GENE --vcf VCF --gdf GDF --control_type custom --region REGION
genotype with TS data, using one or more known SV-free samples as the control set for inter-sample normalization of read depth
stargazer.py genotype -o OUTPUT_PREFIX -d ts -t TARGET_GENE --vcf VCF -c CONTROL_GENE --gdf GDF --sample_list [SAMPLE [SAMPLE ...]]
genotype with SNP array-generated VCF only, using the Beagle program's imputation algorithm
stargazer.py genotype -o OUTPUT_PREFIX -d chip -t TARGET_GENE --vcf VCF --impute
'''
def cyp2a6(sample):
call_sv1(sample, "gc_e1e4", "*34")
call_sv1(sample, "gc_e1e2", "*12")
call_tandem(sample, "dup2", "*2", "*S1")
call_tandem(sample, "dup2", "*1", "*S1")
call_tandem(sample, "gc_e9", "*1", "*S2")
call_tandem(sample, "dup7", "*1", "*S3")
call_tandem(sample, "dup7b", "*1", "*S6")
cyp2a6_svcomb(sample)
def cyp2b6(sample):
call_sv1(sample, "gc_i4e9", "*29")
def cyp2d6(sample):
call_tandem(sample, 'del1', '*S2', '*1', ordered = True)
call_tandem(sample, 'gc_i1e9_5', '*68x5', '*4')
call_tandem(sample, 'gc_i1e9', '*68', '*4')
call_tandem(sample, 'gc_i1e9', '*S1', '*1')
call_tandem(sample, 'gc_e9', '*4N', '*4')
call_tandem(sample, 'gc_e9_3', '*36x3', '*10')
call_tandem(sample, 'gc_e9_7', '*36x7', '*10')
call_tandem(sample, 'gc_e9', '*36', '*10')
call_tandem(sample, 'gc_e9', '*83', '*2')
call_tandem(sample, 'gc_7to6_i4', '*13A', '*2')
call_sv1(sample, 'gc_e1e7', '*13C')
call_sv1(sample, 'gc_7to6_i1', '*13B')
cyp2d6_svcomb(sample)
def cyp2e1(sample):
call_sv1(sample, "dup_e7e9", "*S1")
def gstm1(sample):
gstm1_svcomb(sample)
def gstt1(sample):
gstt1_svcomb(sample)
def slc22a2(sample):
call_sv1(sample, "del_i9", "*S1")
call_sv1(sample, "del_e11", "*S2")
def slco1b1(sample):
call_sv1(sample, "dup1", "*S3")
def ugt1a4(sample):
call_sv1(sample, 'del_i1', '*S1')
call_sv1(sample, 'del2', '*S2')
def ugt2b15(sample):
call_sv1(sample, "del_i3e6", "*S1")
def ugt2b17(sample):
ugt2b17_svcomb(sample)
def new_tandem(sv, star_names):
stars = [args_.star_dict[x] for x in star_names]
score = sum([x.score for x in stars]) if all([isinstance(x.score, float) for x in stars]) else 'unknown'
core = list(set([x for y in [z.core for z in stars] for x in y]))
tandem = helper.Star()
tandem.name, tandem.score, tandem.core, tandem.sv = '+'.join(star_names), score, copy.deepcopy(core), sv
return tandem
def new_dup(sX, cnv):
times = int(cnv.replace('cnv', ''))
score = 'unknown' if isinstance(sX.score, str) else sX.score * times
dup = helper.Star()
dup.name, dup.score, dup.core, dup.sv = sX.name + 'x' + str(times), score, copy.deepcopy(sX.core), cnv
return dup
def remove_select(hap, stars):
'''
args_:
hap (list of Stars)
stars (list of Stars)
'''
for i in reversed(range(len(hap))):
if hap[i].name in [x.name for x in stars]:
del hap[i]
def remove_sv(hap, l = []):
for i in reversed(range(len(hap))):
if hap[i].sv:
if hap[i].name in l:
continue
else:
del hap[i]
def which_has(sample, stars):
'''
args_:
sample (Sample)
stars (list of str)
Returns:
i (int)
'''
h1 = set(stars).issubset([x.name for x in sample.hap[0].cand])
h2 = set(stars).issubset([x.name for x in sample.hap[1].cand])
if h1 and h2:
i = 3
elif h1 and not h2:
i = 1
elif not h1 and h2:
i = 2
else:
i = 0
return i
def which_severe(sample):
'''
args_:
sample (Sample)
Returns:
i (int)
'''
h1 = copy.deepcopy(sample.hap[0].cand)
h2 = copy.deepcopy(sample.hap[1].cand)
remove_sv(h1)
remove_sv(h2)
if h1[0].name == h2[0].name:
i = 3
else:
h3 = sorted([h1[0], h2[0]], key = lambda x: x.rank)
if h3[0].name == h1[0].name:
i = 1
else:
i = 2
return i
def call_sv1(sample, sv, x):
'''
This function calls a sample's final genotype if the sample has only one SV.
If a SV-carrying allele is in LD with other alleles, the funtion takes those other alleles as input.
x = the name of the star allele with the SV
'''
if sample.gt or sample.sv != ["no_sv", sv]:
return
if args_.star_dict[x].core:
h1 = set(args_.star_dict[x].core).issubset(sample.hap[0].obs)
h2 = set(args_.star_dict[x].core).issubset(sample.hap[1].obs)
else:
h1 = True
h2 = True
if not h1 and not h2:
return
elif h1 and not h2:
i, j = 1, 0
elif not h1 and h2:
i, j = 0, 1
else:
l1 = copy.deepcopy(sample.hap[0].cand)
l2 = copy.deepcopy(sample.hap[1].cand)
remove_sv(l1)
remove_sv(l2)
if l1[0].name == l2[0].name:
i, j = 0, 1
else:
l3 = sorted([l1[0], l2[0]], key = lambda x: x.rank)
if l3[0].name == l1[0].name:
i, j = 0, 1
else:
i, j = 1, 0
remove_sv(sample.hap[i].cand)
remove_sv(sample.hap[j].cand, [x])
sample.gt = True
def call_tandem(sample, sv, x, y, ordered = False):
"""
Calls a tandem duplication allele containing two gene copies (e.g., CYP2D6*36+*10)
x = the name of the 1st star allele in the tandem (e.g., '*36')
y = the name of the 2nd star allele in the tandem (e.g., '*10')
"""
if sample.gt or sample.sv != ["no_sv", sv]:
return
h1 = set([x, y]).issubset([_.name for _ in sample.hap[0].cand])
h2 = set([x, y]).issubset([_.name for _ in sample.hap[1].cand])
if not h1 and not h2:
return
elif h1 and not h2:
i, j = 0, 1
elif not h1 and h2:
i, j = 1, 0
else:
l1 = copy.deepcopy(sample.hap[0].cand)
l2 = copy.deepcopy(sample.hap[1].cand)
remove_sv(l1)
remove_sv(l2)
if l1[0] == l2[0]:
i, j = 1, 0
else:
l3 = sorted([l1[0], l2[0]], key = lambda x: x.rank)
if l3[0] == l1[0]:
i, j = 1, 0
else:
i, j = 0, 1
sX = args_.star_dict[x]
sY = args_.star_dict[y]
# find SNPs shared by both star alleles
overlap = []
for snp in sX.core:
if snp in sY.core:
overlap.append(snp)
# return if allele fraction in any of the shared SNPs is less than 0.4
for snp in overlap:
if [x for x in sample.hap[i].obs if x == snp][0].af < 0.4:
return
tandem = new_tandem(sv, [sX.name, sY.name])
sample.hap[i].cand.insert(0, tandem)
remove_select(sample.hap[i].cand, [sX, sY])
remove_sv(sample.hap[i].cand, [tandem.name])
remove_sv(sample.hap[j].cand)
if ordered:
sample.hap[i].cand.sort(key = lambda x: x.rank)
sample.gt = True
def call_cnv3(sample):
if sample.gt or sample.sv != ["no_sv", 'cnv2']:
return
sX = sample.hap[0].cand[0]
sY = sample.hap[1].cand[0]
# Simplest case (e.g., *2/*2x2)
if sX.name == sY.name:
sample.hap[0].add_dup(2)
sample.gt = True
return
sX_gene = sample.hap[0].af_mean_gene(helper.get_hg19_start(args_.target_gene), helper.get_hg19_end(args_.target_gene))
sY_gene = sample.hap[1].af_mean_gene(helper.get_hg19_start(args_.target_gene), helper.get_hg19_end(args_.target_gene))
if sX_gene == -1:
sX_gene = 1 - sY_gene
if sY_gene == -1:
sY_gene = 1 - sX_gene
diff_gene = sY_gene - sX_gene
sX_main = sample.hap[0].af_mean_main
sY_main = sample.hap[1].af_mean_main
if sX_main == -1:
sX_main = 1 - sY_main
if sY_main == -1:
sY_main = 1 - sX_main
diff_main = sY_main - sX_main
fit_maf1, _ = sample.hap[0].fit_data(3, helper.get_hg19_start(args_.target_gene), helper.get_hg19_end(args_.target_gene))
fit_maf2, _ = sample.hap[1].fit_data(3, helper.get_hg19_start(args_.target_gene), helper.get_hg19_end(args_.target_gene))
means = [round(fit_maf1, 2), round(fit_maf2, 2)]
f = lambda a, b: ((a == b) & (a == 0)) | (a * b > 0)
if f(diff_gene, diff_main):
if means == [0.33, 0.67]:
sample.hap[1].add_dup(2)
sample.gt = True
elif means == [0.67, 0.33]:
sample.hap[0].add_dup(2)
sample.gt = True
else:
if abs(diff_main) > abs(diff_gene):
if sY_main > sX_main:
sample.hap[1].add_dup(2)
sample.gt = True
else:
sample.hap[0].add_dup(2)
sample.gt = True
else:
if means == [0.33, 0.67]:
sample.hap[1].add_dup(2)
sample.gt = True
elif means == [0.67, 0.33]:
sample.hap[0].add_dup(2)
sample.gt = True
def call_cnv_plus(sample):
'''This function calls a final genotype with CN > 3 gene copies.'''
if sample.gt:
return
if 'cnv' not in sample.sv[1]:
return
if sample.sv[0] == 'no_sv' and (sample.sv[1] == 'cnv0' or sample.sv[1] == 'cnv2'):
return
if sample.sv[0] == 'no_sv' and 'cnv' in sample.sv[1]:
total_cn = int(sample.sv[1].replace('cnv', '')) + 1
elif 'cnv' in sample.sv[0] and 'cnv' in sample.sv[1]:
total_cn = int(sample.sv[0].replace('cnv', '')) + int(sample.sv[1].replace('cnv', ''))
if total_cn < 4:
return
# allele fraction profile is not informative -- i.e. it's empty
if sample.hap[0].af_mean_gene == -1:
sample.hap[0].add_dup(total_cn - 1)
sample.gt = True
return
fit_maf, fit_cn = sample.hap[0].fit_data(total_cn, int(args_.gene_dict[args_.target_gene]['hg19_start']), int(args_.gene_dict[args_.target_gene]['hg19_end']))
sample.hap[0].add_dup(fit_cn)
sample.hap[1].add_dup(total_cn - fit_cn)
sample.gt = True
def cyp2a6_svcomb(sample):
if sample.gt:
return
gt = []
for sv in sample.sv:
if sv == 'cnv0':
gt.append(args_.star_dict['*4'])
elif sv == 'cnv2':
gt.append(new_dup(args_.star_dict['*1'], sv))
elif sv == 'gc_e1e2':
gt.append(args_.star_dict['*12'])
elif sv == 'gc_e1e4':
gt.append(args_.star_dict['*34'])
elif sv == 'dup2':
gt.append(new_tandem(sv, ['*1', '*S1']))
elif sv == 'gc_e9':
gt.append(new_tandem(sv, ['*1', '*S2']))
elif sv == 'dup7':
gt.append(new_tandem(sv, ['*1', '*S3']))
elif sv == 'dup7x2':
gt.append(new_tandem(sv, ['*1', '*S3', '*S3']))
elif sv == 'dup7b':
gt.append(new_tandem(sv, ['*1', '*S6']))
if len(gt) == 2:
sample.hap[0].cand = [gt[0]]
sample.hap[1].cand = [gt[1]]
sample.gt = True
def cyp2d6_svcomb(sample):
if sample.gt:
return
gt = []
for sv in sample.sv:
if sv == 'cnv0':
gt.append(args_.star_dict['*5'])
elif sv == 'gc_i1e9' and which_has(sample, ['*68', '*4']):
gt.append(new_tandem(sv, ['*68', '*4']))
elif sv == 'gc_i1e9' and which_has(sample, ['*S1', '*1']):
gt.append(new_tandem(sv, ['*S1', '*1']))
elif sv == 'gc_e9' and which_has(sample, ['*4N', '*4']):
gt.append(new_tandem(sv, ['*4N', '*4']))
elif sv == 'gc_e9' and which_has(sample, ['*36', '*10']):
gt.append(new_tandem(sv, ['*36', '*10']))
elif sv == 'gc_e9' and which_has(sample, ['*83', '*2']):
gt.append(new_tandem(sv, ['*83', '*2']))
elif sv == 'gc_7to6_i4' and which_has(sample, ['13A', '*2']):
gt.append(new_tandem(sv, ['13A', '*2']))
elif sv == 'gc_7to6_i1':
gt.append(args_.star_dict['*13B'])
elif sv == 'gc_e1e7':
gt.append(args_.star_dict['*13C'])
cnv = None
for sv in sample.sv:
if 'cnv' in sv and sv != 'cnv0':
cnv = sv
if cnv:
if '*68+*4' in [x.name for x in gt]:
svcomb_tandem_cnv(gt, sample, ['*68', '*4'], cnv)
elif '*S1+*1' in [x.name for x in gt]:
svcomb_tandem_cnv(gt, sample, ['*S1', '*1'], cnv)
elif '*4N+*4' in [x.name for x in gt]:
svcomb_tandem_cnv(gt, sample, ['*4N', '*4'], cnv)
elif '*36+*10' in [x.name for x in gt]:
svcomb_tandem_cnv(gt, sample, ['*36', '*10'], cnv)
elif '*83+*2' in [x.name for x in gt]:
svcomb_tandem_cnv(gt, sample, ['*83', '*2'], cnv)
elif '*13A+*2' in [x.name for x in gt]:
svcomb_tandem_cnv(gt, sample, ['*13A', '*2'], cnv)
elif '*13B' in [x.name for x in gt]:
svcomb_sv1_cnv(gt, sample, '*13B', cnv)
if len(gt) == 2:
sample.hap[0].cand = [gt[0]]
sample.hap[1].cand = [gt[1]]
sample.gt = True
def svcomb_sv1_cnv(gt, sample, sX_name, cnv):
i = which_has(sample, [sX_name])
if not i:
return
if i != 3:
j = {0: 1, 1: 0}[i - 1]
elif i == 3:
j = 0
l = copy.deepcopy(sample.hap[j].cand)
remove_sv(l)
sY = new_dup(l[0], cnv)
gt.insert(j, sY)
def svcomb_tandem_cnv(gt, sample, tandem, cnv):
i = which_has(sample, [tandem[0], tandem[1]])
if not i:
return
if i != 3:
j = {0: 1, 1: 0}[i - 1]
l = copy.deepcopy(sample.hap[j].cand)
remove_sv(l)
sX = new_dup(l[0], cnv)
gt.insert(j, sX)
elif i == 3:
for x in sample.hap[0].cand:
if x.name == tandem[1]:
gt.append(new_dup(x, cnv))
break
def gstt1_svcomb(sample):
if sample.gt:
return
gt = []
for sv in sample.sv:
if sv == 'cnv0':
gt.append(args_.star_dict["*2"])
if len(gt) == 2:
sample.hap[0].cand = [gt[0]]
sample.hap[1].cand = [gt[1]]
sample.gt = True
def gstm1_svcomb(sample):
if sample.gt:
return
gt = []
for sv in sample.sv:
if sv == 'cnv0':
gt.append(args_.star_dict["*2"])
if len(gt) == 2:
sample.hap[0].cand = [gt[0]]
sample.hap[1].cand = [gt[1]]
sample.gt = True
def ugt2b17_svcomb(sample):
if sample.gt:
return
gt = []
for sv in sample.sv:
if sv == 'cnv0':
gt.append(args_.star_dict["*2"])
if len(gt) == 2:
sample.hap[0].cand = [gt[0]]
sample.hap[1].cand = [gt[1]]
sample.gt = True
##############################################################################
def remove_extra_s1():
def f(l):
if len(l) == 1:
return
for i in reversed(range(len(l))):
if l[i].name == '*1':
del l[i]
for name, sample in args_.samples.items():
f(sample.hap[0].cand)
f(sample.hap[1].cand)
f(sample.dip_cand)
def write_result_file():
float2str = lambda x: '.' if x == -1 else '{:.2f}'.format(x)
list2str = lambda x: '.' if not x else ','.join([str(x) for x in x])
with open(args_.output_prefix + '.stargazer-genotype.txt', 'w') as f:
header = ['name', 'status', 'hap1_main', 'hap2_main', 'hap1_cand', 'hap2_cand', 'hap1_score', 'hap2_score', 'dip_score', 'phenotype', 'dip_sv', 'hap1_sv', 'hap2_sv', 'ssr', 'dip_cand', 'hap1_main_core', 'hap2_main_core', 'hap1_main_tag', 'hap2_main_tag', 'hap1_af_mean_gene', 'hap2_af_mean_gene', 'hap1_af_mean_main', 'hap2_af_mean_main']
f.write('\t'.join(header) + '\n')
for name in sorted(args_.samples):
sample = args_.samples[name]
fields = ['.' for x in header]
status = 'g' if sample.gt else 'ng'
fields[header.index('name')] = name
fields[header.index('status')] = status
if status == 'g': fields[header.index('hap1_main')] = sample.hap[0].cand[0].name
if status == 'g': fields[header.index('hap2_main')] = sample.hap[1].cand[0].name
if status != 'qc': fields[header.index('hap1_cand')] = ','.join([x.name for x in sample.hap[0].cand])
if status != 'qc': fields[header.index('hap2_cand')] = ','.join([x.name for x in sample.hap[1].cand])
if status == 'g': fields[header.index('hap1_score')] = str(sample.hap[0].cand[0].score)
if status == 'g': fields[header.index('hap2_score')] = str(sample.hap[1].cand[0].score)
if status == 'g': fields[header.index('dip_score')] = str(sample.dip_score)
if status == 'g': fields[header.index('phenotype')] = sample.pt
if status != 'qc': fields[header.index('dip_sv')] = ','.join(sample.sv)
if status == 'g': fields[header.index('hap1_sv')] = sample.hap[0].sv
if status == 'g': fields[header.index('hap2_sv')] = sample.hap[1].sv
if status != 'qc': fields[header.index('ssr')] = sample.ssr
if status != 'qc': fields[header.index('dip_cand')] = ','.join([x.name for x in sample.dip_cand])
if status != 'qc': fields[header.index('hap1_main_core')] = list2str([x.summary() for x in sample.hap[0].obs if x in sample.hap[0].cand[0].core])
if status != 'qc': fields[header.index('hap2_main_core')] = list2str([x.summary() for x in sample.hap[1].obs if x in sample.hap[1].cand[0].core])
if status != 'qc': fields[header.index('hap1_main_tag')] = list2str([x.summary() for x in sample.hap[0].obs if x in sample.hap[0].cand[0].tag])
if status != 'qc': fields[header.index('hap2_main_tag')] = list2str([x.summary() for x in sample.hap[1].obs if x in sample.hap[1].cand[0].tag])
if status != 'qc': fields[header.index('hap1_af_mean_gene')] = float2str(sample.hap[0].af_mean_gene(helper.get_hg19_start(args_.target_gene), helper.get_hg19_end(args_.target_gene)))
if status != 'qc': fields[header.index('hap2_af_mean_gene')] = float2str(sample.hap[1].af_mean_gene(helper.get_hg19_start(args_.target_gene), helper.get_hg19_end(args_.target_gene)))
if status != 'qc': fields[header.index('hap1_af_mean_main')] = float2str(sample.hap[0].af_mean_main)
if status != 'qc': fields[header.index('hap2_af_mean_main')] = float2str(sample.hap[1].af_mean_main)
f.write('\t'.join(fields) + '\n')
def predict_phenotypes():
operators = {'<': operator.lt, '<=': operator.le, '>': operator.gt, '>=': operator.ge, '==': operator.eq}
phenotypes = {}
with open(args_.program_dir + '/phenotype_table.txt') as f:
header = next(f).strip().split('\t')
for line in f:
fields = line.strip().split('\t')
gene = fields[header.index('gene')]
if gene != args_.target_gene:
continue
name = fields[header.index('name')]
rules = fields[header.index('rules')].strip(',').split(',')
phenotypes[name] = rules
for name, sample in args_.samples.items():
if sample.dip_score == 'unknown':
sample.pt = 'unknown'
continue
for phenotype, rules in phenotypes.items():
found = True
for rule in rules:
op = rule.split(':')[0]
score = float(rule.split(':')[1])
if not operators[op](sample.dip_score, score):
found = False
break
if found:
sample.pt = phenotype
break
def order_haplotypes():
for name, sample in args_.samples.items():
if not sample.gt:
continue
if helper.sort_star_names([sample.hap[0].cand[0].name, sample.hap[1].cand[0].name])[0] == sample.hap[1].cand[0].name:
sample.hap[0], sample.hap[1] = sample.hap[1], sample.hap[0]
def log_genotype_mode():
with open(args_.log, 'a') as f:
f.write(f'\n{args_.line_break}\n')
f.write('Step 1/9: Determining genotype mode...\n\n')
f.write('Status: Completed\n\n')
f.write(f'Target gene: {args_.target_gene.upper()}\n')
f.write('Target paralog: {}\n'.format('N/A' if helper.get_paralog(args_.target_gene) == '.' else helper.get_paralog(args_.target_gene).upper()))
f.write(f'Target region: chr{args_.target_region}\n\n')
f.write('Control type: {}\n'.format('N/A' if args_.control_gene == '.' and args_.control_type == 'known' else 'Control gene' if args_.control_gene != '.' else 'Copy number-stable region' if args_.control_type == 'cnsr' else 'Custom locus'))
f.write('Control gene: {}\n'.format('N/A' if args_.control_gene == '.' else args_.control_gene.upper()))
f.write('Control region: {}\n\n'.format('chr' + args_.control_region if args_.control_region else 'N/A'))
f.write('Input data source: {}\n'.format('Targeted sequencing' if args_.data_type == 'ts' else 'Whole genome sequencing' if args_.data_type == 'wgs' else 'Single nucleotide polymorphism array'))
f.write(f'VCF-only mode is on: {args_.vcf_only}\n')
f.write(f'Imputation mode is on: {args_.impute}\n\n')
f.write(f'Enzyme function: {helper.get_function(args_.target_gene)}\n')
f.write(f'PharmVar member: {helper.get_pv_member(args_.target_gene)}\n')
f.write(f'DPSV member: {helper.get_dpsv_member(args_.target_gene)}\n')
f.write(f'\n{args_.line_break}\n')
def assess_vcf(input_vcf):
if not args_.vcf_only:
# check whether the sample list is identical between VCF and GDF
with open(args_.gdf) as f:
gdf_samples = [x.replace('Depth_for_', '') for x in f.readline().strip().split('\t')[3:]]
vcf_samples = input_vcf.header[9:]
if len(gdf_samples) != len(vcf_samples):
raise TypeError(f'The sample size differs between the VCF file (N={len(vcf_samples)}) and the GDF file (N={len(gdf_samples)})')
if len(set(gdf_samples + vcf_samples)) != len(vcf_samples):
raise TypeError(f'Two different sets of samples were detected from the VCF file and the GDF file')
for i in range(len(vcf_samples)):
if vcf_samples[i] != gdf_samples[i]:
raise TypeError(f"The order of samples differs between the VCF file ('{vcf_samples[i]}') and the GDF file ('{gdf_samples[i]}') at sample index {i}")
# make sure the sample size > 1 when using TS data
if len(vcf_samples) < 5 and args_.data_type == 'ts':
raise TypeError(f"Genotyping with TS data requires at least five samples (the current sample size is {len(vcf_samples)})")
log_dict = {'row': 0, 'AD': 0, 'phased': 0, 'unphased': 0, 'both': 0}
for fields in input_vcf.data:
chr = fields[0].replace('chr', '')
pos = fields[1]
fmt = fields[8].split(':')
# Check GT field
if 'GT' not in fmt:
ValueError('GT field not found [{}]'.format(pos))
# Check AD field
if 'AD' in fmt:
log_dict['AD'] += 1
# Check phasing status
def f(x):
gt = x.split(':')[fmt.index('GT')]
if '/' in gt:
return '/'
elif '|' in gt:
return '|'
else:
if chr == 'X' or chr == 'Y':
return
else:
raise ValueError('Genotype separator not found for autosomal chromosome chr{}:{} GT=[{}]'.format(chr, pos, gt))
separators = set([f(x) for x in fields[9:] if f(x)])
log_dict['row'] += 1
if len(separators) == 1:
if '|' in separators:
log_dict['phased'] += 1
else:
log_dict['unphased'] += 1
else:
log_dict['both'] += 1
# Check if input VCF is empty
if log_dict['row'] == 0:
args_.vcf_empty = True
# Determine AD mode
if log_dict['row'] > 0 and log_dict['AD'] / log_dict['row'] > 0.8:
args_.vcf_ad = True
# Determine phasing mode
if log_dict['phased'] == log_dict['row']:
args_.vcf_sep = '|'
elif log_dict['unphased'] == log_dict['row']:
args_.vcf_sep = '/'
else:
args_.vcf_sep = 'b'
with open(args_.log, 'a') as f:
f.write('Step 2/9: Assessing input VCF...\n\n')
f.write('Status: Completed\n\n')
f.write('Samples total: {}\n'.format(len(input_vcf.header[9:])))
f.write('Markers total: {}\n\n'.format(log_dict['row']))
f.write('Markers with allelic depth: {}\n\n'.format(log_dict['AD']))
f.write('Markers unphased: {}\n'.format(log_dict['unphased']))
f.write('Markers phased: {}\n'.format(log_dict['phased']))
f.write('Markers partially phased: {}\n'.format(log_dict['both']))
f.write(f'\n{args_.line_break}\n')
def process_vcf(input_vcf):
log_dict = {'IA': 0, 'allelic_imbalance': 0, 's50': 0}
processed_vcf = helper.copy_vcf(input_vcf, ['header'])
processed_vcf.meta = [
'##fileformat=VCFv4.2\n',
'##fileDate={}\n'.format(args_.date),
'##source={}\n'.format(args_.stargazer_version),
'##reference=hg19\n',
'##INFO=<ID=FE,Number=A,Type=String,Description="Functional Effect">\n',
'##INFO=<ID=PS,Number=1,Type=String,Description="Phasing Status (A, in preparation; B1, ready for phasing as is; B2, ready for phasing after conformation to reference VCF; C1, excluded from phasing because marker is absent in reference VCF; C2, excluded from phasing because marker has different REF allele; C3, excluded from phasing because marker has no overlapping ALT alleles; D1, statistically phased; D2, manually phased with certainty; D3, manually phased without certainty; D4, already phased; D5, manually phased by extension; E, omitted during statistical phasing)">\n',
'##INFO=<ID=RV,Number=A,Type=String,Description="Reverting Variation">\n',
'##INFO=<ID=SO,Number=A,Type=String,Description="=Sequence Ontology">\n',
'##INFO=<ID=VI,Number=A,Type=String,Description="Variant Impact">\n',
'##FILTER=<ID=IA,Description="Invalid Allele">\n',
'##FILTER=<ID=s50,Description="Less than 50% of samples have data">\n',
'##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">\n',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
'##FORMAT=<ID=HE,Number=4,Type=Integer,Description="Matching scores computed by the phase-by-extension algorithm">\n'
]
for fields in input_vcf.data:
chr, ref, alt, fmt, flt, inf = fields[0].replace('chr', ''), fields[3], fields[4].split(','), fields[8].split(':'), [], ['PS=D4'] if args_.vcf_sep == '|' else ['PS=A']
def f(x):
gt_field = x.split(':')[fmt.index('GT')]
# Determine genotype separator
if '/' in gt_field:
gt_sep = '/'
elif '|' in gt_field:
gt_sep = '|'
else:
gt_sep = ''
# Unphase genotype if input VCF is partially phased
if args_.vcf_sep == 'b' and gt_sep == '|':
if gt_field == '.|.':
gt_field = './.'
else:
gt_field = '/'.join(sorted(gt_field.split('|'), key = lambda x: int(x)))
# Conform genotype for sex chromosomes if necessary
if not gt_sep and (chr == 'X' or chr == 'Y'):
gt_field = '0|' + gt_field if args_.vcf_sep == '|' else '0/' + gt_field
# Get AD field information
if args_.vcf_ad:
ad_field = x.split(':')[fmt.index('AD')]
if gt_field == './.':
ad_field = ','.join(['0'] * (len(alt) + 1))
# Some tools such as Genalice produce variable-length AD fields (i.e., single AD value if sample is homozygous for ALT allele)
if len(ad_field.split(',')) == 1 and gt_sep and gt_field.split(gt_sep)[0] == gt_field.split(gt_sep)[1] and gt_field.split(gt_sep)[0] != '0' and len(alt) == 1:
if 'DP' in fmt:
dp_field = x.split(':')[fmt.index('DP')]
ad_field = str(int(dp_field) - int(ad_field)) + ',' + ad_field
else:
ad_field = '0,' + ad_field
ad_field = ':' + ad_field
else:
ad_field = ''
return gt_field + ad_field
fields[9:] = [f(x) for x in fields[9:]]
# Check invalid allele
if ref == 'I' or ref == 'D' or '.' in alt or 'D' in alt:
flt.append('IA')
log_dict['IA'] += 1
# Check high missingness
if ['.' in x.split(':')[0] for x in fields[9:]].count(True) / len(fields[9:]) > 0.5:
flt.append('s50')
log_dict['s50'] += 1
# Define quick function 3
def qf3(x):
gt = x.split(':')[0]
if '.' in gt:
return False
if '|' in gt:
gt = gt.split('|')
else:
gt = gt.split('/')
if gt[0] == gt[1]:
return False
ad = [int(y) for y in x.split(':')[1].split(',')]
if sum(ad) == 0:
return False
return max(ad) / sum(ad)
# Check allelic imbalance
if args_.vcf_ad:
ratios = [qf3(x) for x in fields[9:] if qf3(x)]
if ratios:
median = statistics.median(ratios)
if median > 0.8 or median < 0.2:
log_dict['allelic_imbalance'] += 1
fields[5] = '.'
fields[6] = ';'.join(flt) if flt else 'PASS'
fields[7] = ';'.join(inf)
fields[8] = 'GT:AD' if args_.vcf_ad else 'GT'
processed_vcf.data.append(fields)
with open(args_.log, 'a') as f:
f.write('Step 3/9: Processing input VCF...\n\n')
f.write('Status: Completed\n\n')
f.write('Markers with allelic imbalance: {}\n'.format(log_dict['allelic_imbalance']))
f.write('Markers with high missingness: {}\n'.format(log_dict['s50']))
f.write('Markers with invalid allele: {}\n'.format(log_dict['IA']))
f.write(f'\n{args_.line_break}\n')
return processed_vcf
def adjust_vcf(processed_vcf):
'''
Conform multiallelic loci containing more than one indels to the star allele table
For example, the UGT1A1 gene has such locus which defines three star alleles: *28 (234668879:CAT>CATAT), *36 (234668879:CAT>C), and *37 (234668879:CAT>CATATAT)
This locus can be represented in a VCF file in many different ways as shown below
File Star Alleles POS REF ALT
1st VCF *28 234668879 C CAT
2nd VCF *36 234668879 CAT C
3rd VCF *37 234668879 C CATAT
4th VCF *28,*36 234668879 CAT CATAT,C
5th VCF *28,*37 234668879 C CAT,CATAT
6th VCF *28,*36,*37 234668879 CAT CATAT,C,CATATAT
Ref VCF *28 234668879 C CAT
The following step would conform all these records to 234668879:CAT>CATAT,C,CATATAT
'''
adjusted_vcf = helper.copy_vcf(processed_vcf, ['meta', 'header', 'data'])
for i in range(len(adjusted_vcf.data)):
fields = adjusted_vcf.data[i]
pos = fields[1]
ref = fields[3]
alt = fields[4].split(',')
# skip if the locus does not have an indel
if len(ref) == 1 and all([len(x) == 1 for x in alt]):
continue
# skip if the locus is not used to define a star allele
star_list = []
for name, star in args_.star_dict.items():
if pos in [x.pos for x in star.core]:
star_list.append(star)
if not star_list:
continue
# skip if the locus is already formatted properly
bool_list = []
for x in alt:
is_found = False
for star in star_list:
if f'{pos}:{ref}>{x}' in [f'{x.pos}:{x.hg}>{x.var}' for x in star.core]:
is_found = True
break
bool_list.append(is_found)
if all(bool_list):
continue
# try to find new sequences for REF and ALT
snp_list = []
for star in star_list:
snp_list += star.core
snp_list = list(set(snp_list))
new_ref = []
new_alt = []
for x in alt:
result = [y for y in snp_list if y.pos == pos and (len(y.hg) - len(ref) == len(y.var) - len(x))]
if result:
new_ref.append(result[0].hg)
new_alt.append(result[0].var)
new_ref = list(set(new_ref))
# skip if there are more than one sequence for REF
if len(new_ref) > 1:
continue
# skip if the number of sequences does not match for ALT
if len(new_alt) != len(alt):
continue
# update the adjusted record
fields[3] = new_ref[0]
fields[4] = ','.join(new_alt)
return adjusted_vcf
def conform_vcf(adjusted_vcf, ref_vcf):
log_dict = {'status': 'Skipped because input VCF is empty ', 'row': 0, 'filtered': 0, 'PS=B1': 0, 'PS=B2': 0, 'PS=C1': 0, 'PS=C2': 0, 'PS=C3': 0}
def write_log():
with open(args_.log, 'a') as f:
f.write('Step 4/9: Conforming input VCF...\n\n')
f.write('Status: {}\n\n'.format(log_dict['status']))
f.write('Markers total: {}\n'.format(len(adjusted_vcf.data)))
f.write('Markers filtered: {}\n'.format(log_dict['filtered']))
f.write('Markers remaining: {}\n\n'.format(log_dict['row']))
f.write('Markers phasable: {}\n'.format(log_dict['PS=B1'] + log_dict['PS=B2']))
f.write('Markers ready: {}\n'.format(log_dict['PS=B1']))
f.write('Markers conformed: {}\n\n'.format(log_dict['PS=B2']))
f.write('Markers unphasable: {}\n'.format(log_dict['PS=C1'] + log_dict['PS=C2'] + log_dict['PS=C3']))
f.write('Markers absent in reference VCF: {}\n'.format(log_dict['PS=C1']))
f.write('Markers with different REF allele: {}\n'.format(log_dict['PS=C2']))
f.write('Markers with no overlapping ALT alleles: {}\n'.format(log_dict['PS=C3']))
f.write(f'\n{args_.line_break}\n')
if args_.vcf_empty:
write_log()
return adjusted_vcf
if args_.vcf_sep == '|':
log_dict['status'] = 'Skipped because input VCF is already fully phased'
write_log()
return adjusted_vcf
log_dict['status'] = 'Completed'
conformed_vcf = helper.copy_vcf(adjusted_vcf, ['meta', 'header'])
for fields1 in adjusted_vcf.data:
chr1, pos1, ref1, alt1, flt1, inf1 = fields1[0], fields1[1], fields1[3], fields1[4].split(','), fields1[6], fields1[7].split(';')
if flt1 != 'PASS':
log_dict['filtered'] += 1
continue
log_dict['row'] += 1
is_found = False
for i in range(len(ref_vcf.data)):
fields2 = ref_vcf.data[i]
chr2, pos2, ref2, alt2 = fields2[0], fields2[1], fields2[3], fields2[4].split(',')
# Keep looking if not found
if chr1 != chr2 or pos1 != pos2:
continue
# Although found, will not be phased
if ref1 != ref2:
fields3 = ref_vcf.data[i + 1]
pos3 = fields3[1]
ref3 = fields3[3]
# Check if the next line matches
if pos1 == pos3 and ref1 == ref3:
continue
else:
log_dict['PS=C2'] += 1
inf1 = ['PS=C2' if x == 'PS=A' else x for x in inf1]
is_found = True
break
# There are no overlapping ALT alleles
if not list(set(alt1) & set(alt2)):
is_found = True
log_dict['PS=C3'] += 1
inf1 = ['PS=C3' if x == 'PS=A' else x for x in inf1]
break
# Found and perfectly matched, no need to conform
if alt1 == alt2 or len(set(alt1) & set(alt2)) == 0:
is_found = True
log_dict['PS=B1'] += 1
inf1 = ['PS=B1' if x == 'PS=A' else x for x in inf1]
break
# Although found, missing one or more ALT alleles
if set(alt1).issubset(alt2) and len(alt2) > len(alt1):
diff = len(alt2) - len(alt1)
for allele in alt2:
if allele not in alt1:
alt1.append(allele)
if args_.vcf_ad:
fields1[9:] = [x + ',0' * diff for x in fields1[9:]]
# Although same ALT alleles, wrong order
if set(alt1) == set(alt2):
is_found = True
mapping = {0: 0}
for i in range(len(alt1)):
mapping[i + 1] = alt2.index(alt1[i]) + 1
fields1[4] = ','.join(alt2) # update ALT alleles
log_dict['PS=B2'] += 1
inf1 = ['PS=B2' if x == 'PS=A' else x for x in inf1]
def f(x):
gt = x.split(':')[0].split('/')
for i in [0, 1]:
if gt[i] != '0' and gt[i] != '.':
gt[i] = str(mapping[int(gt[i])])
if not args_.vcf_ad:
return '/'.join(gt)
ad1 = x.split(':')[1].split(',')
ad2 = [0 for y in ad1]
for i in range(len(ad2)):
ad2[mapping[i]] = ad1[i]
return '/'.join(gt) + ':' + ','.join(ad2)
fields1[9:] = [f(x) for x in fields1[9:]]
break
if not is_found:
inf1 = ['PS=C1' if x == 'PS=A' else x for x in inf1]
log_dict['PS=C1'] += 1
fields1[7] = ';'.join(inf1)
conformed_vcf.data.append(fields1)
write_log()
return conformed_vcf
def phase_vcf(conformed_vcf):
log_dict = {'status': 'Skipped because input VCF is empty', 'attempted': 0, 'PS=D1': 0, 'PS=E': 0}