-
Notifications
You must be signed in to change notification settings - Fork 0
/
spikeIn_denovo_mutations.py
103 lines (69 loc) · 3.02 KB
/
spikeIn_denovo_mutations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import io
import numpy as np
import pandas as pd
## Read de novo mutations VCF file
## Create a dictionary with Chr, position as key and REF and ALT as values
def read_vcf(path):
f = open(path,'r')
data = []
header_lines = []
for each_line in f:
if each_line.startswith('##'):
header_lines.append(each_line)
else:
data.append(each_line)
data_lines = pd.read_table(io.StringIO(u''.join(data)),dtype={'#CHROM': str, 'POS': str, 'ID': str, 'REF': str, 'ALT': str,'QUAL': str, 'FILTER': str, 'INFO': str}).rename(columns={'#CHROM': 'CHROM'})
f.close()
return header_lines, data_lines
def spikeIn_DNMs(childVcf_header,childVcf_data,output_dir,num_DNMs,denovo_db_file):
## Change the file according to hg19 or hg38
DNM_header, DNM_data = read_vcf(denovo_db_file)
## Find common SNPs between denovo-db and Child VCFs
common_df = pd.merge(DNM_data, childVcf_data, on=['CHROM','POS'])
## Filter out SNPs from denovo-db which are common in Child VCF
# https://stackoverflow.com/questions/28901683/pandas-get-rows-which-are-not-in-other-dataframe?noredirect=1&lq=1
DNM_filter = DNM_data.merge(common_df.drop_duplicates(), on=['CHROM','POS'], how='left', indicator=True)
DNM_filter_leftonly = DNM_filter.loc[DNM_filter['_merge'] == 'left_only'][['CHROM','POS','ID','REF','ALT']]
## Subset partucular chromosome SNPs to be spiked-in (e.g. chr22)
#DNM_filter_leftonly = DNM_filter_leftonly.loc[DNM_filter_leftonly['CHROM'] == 'chr22']
## Exclude spiking-in X, Y chromosome & GL000209.1 SNPs
DNM_filter_leftonly = DNM_filter_leftonly[~DNM_filter_leftonly.CHROM.str.contains('KI|GL|chrX|chrY',case=False)]
## Variable k is number of de novo mutations to be spiked-in.
#k = 100
k = num_DNMs
## Randomly select "n" de novo mutations
DNM_sampled = DNM_filter_leftonly.sample(n=k)
## Insert column with default values
DNM_sampled.insert(5,"QUAL", "100")
DNM_sampled.insert(6,"FILTER", "PASS")
DNM_sampled.insert(7,"INFO", ".")
DNM_sampled.insert(8,"FORMAT", "GT")
DNM_sampled.insert(9,"Child_genotype", "0/1")
DNM_sampled.to_csv(output_dir+'/DNM_sampled.csv')
## Spike-in de novo mutations in Child VCF (outer merge)
spikedIn_df = pd.merge(childVcf_data, DNM_sampled, on=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','Child_genotype'], how='outer')
spikedIn_df.rename(columns={'CHROM':'#CHROM'}, inplace=True)
spikedIn_df['ID'].fillna('.', inplace=True)
spikedIn_df.dropna()
for x in childVcf_header:
print x.rstrip('\n')
print(spikedIn_df.to_csv(sep='\t', index=False))
del DNM_header
del DNM_data
del childVcf_header
del childVcf_data
del DNM_filter
del DNM_filter_leftonly
del spikedIn_df
if __name__ == "__main__":
## Call function to read Child VCF file
childVcf_header, childVcf_data = read_vcf(sys.argv[1])
output_dir=sys.argv[2]
num_DNMs=int(sys.argv[3])
denovo_db_file=sys.argv[4]
## Call function to spike-in DNMs into the child VCF file.
spikeIn_DNMs(childVcf_header,childVcf_data,output_dir,num_DNMs,denovo_db_file)