-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathChipSeq.py
182 lines (147 loc) · 8.2 KB
/
ChipSeq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from PCR import PCR
import scipy
import numpy as np
import pandas as pd
class ChipSeq:
def __init__( self, genomeBindingTable, fragExtract, pcr, nChipReads=-1, nControlReads=-1, generateIntervals=True ):
"""
The ChipSeq class ties together all the other classes in the simulation.
The class contain two key dataframes that contain the number of
amplified fragments (amplifiedTable) and the number of reads
(readsTable) for each genomic location.
"""
self.amplifiedTable = pd.DataFrame( {'name' : fragExtract.extractedTable['name'].values} )
self.readsTable = pd.DataFrame( {'name' : fragExtract.extractedTable['name'].values} )
self.controlFragmentLocationMatrix = []
self.chipFragmentLocationMatrix = []
self.controlReadsLocationMatrix = []
self.chipReadsLocationMatrix = []
self.perControlAmplified = []
self.perChipAmplified = []
self.bindingTable = genomeBindingTable
fragExtract = self.downsampleControl(fragExtract)
self.amplifiedTable.loc[:,'amp_control_fragments'], self.perControlAmplified = self.pcrAmplify( fragExtract, pcr, 'control' )
self.readsTable.loc[:,'unique_control_reads'], self.readsTable.loc[:,'control_reads'], self.controlFragmentNumbers = self.sampleReads( fragExtract, 'control', nControlReads, generateIntervals )
self.amplifiedTable.loc[:,'amp_chip_fragments'], self.perChipAmplified = self.pcrAmplify( fragExtract, pcr, 'chip' )
self.readsTable.loc[:,'unique_chip_reads'], self.readsTable.loc[:,'chip_reads'], self.chipFragmentNumbers = self.sampleReads( fragExtract, 'chip', nChipReads, generateIntervals )
def downsampleControl( self, fragExtract ):
"""
This function downsamples either the number of control fragments or number of
ChIP fragments to ensure that their total numbers are equal.
"""
#If there are more fragments in the input sample than in the ChIP sample
if fragExtract.extractedTable['ext_control_fragments'].sum() > fragExtract.extractedTable['ext_chip_fragments'].sum():
downsampling = fragExtract.extractedTable['ext_chip_fragments'].sum()*1.0/fragExtract.extractedTable['ext_control_fragments'].sum()
extControlFragments = scipy.stats.binom.rvs( fragExtract.extractedTable['ext_control_fragments'], downsampling, size=fragExtract.extractedTable.shape[0] )
fragExtract.extractedTable.loc[:,'ext_control_fragments'] = extControlFragments
else:
#If there are more fragments in the ChIP sample than the input sample.
downsampling = fragExtract.extractedTable['ext_control_fragments'].sum()*1.0/fragExtract.extractedTable['ext_chip_fragments'].sum()
extChipFragments = scipy.stats.binom.rvs( fragExtract.extractedTable['ext_chip_fragments'], downsampling, size=fragExtract.extractedTable.shape[0] )
if len(extChipFragments) < fragExtract.extractedTable.shape[0]:
extChipFragments = np.append( extChipFragments, np.zeros( fragExtract.extractedTable.shape[0] - len(extChipFragments) ) )
fragExtract.extractedTable.loc[:,'ext_chip_fragments'] = extChipFragments
return fragExtract
def pcrAmplify( self, fragExtract, pcr, fragmentSetStr ):
"""
This function calls the PCR sampleFromPCRdist() routine to simulate PCR
amplification of extracted fragments in both ChIP and control samples.
Arguments :
1) fragExtract --- An instance of the FragExtract class that contains
the total number of extracted fragments in the ChIP (the
"ext_chip_fragments" column) and control (the "ext_control_fragments")
column.
2) pcr --- An instance of the PCR class that is initialized with the PCR
efficiencies at each genomic location and a specified number of
amplification cycles.
3) fragmentSetStr --- A string that can be set to "chip" or "control"
depending on whether amplification is being simulated on fragments in
the ChIP or input sample, respectively.
"""
fragmentCounts = fragExtract.extractedTable['ext_{}_fragments'.format( fragmentSetStr )]
numLocations = fragExtract.extractedTable.shape[0]
amplified, perFragmentAmplified = pcr.sampleFromPCRdist( fragmentCounts, returnPerFragment=True )
amplified = np.ndarray.astype(amplified, dtype=np.int64)
return [amplified, perFragmentAmplified]
def sampleReads( self, fragExtract, fragmentSetStr, nReads, generateIntervals ):
"""
This function samples nReads from the amplified fragments in the ChIP
and control samples and returns the total and unique number of reads at
each location.
"""
if fragmentSetStr == 'control':
perFragmentAmplified = self.perControlAmplified
else:
perFragmentAmplified = self.perChipAmplified
amplified = self.amplifiedTable['amp_{}_fragments'.format( fragmentSetStr )].values
amplified = np.ndarray.astype(amplified, dtype=np.int64)
unamplified = fragExtract.extractedTable['ext_{}_fragments'.format( fragmentSetStr )]
totalAmplified = amplified.sum()
N = len( amplified )
if totalAmplified > nReads:
#Sample nReads from across all genomic locations such that each
#amplified fragment has an equal probability of being chosen.
readSample = hyperGeomSample( amplified, nReads )
else:
print("WARNING : The number of amplified fragments is less than the total read count. This can happen if the extraction efficiency, number of cells, or the PCR efficiency is too low. Conversely, the total read count is perhaps too high given the other parameters.")
readSample = amplified
uniques = np.zeros( N, dtype=np.int )
duplicates = np.zeros( N, dtype=np.int )
if generateIntervals:
readsToDuplicate = np.zeros( int(nReads), dtype=np.int )
else:
readsToDuplicate = []
#See the Methods section in the manuscript for details on how reads
#are sampled from the pool of amplified fragments.
uniqueReadIdx = 0
for i in range( N ):
if readSample[i] > 0:
locsToChoose = hyperGeomSample( perFragmentAmplified[i], readSample[i] )
mask = locsToChoose >= 1
uniques[i] = np.sum( mask, dtype=np.int )
if generateIntervals:
readsToDuplicate[uniqueReadIdx:(uniqueReadIdx+uniques[i])] = locsToChoose[mask]
else:
uniques[i] = 0
uniqueReadIdx += uniques[i]
if generateIntervals:
readsToDuplicate = readsToDuplicate[:uniqueReadIdx]
uniques = np.ndarray.astype(uniques,dtype=np.int64)
if fragmentSetStr == 'chip':
num = self.amplifiedTable.shape[0]
uniques = np.append( uniques, np.zeros( num - len(uniques), dtype=np.int64 ) )
readSample = np.append( readSample, np.zeros( num - len(readSample), dtype=np.int64 ) )
return [uniques,readSample,readsToDuplicate]
def hyperGeomSample( binCounts, totalDrawSize ):
"""
Draw samples from a multi-variate hypergeometric distribution.
Inputs ---
1) binCounts --- An array of n values M1,M2,...,Mn where Mi is the number of
objects in the i-th bin.
2) totalDrawSize --- The total number of objects to be sampled.
Returns an array of n value that represent the number of objects sampled
from each bin.
"""
numBins = len( binCounts )
sample = np.zeros( numBins, dtype=np.int64 )
totalCount = np.sum( binCounts )
sampleSize = 0
ctr = 0
idx = 0
while idx < numBins:
success = binCounts[idx]
failure = totalCount - success
drawSize = totalDrawSize - sampleSize
if drawSize == 0:
return sample
draw = np.random.hypergeometric( success, failure, drawSize )
if draw > totalDrawSize:
sample[ctr] = n
return sample
else:
totalCount -= binCounts[idx]
sample[ctr] = draw
sampleSize += draw
ctr += 1
idx += 1
return sample