Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
Adds stand-alone code for running Animate.
  • Loading branch information
vishakad authored Oct 4, 2018
1 parent a5e2d98 commit 312f9fe
Show file tree
Hide file tree
Showing 6 changed files with 1,222 additions and 0 deletions.
169 changes: 169 additions & 0 deletions ChipSeq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
from PCR import PCR
import scipy
import numpy as np
import pandas as pd

class ChipSeq:
def __init__( self, genomeBindingTable, fragExtract, pcr, nChipReads=-1, nControlReads=-1 ):
"""
The ChipSeq class ties together all the other classes in the simulation.
The class contain two key dataframes that contain the number of
amplified fragments (amplifiedTable) and the number of reads
(readsTable) for each genomic location.
"""
self.amplifiedTable = pd.DataFrame( {'name' : fragExtract.extractedTable['name'].values} )
self.readsTable = pd.DataFrame( {'name' : fragExtract.extractedTable['name'].values} )
self.controlFragmentLocationMatrix = []
self.chipFragmentLocationMatrix = []
self.controlReadsLocationMatrix = []
self.chipReadsLocationMatrix = []
self.perControlAmplified = []
self.perChipAmplified = []
self.bindingTable = genomeBindingTable

fragExtract = self.downsampleControl(fragExtract)

self.amplifiedTable.loc[:,'amp_control_fragments'], self.perControlAmplified = self.pcrAmplify( fragExtract, pcr, 'control' )
self.readsTable.loc[:,'unique_control_reads'], self.readsTable.loc[:,'control_reads'] = self.sampleReads( fragExtract, 'control', nControlReads )

self.amplifiedTable.loc[:,'amp_chip_fragments'], self.perChipAmplified = self.pcrAmplify( fragExtract, pcr, 'chip' )
self.readsTable.loc[:,'unique_chip_reads'], self.readsTable.loc[:,'chip_reads'] = self.sampleReads( fragExtract, 'chip', nChipReads )

def downsampleControl( self, fragExtract ):
"""
This function downsamples either the number of control fragments or number of
ChIP fragments to ensure that their total numbers are equal.
"""
#If there are more fragments in the input sample than in the ChIP sample
if fragExtract.extractedTable['ext_control_fragments'].sum() > fragExtract.extractedTable['ext_chip_fragments'].sum():
downsampling = fragExtract.extractedTable['ext_chip_fragments'].sum()*1.0/fragExtract.extractedTable['ext_control_fragments'].sum()
extControlFragments = scipy.stats.binom.rvs( fragExtract.extractedTable['ext_control_fragments'], downsampling, size=fragExtract.extractedTable.shape[0] )

fragExtract.extractedTable.loc[:,'ext_control_fragments'] = extControlFragments
else:
#If there are more fragments in the ChIP sample than the input sample.
downsampling = fragExtract.extractedTable['ext_control_fragments'].sum()*1.0/fragExtract.extractedTable['ext_chip_fragments'].sum()
extChipFragments = scipy.stats.binom.rvs( fragExtract.extractedTable['ext_chip_fragments'], downsampling, size=fragExtract.extractedTable.shape[0] )

if len(extChipFragments) < fragExtract.extractedTable.shape[0]:
extChipFragments = np.append( extChipFragments, np.zeros( fragExtract.extractedTable.shape[0] - len(extChipFragments) ) )

fragExtract.extractedTable.loc[:,'ext_chip_fragments'] = extChipFragments

return fragExtract

def pcrAmplify( self, fragExtract, pcr, fragmentSetStr ):
"""
This function calls the PCR sampleFromPCRdist() routine to simulate PCR
amplification of extracted fragments in both ChIP and control samples.
Arguments :
1) fragExtract --- An instance of the FragExtract class that contains
the total number of extracted fragments in the ChIP (the
"ext_chip_fragments" column) and control (the "ext_control_fragments")
column.
2) pcr --- An instance of the PCR class that is initialized with the PCR
efficiencies at each genomic location and a specified number of
amplification cycles.
3) fragmentSetStr --- A string that can be set to "chip" or "control"
depending on whether amplification is being simulated on fragments in
the ChIP or input sample, respectively.
"""
fragmentCounts = fragExtract.extractedTable['ext_{}_fragments'.format( fragmentSetStr )]
numLocations = fragExtract.extractedTable.shape[0]

amplified, perFragmentAmplified = pcr.sampleFromPCRdist( fragmentCounts, returnPerFragment=True )
amplified = np.ndarray.astype(amplified, dtype=np.int64)

return [amplified, perFragmentAmplified]

def sampleReads( self, fragExtract, fragmentSetStr, nReads ):
"""
This function samples nReads from the amplified fragments in the ChIP
and control samples and returns the total and unique number of reads at
each location.
"""
if fragmentSetStr == 'control':
perFragmentAmplified = self.perControlAmplified
else:
perFragmentAmplified = self.perChipAmplified

amplified = self.amplifiedTable['amp_{}_fragments'.format( fragmentSetStr )].values
amplified = np.ndarray.astype(amplified, dtype=np.int64)
unamplified = fragExtract.extractedTable['ext_{}_fragments'.format( fragmentSetStr )]

totalAmplified = amplified.sum()
N = len( amplified )

if totalAmplified > nReads:
#Sample nReads from across all genomic locations such that each
#amplified fragment has an equal probability of being chosen.
readSample = hyperGeomSample( amplified, nReads )
else:
print("WARNING : The number of amplified fragments is less than the total read count. This can happen if the extraction efficiency, number of cells, or the PCR efficiency is too low. Conversely, the total read count is perhaps too high given the other parameters.")
readSample = amplified

uniques = np.zeros( N, dtype=np.int )
duplicates = np.zeros( N, dtype=np.int )

#See the Methods section in the manuscript for details on how reads
#are sampled from the pool of amplified fragments.
for i in range( N ):
if readSample[i] > 0:
locsToChoose = hyperGeomSample( perFragmentAmplified[i], readSample[i] )
mask = locsToChoose >= 1
uniques[i] = np.sum( mask, dtype=np.int )
else:
uniques[i] = 0

uniques = np.ndarray.astype(uniques,dtype=np.int64)

if fragmentSetStr == 'chip':
num = self.amplifiedTable.shape[0]
uniques = np.append( uniques, np.zeros( num - len(uniques), dtype=np.int64 ) )
readSample = np.append( readSample, np.zeros( num - len(readSample), dtype=np.int64 ) )

return [uniques,readSample]

def hyperGeomSample( binCounts, totalDrawSize ):
"""
Draw samples from a multi-variate hypergeometric distribution.
Inputs ---
1) binCounts --- An array of n values M1,M2,...,Mn where Mi is the number of
objects in the i-th bin.
2) totalDrawSize --- The total number of objects to be sampled.
Returns an array of n value that represent the number of objects sampled
from each bin.
"""
numBins = len( binCounts )
sample = np.zeros( numBins, dtype=np.int64 )
totalCount = np.sum( binCounts )
sampleSize = 0

ctr = 0
idx = 0

while idx < numBins:
success = binCounts[idx]
failure = totalCount - success
drawSize = totalDrawSize - sampleSize

if drawSize == 0:
return sample

draw = np.random.hypergeometric( success, failure, drawSize )
if draw > totalDrawSize:
sample[ctr] = n
return sample
else:
totalCount -= binCounts[idx]
sample[ctr] = draw
sampleSize += draw

ctr += 1

idx += 1

return sample
47 changes: 47 additions & 0 deletions FragExtract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from scipy.stats import binom
from scipy.stats import beta
import numpy as np
import pandas as pd

class FragExtract:
def __init__( self, pExtControl, pExtChip, boundTable ):
"""
The FragExtract class contains routines needed for simulating fragment
extraction from a pool of bound fragments.
The arguments to the class constructor are three non-keyword quantities ---
pExtControl, pExtChip and boundTable
1) pExtControl --- An array of values of size N, where N is the number
of binding locations, which specifies the extraction efficiency at
each location for the input/control sample. Each value must lie between 0 and 1.
2) pExtChip --- An array of values of the same dimension as pExtControl,
which specifies the extraction efficiency at each genomic location in
the ChIP sample. In general, we could set pExtChip and pExtControl to be different
values but we keep them equal in all simulations.
3) boundTable --- An instance of the GenomeBindingTable class. The
instance should have the control_fragments and chip_fragments variables
set to non-zero values.
"""
#pExtChip - Antibody efficiency/ChIP extraction efficieny -- What is the probability of a fragment that is bound by the TF
#of interest being picked up by the antibody in the ChIP sample?
#Default : 1
self.pExtChip = pExtChip

#pExtControl - Probability of extracting a fragment from the control sample.
self.pExtControl = pExtControl

#extractedTable is a pandas dataframe that stores the number of
#extracted fragments in the ChIP and input samples at each genomic location.
self.extractedTable = pd.DataFrame()
self.extractedTable.loc[:,'name'] = boundTable.locations['name'].values

self.N = boundTable.locations.shape[0]

#The extracted fragments are binomially sampled from bound fragments.
extControlFragments = binom.rvs( boundTable.locations['control_fragments'], self.pExtControl, size=self.N )

self.extractedTable.loc[:,'ext_control_fragments'] = extControlFragments

extChipFragments = binom.rvs( boundTable.locations['chip_fragments'], self.pExtChip, size=self.N )

self.extractedTable.loc[:,'ext_chip_fragments'] = extChipFragments
Loading

0 comments on commit 312f9fe

Please sign in to comment.