Add files via upload

Adds stand-alone code for running Animate.
vishakad · Oct 4, 2018 · 312f9fe · 312f9fe
1 parent a5e2d98
commit 312f9fe
Show file tree

Hide file tree

Showing 6 changed files with 1,222 additions and 0 deletions.
diff --git a/ChipSeq.py b/ChipSeq.py
@@ -0,0 +1,169 @@
+from PCR import PCR
+import scipy
+import numpy as np
+import pandas as pd
+
+class ChipSeq:
+    def __init__( self, genomeBindingTable, fragExtract, pcr, nChipReads=-1, nControlReads=-1 ):
+        """
+        The ChipSeq class ties together all the other classes in the simulation.
+        The class contain two key dataframes that contain the number of
+        amplified fragments (amplifiedTable) and the number of reads
+        (readsTable) for each genomic location.
+        """
+        self.amplifiedTable = pd.DataFrame( {'name' : fragExtract.extractedTable['name'].values} ) 
+        self.readsTable = pd.DataFrame( {'name' : fragExtract.extractedTable['name'].values} ) 
+        self.controlFragmentLocationMatrix = []
+        self.chipFragmentLocationMatrix = []
+        self.controlReadsLocationMatrix = []
+        self.chipReadsLocationMatrix = []
+        self.perControlAmplified = []
+        self.perChipAmplified = []
+        self.bindingTable = genomeBindingTable
+
+        fragExtract = self.downsampleControl(fragExtract)
+
+        self.amplifiedTable.loc[:,'amp_control_fragments'], self.perControlAmplified = self.pcrAmplify( fragExtract, pcr, 'control' )
+        self.readsTable.loc[:,'unique_control_reads'], self.readsTable.loc[:,'control_reads'] = self.sampleReads( fragExtract, 'control', nControlReads )
+
+        self.amplifiedTable.loc[:,'amp_chip_fragments'], self.perChipAmplified = self.pcrAmplify( fragExtract, pcr, 'chip' )
+        self.readsTable.loc[:,'unique_chip_reads'], self.readsTable.loc[:,'chip_reads'] = self.sampleReads( fragExtract, 'chip', nChipReads )
+
+    def downsampleControl( self, fragExtract ):
+        """
+        This function downsamples either the number of control fragments or number of
+        ChIP fragments to ensure that their total numbers are equal.
+        """
+        #If there are more fragments in the input sample than in the ChIP sample
+        if fragExtract.extractedTable['ext_control_fragments'].sum() > fragExtract.extractedTable['ext_chip_fragments'].sum():
+            downsampling = fragExtract.extractedTable['ext_chip_fragments'].sum()*1.0/fragExtract.extractedTable['ext_control_fragments'].sum()
+            extControlFragments = scipy.stats.binom.rvs( fragExtract.extractedTable['ext_control_fragments'], downsampling, size=fragExtract.extractedTable.shape[0] )
+
+            fragExtract.extractedTable.loc[:,'ext_control_fragments'] = extControlFragments
+        else:
+            #If there are more fragments in the ChIP sample than the input sample.
+            downsampling = fragExtract.extractedTable['ext_control_fragments'].sum()*1.0/fragExtract.extractedTable['ext_chip_fragments'].sum()
+            extChipFragments = scipy.stats.binom.rvs( fragExtract.extractedTable['ext_chip_fragments'], downsampling, size=fragExtract.extractedTable.shape[0] )
+
+            if len(extChipFragments) < fragExtract.extractedTable.shape[0]:
+                extChipFragments = np.append( extChipFragments, np.zeros( fragExtract.extractedTable.shape[0] - len(extChipFragments) ) )
+
+            fragExtract.extractedTable.loc[:,'ext_chip_fragments'] = extChipFragments
+
+        return fragExtract
+
+    def pcrAmplify( self, fragExtract, pcr, fragmentSetStr ):
+        """
+        This function calls the PCR sampleFromPCRdist() routine to simulate PCR
+        amplification of extracted fragments in both ChIP and control samples.
+
+        Arguments : 
+        1) fragExtract --- An instance of the FragExtract class that contains
+        the total number of extracted fragments in the ChIP (the
+        "ext_chip_fragments" column) and control (the "ext_control_fragments")
+        column.
+        2) pcr --- An instance of the PCR class that is initialized with the PCR
+        efficiencies at each genomic location and a specified number of
+        amplification cycles.
+        3) fragmentSetStr --- A string that can be set to "chip" or "control"
+        depending on whether amplification is being simulated on fragments in
+        the ChIP or input sample, respectively.
+        """
+        fragmentCounts = fragExtract.extractedTable['ext_{}_fragments'.format( fragmentSetStr )]
+        numLocations = fragExtract.extractedTable.shape[0]
+
+        amplified, perFragmentAmplified = pcr.sampleFromPCRdist( fragmentCounts, returnPerFragment=True )
+        amplified = np.ndarray.astype(amplified, dtype=np.int64)
+
+        return [amplified, perFragmentAmplified]
+
+    def sampleReads( self, fragExtract, fragmentSetStr, nReads ):
+        """
+        This function samples nReads from the amplified fragments in the ChIP
+        and control samples and returns the total and unique number of reads at
+        each location.
+        """
+        if fragmentSetStr == 'control':
+            perFragmentAmplified = self.perControlAmplified
+        else:
+            perFragmentAmplified = self.perChipAmplified
+
+        amplified = self.amplifiedTable['amp_{}_fragments'.format( fragmentSetStr )].values
+        amplified = np.ndarray.astype(amplified, dtype=np.int64)
+        unamplified = fragExtract.extractedTable['ext_{}_fragments'.format( fragmentSetStr )]
+
+        totalAmplified = amplified.sum()
+        N = len( amplified )
+
+        if totalAmplified > nReads:
+            #Sample nReads from across all genomic locations such that each
+            #amplified fragment has an equal probability of being chosen.
+            readSample = hyperGeomSample( amplified, nReads )
+        else:
+            print("WARNING : The number of amplified fragments is less than the total read count. This can happen if the extraction efficiency, number of cells, or the PCR efficiency is too low.  Conversely, the total read count is perhaps too high given the other parameters.")
+            readSample = amplified
+
+        uniques = np.zeros( N, dtype=np.int )
+        duplicates = np.zeros( N, dtype=np.int )
+
+        #See the Methods section in the manuscript for details on how reads 
+        #are sampled from the pool of amplified fragments.
+        for i in range( N ):
+            if readSample[i] > 0:
+                locsToChoose = hyperGeomSample( perFragmentAmplified[i], readSample[i] )
+                mask = locsToChoose >= 1
+                uniques[i] = np.sum( mask, dtype=np.int )
+            else:
+                uniques[i] = 0
+
+        uniques = np.ndarray.astype(uniques,dtype=np.int64)
+
+        if fragmentSetStr == 'chip':
+            num = self.amplifiedTable.shape[0]
+            uniques = np.append( uniques, np.zeros( num - len(uniques), dtype=np.int64 ) )
+            readSample = np.append( readSample, np.zeros( num - len(readSample), dtype=np.int64 ) )
+
+        return [uniques,readSample] 
+
+def hyperGeomSample( binCounts, totalDrawSize ):
+    """
+    Draw samples from a multi-variate hypergeometric distribution.
+
+    Inputs ---
+    1) binCounts --- An array of n values M1,M2,...,Mn where Mi is the number of
+    objects in the i-th bin.
+    2) totalDrawSize --- The total number of objects to be sampled.
+
+    Returns an array of n value that represent the number of objects sampled
+    from each bin.
+    """
+    numBins = len( binCounts )
+    sample = np.zeros( numBins, dtype=np.int64 )
+    totalCount = np.sum( binCounts )
+    sampleSize = 0
+
+    ctr = 0
+    idx = 0
+
+    while idx < numBins:
+        success = binCounts[idx]
+        failure = totalCount - success
+        drawSize = totalDrawSize - sampleSize
+
+        if drawSize == 0:
+            return sample
+
+        draw = np.random.hypergeometric( success, failure, drawSize )
+        if draw > totalDrawSize:
+            sample[ctr] = n
+            return sample
+        else:
+            totalCount -= binCounts[idx]
+            sample[ctr] = draw
+            sampleSize += draw
+
+        ctr += 1
+
+        idx += 1
+
+    return sample
diff --git a/FragExtract.py b/FragExtract.py
@@ -0,0 +1,47 @@
+from scipy.stats import binom
+from scipy.stats import beta
+import numpy as np
+import pandas as pd
+
+class FragExtract:
+    def __init__( self, pExtControl, pExtChip, boundTable ):
+        """
+        The FragExtract class contains routines needed for simulating fragment
+        extraction from a pool of bound fragments. 
+        
+        The arguments to the class constructor are three non-keyword quantities ---
+        pExtControl, pExtChip and boundTable
+        1) pExtControl --- An array of values of size N, where N is the number
+        of binding locations, which specifies the extraction efficiency at
+        each location for the input/control sample. Each value must lie between 0 and 1. 
+        2) pExtChip --- An array of values of the same dimension as pExtControl, 
+        which specifies the extraction efficiency at each genomic location in
+        the ChIP sample. In general, we could set pExtChip and pExtControl to be different
+        values but we keep them equal in all simulations.
+        3) boundTable --- An instance of the GenomeBindingTable class. The
+        instance should have the control_fragments and chip_fragments variables 
+        set to non-zero values. 
+        """
+        #pExtChip - Antibody efficiency/ChIP extraction efficieny -- What is the probability of a fragment that is bound by the TF
+        #of interest being picked up by the antibody in the ChIP sample?
+        #Default : 1 
+        self.pExtChip = pExtChip
+
+        #pExtControl - Probability of extracting a fragment from the control sample. 
+        self.pExtControl = pExtControl
+
+        #extractedTable is a pandas dataframe that stores the number of 
+        #extracted fragments in the ChIP and input samples at each genomic location.
+        self.extractedTable = pd.DataFrame()
+        self.extractedTable.loc[:,'name'] = boundTable.locations['name'].values
+
+        self.N = boundTable.locations.shape[0]
+
+        #The extracted fragments are binomially sampled from bound fragments.
+        extControlFragments = binom.rvs( boundTable.locations['control_fragments'], self.pExtControl, size=self.N )
+
+        self.extractedTable.loc[:,'ext_control_fragments'] = extControlFragments
+
+        extChipFragments = binom.rvs( boundTable.locations['chip_fragments'], self.pExtChip, size=self.N )
+
+        self.extractedTable.loc[:,'ext_chip_fragments'] = extChipFragments