Skip to content

Commit

Permalink
Re-factor and expand input validation checks
Browse files Browse the repository at this point in the history
  • Loading branch information
darthshak committed Jan 4, 2019
1 parent 90cadb1 commit cbab230
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 35 deletions.
23 changes: 14 additions & 9 deletions GenomeBindingTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd

class GenomeBindingTable:
def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells, unboundEnergy=1.59, controlCellRatio=0.1, secondTFspEnergies=[], secondTFchemicalPotential=0, secondTFintEnergies=[], indirectLocations=[], chromAccessibility=[]):
def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells, names=[], unboundEnergy=1.59, controlCellRatio=0.1, secondTFspEnergies=[], secondTFchemicalPotential=0, secondTFintEnergies=[], indirectLocations=[], chromAccessibility=[]):
"""
The GenomeBindingTable class stores the number of bound fragments based
on the number of bound fragments in ChIP and input samples at each
Expand All @@ -23,23 +23,24 @@ def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells
5) numCells --- Number of cells to be employed in the ChIP sample.
The keyword arguments are
6) unboundEnergy --- This is the binding energy of the unbound state (in
6) names --- Names for each genomic region.
7) unboundEnergy --- This is the binding energy of the unbound state (in
units of kBT) of a genomic location. By default, this is set to 1.59, so
that the occupancy of the highest affinity site (i.e. site with zero
energy) is 0.99.
6) controlCellRatio --- This is a fraction that determines the number of cells in the
8) controlCellRatio --- This is a fraction that determines the number of cells in the
ChIP sample that will be employed in the control sample. The default
value is 1.0 i.e. the same number of cells will be employed in both ChIP
and input samples.
7) secondTFspEnergies --- Binding energies of the second TF.
8) secondTFchemicalPotential --- Chemical potential of the second TF.
9) secondTFintEnergies --- An array that specifies the interaction
9) secondTFspEnergies --- Binding energies of the second TF.
10) secondTFchemicalPotential --- Chemical potential of the second TF.
11) secondTFintEnergies --- An array that specifies the interaction
energy between both TFs at each genomic location. Positive values
indicate a competitive interaction, negative values indicate a
cooperative interaction and zero indicates no interaction.
10) indirectLocations --- An array of location numbers that are
12) indirectLocations --- An array of location numbers that are
to be simulated as being indirectly bound.
11) chromAccessibility -- An array of values that specify the chromatin
13) chromAccessibility -- An array of values that specify the chromatin
accessibility at each genomic location. The values must lie between
0 and 1.
Expand Down Expand Up @@ -116,7 +117,10 @@ def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells
#to join entries with second tables from the fragment extraction,
#PCR amplification and sequencing processes.
self.locations = pd.DataFrame( columns=['name'] )
self.locations.loc[:,'name'] = ['region_' + str(idx) for idx in range( 1, self.N+1 )]
if len( names ) == 0:
self.locations.loc[:,'name'] = ['region_' + str(idx) for idx in range( 1, self.N+1 )]
else:
self.locations.loc[:,'name'] = names

#Binding energies of the TF A at each location.
self.locations.loc[:,'energy_A'] = spEnergies
Expand Down Expand Up @@ -215,3 +219,4 @@ def computeBindingProbabilities( self ):

self.locations.loc[:,'p_occ_chip'] = pTFbound * self.chromAccessibility
return [pTFbound,pBgBound]

73 changes: 47 additions & 26 deletions chipulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def makeArray( val, N ):

def performChipSeq( sequences=[], spEnergies=[], numCells=100000, depth=100,
pExt=1.0, pAmp=0.58, pcrCycles=15, bgEnergy=1,
chemicalPotential=3, secondTFspEnergies=[],
chemicalPotential=3, secondTFspEnergies=[], names=[],
secondTFchemicalPotential=0, chromAccessibility=[],
secondTFintEnergies=[], indirectLocations=[], controlCellRatio=1.0, generateIntervals=True ):
"""
Expand Down Expand Up @@ -66,6 +66,8 @@ def performChipSeq( sequences=[], spEnergies=[], numCells=100000, depth=100,
secondTFspEnergies --- Binding energies of the second TF.
names --- Names for each region.
secondTFchemicalPotential --- Chemical potential of the second TF.
secondTFintEnergies --- An array that specifies the interaction
Expand Down Expand Up @@ -157,7 +159,7 @@ def performChipSeq( sequences=[], spEnergies=[], numCells=100000, depth=100,
secondTFchemicalPotential=secondTFchemicalPotential,
secondTFintEnergies=secondTFintEnergies,
indirectLocations=indirectLocations,
controlCellRatio=controlCellRatio,
controlCellRatio=controlCellRatio, names=names,
chromAccessibility=chromAccessibility )

pExtControl = makeArray( pExtControl, N )
Expand Down Expand Up @@ -393,7 +395,7 @@ def makeFastq( bedFileNames, genomeFileName, readLength, outputDir="", readError

args = parser.parse_args()

def validateInput( df, args ):
def validateAndAutofillInput( df, args ):
numLocations = df.shape[0]
terminateFlag = False
allowedColumnNames = ['chr','start','end','name','summit','p_ext','p_amp','energy_A','energy_B','sequence','binding_type','int_energy']
Expand Down Expand Up @@ -473,7 +475,7 @@ def validateInput( df, args ):

return [terminateFlag,df]

def validateBedFasta( args ):
def validateBedFastaAndAutofillInput( df, args ):
chromSizesFileName = args.chrom_size_file
genomeFileName = args.genome_file
readLength = args.read_length
Expand Down Expand Up @@ -517,7 +519,39 @@ def validateBedFasta( args ):
if readLength > 0 and fragmentLength > 0 and fragmentLength < readLength:
print("Fragment length specified ({} bp) is lower than the read length specified ({} bp). Read length must be less than fragment length.".format( fragmentLength, readLength), file=sys.stderr)

return terminateFlag

#Assign random summits to each region if no summit was specified.
if 'summit' not in df.columns:
starts = df['start'].values
ends = df['end'].values
df.loc[:,'summit'] = df.eval( '(end-start)/2' )
else:
if df.query('summit < 0').shape[0] > 0:
print("Summit positions must be positive.", file=sys.stderr)
terminateFlag = True

df.loc[:,'strand'] = '.'

if 'name' in df.columns:
df.loc[:,'name'] = df['name'].values
dups = df['name'].duplicated()
if np.sum( dups ) > 0:
print("The following sets of regions have identical names : ", file=sys.stderr)
print( df.loc[dups,['chr','start','end','summit','name']].unique().tolist(), file=sys.stderr )
print("Ensure that each ('chr','start','end','summit') entry has a unique name.")
terminateFlag = True
else:
df.loc[:,'name'] = ['region_' + str(idx) for idx in range( 1, df.shape[0]+1 )]

#Ensure that no (chr,start,end,summit) positions are repeated.
dups = df[['chr','start','end','summit']].duplicated()
if np.sum( dups ) > 0:
print("The following regions have identical (chr,start,end,summit) coordinates : ", file=sys.stderr)
print( df.loc[dups,['chr','start','end','summit']], file=sys.stderr )
print("Ensure that the regions passed do not have duplicate (chr,start,end,summit) coordinates. You could specify different summits for each (chr,start,end) region, or delete the duplicate entries.", file=sys.stderr)
terminateFlag = True

return [terminateFlag,df]

def main():
inputFileName = args.input_file
Expand All @@ -540,10 +574,10 @@ def main():
inputDf = pd.read_csv( inputFileName, sep="\t" )
numLocations = inputDf.shape[0]

terminateFlagInput, inputDf = validateInput( inputDf, args )
terminateFlagInput, inputDf = validateAndAutofillInput( inputDf, args )
if 'chr' in inputDf.columns and 'start' in inputDf.columns and 'end' in inputDf.columns:
generateIntervals = True
terminateFlagBedFasta = validateBedFasta( args )
terminateFlagBedFasta, inputDf = validateBedFastaAndAutofillInput( inputDf, args )

depth = args.depth
numCells = args.num_cells
Expand All @@ -560,7 +594,7 @@ def main():
libraryType = args.library_type

if terminateFlagBedFasta or terminateFlagInput :
print("Error encountered in input. Aborting.", file=sys.stderr)
print("Error(s) encountered in input. See output above. Aborting.", file=sys.stderr)
return 0

spEnergies = inputDf['energy_A']
Expand Down Expand Up @@ -589,34 +623,21 @@ def main():
else:
chromAccessibility = []

bedCols = []
if generateIntervals:
bedCols = ['chr','start','end','name','summit','strand']

#Assign random summits to each region.
if 'summit' not in inputDf.columns:
starts = inputDf['start'].values
ends = inputDf['end'].values
inputDf.loc[:,'summit'] = inputDf.eval( '(end-start)/2' )

inputDf.loc[:,'strand'] = '.'
chromSizesDf = pd.read_table( chromSizesFileName, sep="\t", header=None )
chromSizesDf = chromSizesDf[[0,1]].rename( {0 : 'chr', 1 : 'max'}, axis=1 )

outputDf, chipFragmentNumbers, controlFragmentNumbers = performChipSeq( sequences=sequences, spEnergies=spEnergies,
numCells=numCells, depth=depth, pAmp=pAmp,
pExt=pExt, pcrCycles=pcrCycles,
pExt=pExt, pcrCycles=pcrCycles, names=inputDf['name'].values,
bgEnergy=inputBgEnergy, controlCellRatio=controlCellRatio,
chemicalPotential=chemicalPotentialA,
secondTFspEnergies=secondTFspEnergies,
secondTFchemicalPotential=chemicalPotentialB,
chromAccessibility=chromAccessibility,
indirectLocations=indirectLocations, generateIntervals=generateIntervals )

if 'name' not in inputDf.columns:
inputDf.loc[:,'name'] = outputDf['name'].values

if generateIntervals:
bedCols = ['chr','start','end','name','summit','strand']
chromSizesDf = pd.read_table( chromSizesFileName, sep="\t", header=None )
chromSizesDf = chromSizesDf[[0,1]].rename( {0 : 'chr', 1 : 'max'}, axis=1 )

bedFileNames = makeBed( inputDf[bedCols], outputDf, chipFragmentNumbers, controlFragmentNumbers, chromSizesDf, outputDir=outputDir, readLength=readLength, fragmentLength=fragmentLength, fragmentJitter=fragmentJitter, libraryType=libraryType )

makeFastq( bedFileNames, genomeFileName, readLength, libraryType=libraryType )
Expand Down

0 comments on commit cbab230

Please sign in to comment.