From cbab2302c405b9f0416c2d64965804a6813c518d Mon Sep 17 00:00:00 2001 From: vishakad Date: Fri, 4 Jan 2019 15:22:57 +0530 Subject: [PATCH] Re-factor and expand input validation checks --- GenomeBindingTable.py | 23 ++++++++------ chipulate.py | 73 ++++++++++++++++++++++++++++--------------- 2 files changed, 61 insertions(+), 35 deletions(-) diff --git a/GenomeBindingTable.py b/GenomeBindingTable.py index 8b27ac1..75d86c3 100644 --- a/GenomeBindingTable.py +++ b/GenomeBindingTable.py @@ -4,7 +4,7 @@ import pandas as pd class GenomeBindingTable: - def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells, unboundEnergy=1.59, controlCellRatio=0.1, secondTFspEnergies=[], secondTFchemicalPotential=0, secondTFintEnergies=[], indirectLocations=[], chromAccessibility=[]): + def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells, names=[], unboundEnergy=1.59, controlCellRatio=0.1, secondTFspEnergies=[], secondTFchemicalPotential=0, secondTFintEnergies=[], indirectLocations=[], chromAccessibility=[]): """ The GenomeBindingTable class stores the number of bound fragments based on the number of bound fragments in ChIP and input samples at each @@ -23,23 +23,24 @@ def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells 5) numCells --- Number of cells to be employed in the ChIP sample. The keyword arguments are - 6) unboundEnergy --- This is the binding energy of the unbound state (in + 6) names --- Names for each genomic region. + 7) unboundEnergy --- This is the binding energy of the unbound state (in units of kBT) of a genomic location. By default, this is set to 1.59, so that the occupancy of the highest affinity site (i.e. site with zero energy) is 0.99. - 6) controlCellRatio --- This is a fraction that determines the number of cells in the + 8) controlCellRatio --- This is a fraction that determines the number of cells in the ChIP sample that will be employed in the control sample. The default value is 1.0 i.e. the same number of cells will be employed in both ChIP and input samples. - 7) secondTFspEnergies --- Binding energies of the second TF. - 8) secondTFchemicalPotential --- Chemical potential of the second TF. - 9) secondTFintEnergies --- An array that specifies the interaction + 9) secondTFspEnergies --- Binding energies of the second TF. + 10) secondTFchemicalPotential --- Chemical potential of the second TF. + 11) secondTFintEnergies --- An array that specifies the interaction energy between both TFs at each genomic location. Positive values indicate a competitive interaction, negative values indicate a cooperative interaction and zero indicates no interaction. - 10) indirectLocations --- An array of location numbers that are + 12) indirectLocations --- An array of location numbers that are to be simulated as being indirectly bound. - 11) chromAccessibility -- An array of values that specify the chromatin + 13) chromAccessibility -- An array of values that specify the chromatin accessibility at each genomic location. The values must lie between 0 and 1. @@ -116,7 +117,10 @@ def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells #to join entries with second tables from the fragment extraction, #PCR amplification and sequencing processes. self.locations = pd.DataFrame( columns=['name'] ) - self.locations.loc[:,'name'] = ['region_' + str(idx) for idx in range( 1, self.N+1 )] + if len( names ) == 0: + self.locations.loc[:,'name'] = ['region_' + str(idx) for idx in range( 1, self.N+1 )] + else: + self.locations.loc[:,'name'] = names #Binding energies of the TF A at each location. self.locations.loc[:,'energy_A'] = spEnergies @@ -215,3 +219,4 @@ def computeBindingProbabilities( self ): self.locations.loc[:,'p_occ_chip'] = pTFbound * self.chromAccessibility return [pTFbound,pBgBound] + diff --git a/chipulate.py b/chipulate.py index 4e596b0..7381a5c 100644 --- a/chipulate.py +++ b/chipulate.py @@ -24,7 +24,7 @@ def makeArray( val, N ): def performChipSeq( sequences=[], spEnergies=[], numCells=100000, depth=100, pExt=1.0, pAmp=0.58, pcrCycles=15, bgEnergy=1, - chemicalPotential=3, secondTFspEnergies=[], + chemicalPotential=3, secondTFspEnergies=[], names=[], secondTFchemicalPotential=0, chromAccessibility=[], secondTFintEnergies=[], indirectLocations=[], controlCellRatio=1.0, generateIntervals=True ): """ @@ -66,6 +66,8 @@ def performChipSeq( sequences=[], spEnergies=[], numCells=100000, depth=100, secondTFspEnergies --- Binding energies of the second TF. + names --- Names for each region. + secondTFchemicalPotential --- Chemical potential of the second TF. secondTFintEnergies --- An array that specifies the interaction @@ -157,7 +159,7 @@ def performChipSeq( sequences=[], spEnergies=[], numCells=100000, depth=100, secondTFchemicalPotential=secondTFchemicalPotential, secondTFintEnergies=secondTFintEnergies, indirectLocations=indirectLocations, - controlCellRatio=controlCellRatio, + controlCellRatio=controlCellRatio, names=names, chromAccessibility=chromAccessibility ) pExtControl = makeArray( pExtControl, N ) @@ -393,7 +395,7 @@ def makeFastq( bedFileNames, genomeFileName, readLength, outputDir="", readError args = parser.parse_args() -def validateInput( df, args ): +def validateAndAutofillInput( df, args ): numLocations = df.shape[0] terminateFlag = False allowedColumnNames = ['chr','start','end','name','summit','p_ext','p_amp','energy_A','energy_B','sequence','binding_type','int_energy'] @@ -473,7 +475,7 @@ def validateInput( df, args ): return [terminateFlag,df] -def validateBedFasta( args ): +def validateBedFastaAndAutofillInput( df, args ): chromSizesFileName = args.chrom_size_file genomeFileName = args.genome_file readLength = args.read_length @@ -517,7 +519,39 @@ def validateBedFasta( args ): if readLength > 0 and fragmentLength > 0 and fragmentLength < readLength: print("Fragment length specified ({} bp) is lower than the read length specified ({} bp). Read length must be less than fragment length.".format( fragmentLength, readLength), file=sys.stderr) - return terminateFlag + + #Assign random summits to each region if no summit was specified. + if 'summit' not in df.columns: + starts = df['start'].values + ends = df['end'].values + df.loc[:,'summit'] = df.eval( '(end-start)/2' ) + else: + if df.query('summit < 0').shape[0] > 0: + print("Summit positions must be positive.", file=sys.stderr) + terminateFlag = True + + df.loc[:,'strand'] = '.' + + if 'name' in df.columns: + df.loc[:,'name'] = df['name'].values + dups = df['name'].duplicated() + if np.sum( dups ) > 0: + print("The following sets of regions have identical names : ", file=sys.stderr) + print( df.loc[dups,['chr','start','end','summit','name']].unique().tolist(), file=sys.stderr ) + print("Ensure that each ('chr','start','end','summit') entry has a unique name.") + terminateFlag = True + else: + df.loc[:,'name'] = ['region_' + str(idx) for idx in range( 1, df.shape[0]+1 )] + + #Ensure that no (chr,start,end,summit) positions are repeated. + dups = df[['chr','start','end','summit']].duplicated() + if np.sum( dups ) > 0: + print("The following regions have identical (chr,start,end,summit) coordinates : ", file=sys.stderr) + print( df.loc[dups,['chr','start','end','summit']], file=sys.stderr ) + print("Ensure that the regions passed do not have duplicate (chr,start,end,summit) coordinates. You could specify different summits for each (chr,start,end) region, or delete the duplicate entries.", file=sys.stderr) + terminateFlag = True + + return [terminateFlag,df] def main(): inputFileName = args.input_file @@ -540,10 +574,10 @@ def main(): inputDf = pd.read_csv( inputFileName, sep="\t" ) numLocations = inputDf.shape[0] - terminateFlagInput, inputDf = validateInput( inputDf, args ) + terminateFlagInput, inputDf = validateAndAutofillInput( inputDf, args ) if 'chr' in inputDf.columns and 'start' in inputDf.columns and 'end' in inputDf.columns: generateIntervals = True - terminateFlagBedFasta = validateBedFasta( args ) + terminateFlagBedFasta, inputDf = validateBedFastaAndAutofillInput( inputDf, args ) depth = args.depth numCells = args.num_cells @@ -560,7 +594,7 @@ def main(): libraryType = args.library_type if terminateFlagBedFasta or terminateFlagInput : - print("Error encountered in input. Aborting.", file=sys.stderr) + print("Error(s) encountered in input. See output above. Aborting.", file=sys.stderr) return 0 spEnergies = inputDf['energy_A'] @@ -589,23 +623,9 @@ def main(): else: chromAccessibility = [] - bedCols = [] - if generateIntervals: - bedCols = ['chr','start','end','name','summit','strand'] - - #Assign random summits to each region. - if 'summit' not in inputDf.columns: - starts = inputDf['start'].values - ends = inputDf['end'].values - inputDf.loc[:,'summit'] = inputDf.eval( '(end-start)/2' ) - - inputDf.loc[:,'strand'] = '.' - chromSizesDf = pd.read_table( chromSizesFileName, sep="\t", header=None ) - chromSizesDf = chromSizesDf[[0,1]].rename( {0 : 'chr', 1 : 'max'}, axis=1 ) - outputDf, chipFragmentNumbers, controlFragmentNumbers = performChipSeq( sequences=sequences, spEnergies=spEnergies, numCells=numCells, depth=depth, pAmp=pAmp, - pExt=pExt, pcrCycles=pcrCycles, + pExt=pExt, pcrCycles=pcrCycles, names=inputDf['name'].values, bgEnergy=inputBgEnergy, controlCellRatio=controlCellRatio, chemicalPotential=chemicalPotentialA, secondTFspEnergies=secondTFspEnergies, @@ -613,10 +633,11 @@ def main(): chromAccessibility=chromAccessibility, indirectLocations=indirectLocations, generateIntervals=generateIntervals ) - if 'name' not in inputDf.columns: - inputDf.loc[:,'name'] = outputDf['name'].values - if generateIntervals: + bedCols = ['chr','start','end','name','summit','strand'] + chromSizesDf = pd.read_table( chromSizesFileName, sep="\t", header=None ) + chromSizesDf = chromSizesDf[[0,1]].rename( {0 : 'chr', 1 : 'max'}, axis=1 ) + bedFileNames = makeBed( inputDf[bedCols], outputDf, chipFragmentNumbers, controlFragmentNumbers, chromSizesDf, outputDir=outputDir, readLength=readLength, fragmentLength=fragmentLength, fragmentJitter=fragmentJitter, libraryType=libraryType ) makeFastq( bedFileNames, genomeFileName, readLength, libraryType=libraryType )