From cbab2302c405b9f0416c2d64965804a6813c518d Mon Sep 17 00:00:00 2001
From: vishakad <vishakadatta@gmail.com>
Date: Fri, 4 Jan 2019 15:22:57 +0530
Subject: [PATCH] Re-factor and expand input validation checks

---
 GenomeBindingTable.py | 23 ++++++++------
 chipulate.py          | 73 ++++++++++++++++++++++++++++---------------
 2 files changed, 61 insertions(+), 35 deletions(-)

diff --git a/GenomeBindingTable.py b/GenomeBindingTable.py
index 8b27ac1..75d86c3 100644
--- a/GenomeBindingTable.py
+++ b/GenomeBindingTable.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 class GenomeBindingTable:
-    def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells, unboundEnergy=1.59, controlCellRatio=0.1, secondTFspEnergies=[], secondTFchemicalPotential=0, secondTFintEnergies=[], indirectLocations=[], chromAccessibility=[]):
+    def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells, names=[], unboundEnergy=1.59, controlCellRatio=0.1, secondTFspEnergies=[], secondTFchemicalPotential=0, secondTFintEnergies=[], indirectLocations=[], chromAccessibility=[]):
         """
         The GenomeBindingTable class stores the number of bound fragments based
         on the number of bound fragments in ChIP and input samples at each
@@ -23,23 +23,24 @@ def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells
         5) numCells --- Number of cells to be employed in the ChIP sample.
 
         The keyword arguments are
-        6) unboundEnergy --- This is the binding energy of the unbound state (in
+        6) names --- Names for each genomic region.
+        7) unboundEnergy --- This is the binding energy of the unbound state (in
         units of kBT) of a genomic location. By default, this is set to 1.59, so
         that the occupancy of the highest affinity site (i.e. site with zero
         energy) is 0.99. 
-        6) controlCellRatio --- This is a fraction that determines the number of cells in the
+        8) controlCellRatio --- This is a fraction that determines the number of cells in the
         ChIP sample that will be employed in the control sample. The default
         value is 1.0 i.e. the same number of cells will be employed in both ChIP
         and input samples.
-        7) secondTFspEnergies --- Binding energies of the second TF. 
-        8) secondTFchemicalPotential --- Chemical potential of the second TF. 
-        9) secondTFintEnergies --- An array that specifies the interaction
+        9) secondTFspEnergies --- Binding energies of the second TF. 
+        10) secondTFchemicalPotential --- Chemical potential of the second TF. 
+        11) secondTFintEnergies --- An array that specifies the interaction
         energy between both TFs at each genomic location. Positive values
         indicate a competitive interaction, negative values indicate a
         cooperative interaction and zero indicates no interaction. 
-        10) indirectLocations --- An array of location numbers that are
+        12) indirectLocations --- An array of location numbers that are
         to be simulated as being indirectly bound.  
-        11) chromAccessibility -- An array of values that specify the chromatin
+        13) chromAccessibility -- An array of values that specify the chromatin
         accessibility at each genomic location. The values must lie between
         0 and 1. 
 
@@ -116,7 +117,10 @@ def __init__( self, sequences, spEnergies, bgEnergy, chemicalPotential, numCells
         #to join entries with second tables from the fragment extraction, 
         #PCR amplification and sequencing processes.
         self.locations = pd.DataFrame( columns=['name'] )
-        self.locations.loc[:,'name'] = ['region_' + str(idx) for idx in range( 1, self.N+1 )]
+        if len( names ) == 0:
+            self.locations.loc[:,'name'] = ['region_' + str(idx) for idx in range( 1, self.N+1 )]
+        else:
+            self.locations.loc[:,'name'] = names
 
         #Binding energies of the TF A at each location.
         self.locations.loc[:,'energy_A'] = spEnergies
@@ -215,3 +219,4 @@ def computeBindingProbabilities( self ):
 
         self.locations.loc[:,'p_occ_chip'] = pTFbound * self.chromAccessibility
         return [pTFbound,pBgBound]
+
diff --git a/chipulate.py b/chipulate.py
index 4e596b0..7381a5c 100644
--- a/chipulate.py
+++ b/chipulate.py
@@ -24,7 +24,7 @@ def makeArray( val, N ):
 
 def performChipSeq( sequences=[], spEnergies=[], numCells=100000, depth=100,
                    pExt=1.0, pAmp=0.58, pcrCycles=15, bgEnergy=1,
-                   chemicalPotential=3, secondTFspEnergies=[],
+                   chemicalPotential=3, secondTFspEnergies=[], names=[], 
                    secondTFchemicalPotential=0, chromAccessibility=[],
                    secondTFintEnergies=[], indirectLocations=[], controlCellRatio=1.0, generateIntervals=True ):
     """
@@ -66,6 +66,8 @@ def performChipSeq( sequences=[], spEnergies=[], numCells=100000, depth=100,
 
     secondTFspEnergies --- Binding energies of the second TF. 
 
+    names --- Names for each region.
+
     secondTFchemicalPotential --- Chemical potential of the second TF. 
 
     secondTFintEnergies --- An array that specifies the interaction
@@ -157,7 +159,7 @@ def performChipSeq( sequences=[], spEnergies=[], numCells=100000, depth=100,
                                    secondTFchemicalPotential=secondTFchemicalPotential,
                                    secondTFintEnergies=secondTFintEnergies,
                                    indirectLocations=indirectLocations,
-                                   controlCellRatio=controlCellRatio,
+                                   controlCellRatio=controlCellRatio, names=names,
                                    chromAccessibility=chromAccessibility  )
 
     pExtControl = makeArray( pExtControl, N )
@@ -393,7 +395,7 @@ def makeFastq( bedFileNames, genomeFileName, readLength, outputDir="", readError
 
 args = parser.parse_args()
 
-def validateInput( df, args ):
+def validateAndAutofillInput( df, args ):
     numLocations = df.shape[0]
     terminateFlag = False
     allowedColumnNames = ['chr','start','end','name','summit','p_ext','p_amp','energy_A','energy_B','sequence','binding_type','int_energy']
@@ -473,7 +475,7 @@ def validateInput( df, args ):
 
     return [terminateFlag,df]
 
-def validateBedFasta( args ):
+def validateBedFastaAndAutofillInput( df, args ):
     chromSizesFileName = args.chrom_size_file
     genomeFileName = args.genome_file
     readLength = args.read_length
@@ -517,7 +519,39 @@ def validateBedFasta( args ):
     if readLength > 0 and fragmentLength > 0 and fragmentLength < readLength:
         print("Fragment length specified ({} bp) is lower than the read length specified ({} bp). Read length must be less than fragment length.".format( fragmentLength, readLength), file=sys.stderr) 
 
-    return terminateFlag 
+
+    #Assign random summits to each region if no summit was specified.
+    if 'summit' not in df.columns:
+        starts = df['start'].values
+        ends = df['end'].values
+        df.loc[:,'summit'] = df.eval( '(end-start)/2' )
+    else:
+        if df.query('summit < 0').shape[0] > 0:
+            print("Summit positions must be positive.", file=sys.stderr)
+            terminateFlag = True
+
+        df.loc[:,'strand'] = '.'
+
+    if 'name' in df.columns:
+        df.loc[:,'name'] = df['name'].values
+        dups = df['name'].duplicated()
+        if np.sum( dups ) > 0:
+            print("The following sets of regions have identical names : ", file=sys.stderr)
+            print( df.loc[dups,['chr','start','end','summit','name']].unique().tolist(), file=sys.stderr )
+            print("Ensure that each ('chr','start','end','summit') entry has a unique name.")
+            terminateFlag = True
+    else:
+        df.loc[:,'name'] =  ['region_' + str(idx) for idx in range( 1, df.shape[0]+1 )]
+
+    #Ensure that no (chr,start,end,summit) positions are repeated.
+    dups = df[['chr','start','end','summit']].duplicated()
+    if np.sum( dups ) > 0:
+        print("The following regions have identical (chr,start,end,summit) coordinates : ", file=sys.stderr)
+        print( df.loc[dups,['chr','start','end','summit']], file=sys.stderr )
+        print("Ensure that the regions passed do not have duplicate (chr,start,end,summit) coordinates. You could specify different summits for each (chr,start,end) region, or delete the duplicate entries.", file=sys.stderr)
+        terminateFlag = True
+
+    return [terminateFlag,df]
 
 def main():
     inputFileName = args.input_file
@@ -540,10 +574,10 @@ def main():
     inputDf = pd.read_csv( inputFileName, sep="\t" )
     numLocations = inputDf.shape[0]
 
-    terminateFlagInput, inputDf = validateInput( inputDf, args ) 
+    terminateFlagInput, inputDf = validateAndAutofillInput( inputDf, args ) 
     if 'chr' in inputDf.columns and 'start' in inputDf.columns and 'end' in inputDf.columns:
         generateIntervals = True
-        terminateFlagBedFasta = validateBedFasta( args )
+        terminateFlagBedFasta, inputDf = validateBedFastaAndAutofillInput( inputDf, args )
     
     depth = args.depth
     numCells = args.num_cells
@@ -560,7 +594,7 @@ def main():
     libraryType = args.library_type
 
     if terminateFlagBedFasta or terminateFlagInput :
-        print("Error encountered in input. Aborting.", file=sys.stderr)
+        print("Error(s) encountered in input. See output above. Aborting.", file=sys.stderr)
         return 0
 
     spEnergies = inputDf['energy_A']
@@ -589,23 +623,9 @@ def main():
     else:
         chromAccessibility = []
 
-    bedCols = []
-    if generateIntervals:
-        bedCols = ['chr','start','end','name','summit','strand']
-
-        #Assign random summits to each region.
-        if 'summit' not in inputDf.columns:
-            starts = inputDf['start'].values
-            ends = inputDf['end'].values
-            inputDf.loc[:,'summit'] = inputDf.eval( '(end-start)/2' )
-
-        inputDf.loc[:,'strand'] = '.'
-        chromSizesDf = pd.read_table( chromSizesFileName, sep="\t", header=None )
-        chromSizesDf = chromSizesDf[[0,1]].rename( {0 : 'chr', 1 : 'max'}, axis=1 )
-
     outputDf, chipFragmentNumbers, controlFragmentNumbers = performChipSeq( sequences=sequences, spEnergies=spEnergies,
                             numCells=numCells, depth=depth, pAmp=pAmp,
-                            pExt=pExt, pcrCycles=pcrCycles,
+                            pExt=pExt, pcrCycles=pcrCycles, names=inputDf['name'].values,
                             bgEnergy=inputBgEnergy, controlCellRatio=controlCellRatio,
                             chemicalPotential=chemicalPotentialA,
                             secondTFspEnergies=secondTFspEnergies,
@@ -613,10 +633,11 @@ def main():
                             chromAccessibility=chromAccessibility,
                             indirectLocations=indirectLocations, generateIntervals=generateIntervals )
 
-    if 'name' not in inputDf.columns:
-        inputDf.loc[:,'name'] = outputDf['name'].values
-
     if generateIntervals:
+        bedCols = ['chr','start','end','name','summit','strand']
+        chromSizesDf = pd.read_table( chromSizesFileName, sep="\t", header=None )
+        chromSizesDf = chromSizesDf[[0,1]].rename( {0 : 'chr', 1 : 'max'}, axis=1 )
+
         bedFileNames = makeBed( inputDf[bedCols], outputDf, chipFragmentNumbers, controlFragmentNumbers, chromSizesDf, outputDir=outputDir, readLength=readLength, fragmentLength=fragmentLength, fragmentJitter=fragmentJitter, libraryType=libraryType )
 
         makeFastq( bedFileNames, genomeFileName, readLength, libraryType=libraryType )