updted imbalanced training

phoenixding · May 22, 2018 · c42dfc5 · c42dfc5
1 parent d53eed3
commit c42dfc5
Show file tree

Hide file tree

Showing 8 changed files with 468,403 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@
 
 # INTRODUCTION 
 <div style="text-align: justify"> 
-Current methods have relied primarily on the assumption that descendant cells are similar to their parents in terms of gene expression levels. 
+Most existing single-cell trajectory inference methods have relied primarily on the assumption that descendant cells are similar to their parents in terms of gene expression levels. 
 These assumptions do not always hold for in-vivo studies which often include infrequently sampled, un-synchronized and diverse cell populations. 
 Thus, additional information may be needed to determine the correct ordering and branching of progenitor cells and the set of transcription factors (TFs) 
 that are active during advancing stages of organogenesis. To enable such modeling we developed scdiff,
@@ -38,6 +38,10 @@ instructions.
 -- scikit-learn   
 -- scipy  
 -- numpy  
+-- matplotlib  
+-- pydiffmap  
+-- imbalanced_learn  
+
 The python setup.py script (or pip) will try to install these packages automatically.
 However, please install them manually if, by any reason, the automatic 
 installation fails. 
@@ -152,8 +156,8 @@ usage: scdiff [-h] -i INPUT -t TF_DNA -k CLUSTERS -o OUTPUT [-s SPEEDUP] [-l Lar
 						However, users are allowed to customize the cutoff based on their 
 						application scenario (e.g. log2 fold change 1.5). 
 
-	-e ETFLISTFILE, --etfListFile ETFLISTFILE
-						String, Optional, by default, scdiff recognizes 1.6k
+	-e ETFLISTFILE, --etfListFile ETFLISTFILE (String), optional  
+						By default, scdiff recognizes 1.6k
 						TFs (we collected in human and mouse). Users are able
 						to provide a customized list of TFs instead using this
 						option. It specifies the path to the TF list file, in
@@ -190,8 +194,8 @@ The input file has the following formatting requirements:
 
 * __-t/--tf_dna__  
 This specifies the TF-gene interaction data.  In other words, it specifies the TF targets. 
-Under the tf_dna directory, we provided a human TF-gene interaction file inferred using the strategy in our previous study (https://www.ncbi.nlm.nih.gov/pubmed/20219943). 
-Although this TF-gene interaction file is collected in human, it should be also able to apply to other close species such as mouse.   
+Under the tf_dna directory, we provided a [human TF-gene interaction file](tf_dna/Human_TF_targets.txt) and a [mouse TF-gene interaction file](tf_dna/Mouse_TF_targets.txt) inferred using the strategy in our previous study (https://www.ncbi.nlm.nih.gov/pubmed/20219943). 
+Although this TF-gene interactions are collected in human and mouse, they should be also able to apply to other close species.
 Besides, in our previous work DREM (http://sb.cs.cmu.edu/drem/), we did collected the TF-gene interactions for common species including human, mouse, fry, E.coli, yeast, Arabidopsis. 
 Please refer to  http://sb.cs.cmu.edu/drem/DREMmanual.pdf appendix B for complete details. 
 Those TF-gene interaction files can be downloaded from our DREM software (https://github.com/phoenixding/idrem/tree/master/TFInput).
@@ -208,7 +212,7 @@ You might need to unzip and re-format the file to satisfy the requirements. The
 		This column is not used in scdiff. 
 		 	
 	Example file:   
-	[example TF gene interaction file](tf_dna/human_predicted_100.txt.update)
+	[example TF gene interaction file](tf_dna/Human_TF_targets.txt)
 
 * __-k/--cluster__  
   This specifies the clustering parameter (String).   
@@ -242,7 +246,7 @@ You might need to unzip and re-format the file to satisfy the requirements. The
   each row represents a TF standard name and matches to the gene expression names.  
   We required that the predicted TFs must be expressing (based on the expression data).
   
-  An example of the TF List file can be found under the "tf_dna" folder [HumanTFList.txt](tf_dna/HumanTFList.txt).
+  An example of the TF List file can be found under the "tf_list" folder [HumanTFList.txt](tf_list/Human_mouse_TFList.txt).
   
 For other scdiff optional parameters, please refer to the [usage](#usage) section.   
 
@@ -771,8 +775,9 @@ Then, you will find the visualized result page in HTML under 'e1_out' directory.
 # CREDITS
  
 This software was developed by ZIV-system biology group @ Carnegie Mellon University.  
-Implemented by Jun Ding
+Implemented by Jun Ding.
 
+Please cite our paper [Reconstructing differentiation networks and their regulation from time series single cell expression data](https://genome.cshlp.org/content/early/2018/01/09/gr.225979.117). 
 
 # LICENSE 
  

diff --git a/example/example_out/example.E.json b/example/example_out/example.E.json
diff --git a/scdiff/scdiff.py b/scdiff/scdiff.py
@@ -33,6 +33,7 @@
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.cluster import KMeans
 from sklearn.cluster import Birch
+from imblearn.over_sampling import SMOTE
 
 from sklearn.metrics import silhouette_score
 from sklearn.linear_model import LogisticRegression
@@ -193,7 +194,7 @@ def bestK(T):
 		KET=self.KET
 		dCK = {}
 		dCK[KET[0]] = K0
-		K = range(2, 10) if self.largeType==None else range(2,7)
+		K = range(2, 10) if self.largeType==None else range(2,8)
 		print("learning K...")
 
 		#-----------------------------------------------------------------
@@ -572,7 +573,7 @@ def __init__(self,fromNode,toNode,Nodes,dTD,dTG,dMb,fChangeCut=1):
                 self.atf=self.getActiveTF(dTD,dTG,dMb)                                             # TF targets are significantly different between fromNode and toNode
 
 		#---------------------------------------------------------------------- 
-                self.B=self.getTransition(2,-2,dTD,dTG,dMb,fChangeCut)                  # transition offset
+                self.B=self.getTransition(dTD,dTG,dMb,fChangeCut)                       # transition offset
                 self.Q=self.getProcessVariance(MU=self.fromNode.E)                      # initial process variance
 
                 #self.fulltext = '' # for drawing purpose
@@ -688,11 +689,11 @@ def getActiveTF(self,dTD,dTG,dMb):
 
         #-------------------------------------------------------------------
         # regresion model for each path
-        def getTransition(self,U,D,dTD,dTG,dMb,FCUT=1):
+        def getTransition(self,dTD,dTG,dMb,FCUT=1):
                 G = self.getFC()
                 dFC = {item[0].upper(): item[1] for item in G}
                 etfID = [item[1] for item in self.etf]
-                [X, Y] = buildTrain(G, dTG, etfID,self.GL)
+                [X, Y,U,D] = buildTrain(G, dTG, etfID,self.GL,FCUT)
                 LR = LogisticRegressionCV(penalty='l1', Cs=[1.5, 2, 3, 4, 5], solver='liblinear', multi_class='ovr')
                 dR = {0: U, 1: D, 2: 0}
                 HGL = [item.upper() for item in self.GL]
@@ -1463,17 +1464,28 @@ def batchScanPrior(A,dTD):
 
 #-----------------------------------------------------------------------
 # building traning dataset for regression
-def buildTrain(G,dTG,etf,GL):
+def buildTrain(G,dTG,etf,GL,Fcut=1):
 	# G: differential genes for a given path
 	# dTD: DNA->TF dictionary
 	# TF candidate
-	Fcut=1.5
-	Ncut=0.5
-	#SZ=1000 # SAMPLE SIZE
-	UP=[item[0].upper() for item in G if item[1]>Fcut]
-	DN=[item[0].upper() for item in G if item[1]<-1*Fcut]
-	NN=[item[0].upper() for item in G if abs(item[1])<Ncut]
-
+	Ncut=Fcut/2.0
+
+	#UP=[item[0].upper() for item in G if item[1]>Fcut]
+	#DN=[item[0].upper() for item in G if item[1]<-1*Fcut]
+	#NN=[item[0].upper() for item in G if abs(item[1])<Ncut]
+	UP=[item for item in G if item[1]>Fcut]
+	DN=[item for item in G if item[1]<-1*Fcut]
+	NN=[item for item in G if abs(item[1])<Ncut]
+
+
+	U=sum([item[1] for item in UP])/len(UP)
+	D=sum([item[1] for item in DN])/len(DN)
+
+	UP=[item[0].upper() for item in UP]
+	DN=[item[0].upper() for item in DN]
+	NN=[item[0].upper() for item in NN]
+
+
 	XU=[]
 	XD=[]
 	XN=[]
@@ -1502,7 +1514,16 @@ def buildTrain(G,dTG,etf,GL):
 
 	X=XU+XD+XN
 	Y=YU+YD+YN
-	return [X,Y]
+
+	# to solve the imbalanced training set issue, use over-sampling techqniue- SMOTE
+	sm=SMOTE(random_state=0)
+	Xs,Ys=sm.fit_sample(X,Y)
+
+	Xs=list(Xs)
+	Ys=list(Ys)
+
+	#pdb.set_trace()
+	return [Xs,Ys,U,D]
 
 # parse Logistic regression result
 def parseLR(etf,LRC):
@@ -1545,6 +1566,23 @@ def buildVirtualAncestor(AllCells,VT=None):
 	virtual_ancestor=Cell('virtual_ancestor',TA,AE,'NA',GL)
 	return virtual_ancestor
 
+#-----------------------------------------------------------------------
+# differenc between ACL and ACL_update
+
+def ClusteringDifference(ACL,ACL_update):
+
+	totaloverlap=0
+	for i in ACL_update:
+		ci=[]
+		for j in ACL:
+			ov=[item for item in i if item in j]
+			ci.append(len(ov))
+		maxci=max(ci)
+		totaloverlap+=maxci
+	pertotaloverlap=totaloverlap*1.0/sum([len(item) for item in ACL_update])
+	pdiff=1-pertotaloverlap
+	return pdiff
+
 #----------------------------------------------------------------------
 #=======================================================================
 # MAIN program starts here!
@@ -1641,10 +1679,11 @@ def  main():
 	scg_name=scg.split('/')[-1]
 	viz(scg_name,G1,output)
 
+	sflag=0
 	if (args.speedup=='1') or (args.speedup=='True'):
-		print("done!")
-		sys.exit(0)
-
+		sflag=1
+		pdiff=0.05 
+		
 	#=======================================================================
 	# start cell-reassignment
 	# starting Kalman Filter --Expression /Time
@@ -1658,7 +1697,10 @@ def  main():
 		G1.ReAssign()
 		G1.updateGraph()
 		ACL_update=[sorted([item.ID for item in K.cells]) for K in G1.Nodes]
-		condition = ((ACL != ACL_update) and (lct < maxLoop))
+		if sflag!=1:
+			condition = ((ACL != ACL_update) and (lct < maxLoop))
+		else:	
+			condition=((ClusteringDifference(ACL,ACL_update)>pdiff) and (lct<maxLoop))
 		lct+=1
 		viz(scg_name,G1,output)
 	print("done!")

diff --git a/scdiff/viz.py b/scdiff/viz.py
@@ -1,11 +1,14 @@
 #!/usr/bin/env python
 
 import pdb,sys,os,json
+import warnings
+warnings.filterwarnings('ignore')
 from sklearn.manifold import TSNE
 from sklearn.decomposition import PCA
 from sklearn.manifold import Isomap
 import pydiffmap.diffusion_map as pdm
 
+
 #-----------------------------------------------------------------------
 # export data ==>json
 def GtoJson(G1,GL,dTD):
@@ -29,12 +32,12 @@ def GtoJson(G1,GL,dTD):
 
 	# diffusion map 
 	dmk=min(len(xmatrix),10)
-	dfmap=pdm.DiffusionMap(n_evecs = 2, epsilon ='bgh', alpha = 0.5,k=dmk)
-	dfmap_matrix=dfmap.fit_transform(xmatrix)
-
-
-	#pdb.set_trace()
 
+	with warnings.catch_warnings():
+		warnings.simplefilter("ignore")
+		dfmap=pdm.DiffusionMap(n_evecs = 2, epsilon ='bgh', alpha = 0.5,k=dmk)
+		dfmap_matrix=dfmap.fit_transform(xmatrix)
+
 	CL=[]
 	for i in range(len(G1.Cells)):
 		jci=(G1.Cells[i]).__dict__

diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(  name='scdiff',
-		version='1.1.13',
+		version='1.1.15',
 		description='Single Cell Differentiation Model package',
 		author='Jun Ding',
 		author_email='jund@andrew.cmu.edu',
@@ -10,9 +10,8 @@
 		packages=['scdiff'],
 		package_data={'scdiff':['img/logo.gif','tfdata/HumanTFList.txt']},
 		entry_points={'console_scripts':['scdiff=scdiff.scdiff:main','scdiff_gui=scdiff.scdiff_gui:main']},
-		install_requires=['scipy','numpy','scikit-learn','pyDiffMap'],
+		install_requires=['scipy','numpy','scikit-learn','pydiffmap','matplotlib','imbalanced_learn'],
 		classifiers=[
-			'Development Status :: 3 - Alpha',
 			'License :: OSI Approved :: MIT License',
 			'Programming Language :: Python :: 2',
 			'Programming Language :: Python :: 3',

diff --git a/tf_dna/human_predicted_100.txt.update → tf_dna/Human_TF_targets.txt b/tf_dna/human_predicted_100.txt.update → tf_dna/Human_TF_targets.txt