Merge pull request #225 from broadinstitute/jv

debug cms_modeller boostrap; commit intermediate updates
broadinstitute · Feb 26, 2018 · 3fbb697 · 3fbb697
2 parents aaeecdf + b215e76
commit 3fbb697
Show file tree

Hide file tree

Showing 19 changed files with 3,116 additions and 308 deletions.
diff --git a/.gitignore b/.gitignore
@@ -64,3 +64,7 @@ cms/VERSION
 cms/VERSION
 
 cms/logfile
+
+cms/model/.DS_Store
+
+cms/model/.DS_Store
diff --git a/cms/cms_modeller.py b/cms/cms_modeller.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 ## top-level script for demographic modeling as part of CMS 2.0. 
-## last updated: 10.08.16 vitti@broadinstitute.org
+## last updated: 02.26.18 vitti@broadinstitute.org
 
 import matplotlib as mp 
 mp.use('agg') 
@@ -39,8 +39,14 @@ def full_parser_cms_modeller():
 	bootstrap_parser = subparsers.add_parser('bootstrap', help='Perform bootstrap estimates of population summary statistics from per-site(/per-site-pair) calculations in order to finalize model target values.')
 	bootstrap_parser.add_argument('nBootstrapReps', action='store', type=int, help='number of bootstraps to perform in order to estimate standard error of the dataset (should converge for reasonably small n)')
 	bootstrap_parser.add_argument('--in_freqs', action='store', help='comma-delimited list of infiles with per-site calculations for population. One file per population -- for bootstrap estimates of genome-wide values, should first concatenate per-chrom files') 
+	bootstrap_parser.add_argument('--nFreqHistBins', action='store',type=int, default=6, help="number of bins for site frequency spectrum and p(der|freq)")
 	bootstrap_parser.add_argument('--in_ld', action='store', help='comma-delimited list of infiles with per-site-pair calculations for population. One file per population -- for bootstrap estimates of genome-wide values, should first concatenate per-chrom files') 
+	bootstrap_parser.add_argument('--mafcutoffdprime', action='store', type=float, default=.2, help="for D' calculations, only use sites with MAF > mafcutoffdprime") 	
+	bootstrap_parser.add_argument('--nphysdisthist', action='store', type=int, default=14, help="nbins for r2 LD calculations") 	
+
+
 	bootstrap_parser.add_argument('--in_fst', action='store', help='comma-delimited list of infiles with per-site calculations for population pair. One file per population-pair -- for bootstrap estimates of genome-wide values, should first concatenate per-chrom files') 	
+	bootstrap_parser.add_argument('--ngendisthist', action='store', type=int, default=17, help="nbins for D' LD calculations") 	
 	bootstrap_parser.add_argument('out', action='store', type=str, help='outfile prefix') 
 
 	##########################
@@ -137,6 +143,7 @@ def execute_bootstrap(args):
 	### FREQ STATS ##
 	#################
 	if args.in_freqs is not None: 
+		nhist = args.nFreqHistBins
 		inputestimatefilenames = ''.join(args.in_freqs)
 		inputfilenames = inputestimatefilenames.split(',')
 		npops = len(inputfilenames)
@@ -156,7 +163,7 @@ def execute_bootstrap(args):
 				totallen += sum(seqlens)
 				for i in range(len(allpi)):
 					nsnps += len(allpi[i])
-			print("TOTAL: logged frequency values for " + str(nsnps) + " SNPS across " + str(totalregions) + ".\n")
+			print("TOTAL: logged frequency values for " + str(nsnps) + " SNPS across " + str(totalregions) + ".")
 
 			####################################
 			#### PI: MEAN & BOOTSTRAP STDERR ###
@@ -215,20 +222,31 @@ def execute_bootstrap(args):
 	### LD ##
 	#########
 	if args.in_ld is not None:
+		nphysdisthist = args.nphysdisthist
+		ngendisthist = args.ngendisthist
 		inputestimatefilenames = ''.join(args.in_ld)
 		inputfilenames = inputestimatefilenames.split(',')
 		npops = len(inputfilenames)
+		#print('npops ' + str(npops)) #debug
 		for ipop in range(npops):
-			inputfilename = inputfilenames[i]
+			inputfilename = inputfilenames[ipop]
 			print("reading linkage disequilibrium statistics from: " + inputfilename)
 			writefile.write(str(ipop) + '\n')
-			alldists, allr2, allgendists, alldprime, nr2regions, ndprimeregions = readLDFile(ldfilename, dprimecutoff = mafcutoffdprime)
-			print("TOTAL: logged r2 values for " + str(allr2) + " SNP pairs.\n\tlogged D' values for " + str(alldprime) + " SNP pairs.\n")
+			N_r2regs, N_dprimeregs = 0, 0
+			N_r2snps, N_dprimesnps = 0, 0
+			allRegionDists, allRegionr2, allRegionGendists, allRegionDprime, nr2regions, ndprimeregions = readLDFile(inputfilename, dprimecutoff = args.mafcutoffdprime)
+			N_r2regs += nr2regions
+			N_r2snps += sum([len(x) for x in allRegionr2])
+			N_dprimeregs += ndprimeregions
+			N_dprimesnps += sum([len(x) for x in allRegionDprime])
+			print("\tlogged r2 values for " + str(N_r2snps) + " SNP pairs across " + str(N_r2regs) + " regions.")
+			print("\tlogged D' values for " + str(N_dprimesnps) + " SNP pairs across " + str(N_dprimeregs) + " regions.")
+
 
 			###################################
 			### r2: MEAN ACROSS ALL REGIONS ###
 			###################################
-			r2sums, physDistHist = estimater2decay(allRegionr2, allRegionDists)
+			r2sums, physDistHist = estimater2decay(allRegionr2, allRegionDists, nphysdisthist)
 			r2dist = [r2sums[u]/physDistHist[u] for u in range(len(r2sums))]
 			writefile.write(str(r2dist) + "\n")
 
@@ -247,7 +265,7 @@ def execute_bootstrap(args):
 					rep_all_physdist.append(flatregions[index_r2])
 
 				#add pseudocount for empty bins
-				repr2sum, repphysdisthist = estimater2decay(rep_all_r2, rep_all_physdist)
+				repr2sum, repphysdisthist = estimater2decay(rep_all_r2, rep_all_physdist, nphysdisthist)
 				for ibin in range(len(repphysdisthist)):
 					if repphysdisthist[ibin] == 0:
 						repphysdisthist[ibin] = 1
@@ -261,7 +279,7 @@ def execute_bootstrap(args):
 			####################################
 			### D': MEAN ACROSS ALL REGIONS ###
 			####################################
-			compLDhist, genDistHist = estimatedprimedecay(allRegionDprime, allRegionGendists)
+			compLDhist, genDistHist = estimatedprimedecay(allRegionDprime, allRegionGendists, ngendisthist)
 			#add pseudocounts
 			for ibin in range(len(genDistHist)):
 				if genDistHist[ibin] == 0:
@@ -285,7 +303,7 @@ def execute_bootstrap(args):
 					rep_all_dprime.append(flatdprime[index_dprime])
 					rep_all_gendist.append(flatgendist[index_dprime])
 
-				repcompLDhist, repgenDistHist = estimatedprimedecay(rep_all_dprime, rep_all_gendist)
+				repcompLDhist, repgenDistHist = estimatedprimedecay(rep_all_dprime, rep_all_gendist, ngendisthist)
 				for ibin in range(len(repgenDistHist)):
 					if repgenDistHist[ibin] == 0:
 						repgenDistHist[ibin] = 1
@@ -302,11 +320,13 @@ def execute_bootstrap(args):
 		inputestimatefilenames = ''.join(args.in_fst)
 		inputfilenames = inputestimatefilenames.split(',')
 		npopcomp = len(inputfilenames)
-		for icomp in range(len(npopcomp)):
+		for icomp in range(npopcomp):
 			fstfilename	= inputfilenames[icomp]
 			print("reading Fst values from: " + fstfilename)
 			if checkFileExists(fstfilename):
 				allfst, nregions = readFstFile(fstfilename)
+			else:
+				print('missing ' + fstfilename)
 			target_mean, target_se = estimateFstByBootstrap_bysnp(allfst, nrep = nbootstraprep)
 			writeline =  str(icomp) + "\t" + str(target_mean) + "\t" + str(target_se) + '\n'
 			writefile.write(writeline)

diff --git a/cms/combine/Makefile b/cms/combine/Makefile
@@ -1,18 +1,30 @@
 ## leaf Makefile for cms2.0/combine
-## last updated 06.19.17 	vitti@broadinstitute.org
+## last updated 12.01.17 	vitti@broadinstitute.org
 
 ######################
 ## DEFINE VARIABLES ##
 ######################
 
 CC = gcc
-CCFLAG = -O0 -ggdb3 -lm -Wall 
+CCFLAG = -O0 -ggdb3 -lm -Wall -lz 
 
 ##################
 ## DEFINE RULES ##
 ##################
 
-all : combine_scores_local combine_scores_gw write_xpehh_from_ihh  #install
+all : combine_scores_local_zipped  combine_scores_gw_zipped write_xpehh_from_ihh  #install
+
+# combine_scores_local combine_scores_gw
+
+#collate_scores.o : collate_scores.c
+#	$(CC) -c collate_scores.c
+
+#collate_scores : collate_scores.o cms_data.c cms_data.h
+#	$(CC) $(CCFLAG) -o collate_scores collate_scores.o cms_data.c
+
+#collate_scores_zipped : collate_scores.o cms_data_zipped.c cms_data.h
+#	$(CC) $(CCFLAG) -o collate_scores_zipped collate_scores.o cms_data_zipped.c
+
 
 combine_scores_local.o : combine_scores_local.c
 	$(CC) -c combine_scores_local.c
@@ -26,6 +38,12 @@ combine_scores_gw.o : combine_scores_gw.c
 combine_scores_gw : combine_scores_gw.o cms_data.c cms_data.h
 	$(CC) $(CCFLAG) -o combine_scores_gw combine_scores_gw.o cms_data.c
 
+combine_scores_local_zipped : combine_scores_local.o cms_data_zipped.c cms_data.h
+	$(CC) $(CCFLAG) -o combine_scores_local_zipped combine_scores_local.o cms_data_zipped.c
+
+combine_scores_gw_zipped : combine_scores_gw.o cms_data_zipped.c cms_data.h
+	$(CC) $(CCFLAG) -o combine_scores_gw_zipped combine_scores_gw.o cms_data_zipped.c
+
 write_xpehh_from_ihh.o : 
 	$(CC) -c write_xpehh_from_ihh.c
 
@@ -34,7 +52,9 @@ write_xpehh_from_ihh : write_xpehh_from_ihh.o cms_data.c pop_ihh_data.h  pop_ihh
 
 
 clean :
-	rm *.o && rm combine_scores_local && rm combine_scores_gw && rm write_xpehh_from_ihh #&& rm -R *.dSYM/
+	rm *.o && rm combine_scores_local_zipped && rm combine_scores_gw_zipped
+	# && rm write_xpehh_from_ihh && rm combine_scores_local_zipped && rm combine_scores_gw_zipped
+	#&& rm -R *.dSYM/
 #install:
 #	cp combine_scores /usr/local/ && cp write_xpehh_from_ihh /usr/local