Skip to content

Commit

Permalink
Merge pull request #225 from broadinstitute/jv
Browse files Browse the repository at this point in the history
debug cms_modeller boostrap; commit intermediate updates
  • Loading branch information
josephvitti authored Feb 26, 2018
2 parents aaeecdf + b215e76 commit 3fbb697
Show file tree
Hide file tree
Showing 19 changed files with 3,116 additions and 308 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,7 @@ cms/VERSION
cms/VERSION

cms/logfile

cms/model/.DS_Store

cms/model/.DS_Store
40 changes: 30 additions & 10 deletions cms/cms_modeller.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
## top-level script for demographic modeling as part of CMS 2.0.
## last updated: 10.08.16 vitti@broadinstitute.org
## last updated: 02.26.18 vitti@broadinstitute.org

import matplotlib as mp
mp.use('agg')
Expand Down Expand Up @@ -39,8 +39,14 @@ def full_parser_cms_modeller():
bootstrap_parser = subparsers.add_parser('bootstrap', help='Perform bootstrap estimates of population summary statistics from per-site(/per-site-pair) calculations in order to finalize model target values.')
bootstrap_parser.add_argument('nBootstrapReps', action='store', type=int, help='number of bootstraps to perform in order to estimate standard error of the dataset (should converge for reasonably small n)')
bootstrap_parser.add_argument('--in_freqs', action='store', help='comma-delimited list of infiles with per-site calculations for population. One file per population -- for bootstrap estimates of genome-wide values, should first concatenate per-chrom files')
bootstrap_parser.add_argument('--nFreqHistBins', action='store',type=int, default=6, help="number of bins for site frequency spectrum and p(der|freq)")
bootstrap_parser.add_argument('--in_ld', action='store', help='comma-delimited list of infiles with per-site-pair calculations for population. One file per population -- for bootstrap estimates of genome-wide values, should first concatenate per-chrom files')
bootstrap_parser.add_argument('--mafcutoffdprime', action='store', type=float, default=.2, help="for D' calculations, only use sites with MAF > mafcutoffdprime")
bootstrap_parser.add_argument('--nphysdisthist', action='store', type=int, default=14, help="nbins for r2 LD calculations")


bootstrap_parser.add_argument('--in_fst', action='store', help='comma-delimited list of infiles with per-site calculations for population pair. One file per population-pair -- for bootstrap estimates of genome-wide values, should first concatenate per-chrom files')
bootstrap_parser.add_argument('--ngendisthist', action='store', type=int, default=17, help="nbins for D' LD calculations")
bootstrap_parser.add_argument('out', action='store', type=str, help='outfile prefix')

##########################
Expand Down Expand Up @@ -137,6 +143,7 @@ def execute_bootstrap(args):
### FREQ STATS ##
#################
if args.in_freqs is not None:
nhist = args.nFreqHistBins
inputestimatefilenames = ''.join(args.in_freqs)
inputfilenames = inputestimatefilenames.split(',')
npops = len(inputfilenames)
Expand All @@ -156,7 +163,7 @@ def execute_bootstrap(args):
totallen += sum(seqlens)
for i in range(len(allpi)):
nsnps += len(allpi[i])
print("TOTAL: logged frequency values for " + str(nsnps) + " SNPS across " + str(totalregions) + ".\n")
print("TOTAL: logged frequency values for " + str(nsnps) + " SNPS across " + str(totalregions) + ".")

####################################
#### PI: MEAN & BOOTSTRAP STDERR ###
Expand Down Expand Up @@ -215,20 +222,31 @@ def execute_bootstrap(args):
### LD ##
#########
if args.in_ld is not None:
nphysdisthist = args.nphysdisthist
ngendisthist = args.ngendisthist
inputestimatefilenames = ''.join(args.in_ld)
inputfilenames = inputestimatefilenames.split(',')
npops = len(inputfilenames)
#print('npops ' + str(npops)) #debug
for ipop in range(npops):
inputfilename = inputfilenames[i]
inputfilename = inputfilenames[ipop]
print("reading linkage disequilibrium statistics from: " + inputfilename)
writefile.write(str(ipop) + '\n')
alldists, allr2, allgendists, alldprime, nr2regions, ndprimeregions = readLDFile(ldfilename, dprimecutoff = mafcutoffdprime)
print("TOTAL: logged r2 values for " + str(allr2) + " SNP pairs.\n\tlogged D' values for " + str(alldprime) + " SNP pairs.\n")
N_r2regs, N_dprimeregs = 0, 0
N_r2snps, N_dprimesnps = 0, 0
allRegionDists, allRegionr2, allRegionGendists, allRegionDprime, nr2regions, ndprimeregions = readLDFile(inputfilename, dprimecutoff = args.mafcutoffdprime)
N_r2regs += nr2regions
N_r2snps += sum([len(x) for x in allRegionr2])
N_dprimeregs += ndprimeregions
N_dprimesnps += sum([len(x) for x in allRegionDprime])
print("\tlogged r2 values for " + str(N_r2snps) + " SNP pairs across " + str(N_r2regs) + " regions.")
print("\tlogged D' values for " + str(N_dprimesnps) + " SNP pairs across " + str(N_dprimeregs) + " regions.")


###################################
### r2: MEAN ACROSS ALL REGIONS ###
###################################
r2sums, physDistHist = estimater2decay(allRegionr2, allRegionDists)
r2sums, physDistHist = estimater2decay(allRegionr2, allRegionDists, nphysdisthist)
r2dist = [r2sums[u]/physDistHist[u] for u in range(len(r2sums))]
writefile.write(str(r2dist) + "\n")

Expand All @@ -247,7 +265,7 @@ def execute_bootstrap(args):
rep_all_physdist.append(flatregions[index_r2])

#add pseudocount for empty bins
repr2sum, repphysdisthist = estimater2decay(rep_all_r2, rep_all_physdist)
repr2sum, repphysdisthist = estimater2decay(rep_all_r2, rep_all_physdist, nphysdisthist)
for ibin in range(len(repphysdisthist)):
if repphysdisthist[ibin] == 0:
repphysdisthist[ibin] = 1
Expand All @@ -261,7 +279,7 @@ def execute_bootstrap(args):
####################################
### D': MEAN ACROSS ALL REGIONS ###
####################################
compLDhist, genDistHist = estimatedprimedecay(allRegionDprime, allRegionGendists)
compLDhist, genDistHist = estimatedprimedecay(allRegionDprime, allRegionGendists, ngendisthist)
#add pseudocounts
for ibin in range(len(genDistHist)):
if genDistHist[ibin] == 0:
Expand All @@ -285,7 +303,7 @@ def execute_bootstrap(args):
rep_all_dprime.append(flatdprime[index_dprime])
rep_all_gendist.append(flatgendist[index_dprime])

repcompLDhist, repgenDistHist = estimatedprimedecay(rep_all_dprime, rep_all_gendist)
repcompLDhist, repgenDistHist = estimatedprimedecay(rep_all_dprime, rep_all_gendist, ngendisthist)
for ibin in range(len(repgenDistHist)):
if repgenDistHist[ibin] == 0:
repgenDistHist[ibin] = 1
Expand All @@ -302,11 +320,13 @@ def execute_bootstrap(args):
inputestimatefilenames = ''.join(args.in_fst)
inputfilenames = inputestimatefilenames.split(',')
npopcomp = len(inputfilenames)
for icomp in range(len(npopcomp)):
for icomp in range(npopcomp):
fstfilename = inputfilenames[icomp]
print("reading Fst values from: " + fstfilename)
if checkFileExists(fstfilename):
allfst, nregions = readFstFile(fstfilename)
else:
print('missing ' + fstfilename)
target_mean, target_se = estimateFstByBootstrap_bysnp(allfst, nrep = nbootstraprep)
writeline = str(icomp) + "\t" + str(target_mean) + "\t" + str(target_se) + '\n'
writefile.write(writeline)
Expand Down
28 changes: 24 additions & 4 deletions cms/combine/Makefile
Original file line number Diff line number Diff line change
@@ -1,18 +1,30 @@
## leaf Makefile for cms2.0/combine
## last updated 06.19.17 vitti@broadinstitute.org
## last updated 12.01.17 vitti@broadinstitute.org

######################
## DEFINE VARIABLES ##
######################

CC = gcc
CCFLAG = -O0 -ggdb3 -lm -Wall
CCFLAG = -O0 -ggdb3 -lm -Wall -lz

##################
## DEFINE RULES ##
##################

all : combine_scores_local combine_scores_gw write_xpehh_from_ihh #install
all : combine_scores_local_zipped combine_scores_gw_zipped write_xpehh_from_ihh #install

# combine_scores_local combine_scores_gw

#collate_scores.o : collate_scores.c
# $(CC) -c collate_scores.c

#collate_scores : collate_scores.o cms_data.c cms_data.h
# $(CC) $(CCFLAG) -o collate_scores collate_scores.o cms_data.c

#collate_scores_zipped : collate_scores.o cms_data_zipped.c cms_data.h
# $(CC) $(CCFLAG) -o collate_scores_zipped collate_scores.o cms_data_zipped.c


combine_scores_local.o : combine_scores_local.c
$(CC) -c combine_scores_local.c
Expand All @@ -26,6 +38,12 @@ combine_scores_gw.o : combine_scores_gw.c
combine_scores_gw : combine_scores_gw.o cms_data.c cms_data.h
$(CC) $(CCFLAG) -o combine_scores_gw combine_scores_gw.o cms_data.c

combine_scores_local_zipped : combine_scores_local.o cms_data_zipped.c cms_data.h
$(CC) $(CCFLAG) -o combine_scores_local_zipped combine_scores_local.o cms_data_zipped.c

combine_scores_gw_zipped : combine_scores_gw.o cms_data_zipped.c cms_data.h
$(CC) $(CCFLAG) -o combine_scores_gw_zipped combine_scores_gw.o cms_data_zipped.c

write_xpehh_from_ihh.o :
$(CC) -c write_xpehh_from_ihh.c

Expand All @@ -34,7 +52,9 @@ write_xpehh_from_ihh : write_xpehh_from_ihh.o cms_data.c pop_ihh_data.h pop_ihh


clean :
rm *.o && rm combine_scores_local && rm combine_scores_gw && rm write_xpehh_from_ihh #&& rm -R *.dSYM/
rm *.o && rm combine_scores_local_zipped && rm combine_scores_gw_zipped
# && rm write_xpehh_from_ihh && rm combine_scores_local_zipped && rm combine_scores_gw_zipped
#&& rm -R *.dSYM/
#install:
# cp combine_scores /usr/local/ && cp write_xpehh_from_ihh /usr/local

Loading

0 comments on commit 3fbb697

Please sign in to comment.