From 39eedf4d996a4b00c81f9c110e4f7dd32f4cf709 Mon Sep 17 00:00:00 2001 From: Jon Bloom Date: Thu, 22 Mar 2018 12:13:58 -0400 Subject: [PATCH] added Direchlet mixture option to BaldingNicholsModel (#3206) --- python/hail/methods/statgen.py | 25 +++++--- src/main/scala/is/hail/HailContext.scala | 5 +- .../is/hail/stats/BaldingNicholsModel.scala | 59 +++++++++++++------ .../hail/stats/BaldingNicholsModelSuite.scala | 2 +- 4 files changed, 62 insertions(+), 29 deletions(-) diff --git a/python/hail/methods/statgen.py b/python/hail/methods/statgen.py index 1146c3f50f64..05e1400b9d40 100644 --- a/python/hail/methods/statgen.py +++ b/python/hail/methods/statgen.py @@ -1301,7 +1301,7 @@ def hwe_normalized_pca(dataset, k=10, compute_loadings=False, as_array=False): :math:`i` and :math:`j` of :math:`M`; in terms of :math:`C` it is .. math:: - + \frac{1}{m}\sum_{l\in\mathcal{C}_i\cap\mathcal{C}_j}\frac{(C_{il}-2p_l)(C_{jl} - 2p_l)}{2p_l(1-p_l)} where :math:`\mathcal{C}_i = \{l \mid C_{il} \text{ is non-missing}\}`. In @@ -1323,7 +1323,7 @@ def hwe_normalized_pca(dataset, k=10, compute_loadings=False, as_array=False): Parameters ---------- dataset : :class:`.MatrixTable` - Dataset. + Matrix table with entry-indexed ``GT`` field of type :py:data:`.tcall`. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` @@ -1379,7 +1379,7 @@ def pca(entry_expr, k=10, compute_loadings=False, as_array=False): 1s encoding missingness of genotype calls. >>> eigenvalues, scores, _ = hl.pca(hl.int(hl.is_defined(dataset.GT)), - ... k=2) + ... k=2) Warning ------- @@ -1438,8 +1438,6 @@ def pca(entry_expr, k=10, compute_loadings=False, as_array=False): Parameters ---------- - dataset : :class:`.MatrixTable` - Dataset. entry_expr : :class:`.Expression` Numeric expression for matrix entries. k : :obj:`int` @@ -2258,10 +2256,11 @@ def realized_relationship_matrix(call_expr): fst=nullable(listof(numeric)), af_dist=oneof(UniformDist, BetaDist, TruncatedBetaDist), seed=int, - reference_genome=reference_genome_type) + reference_genome=reference_genome_type, + mixture=bool) def balding_nichols_model(n_populations, n_samples, n_variants, n_partitions=None, pop_dist=None, fst=None, af_dist=UniformDist(0.1, 0.9), - seed=0, reference_genome='default'): + seed=0, reference_genome='default', mixture=False): r"""Generate a matrix table of variants, samples, and genotypes using the Balding-Nichols model. @@ -2324,7 +2323,7 @@ def balding_nichols_model(n_populations, n_samples, n_variants, n_partitions=Non population allele frequencies by :math:`p_{k, m}`, and diploid, unphased genotype calls by :math:`g_{n, m}` (0, 1, and 2 correspond to homozygous reference, heterozygous, and homozygous variant, respectively). - + The generative model is then given by: .. math:: @@ -2354,6 +2353,7 @@ def balding_nichols_model(n_populations, n_samples, n_variants, n_partitions=Non - `ancestral_af_dist` (:class:`.tstruct`) -- Description of the ancestral allele frequency distribution. - `seed` (:py:data:`.tint32`) -- Random seed. + - `mixture` (:py:data:`.tbool`) -- Value of `mixture` parameter. Row fields: @@ -2397,6 +2397,12 @@ def balding_nichols_model(n_populations, n_samples, n_variants, n_partitions=Non Random seed. reference_genome : :obj:`str` or :class:`.ReferenceGenome` Reference genome to use. + mixture : :obj:`bool` + Treat `pop_dist` as the parameters of a Dirichlet distribution, + as in the Prichard-Stevens-Donnelly model. This feature is + EXPERIMENTAL and currently undocumented and untested. + If ``True``, the type of `pop` is :class:`.tarray` of + :py:data:`.tfloat64` and the value is the mixture proportions. Returns ------- @@ -2420,7 +2426,8 @@ def balding_nichols_model(n_populations, n_samples, n_variants, n_partitions=Non jvm_fst_opt, af_dist._jrep(), seed, - reference_genome._jrep) + reference_genome._jrep, + mixture) return MatrixTable(jmt) diff --git a/src/main/scala/is/hail/HailContext.scala b/src/main/scala/is/hail/HailContext.scala index b25b4a99b905..c11e4b57a727 100644 --- a/src/main/scala/is/hail/HailContext.scala +++ b/src/main/scala/is/hail/HailContext.scala @@ -603,8 +603,9 @@ class HailContext private(val sc: SparkContext, fst: Option[Array[Double]] = None, afDist: Distribution = UniformDist(0.1, 0.9), seed: Int = 0, - rg: ReferenceGenome = ReferenceGenome.defaultReference): MatrixTable = - BaldingNicholsModel(this, populations, samples, variants, popDist, fst, seed, nPartitions, afDist, rg) + rg: ReferenceGenome = ReferenceGenome.defaultReference, + mixture: Boolean = false): MatrixTable = + BaldingNicholsModel(this, populations, samples, variants, popDist, fst, seed, nPartitions, afDist, rg, mixture) def genDataset(): MatrixTable = VSMSubgen.realistic.gen(this).sample() diff --git a/src/main/scala/is/hail/stats/BaldingNicholsModel.scala b/src/main/scala/is/hail/stats/BaldingNicholsModel.scala index 244b44ec4789..18c0a1cdfd06 100644 --- a/src/main/scala/is/hail/stats/BaldingNicholsModel.scala +++ b/src/main/scala/is/hail/stats/BaldingNicholsModel.scala @@ -7,15 +7,22 @@ import is.hail.annotations._ import is.hail.expr.types._ import is.hail.rvd.OrderedRVD import is.hail.utils._ -import is.hail.variant.{Call, Call2, ReferenceGenome, MatrixTable} +import is.hail.variant.{Call2, ReferenceGenome, MatrixTable} import org.apache.commons.math3.random.JDKRandomGenerator object BaldingNicholsModel { - def apply(hc: HailContext, nPops: Int, nSamples: Int, nVariants: Int, - popDistArrayOpt: Option[Array[Double]], FstOfPopArrayOpt: Option[Array[Double]], - seed: Int, nPartitionsOpt: Option[Int], af_dist: Distribution, - rg: ReferenceGenome = ReferenceGenome.defaultReference): MatrixTable = { + def apply(hc: HailContext, + nPops: Int, + nSamples: Int, + nVariants: Int, + popDistArrayOpt: Option[Array[Double]], + FstOfPopArrayOpt: Option[Array[Double]], + seed: Int, + nPartitionsOpt: Option[Int], + af_dist: Distribution, + rg: ReferenceGenome = ReferenceGenome.defaultReference, + mixture: Boolean = false): MatrixTable = { val sc = hc.sc @@ -69,17 +76,24 @@ object BaldingNicholsModel { Rand.generator.setSeed(seed) val popDist_k = popDist - popDist_k :/= sum(popDist_k) - - val popDistRV = Multinomial(popDist_k) - val popOfSample_n: DenseVector[Int] = DenseVector.fill[Int](N)(popDistRV.draw()) + val popOfSample_n = DenseMatrix.zeros[Double](if (mixture) K else 1, N) + + if (mixture) { + val popDistRV = Dirichlet(popDist_k) + (0 until N).foreach(j => popOfSample_n(::, j) := popDistRV.draw()) + } else { + popDist_k :/= sum(popDist_k) + val popDistRV = Multinomial(popDist_k) + (0 until N).foreach(j => popOfSample_n(0, j) = popDistRV.draw()) + } + val popOfSample_nBc = sc.broadcast(popOfSample_n) val Fst_k = FstOfPop val Fst1_k = (1d - Fst_k) /:/ Fst_k val Fst1_kBc = sc.broadcast(Fst1_k) - val saSignature = TStruct("sample_idx" -> TInt32(), "pop" -> TInt32()) + val saSignature = TStruct("sample_idx" -> TInt32(), "pop" -> (if (mixture) TArray(TFloat64()) else TInt32())) val vaSignature = TStruct("ancestralAF" -> TFloat64(), "AF" -> TArray(TFloat64())) val ancestralAFAnnotation = af_dist match { @@ -88,7 +102,7 @@ object BaldingNicholsModel { case TruncatedBetaDist(a, b, min, max) => Annotation("TruncatedBetaDist", a, b, min, max) } val globalAnnotation = - Annotation(K, N, M, popDistArray: IndexedSeq[Double], FstOfPopArray: IndexedSeq[Double], ancestralAFAnnotation, seed) + Annotation(K, N, M, popDistArray: IndexedSeq[Double], FstOfPopArray: IndexedSeq[Double], ancestralAFAnnotation, seed, mixture) val ancestralAFAnnotationSignature = af_dist match { case UniformDist(min, max) => TStruct("type" -> TString(), "min" -> TFloat64(), "max" -> TFloat64()) @@ -103,7 +117,8 @@ object BaldingNicholsModel { "pop_dist" -> TArray(TFloat64()), "fst" -> TArray(TFloat64()), "ancestral_af_dist" -> ancestralAFAnnotationSignature, - "seed" -> TInt32()) + "seed" -> TInt32(), + "mixture" -> TBoolean()) val matrixType: MatrixType = MatrixType.fromParts( globalType = globalSignature, @@ -130,9 +145,11 @@ object BaldingNicholsModel { val ancestralAF = af_dist.getBreezeDist(perVariantRandomBasis).draw() - val popAF_k: IndexedSeq[Double] = Array.tabulate(K) { k => - new Beta(ancestralAF * Fst1_kBc.value(k), (1 - ancestralAF) * Fst1_kBc.value(k))(perVariantRandomBasis).draw() - } + val popAF_k: DenseVector[Double] = DenseVector( + Array.tabulate(K) { k => + new Beta(ancestralAF * Fst1_kBc.value(k), (1 - ancestralAF) * Fst1_kBc.value(k))(perVariantRandomBasis) + .draw() + }) region.clear() rvb.start(rvType) @@ -165,7 +182,11 @@ object BaldingNicholsModel { i = 0 val unif = new Uniform(0, 1)(perVariantRandomBasis) while (i < N) { - val p = popAF_k(popOfSample_nBc.value(i)) + val p = + if (mixture) + popOfSample_nBc.value(::, i) dot popAF_k + else + popAF_k(popOfSample_nBc.value(0, i).toInt) val pSq = p * p val x = unif.draw() val c = @@ -188,7 +209,11 @@ object BaldingNicholsModel { } } - val sampleAnnotations = (0 until N).map { i => Annotation(i, popOfSample_n(i)) }.toArray + val sampleAnnotations: Array[Annotation] = + if (mixture) + Array.tabulate(N)(i => Annotation(i, popOfSample_n(::, i).data.toIndexedSeq)) + else + Array.tabulate(N)(i => Annotation(i, popOfSample_n(0, i).toInt)) // FIXME: should use fast keys val ordrdd = OrderedRVD(matrixType.orvdType, rdd, None, None) diff --git a/src/test/scala/is/hail/stats/BaldingNicholsModelSuite.scala b/src/test/scala/is/hail/stats/BaldingNicholsModelSuite.scala index be38831bbb46..80d10ed496ab 100644 --- a/src/test/scala/is/hail/stats/BaldingNicholsModelSuite.scala +++ b/src/test/scala/is/hail/stats/BaldingNicholsModelSuite.scala @@ -2,7 +2,7 @@ package is.hail.stats import breeze.stats._ import is.hail.SparkSuite -import is.hail.variant.{Call, Locus, Variant} +import is.hail.variant.{Call, Variant} import is.hail.testUtils._ import org.apache.spark.sql.Row import org.testng.Assert.assertEquals