Skip to content

Commit

Permalink
[avocado-122] Tested and reintegrated EM algorithm.
Browse files Browse the repository at this point in the history
  • Loading branch information
fnothaft committed Jan 8, 2015
1 parent b5718a6 commit 343b70d
Show file tree
Hide file tree
Showing 9 changed files with 300 additions and 100 deletions.
4 changes: 4 additions & 0 deletions avocado-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,9 @@
<artifactId>scalatest_${scala.version.prefix}</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalanlp</groupId>
<artifactId>breeze_2.10</artifactId>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.avocado.calls.pileup
package org.bdgenomics.avocado.algorithms.em

import scala.math.pow
import breeze.stats.distributions.Binomial
import scala.math.{ abs, pow }

object EMForAlleles {

Expand All @@ -30,88 +31,66 @@ object EMForAlleles {
*
* Note: GL is currently an array of (numSnps) arrays of length (numInds),
*/
def emForMAF(Phi: Array[Double], GL: Array[Array[(Double, Double, Double)]]): Array[Double] = {
var eps = 1.0
val tol = 0.0001
val L = Phi.length
var phi_updates = Phi
while (eps > tol) {
var Phi_next = Array.fill(L) { 0.0 }
Phi_next.indices.foreach(i => {
GL(i).foreach(l => {
Phi_next(i) += (1.0 / (2.0 * GL(i).length)) * ((1.0 * l._2 * 2.0 * phi_updates(i) * (1 - phi_updates(i)) +
2.0 * l._3 * pow(phi_updates(i), 2.0)) / (l._1 * pow(1.0 - phi_updates(i), 2.0) +
l._2 * 2.0 * phi_updates(i) * (1.0 - phi_updates(i)) +
l._3 * pow(phi_updates(i), 2.0)))
def emForMAF(genotypeLikelihoods: Array[Array[Double]],
referenceFrequencyEstimate: Double,
maxIterations: Option[Int] = None,
targetTolerance: Option[Double] = None): Double = {
assert(maxIterations.isDefined || targetTolerance.isDefined,
"Must define at least one of the iteration or tolerance limits.")

// M is the total number of chromosomes in all samples
val samplePloidy = genotypeLikelihoods.map(_.length - 1)
val M = samplePloidy.sum.toDouble
val ploidies = samplePloidy.distinct.toArray

// loop until convergence
var psi = referenceFrequencyEstimate
var lastPsi = psi
var iter = 0
do {
// carry over psi from previous iteration
lastPsi = psi

// calculate the new prior distributions per ploidy
val ploidyDistributionMap = ploidies.map(m => {
// build distribution
val dist = Binomial(m, psi)

// evaluate for all states 0...(m + 1)
val stateArray = new Array[Double](m + 1)
(0 to m).foreach(i => {
stateArray(i) = dist.probabilityOf(i)
})
})
var eps = 0.0
phi_updates.indices.foreach(i => eps += pow(phi_updates(i) - Phi_next(i), 2.0))
phi_updates = Phi_next
}
return phi_updates
}

/**
* Helper function to compute Y iteratively
* For each site, executes the recursion in 4.2.3. Y(i) is Ynk vector for site i
*/
def compY(GL: Array[Array[(Double, Double, Double)]]): Array[Array[Double]] = {
val L = GL.length
val GLt = GL.transpose
val n = GLt.length
val M = 2 * n
var Y = Array.ofDim[Double](L, n + 1, M + 1)
// NOTE: this ordering may be suboptimal?
for (i <- 0 until L) {
for (k <- 0 to M) {
for (j <- 0 to n) { // 0 = 0 people not first person
if (j == 0) {
Y(i)(j)(k) = 1.0
} else if (k == 0) {
Y(i)(j)(k) = (1.0 / (2.0 * j * (2.0 * j - 1.0))) * ((2.0 * j - k) * (2.0 * j - k - 1.0) * Y(i)(j - 1)(k) * GL(i)(j)._1)
} else if (k == 1) {
Y(i)(j)(k) = (1.0 / (2.0 * j * (2.0 * j - 1.0))) * ((2.0 * j - k) * (2.0 * j - k - 1.0) * Y(i)(j - 1)(k) * GL(i)(j)._1 +
2.0 * k * (2.0 * j - k) * Y(i)(j - 1)(k - 1) * GL(i)(j)._2)
} else {
Y(i)(j)(k) = (1.0 / (2.0 * j * (2.0 * j - 1.0))) * ((2.0 * j - k) * (2.0 * j - k - 1.0) * Y(i)(j - 1)(k) * GL(i)(j)._1 +
2.0 * k * (2.0 * j - k) * Y(i)(j - 1)(k - 1) * GL(i)(j)._2 + k * (k - 1.0) *
Y(i)(j - 1)(k - 2) * GL(i)(j)._2)
}
}
}
}
(m + 1, stateArray)
}).toMap

var Yr = Array.ofDim[Double](L, M)
for (l <- 0 until L) Yr(l) = Y(l)(n)
return Yr
}
// per sample, calculate the contribution of each genotype state
// then, sum these contributions together and normalize
psi = genotypeLikelihoods.map(gls => {
// the length of the genotype likelihood array is equal to the sample ploidy plus 1
val ploidyP1 = gls.length

/**
* Main AFS EM function
* IN: Phi - an initial MAF vector of length number of SNPs
* GL - Array of arrays of likelihood triples P( D | g )
* (note these are NOT multiplied by P(g | phi)! )
* OUT: Phi - ML estimate of MAF's across SNPs
* Note: GL is currently an array of (numSnps) arrays of length (numInds), which is transposed
*/
def emForAFS(Phik: Array[Double], GL: Array[Array[(Double, Double, Double)]]): Array[Double] = {
val GLt = GL.transpose
val tol = 0.0001
val L = GL.length
val M = Phik.length
var eps = 1.0
var Y = compY(GL)
var phik_updates = Phik
while (eps > tol) {
var sums = Array.fill(L) { 0.0 }
sums.indices.foreach(a => phik_updates.indices.foreach(p => sums(a) += phik_updates(p) * Y(a)(p)))
val Phik_next = Array.fill(M) { 0.0 }
Phik_next.indices.foreach(i => Y.foreach(y => Phik_next(i) += (1.0 / L) * phik_updates(i) * y(i) / sums(i)))
eps = 0.0
phik_updates.indices.foreach(i => eps += pow(phik_updates(i) - Phik_next(i), 2.0))
phik_updates = Phik_next
}
phik_updates
// from this, recover the state prior probabilities
val prior = ploidyDistributionMap(ploidyP1)

// loop to sum
var num = 0.0
var denom = 0.0
(0 until ploidyP1).foreach(i => {
val contribution = gls(i) * prior(i)
num += i * contribution
denom += contribution
})

num / denom
}).sum / M

// increment iteration count
iter += 1
} while (targetTolerance.fold(true)(_ < abs(psi - lastPsi)) &&
maxIterations.fold(true)(_ > iter))

psi
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,19 @@
*/
package org.bdgenomics.avocado.genotyping

import breeze.stats.distributions.Binomial
import org.apache.commons.configuration.{ HierarchicalConfiguration, SubnodeConfiguration }
import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.models.{ ReferencePosition, VariantContext }
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.util.PhredUtils
import org.bdgenomics.avocado.algorithms.em.EMForAlleles
import org.bdgenomics.avocado.models.{ AlleleObservation, Observation }
import org.bdgenomics.avocado.stats.AvocadoConfigAndStats
import org.bdgenomics.formats.avro.{ Contig, Genotype, GenotypeAllele, Variant }
import scala.annotation.tailrec
import scala.math.pow
import scala.math.{ max, min, pow }

object BiallelicGenotyper extends GenotyperCompanion {

Expand All @@ -36,15 +38,58 @@ object BiallelicGenotyper extends GenotyperCompanion {
protected def apply(stats: AvocadoConfigAndStats,
config: SubnodeConfiguration): Genotyper = {

// get finishing conditions for EM algorithms
val useEM = config.getBoolean("useEM", true)
val maxIterations = if (config.containsKey("maxEMIterations")) {
val iterLimit = config.getInt("maxEMIterations")
if (iterLimit <= 0) {
throw new IllegalArgumentException("EM iteration limit must be greater than 0.")
}
Some(iterLimit)
} else {
None
}
val tolerance = if (config.containsKey("emTolerance")) {
val emTol = config.getDouble("emTolerance")
if (emTol < 0.0 || emTol > 1.0) {
throw new IllegalArgumentException("EM tolerance must be between 0 and 1, non-inclusive.")
}
Some(emTol)
} else {
None
}

if (maxIterations.isEmpty && tolerance.isEmpty && useEM) {
throw new IllegalArgumentException("At least one constraint must be defined for the EM algorithm.")
}

// what level do we saturate the reference frequency to if we encounter underflow?
val referenceFrequency = config.getDouble("referenceFrequency", 0.999)
if (referenceFrequency < 0.0 || referenceFrequency > 1.0) {
throw new IllegalArgumentException("Reference frequency must be between 0 and 1, non-inclusive.")
}
val saturationThreshold = config.getDouble("emSaturationThreshold", 0.001)
if (saturationThreshold < 0.0 || saturationThreshold > 1.0) {
throw new IllegalArgumentException("Saturation threshold must be between 0 and 1, non-inclusive.")
}

new BiallelicGenotyper(config.getInt("ploidy", 2),
config.getBoolean("useEM", false),
config.getBoolean("emitGVCF", true))
useEM,
config.getBoolean("emitGVCF", true),
referenceFrequency,
maxIterations,
tolerance,
saturationThreshold)
}
}

class BiallelicGenotyper(ploidy: Int = 2,
useEM: Boolean = false,
emitGVCF: Boolean = true) extends Genotyper with Logging {
emitGVCF: Boolean = true,
estimatedReferenceFrequency: Double = 0.999,
maxIterations: Option[Int] = Some(10),
tolerance: Option[Double] = Some(1e-3),
saturationThreshold: Double = 0.001) extends Genotyper with Logging {

val companion: GenotyperCompanion = BiallelicGenotyper

Expand Down Expand Up @@ -230,12 +275,33 @@ class BiallelicGenotyper(ploidy: Int = 2,
})

// compensate likelihoods on the basis of population statistics
val compensatedLikelihoodsPerSample = if (useEM) {
// TODO: connect up EM algorithm
???
val majorAlleleFrequency = if (useEM) {
min(1.0 - saturationThreshold,
max(EMForAlleles.emForMAF(likelihoodsPerSample.flatMap(s => {
// did we have any observations from this sample?
if (s._1.size > 0) {
Some(s._2)
} else {
None
}
}).toArray,
estimatedReferenceFrequency,
maxIterations,
tolerance), saturationThreshold))
} else {
likelihoodsPerSample
estimatedReferenceFrequency
}
val distribution = Binomial(ploidy, majorAlleleFrequency)
val statePriors = (0 to ploidy).map(g => distribution.probabilityOf(g))
val compensatedLikelihoodsPerSample = likelihoodsPerSample.map(s => {
// extract info
val (observations, likelihoods, likelihoodOtherAlt) = s

(0 to ploidy).foreach(i => {
likelihoods(i) *= statePriors(i)
})
(observations, likelihoods, likelihoodOtherAlt)
})

// construct variant
val variant = Variant.newBuilder()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.avocado.algorithms.em

import org.scalatest.FunSuite
import scala.math.abs

class EMForAllelesSuite extends FunSuite {

def fpEquals(a: Double, b: Double, eps: Double = 1e-3): Boolean = {
abs(a - b) < eps
}

test("cannot run EM without specifying an iteration limit or target tolerance") {
intercept[AssertionError] {
EMForAlleles.emForMAF(Array(Array[Double]()),
1.0 - 1e-3)
}
}

test("run EM on single sample, definite ref") {
val psi = EMForAlleles.emForMAF(Array(Array(0.0, 0.0, 1.0)),
1.0 - 1e-3,
maxIterations = Some(10))

assert(fpEquals(psi, 1.0))
}

test("run EM on three samples, mix of hom ref, het, hom alt") {
val psi = EMForAlleles.emForMAF(Array(Array(0.0000001, 0.0001, 0.1),
Array(0.0001, 0.1, 0.0001),
Array(0.1, 0.0001, 0.0000001)),
1.0 - 1e-3,
maxIterations = Some(10))

assert(fpEquals(psi, 0.5))
}

test("run EM on five samples, one hom alt, all others hom ref") {
val psi = EMForAlleles.emForMAF(Array(Array(0.0000001, 0.0001, 0.1),
Array(0.0000001, 0.0001, 0.1),
Array(0.0000001, 0.0001, 0.1),
Array(0.0000001, 0.0001, 0.1),
Array(0.1, 0.0001, 0.0000001)),
1.0 - 1e-3,
maxIterations = Some(10))

assert(fpEquals(psi, 0.8))
}

test("run EM on five samples, with varying ploidy, M = 10, G = 7") {
val psi = EMForAlleles.emForMAF(Array(Array(0.0000001, 0.0001, 0.1),
Array(0.0000001, 0.0001, 0.1),
Array(0.0000001, 0.0001, 0.1),
Array(0.1, 0.0001),
Array(0.0001, 0.1, 0.0001, 0.0000001)),
1.0 - 1e-3,
maxIterations = Some(10))

assert(fpEquals(psi, 0.7))
}
}
1 change: 1 addition & 0 deletions avocado-sample-configs/basic.properties
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
{
}
biallelicGenotyper = {
tolerance = 0.001;
}
defPart =
{
Expand Down
Loading

0 comments on commit 343b70d

Please sign in to comment.