Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to indel realigner to improve performance and accuracy. #314

Merged
merged 1 commit into from
Jul 22, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.algorithms.consensus

import org.bdgenomics.adam.algorithms.realignmenttarget.IndelRealignmentTarget
import org.bdgenomics.adam.models.{ Consensus, ReferenceRegion }
import org.bdgenomics.adam.rich.RichADAMRecord
import org.apache.spark.rdd.RDD

abstract class ConsensusGenerator extends Serializable {

/**
* Generates targets to add to initial set of indel realignment targets, if additional
* targets are necessary.
*
* @return Returns an option which wraps an RDD of indel realignment targets.
*/
def targetsToAdd(): Option[RDD[IndelRealignmentTarget]]

/**
* Performs any preprocessing specific to this consensus generation algorithm, e.g.,
* indel normalization.
*
* @param reads Reads to preprocess.
* @return Preprocessed reads.
*/
def preprocessReadsForRealignment(reads: Iterable[RichADAMRecord],
reference: String,
region: ReferenceRegion): Iterable[RichADAMRecord]

/**
* For all reads in this region, generates the list of consensus sequences for realignment.
*
* @param reads Reads to generate consensus sequences from.
* @return Consensus sequences to use for realignment.
*/
def findConsensus(reads: Iterable[RichADAMRecord]): Iterable[Consensus]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.algorithms.consensus

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.variation.ADAMVariationContext._
import org.bdgenomics.adam.algorithms.realignmenttarget.IndelRealignmentTarget
import org.bdgenomics.adam.models._
import org.bdgenomics.adam.rich.RichADAMRecord

class ConsensusGeneratorFromKnowns(file: String, sc: SparkContext) extends ConsensusGenerator {

val indelTable = sc.broadcast(IndelTable(file, sc))

/**
* Generates targets to add to initial set of indel realignment targets, if additional
* targets are necessary.
*
* @return Returns an option which wraps an RDD of indel realignment targets.
*/
def targetsToAdd(): Option[RDD[IndelRealignmentTarget]] = {
val rdd: RDD[ADAMVariantContext] = sc.adamVCFLoad(file)

Some(rdd.map(_.variant.variant)
.filter(v => v.getReferenceAllele.length != v.getVariantAllele.length)
.map(v => ReferenceRegion(v.getContig.getContigName, v.getPosition, v.getPosition + v.getReferenceAllele.length))
.map(r => new IndelRealignmentTarget(Some(r), r)))
}

/**
* Performs any preprocessing specific to this consensus generation algorithm, e.g.,
* indel normalization.
*
* @param reads Reads to preprocess.
* @return Preprocessed reads.
*/
def preprocessReadsForRealignment(reads: Iterable[RichADAMRecord],
reference: String,
region: ReferenceRegion): Iterable[RichADAMRecord] = {
reads
}

/**
* For all reads in this region, generates the list of consensus sequences for realignment.
*
* @param reads Reads to generate consensus sequences from.
* @return Consensus sequences to use for realignment.
*/
def findConsensus(reads: Iterable[RichADAMRecord]): Iterable[Consensus] = {
val table = indelTable.value

// get region
val start = reads.map(_.record.getStart.toLong).reduce(_ min _)
val end = reads.flatMap(_.end).reduce(_ max _)
val refId = reads.head.record.getContig.getContigName

val region = ReferenceRegion(refId, start, end + 1)

// get reads
table.getIndelsInRegion(region)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.algorithms.consensus

import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.algorithms.realignmenttarget.IndelRealignmentTarget
import org.bdgenomics.adam.models.{ Consensus, ReferenceRegion, ReferencePosition }
import org.bdgenomics.adam.rich.RichADAMRecord
import org.bdgenomics.adam.rich.RichADAMRecord._
import org.bdgenomics.adam.rich.RichCigar._
import org.bdgenomics.adam.util.MdTag
import org.bdgenomics.adam.util.ImplicitJavaConversions._
import org.bdgenomics.adam.util.NormalizationUtils._
import org.bdgenomics.formats.avro.ADAMRecord

class ConsensusGeneratorFromReads extends ConsensusGenerator {

/**
* No targets to add if generating consensus targets from reads.
*
* @return Returns a None.
*/
def targetsToAdd(): Option[RDD[IndelRealignmentTarget]] = None

/**
* Performs read preprocessing by normalizing indels for all reads that have evidence of one
* indel.
*
* @param reads Reads to process.
* @return Reads with indels normalized if they contain a single indel.
*/
def preprocessReadsForRealignment(reads: Iterable[RichADAMRecord],
reference: String,
region: ReferenceRegion): Iterable[RichADAMRecord] = {
reads.map(r => {
// if there are two alignment blocks (sequence matches) then there is a single indel in the read
if (r.samtoolsCigar.numAlignmentBlocks == 2) {
// left align this indel and update the mdtag
val cigar = leftAlignIndel(r)
val mdTag = MdTag.moveAlignment(r, cigar)

val newRead: RichADAMRecord = ADAMRecord.newBuilder(r)
.setCigar(cigar.toString)
.setMismatchingPositions(mdTag.toString())
.build()

newRead
} else {
r
}
})
}

/**
* Generates concensus sequences from reads with indels.
*/
def findConsensus(reads: Iterable[RichADAMRecord]): Iterable[Consensus] = {
reads.filter(r => r.mdTag.isDefined)
.flatMap(r => {
// try to generate a consensus alignment - if a consensus exists, add it to our
// list of consensuses to test
Consensus.generateAlternateConsensus(r.getSequence,
ReferencePosition(r.getContig.getContigName,
r.getStart),
r.samtoolsCigar)
})
.toSeq
.distinct
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.algorithms.consensus

import org.bdgenomics.adam.algorithms.smithwaterman.SmithWatermanConstantGapScoring
import org.bdgenomics.adam.models.ReferenceRegion
import org.bdgenomics.adam.rich.RichADAMRecord
import org.bdgenomics.adam.rich.RichADAMRecord._
import org.bdgenomics.adam.rich.RichCigar._
import org.bdgenomics.adam.util.MdTag
import org.bdgenomics.formats.avro.ADAMRecord

class ConsensusGeneratorFromSmithWaterman(wMatch: Double,
wMismatch: Double,
wInsert: Double,
wDelete: Double) extends ConsensusGeneratorFromReads {

/**
* Attempts realignment of all reads using Smith-Waterman. Accepts all realignments that have one
* or fewer indels.
*
* @param reads Reads to process.
* @return Reads with indels normalized if they contain a single indel.
*/
override def preprocessReadsForRealignment(reads: Iterable[RichADAMRecord],
reference: String,
region: ReferenceRegion): Iterable[RichADAMRecord] = {
val rds: Iterable[RichADAMRecord] = reads.map(r => {

val sw = new SmithWatermanConstantGapScoring(r.record.getSequence.toString,
reference,
wMatch,
wMismatch,
wInsert,
wDelete)
println("for " + r.record.getReadName + " sw to " + sw.xStart + " with " + sw.cigarX)

// if we realign with fewer than three alignment blocks, then take the new alignment
if (sw.cigarX.numAlignmentBlocks <= 2) {
val mdTag = MdTag(r.record.getSequence.toString,
reference.drop(sw.xStart),
sw.cigarX,
region.start)

val newRead: RichADAMRecord = ADAMRecord.newBuilder(r)
.setStart(sw.xStart + region.start)
.setCigar(sw.cigarX.toString)
.setMismatchingPositions(mdTag.toString())
.build()

newRead
} else {
r
}
})

super.preprocessReadsForRealignment(rds, reference, region)
}
}
Loading