bigdatagenomics · carlyeks · Jul 22, 2014 · Mar 19, 2014
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala
@@ -0,0 +1,53 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.algorithms.consensus
+
+import org.bdgenomics.adam.algorithms.realignmenttarget.IndelRealignmentTarget
+import org.bdgenomics.adam.models.{ Consensus, ReferenceRegion }
+import org.bdgenomics.adam.rich.RichADAMRecord
+import org.apache.spark.rdd.RDD
+
+abstract class ConsensusGenerator extends Serializable {
+
+  /**
+   * Generates targets to add to initial set of indel realignment targets, if additional
+   * targets are necessary.
+   *
+   * @return Returns an option which wraps an RDD of indel realignment targets.
+   */
+  def targetsToAdd(): Option[RDD[IndelRealignmentTarget]]
+
+  /**
+   * Performs any preprocessing specific to this consensus generation algorithm, e.g.,
+   * indel normalization.
+   *
+   * @param reads Reads to preprocess.
+   * @return Preprocessed reads.
+   */
+  def preprocessReadsForRealignment(reads: Iterable[RichADAMRecord],
+                                    reference: String,
+                                    region: ReferenceRegion): Iterable[RichADAMRecord]
+
+  /**
+   * For all reads in this region, generates the list of consensus sequences for realignment.
+   *
+   * @param reads Reads to generate consensus sequences from.
+   * @return Consensus sequences to use for realignment.
+   */
+  def findConsensus(reads: Iterable[RichADAMRecord]): Iterable[Consensus]
+}
diff --git a/...rc/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromKnowns.scala b/...rc/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromKnowns.scala
@@ -0,0 +1,79 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.algorithms.consensus
+
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.bdgenomics.adam.rdd.variation.ADAMVariationContext._
+import org.bdgenomics.adam.algorithms.realignmenttarget.IndelRealignmentTarget
+import org.bdgenomics.adam.models._
+import org.bdgenomics.adam.rich.RichADAMRecord
+
+class ConsensusGeneratorFromKnowns(file: String, sc: SparkContext) extends ConsensusGenerator {
+
+  val indelTable = sc.broadcast(IndelTable(file, sc))
+
+  /**
+   * Generates targets to add to initial set of indel realignment targets, if additional
+   * targets are necessary.
+   *
+   * @return Returns an option which wraps an RDD of indel realignment targets.
+   */
+  def targetsToAdd(): Option[RDD[IndelRealignmentTarget]] = {
+    val rdd: RDD[ADAMVariantContext] = sc.adamVCFLoad(file)
+
+    Some(rdd.map(_.variant.variant)
+      .filter(v => v.getReferenceAllele.length != v.getVariantAllele.length)
+      .map(v => ReferenceRegion(v.getContig.getContigName, v.getPosition, v.getPosition + v.getReferenceAllele.length))
+      .map(r => new IndelRealignmentTarget(Some(r), r)))
+  }
+
+  /**
+   * Performs any preprocessing specific to this consensus generation algorithm, e.g.,
+   * indel normalization.
+   *
+   * @param reads Reads to preprocess.
+   * @return Preprocessed reads.
+   */
+  def preprocessReadsForRealignment(reads: Iterable[RichADAMRecord],
+                                    reference: String,
+                                    region: ReferenceRegion): Iterable[RichADAMRecord] = {
+    reads
+  }
+
+  /**
+   * For all reads in this region, generates the list of consensus sequences for realignment.
+   *
+   * @param reads Reads to generate consensus sequences from.
+   * @return Consensus sequences to use for realignment.
+   */
+  def findConsensus(reads: Iterable[RichADAMRecord]): Iterable[Consensus] = {
+    val table = indelTable.value
+
+    // get region
+    val start = reads.map(_.record.getStart.toLong).reduce(_ min _)
+    val end = reads.flatMap(_.end).reduce(_ max _)
+    val refId = reads.head.record.getContig.getContigName
+
+    val region = ReferenceRegion(refId, start, end + 1)
+
+    // get reads
+    table.getIndelsInRegion(region)
+  }
+}
diff --git a/...src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromReads.scala b/...src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromReads.scala
@@ -0,0 +1,86 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.algorithms.consensus
+
+import org.apache.spark.rdd.RDD
+import org.bdgenomics.adam.algorithms.realignmenttarget.IndelRealignmentTarget
+import org.bdgenomics.adam.models.{ Consensus, ReferenceRegion, ReferencePosition }
+import org.bdgenomics.adam.rich.RichADAMRecord
+import org.bdgenomics.adam.rich.RichADAMRecord._
+import org.bdgenomics.adam.rich.RichCigar._
+import org.bdgenomics.adam.util.MdTag
+import org.bdgenomics.adam.util.ImplicitJavaConversions._
+import org.bdgenomics.adam.util.NormalizationUtils._
+import org.bdgenomics.formats.avro.ADAMRecord
+
+class ConsensusGeneratorFromReads extends ConsensusGenerator {
+
+  /**
+   * No targets to add if generating consensus targets from reads.
+   *
+   * @return Returns a None.
+   */
+  def targetsToAdd(): Option[RDD[IndelRealignmentTarget]] = None
+
+  /**
+   * Performs read preprocessing by normalizing indels for all reads that have evidence of one
+   * indel.
+   *
+   * @param reads Reads to process.
+   * @return Reads with indels normalized if they contain a single indel.
+   */
+  def preprocessReadsForRealignment(reads: Iterable[RichADAMRecord],
+                                    reference: String,
+                                    region: ReferenceRegion): Iterable[RichADAMRecord] = {
+    reads.map(r => {
+      // if there are two alignment blocks (sequence matches) then there is a single indel in the read
+      if (r.samtoolsCigar.numAlignmentBlocks == 2) {
+        // left align this indel and update the mdtag
+        val cigar = leftAlignIndel(r)
+        val mdTag = MdTag.moveAlignment(r, cigar)
+
+        val newRead: RichADAMRecord = ADAMRecord.newBuilder(r)
+          .setCigar(cigar.toString)
+          .setMismatchingPositions(mdTag.toString())
+          .build()
+
+        newRead
+      } else {
+        r
+      }
+    })
+  }
+
+  /**
+   * Generates concensus sequences from reads with indels.
+   */
+  def findConsensus(reads: Iterable[RichADAMRecord]): Iterable[Consensus] = {
+    reads.filter(r => r.mdTag.isDefined)
+      .flatMap(r => {
+        // try to generate a consensus alignment - if a consensus exists, add it to our
+        // list of consensuses to test
+        Consensus.generateAlternateConsensus(r.getSequence,
+          ReferencePosition(r.getContig.getContigName,
+            r.getStart),
+          r.samtoolsCigar)
+      })
+      .toSeq
+      .distinct
+  }
+
+}
diff --git a/.../scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromSmithWaterman.scala b/.../scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromSmithWaterman.scala
@@ -0,0 +1,74 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.algorithms.consensus
+
+import org.bdgenomics.adam.algorithms.smithwaterman.SmithWatermanConstantGapScoring
+import org.bdgenomics.adam.models.ReferenceRegion
+import org.bdgenomics.adam.rich.RichADAMRecord
+import org.bdgenomics.adam.rich.RichADAMRecord._
+import org.bdgenomics.adam.rich.RichCigar._
+import org.bdgenomics.adam.util.MdTag
+import org.bdgenomics.formats.avro.ADAMRecord
+
+class ConsensusGeneratorFromSmithWaterman(wMatch: Double,
+                                          wMismatch: Double,
+                                          wInsert: Double,
+                                          wDelete: Double) extends ConsensusGeneratorFromReads {
+
+  /**
+   * Attempts realignment of all reads using Smith-Waterman. Accepts all realignments that have one
+   * or fewer indels.
+   *
+   * @param reads Reads to process.
+   * @return Reads with indels normalized if they contain a single indel.
+   */
+  override def preprocessReadsForRealignment(reads: Iterable[RichADAMRecord],
+                                             reference: String,
+                                             region: ReferenceRegion): Iterable[RichADAMRecord] = {
+    val rds: Iterable[RichADAMRecord] = reads.map(r => {
+
+      val sw = new SmithWatermanConstantGapScoring(r.record.getSequence.toString,
+        reference,
+        wMatch,
+        wMismatch,
+        wInsert,
+        wDelete)
+      println("for " + r.record.getReadName + " sw to " + sw.xStart + " with " + sw.cigarX)
+
+      // if we realign with fewer than three alignment blocks, then take the new alignment
+      if (sw.cigarX.numAlignmentBlocks <= 2) {
+        val mdTag = MdTag(r.record.getSequence.toString,
+          reference.drop(sw.xStart),
+          sw.cigarX,
+          region.start)
+
+        val newRead: RichADAMRecord = ADAMRecord.newBuilder(r)
+          .setStart(sw.xStart + region.start)
+          .setCigar(sw.cigarX.toString)
+          .setMismatchingPositions(mdTag.toString())
+          .build()
+
+        newRead
+      } else {
+        r
+      }
+    })
+
+    super.preprocessReadsForRealignment(rds, reference, region)
+  }
+}