[ADAM-511] Provide Ordering for ReferenceRegion

As covered in issue bigdatagenomics#511, ReferenceRegion shouldn't extend Ordered, but rather should be ordered by an Ordering value which can be imported implicitly. This will fix some "diverging implicit expansion" errors, and bring us more in line with the Scala documentation on Ordered (which is intended to be used in classes that have "a single, natural ordering" -- a description that doesn't apply to ReferenceRegion). This commit makes these changes, with the new Ordering provided implicitly in ReferenceRegionContext. It also includes corresponding changes to the ReferenceRegionWithOrientation handling, and (finally) includes changes to downstream classes that use(d) the Region.compareTo method. Also bundled in to this commit are two additional changes to ReferenceRegion: * Adding explanations to ReferenceRegion asserts. * Adding 'extend' to ReferenceRegionWithOrientation, allowing a strand-specific extension of the region in a single direction.
tdanford · Dec 2, 2014 · e2a92a5 · e2a92a5
1 parent 3477a39
commit e2a92a5
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 27 deletions.
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala
@@ -23,7 +23,7 @@ import org.apache.hadoop.mapreduce.Job
 import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
-import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion }
+import org.bdgenomics.adam.models.{ ReferenceRegionContext, SequenceDictionary, ReferenceRegion }
 import org.bdgenomics.adam.projections.Projection
 import org.bdgenomics.adam.projections.AlignmentRecordField._
 import org.bdgenomics.adam.rdd.ADAMContext._
@@ -32,6 +32,8 @@ import org.bdgenomics.adam.rich.ReferenceMappingContext._
 import org.bdgenomics.formats.avro.AlignmentRecord
 import scala.io._
 
+import ReferenceRegionContext._
+
 /**
  * CalculateDepth (accessible as the command 'depth' through the CLI) takes two arguments,
  * an Read file and a VCF (or equivalent) file, and calculates the number of reads

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala
@@ -22,6 +22,7 @@ import org.apache.spark.rdd.RDD
 import org.bdgenomics.adam.rdd.features.GeneFeatureRDD._
 import org.bdgenomics.adam.util.SequenceUtils
 import org.bdgenomics.formats.avro.{ Strand, Feature }
+import ReferenceRegionContext._
 
 /**
  * A 'gene model' is a small, hierarchical collection of objects: Genes, Transcripts, and Exons.

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala
@@ -52,7 +52,7 @@ object ReferenceRegionWithOrientation {
  *                       reverse strand of the reference region.
  */
 case class ReferenceRegionWithOrientation(region: ReferenceRegion,
-                                          negativeStrand: Boolean) extends Ordered[ReferenceRegionWithOrientation] {
+                                          negativeStrand: Boolean) {
   def width: Long = region.width
 
   def contains(other: ReferencePositionWithOrientation): Boolean = {
@@ -67,13 +67,30 @@ case class ReferenceRegionWithOrientation(region: ReferenceRegion,
     region.overlaps(other.region) && negativeStrand == other.negativeStrand
   }
 
-  def compare(that: ReferenceRegionWithOrientation): Int = {
-    val regionCompare = region.compare(that.region)
-    if (regionCompare != 0) {
-      regionCompare
-    } else {
-      negativeStrand.compare(that.negativeStrand)
-    }
+  /**
+   * Creates a new ReferenceRegionWithOrientation, with either the start or
+   * end field modified so that the overall region has a width equal to the
+   * given target length.
+   *
+   * If the region is on the negative strand, then the start value is modified,
+   * otherwise, if the region is on the positive strand, then the end value is
+   * incremented.
+   *
+   * @param targetLength the desired width of the new region
+   * @return The new region whose width==targetLength
+   */
+  def extend(targetLength: Long): ReferenceRegionWithOrientation = {
+    require(targetLength >= 0)
+    require(targetLength > width)
+
+    val diff = targetLength - width
+
+    if (negativeStrand)
+      ReferenceRegionWithOrientation(
+        ReferenceRegion(region.referenceName, region.start - diff, region.end), negativeStrand)
+    else
+      ReferenceRegionWithOrientation(
+        ReferenceRegion(region.referenceName, region.start, region.end + diff), negativeStrand)
   }
 
   def toReferenceRegion: ReferenceRegion = region
@@ -85,6 +102,23 @@ case class ReferenceRegionWithOrientation(region: ReferenceRegion,
   def end: Long = region.end
 }
 
+object ReferenceRegionContext {
+  implicit val referenceRegionOrdering = ReferenceRegionOrdering
+  implicit val referenceRegionWithOrientationOrdering = ReferenceRegionWithOrientationOrdering
+}
+
+object ReferenceRegionWithOrientationOrdering extends Ordering[ReferenceRegionWithOrientation] {
+
+  def compare(this_ : ReferenceRegionWithOrientation, that: ReferenceRegionWithOrientation): Int = {
+    val regionCompare = ReferenceRegionOrdering.compare(this_.region, that.region)
+    if (regionCompare != 0) {
+      regionCompare
+    } else {
+      this_.negativeStrand.compare(that.negativeStrand)
+    }
+  }
+}
+
 object ReferenceRegion {
 
   /**
@@ -135,6 +169,17 @@ object ReferenceRegion {
   }
 }
 
+object ReferenceRegionOrdering extends Ordering[ReferenceRegion] {
+
+  override def compare(this_ : ReferenceRegion, that: ReferenceRegion): Int =
+    if (this_.referenceName != that.referenceName)
+      this_.referenceName.compareTo(that.referenceName)
+    else if (this_.start != that.start)
+      this_.start.compareTo(that.start)
+    else
+      this_.end.compareTo(that.end)
+}
+
 /**
  * Represents a contiguous region of the reference genome.
  *
@@ -144,10 +189,10 @@ object ReferenceRegion {
  *            which is <i>not</i> in the region -- i.e. [start, end) define a 0-based
  *            half-open interval.
  */
-case class ReferenceRegion(referenceName: String, start: Long, end: Long) extends Ordered[ReferenceRegion] with Interval {
+case class ReferenceRegion(referenceName: String, start: Long, end: Long) extends Interval {
 
-  assert(start >= 0)
-  assert(end >= start)
+  assert(start >= 0, "ReferenceRegion doesn't allow a negative start coordinate (here provided %d)".format(start))
+  assert(end >= start, "ReferenceRegion requires end >= start (here provided %d,%d)".format(start, end))
 
   def width: Long = end - start
 
@@ -255,14 +300,6 @@ case class ReferenceRegion(referenceName: String, start: Long, end: Long) extend
   def overlaps(other: ReferenceRegion): Boolean =
     referenceName == other.referenceName && end > other.start && start < other.end
 
-  def compare(that: ReferenceRegion): Int =
-    if (referenceName != that.referenceName)
-      referenceName.compareTo(that.referenceName)
-    else if (start != that.start)
-      start.compareTo(that.start)
-    else
-      end.compareTo(that.end)
-
   def length(): Long = {
     end - start
   }

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala
@@ -21,8 +21,9 @@ import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
 import scala.math._
-import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion }
+import org.bdgenomics.adam.models.{ ReferenceRegionContext, ReferenceRegionOrdering, SequenceDictionary, ReferenceRegion }
 import PairingRDD._
+import ReferenceRegionContext._
 
 /**
  * A base is 'covered' by a region set if any region in the set contains the base itself.
@@ -97,7 +98,7 @@ class Coverage(val window: Long) extends Serializable {
       case (None, None)         => 0
       case (None, Some(r2))     => -1
       case (Some(r1), None)     => 1
-      case (Some(r1), Some(r2)) => r1.compareTo(r2)
+      case (Some(r1), Some(r2)) => ReferenceRegionOrdering.compare(r1, r2)
     }
 
   case class OrientedPoint(chrom: String, pos: Long, polarity: Boolean) extends Ordered[OrientedPoint] with Serializable {

diff --git a/.../src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala b/.../src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala
@@ -36,6 +36,7 @@ import parquet.hadoop.metadata.CompressionCodecName
 import parquet.hadoop.util.ContextUtil
 import scala.math.max
 import scala.Some
+import ReferenceRegionContext._
 
 class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) extends ADAMSequenceDictionaryRDDAggregator[NucleotideContigFragment](rdd) {
 
@@ -67,7 +68,7 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e
       assert(kv1._1.isAdjacent(kv2._1), "Regions being joined must be adjacent. For: " +
         kv1 + ", " + kv2)
 
-      (kv1._1.merge(kv2._1), if (kv1._1.compare(kv2._1) <= 0) {
+      (kv1._1.merge(kv2._1), if (ReferenceRegionOrdering.compare(kv1._1, kv2._1) <= 0) {
         kv1._2 + kv2._2
       } else {
         kv2._2 + kv1._2
@@ -83,7 +84,7 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e
         .map(kv => getString(kv))
         .reduce(reducePairs)
 
-      assert(pair._1.compare(region) == 0,
+      assert(ReferenceRegionOrdering.compare(pair._1, region) == 0,
         "Merging fragments returned a different region than requested.")
 
       pair._2

diff --git a/...core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala b/...core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala
@@ -21,7 +21,7 @@ import com.esotericsoftware.kryo.io.{ Input, Output }
 import com.esotericsoftware.kryo.{ Kryo, Serializer }
 import htsjdk.samtools.CigarOperator
 import org.apache.spark.Logging
-import org.bdgenomics.adam.models.ReferenceRegion
+import org.bdgenomics.adam.models.{ ReferenceRegionOrdering, ReferenceRegion }
 import org.bdgenomics.adam.rdd.ADAMContext._
 import org.bdgenomics.adam.rich.RichAlignmentRecord
 import org.bdgenomics.formats.avro.AlignmentRecord
@@ -50,7 +50,8 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] {
    * @param b Indel realignment target to compare.
    * @return Comparison done by starting position.
    */
-  def compare(a: IndelRealignmentTarget, b: IndelRealignmentTarget): Int = a.readRange compare b.readRange
+  def compare(a: IndelRealignmentTarget, b: IndelRealignmentTarget): Int =
+    ReferenceRegionOrdering.compare(a.readRange, b.readRange)
 
   /**
    * Check to see if an indel realignment target contains the given read.
@@ -75,7 +76,7 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] {
   def lt(target: IndelRealignmentTarget, read: RichAlignmentRecord): Boolean = {
     val region = read.readRegion
 
-    region.forall(r => target.readRange.compare(r) < 0)
+    region.forall(r => ReferenceRegionOrdering.compare(target.readRange, r) < 0)
   }
 
   /**