From 4a28c964e52001e9574a5a545406fc82afabbac2 Mon Sep 17 00:00:00 2001
From: Timothy Danford <tdanford@gmail.com>
Date: Fri, 28 Nov 2014 15:14:20 -0500
Subject: [PATCH] [ADAM-511] Provide Ordering for ReferenceRegion

As covered in issue #511, ReferenceRegion shouldn't extend Ordered, but
rather should be ordered by an Ordering value which can be imported
implicitly.  This will fix some "diverging implicit expansion" errors,
and bring us more in line with the Scala documentation on Ordered (which
is intended to be used in classes that have "a single, natural ordering"
-- a description that doesn't apply to ReferenceRegion).

This commit makes these changes, with the new Ordering provided
implicitly in ReferenceRegionContext.  It also includes corresponding
changes to the ReferenceRegionWithOrientation handling, and (finally)
includes changes to downstream classes that use(d) the Region.compareTo
method.

Also bundled in to this commit are two additional changes to
ReferenceRegion:
* Adding explanations to ReferenceRegion asserts.
* Adding 'extend' to ReferenceRegionWithOrientation, allowing a
  strand-specific extension of the region in a single direction.
---
 .../bdgenomics/adam/cli/CalculateDepth.scala  |  4 +-
 .../org/bdgenomics/adam/models/Gene.scala     |  1 +
 .../adam/models/ReferenceRegion.scala         | 75 ++++++++++++++-----
 .../org/bdgenomics/adam/rdd/Coverage.scala    |  5 +-
 ...NucleotideContigFragmentRDDFunctions.scala |  5 +-
 .../realignment/IndelRealignmentTarget.scala  |  7 +-
 6 files changed, 70 insertions(+), 27 deletions(-)

diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala
index 450a6cf688..204850b2a1 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala
@@ -23,7 +23,7 @@ import org.apache.hadoop.mapreduce.Job
 import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
-import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion }
+import org.bdgenomics.adam.models.{ ReferenceRegionContext, SequenceDictionary, ReferenceRegion }
 import org.bdgenomics.adam.projections.Projection
 import org.bdgenomics.adam.projections.AlignmentRecordField._
 import org.bdgenomics.adam.rdd.ADAMContext._
@@ -32,6 +32,8 @@ import org.bdgenomics.adam.rich.ReferenceMappingContext._
 import org.bdgenomics.formats.avro.AlignmentRecord
 import scala.io._
 
+import ReferenceRegionContext._
+
 /**
  * CalculateDepth (accessible as the command 'depth' through the CLI) takes two arguments,
  * an Read file and a VCF (or equivalent) file, and calculates the number of reads
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala
index b1491f7261..34bdd52d7c 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala
@@ -22,6 +22,7 @@ import org.apache.spark.rdd.RDD
 import org.bdgenomics.adam.rdd.features.GeneFeatureRDD._
 import org.bdgenomics.adam.util.SequenceUtils
 import org.bdgenomics.formats.avro.{ Strand, Feature }
+import ReferenceRegionContext._
 
 /**
  * A 'gene model' is a small, hierarchical collection of objects: Genes, Transcripts, and Exons.
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala
index 86bd7d036e..1aa7bdd61c 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala
@@ -52,7 +52,7 @@ object ReferenceRegionWithOrientation {
  *                       reverse strand of the reference region.
  */
 case class ReferenceRegionWithOrientation(region: ReferenceRegion,
-                                          negativeStrand: Boolean) extends Ordered[ReferenceRegionWithOrientation] {
+                                          negativeStrand: Boolean) {
   def width: Long = region.width
 
   def contains(other: ReferencePositionWithOrientation): Boolean = {
@@ -67,13 +67,30 @@ case class ReferenceRegionWithOrientation(region: ReferenceRegion,
     region.overlaps(other.region) && negativeStrand == other.negativeStrand
   }
 
-  def compare(that: ReferenceRegionWithOrientation): Int = {
-    val regionCompare = region.compare(that.region)
-    if (regionCompare != 0) {
-      regionCompare
-    } else {
-      negativeStrand.compare(that.negativeStrand)
-    }
+  /**
+   * Creates a new ReferenceRegionWithOrientation, with either the start or
+   * end field modified so that the overall region has a width equal to the
+   * given target length.
+   *
+   * If the region is on the negative strand, then the start value is modified,
+   * otherwise, if the region is on the positive strand, then the end value is
+   * incremented.
+   *
+   * @param targetLength the desired width of the new region
+   * @return The new region whose width==targetLength
+   */
+  def extend(targetLength: Long): ReferenceRegionWithOrientation = {
+    require(targetLength >= 0)
+    require(targetLength > width)
+
+    val diff = targetLength - width
+
+    if (negativeStrand)
+      ReferenceRegionWithOrientation(
+        ReferenceRegion(region.referenceName, region.start - diff, region.end), negativeStrand)
+    else
+      ReferenceRegionWithOrientation(
+        ReferenceRegion(region.referenceName, region.start, region.end + diff), negativeStrand)
   }
 
   def toReferenceRegion: ReferenceRegion = region
@@ -85,6 +102,23 @@ case class ReferenceRegionWithOrientation(region: ReferenceRegion,
   def end: Long = region.end
 }
 
+object ReferenceRegionContext {
+  implicit val referenceRegionOrdering = ReferenceRegionOrdering
+  implicit val referenceRegionWithOrientationOrdering = ReferenceRegionWithOrientationOrdering
+}
+
+object ReferenceRegionWithOrientationOrdering extends Ordering[ReferenceRegionWithOrientation] {
+
+  def compare(this_ : ReferenceRegionWithOrientation, that: ReferenceRegionWithOrientation): Int = {
+    val regionCompare = ReferenceRegionOrdering.compare(this_.region, that.region)
+    if (regionCompare != 0) {
+      regionCompare
+    } else {
+      this_.negativeStrand.compare(that.negativeStrand)
+    }
+  }
+}
+
 object ReferenceRegion {
 
   /**
@@ -135,6 +169,17 @@ object ReferenceRegion {
   }
 }
 
+object ReferenceRegionOrdering extends Ordering[ReferenceRegion] {
+
+  override def compare(this_ : ReferenceRegion, that: ReferenceRegion): Int =
+    if (this_.referenceName != that.referenceName)
+      this_.referenceName.compareTo(that.referenceName)
+    else if (this_.start != that.start)
+      this_.start.compareTo(that.start)
+    else
+      this_.end.compareTo(that.end)
+}
+
 /**
  * Represents a contiguous region of the reference genome.
  *
@@ -144,10 +189,10 @@ object ReferenceRegion {
  *            which is <i>not</i> in the region -- i.e. [start, end) define a 0-based
  *            half-open interval.
  */
-case class ReferenceRegion(referenceName: String, start: Long, end: Long) extends Ordered[ReferenceRegion] with Interval {
+case class ReferenceRegion(referenceName: String, start: Long, end: Long) extends Interval {
 
-  assert(start >= 0)
-  assert(end >= start)
+  assert(start >= 0, "ReferenceRegion doesn't allow a negative start coordinate (here provided %d)".format(start))
+  assert(end >= start, "ReferenceRegion requires end >= start (here provided %d,%d)".format(start, end))
 
   def width: Long = end - start
 
@@ -255,14 +300,6 @@ case class ReferenceRegion(referenceName: String, start: Long, end: Long) extend
   def overlaps(other: ReferenceRegion): Boolean =
     referenceName == other.referenceName && end > other.start && start < other.end
 
-  def compare(that: ReferenceRegion): Int =
-    if (referenceName != that.referenceName)
-      referenceName.compareTo(that.referenceName)
-    else if (start != that.start)
-      start.compareTo(that.start)
-    else
-      end.compareTo(that.end)
-
   def length(): Long = {
     end - start
   }
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala
index 5686f4ee27..10542aa0fd 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala
@@ -21,8 +21,9 @@ import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
 import scala.math._
-import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion }
+import org.bdgenomics.adam.models.{ ReferenceRegionContext, ReferenceRegionOrdering, SequenceDictionary, ReferenceRegion }
 import PairingRDD._
+import ReferenceRegionContext._
 
 /**
  * A base is 'covered' by a region set if any region in the set contains the base itself.
@@ -97,7 +98,7 @@ class Coverage(val window: Long) extends Serializable {
       case (None, None)         => 0
       case (None, Some(r2))     => -1
       case (Some(r1), None)     => 1
-      case (Some(r1), Some(r2)) => r1.compareTo(r2)
+      case (Some(r1), Some(r2)) => ReferenceRegionOrdering.compare(r1, r2)
     }
 
   case class OrientedPoint(chrom: String, pos: Long, polarity: Boolean) extends Ordered[OrientedPoint] with Serializable {
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala
index d1c38d2884..847c6cd70c 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala
@@ -36,6 +36,7 @@ import parquet.hadoop.metadata.CompressionCodecName
 import parquet.hadoop.util.ContextUtil
 import scala.math.max
 import scala.Some
+import ReferenceRegionContext._
 
 class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) extends ADAMSequenceDictionaryRDDAggregator[NucleotideContigFragment](rdd) {
 
@@ -67,7 +68,7 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e
       assert(kv1._1.isAdjacent(kv2._1), "Regions being joined must be adjacent. For: " +
         kv1 + ", " + kv2)
 
-      (kv1._1.merge(kv2._1), if (kv1._1.compare(kv2._1) <= 0) {
+      (kv1._1.merge(kv2._1), if (ReferenceRegionOrdering.compare(kv1._1, kv2._1) <= 0) {
         kv1._2 + kv2._2
       } else {
         kv2._2 + kv1._2
@@ -83,7 +84,7 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e
         .map(kv => getString(kv))
         .reduce(reducePairs)
 
-      assert(pair._1.compare(region) == 0,
+      assert(ReferenceRegionOrdering.compare(pair._1, region) == 0,
         "Merging fragments returned a different region than requested.")
 
       pair._2
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala
index 1960e41bdb..c878335fa8 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala
@@ -21,7 +21,7 @@ import com.esotericsoftware.kryo.io.{ Input, Output }
 import com.esotericsoftware.kryo.{ Kryo, Serializer }
 import htsjdk.samtools.CigarOperator
 import org.apache.spark.Logging
-import org.bdgenomics.adam.models.ReferenceRegion
+import org.bdgenomics.adam.models.{ ReferenceRegionOrdering, ReferenceRegion }
 import org.bdgenomics.adam.rdd.ADAMContext._
 import org.bdgenomics.adam.rich.RichAlignmentRecord
 import org.bdgenomics.formats.avro.AlignmentRecord
@@ -50,7 +50,8 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] {
    * @param b Indel realignment target to compare.
    * @return Comparison done by starting position.
    */
-  def compare(a: IndelRealignmentTarget, b: IndelRealignmentTarget): Int = a.readRange compare b.readRange
+  def compare(a: IndelRealignmentTarget, b: IndelRealignmentTarget): Int =
+    ReferenceRegionOrdering.compare(a.readRange, b.readRange)
 
   /**
    * Check to see if an indel realignment target contains the given read.
@@ -75,7 +76,7 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] {
   def lt(target: IndelRealignmentTarget, read: RichAlignmentRecord): Boolean = {
     val region = read.readRegion
 
-    region.forall(r => target.readRange.compare(r) < 0)
+    region.forall(r => ReferenceRegionOrdering.compare(target.readRange, r) < 0)
   }
 
   /**