Skip to content

Commit

Permalink
[ADAM-511] Provide Ordering for ReferenceRegion
Browse files Browse the repository at this point in the history
As covered in issue bigdatagenomics#511, ReferenceRegion shouldn't extend Ordered, but
rather should be ordered by an Ordering value which can be imported
implicitly.  This will fix some "diverging implicit expansion" errors,
and bring us more in line with the Scala documentation on Ordered (which
is intended to be used in classes that have "a single, natural ordering"
-- a description that doesn't apply to ReferenceRegion).

This commit makes these changes, with the new Ordering provided
implicitly in ReferenceRegionContext.  It also includes corresponding
changes to the ReferenceRegionWithOrientation handling, and (finally)
includes changes to downstream classes that use(d) the Region.compareTo
method.

Also bundled in to this commit are two additional changes to
ReferenceRegion:
* Adding explanations to ReferenceRegion asserts.
* Adding 'extend' to ReferenceRegionWithOrientation, allowing a
  strand-specific extension of the region in a single direction.
  • Loading branch information
tdanford committed Dec 2, 2014
1 parent 3477a39 commit e2a92a5
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion }
import org.bdgenomics.adam.models.{ ReferenceRegionContext, SequenceDictionary, ReferenceRegion }
import org.bdgenomics.adam.projections.Projection
import org.bdgenomics.adam.projections.AlignmentRecordField._
import org.bdgenomics.adam.rdd.ADAMContext._
Expand All @@ -32,6 +32,8 @@ import org.bdgenomics.adam.rich.ReferenceMappingContext._
import org.bdgenomics.formats.avro.AlignmentRecord
import scala.io._

import ReferenceRegionContext._

/**
* CalculateDepth (accessible as the command 'depth' through the CLI) takes two arguments,
* an Read file and a VCF (or equivalent) file, and calculates the number of reads
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.rdd.features.GeneFeatureRDD._
import org.bdgenomics.adam.util.SequenceUtils
import org.bdgenomics.formats.avro.{ Strand, Feature }
import ReferenceRegionContext._

/**
* A 'gene model' is a small, hierarchical collection of objects: Genes, Transcripts, and Exons.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ object ReferenceRegionWithOrientation {
* reverse strand of the reference region.
*/
case class ReferenceRegionWithOrientation(region: ReferenceRegion,
negativeStrand: Boolean) extends Ordered[ReferenceRegionWithOrientation] {
negativeStrand: Boolean) {
def width: Long = region.width

def contains(other: ReferencePositionWithOrientation): Boolean = {
Expand All @@ -67,13 +67,30 @@ case class ReferenceRegionWithOrientation(region: ReferenceRegion,
region.overlaps(other.region) && negativeStrand == other.negativeStrand
}

def compare(that: ReferenceRegionWithOrientation): Int = {
val regionCompare = region.compare(that.region)
if (regionCompare != 0) {
regionCompare
} else {
negativeStrand.compare(that.negativeStrand)
}
/**
* Creates a new ReferenceRegionWithOrientation, with either the start or
* end field modified so that the overall region has a width equal to the
* given target length.
*
* If the region is on the negative strand, then the start value is modified,
* otherwise, if the region is on the positive strand, then the end value is
* incremented.
*
* @param targetLength the desired width of the new region
* @return The new region whose width==targetLength
*/
def extend(targetLength: Long): ReferenceRegionWithOrientation = {
require(targetLength >= 0)
require(targetLength > width)

val diff = targetLength - width

if (negativeStrand)
ReferenceRegionWithOrientation(
ReferenceRegion(region.referenceName, region.start - diff, region.end), negativeStrand)
else
ReferenceRegionWithOrientation(
ReferenceRegion(region.referenceName, region.start, region.end + diff), negativeStrand)
}

def toReferenceRegion: ReferenceRegion = region
Expand All @@ -85,6 +102,23 @@ case class ReferenceRegionWithOrientation(region: ReferenceRegion,
def end: Long = region.end
}

object ReferenceRegionContext {
implicit val referenceRegionOrdering = ReferenceRegionOrdering
implicit val referenceRegionWithOrientationOrdering = ReferenceRegionWithOrientationOrdering
}

object ReferenceRegionWithOrientationOrdering extends Ordering[ReferenceRegionWithOrientation] {

def compare(this_ : ReferenceRegionWithOrientation, that: ReferenceRegionWithOrientation): Int = {
val regionCompare = ReferenceRegionOrdering.compare(this_.region, that.region)
if (regionCompare != 0) {
regionCompare
} else {
this_.negativeStrand.compare(that.negativeStrand)
}
}
}

object ReferenceRegion {

/**
Expand Down Expand Up @@ -135,6 +169,17 @@ object ReferenceRegion {
}
}

object ReferenceRegionOrdering extends Ordering[ReferenceRegion] {

override def compare(this_ : ReferenceRegion, that: ReferenceRegion): Int =
if (this_.referenceName != that.referenceName)
this_.referenceName.compareTo(that.referenceName)
else if (this_.start != that.start)
this_.start.compareTo(that.start)
else
this_.end.compareTo(that.end)
}

/**
* Represents a contiguous region of the reference genome.
*
Expand All @@ -144,10 +189,10 @@ object ReferenceRegion {
* which is <i>not</i> in the region -- i.e. [start, end) define a 0-based
* half-open interval.
*/
case class ReferenceRegion(referenceName: String, start: Long, end: Long) extends Ordered[ReferenceRegion] with Interval {
case class ReferenceRegion(referenceName: String, start: Long, end: Long) extends Interval {

assert(start >= 0)
assert(end >= start)
assert(start >= 0, "ReferenceRegion doesn't allow a negative start coordinate (here provided %d)".format(start))
assert(end >= start, "ReferenceRegion requires end >= start (here provided %d,%d)".format(start, end))

def width: Long = end - start

Expand Down Expand Up @@ -255,14 +300,6 @@ case class ReferenceRegion(referenceName: String, start: Long, end: Long) extend
def overlaps(other: ReferenceRegion): Boolean =
referenceName == other.referenceName && end > other.start && start < other.end

def compare(that: ReferenceRegion): Int =
if (referenceName != that.referenceName)
referenceName.compareTo(that.referenceName)
else if (start != that.start)
start.compareTo(that.start)
else
end.compareTo(that.end)

def length(): Long = {
end - start
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import scala.math._
import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion }
import org.bdgenomics.adam.models.{ ReferenceRegionContext, ReferenceRegionOrdering, SequenceDictionary, ReferenceRegion }
import PairingRDD._
import ReferenceRegionContext._

/**
* A base is 'covered' by a region set if any region in the set contains the base itself.
Expand Down Expand Up @@ -97,7 +98,7 @@ class Coverage(val window: Long) extends Serializable {
case (None, None) => 0
case (None, Some(r2)) => -1
case (Some(r1), None) => 1
case (Some(r1), Some(r2)) => r1.compareTo(r2)
case (Some(r1), Some(r2)) => ReferenceRegionOrdering.compare(r1, r2)
}

case class OrientedPoint(chrom: String, pos: Long, polarity: Boolean) extends Ordered[OrientedPoint] with Serializable {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import parquet.hadoop.metadata.CompressionCodecName
import parquet.hadoop.util.ContextUtil
import scala.math.max
import scala.Some
import ReferenceRegionContext._

class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) extends ADAMSequenceDictionaryRDDAggregator[NucleotideContigFragment](rdd) {

Expand Down Expand Up @@ -67,7 +68,7 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e
assert(kv1._1.isAdjacent(kv2._1), "Regions being joined must be adjacent. For: " +
kv1 + ", " + kv2)

(kv1._1.merge(kv2._1), if (kv1._1.compare(kv2._1) <= 0) {
(kv1._1.merge(kv2._1), if (ReferenceRegionOrdering.compare(kv1._1, kv2._1) <= 0) {
kv1._2 + kv2._2
} else {
kv2._2 + kv1._2
Expand All @@ -83,7 +84,7 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e
.map(kv => getString(kv))
.reduce(reducePairs)

assert(pair._1.compare(region) == 0,
assert(ReferenceRegionOrdering.compare(pair._1, region) == 0,
"Merging fragments returned a different region than requested.")

pair._2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import com.esotericsoftware.kryo.io.{ Input, Output }
import com.esotericsoftware.kryo.{ Kryo, Serializer }
import htsjdk.samtools.CigarOperator
import org.apache.spark.Logging
import org.bdgenomics.adam.models.ReferenceRegion
import org.bdgenomics.adam.models.{ ReferenceRegionOrdering, ReferenceRegion }
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rich.RichAlignmentRecord
import org.bdgenomics.formats.avro.AlignmentRecord
Expand Down Expand Up @@ -50,7 +50,8 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] {
* @param b Indel realignment target to compare.
* @return Comparison done by starting position.
*/
def compare(a: IndelRealignmentTarget, b: IndelRealignmentTarget): Int = a.readRange compare b.readRange
def compare(a: IndelRealignmentTarget, b: IndelRealignmentTarget): Int =
ReferenceRegionOrdering.compare(a.readRange, b.readRange)

/**
* Check to see if an indel realignment target contains the given read.
Expand All @@ -75,7 +76,7 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] {
def lt(target: IndelRealignmentTarget, read: RichAlignmentRecord): Boolean = {
val region = read.readRegion

region.forall(r => target.readRange.compare(r) < 0)
region.forall(r => ReferenceRegionOrdering.compare(target.readRange, r) < 0)
}

/**
Expand Down

0 comments on commit e2a92a5

Please sign in to comment.