From 4a28c964e52001e9574a5a545406fc82afabbac2 Mon Sep 17 00:00:00 2001 From: Timothy Danford Date: Fri, 28 Nov 2014 15:14:20 -0500 Subject: [PATCH] [ADAM-511] Provide Ordering for ReferenceRegion As covered in issue #511, ReferenceRegion shouldn't extend Ordered, but rather should be ordered by an Ordering value which can be imported implicitly. This will fix some "diverging implicit expansion" errors, and bring us more in line with the Scala documentation on Ordered (which is intended to be used in classes that have "a single, natural ordering" -- a description that doesn't apply to ReferenceRegion). This commit makes these changes, with the new Ordering provided implicitly in ReferenceRegionContext. It also includes corresponding changes to the ReferenceRegionWithOrientation handling, and (finally) includes changes to downstream classes that use(d) the Region.compareTo method. Also bundled in to this commit are two additional changes to ReferenceRegion: * Adding explanations to ReferenceRegion asserts. * Adding 'extend' to ReferenceRegionWithOrientation, allowing a strand-specific extension of the region in a single direction. --- .../bdgenomics/adam/cli/CalculateDepth.scala | 4 +- .../org/bdgenomics/adam/models/Gene.scala | 1 + .../adam/models/ReferenceRegion.scala | 75 ++++++++++++++----- .../org/bdgenomics/adam/rdd/Coverage.scala | 5 +- ...NucleotideContigFragmentRDDFunctions.scala | 5 +- .../realignment/IndelRealignmentTarget.scala | 7 +- 6 files changed, 70 insertions(+), 27 deletions(-) diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala index 450a6cf688..204850b2a1 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/CalculateDepth.scala @@ -23,7 +23,7 @@ import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD -import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion } +import org.bdgenomics.adam.models.{ ReferenceRegionContext, SequenceDictionary, ReferenceRegion } import org.bdgenomics.adam.projections.Projection import org.bdgenomics.adam.projections.AlignmentRecordField._ import org.bdgenomics.adam.rdd.ADAMContext._ @@ -32,6 +32,8 @@ import org.bdgenomics.adam.rich.ReferenceMappingContext._ import org.bdgenomics.formats.avro.AlignmentRecord import scala.io._ +import ReferenceRegionContext._ + /** * CalculateDepth (accessible as the command 'depth' through the CLI) takes two arguments, * an Read file and a VCF (or equivalent) file, and calculates the number of reads diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala index b1491f7261..34bdd52d7c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/Gene.scala @@ -22,6 +22,7 @@ import org.apache.spark.rdd.RDD import org.bdgenomics.adam.rdd.features.GeneFeatureRDD._ import org.bdgenomics.adam.util.SequenceUtils import org.bdgenomics.formats.avro.{ Strand, Feature } +import ReferenceRegionContext._ /** * A 'gene model' is a small, hierarchical collection of objects: Genes, Transcripts, and Exons. diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala index 86bd7d036e..1aa7bdd61c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/models/ReferenceRegion.scala @@ -52,7 +52,7 @@ object ReferenceRegionWithOrientation { * reverse strand of the reference region. */ case class ReferenceRegionWithOrientation(region: ReferenceRegion, - negativeStrand: Boolean) extends Ordered[ReferenceRegionWithOrientation] { + negativeStrand: Boolean) { def width: Long = region.width def contains(other: ReferencePositionWithOrientation): Boolean = { @@ -67,13 +67,30 @@ case class ReferenceRegionWithOrientation(region: ReferenceRegion, region.overlaps(other.region) && negativeStrand == other.negativeStrand } - def compare(that: ReferenceRegionWithOrientation): Int = { - val regionCompare = region.compare(that.region) - if (regionCompare != 0) { - regionCompare - } else { - negativeStrand.compare(that.negativeStrand) - } + /** + * Creates a new ReferenceRegionWithOrientation, with either the start or + * end field modified so that the overall region has a width equal to the + * given target length. + * + * If the region is on the negative strand, then the start value is modified, + * otherwise, if the region is on the positive strand, then the end value is + * incremented. + * + * @param targetLength the desired width of the new region + * @return The new region whose width==targetLength + */ + def extend(targetLength: Long): ReferenceRegionWithOrientation = { + require(targetLength >= 0) + require(targetLength > width) + + val diff = targetLength - width + + if (negativeStrand) + ReferenceRegionWithOrientation( + ReferenceRegion(region.referenceName, region.start - diff, region.end), negativeStrand) + else + ReferenceRegionWithOrientation( + ReferenceRegion(region.referenceName, region.start, region.end + diff), negativeStrand) } def toReferenceRegion: ReferenceRegion = region @@ -85,6 +102,23 @@ case class ReferenceRegionWithOrientation(region: ReferenceRegion, def end: Long = region.end } +object ReferenceRegionContext { + implicit val referenceRegionOrdering = ReferenceRegionOrdering + implicit val referenceRegionWithOrientationOrdering = ReferenceRegionWithOrientationOrdering +} + +object ReferenceRegionWithOrientationOrdering extends Ordering[ReferenceRegionWithOrientation] { + + def compare(this_ : ReferenceRegionWithOrientation, that: ReferenceRegionWithOrientation): Int = { + val regionCompare = ReferenceRegionOrdering.compare(this_.region, that.region) + if (regionCompare != 0) { + regionCompare + } else { + this_.negativeStrand.compare(that.negativeStrand) + } + } +} + object ReferenceRegion { /** @@ -135,6 +169,17 @@ object ReferenceRegion { } } +object ReferenceRegionOrdering extends Ordering[ReferenceRegion] { + + override def compare(this_ : ReferenceRegion, that: ReferenceRegion): Int = + if (this_.referenceName != that.referenceName) + this_.referenceName.compareTo(that.referenceName) + else if (this_.start != that.start) + this_.start.compareTo(that.start) + else + this_.end.compareTo(that.end) +} + /** * Represents a contiguous region of the reference genome. * @@ -144,10 +189,10 @@ object ReferenceRegion { * which is not in the region -- i.e. [start, end) define a 0-based * half-open interval. */ -case class ReferenceRegion(referenceName: String, start: Long, end: Long) extends Ordered[ReferenceRegion] with Interval { +case class ReferenceRegion(referenceName: String, start: Long, end: Long) extends Interval { - assert(start >= 0) - assert(end >= start) + assert(start >= 0, "ReferenceRegion doesn't allow a negative start coordinate (here provided %d)".format(start)) + assert(end >= start, "ReferenceRegion requires end >= start (here provided %d,%d)".format(start, end)) def width: Long = end - start @@ -255,14 +300,6 @@ case class ReferenceRegion(referenceName: String, start: Long, end: Long) extend def overlaps(other: ReferenceRegion): Boolean = referenceName == other.referenceName && end > other.start && start < other.end - def compare(that: ReferenceRegion): Int = - if (referenceName != that.referenceName) - referenceName.compareTo(that.referenceName) - else if (start != that.start) - start.compareTo(that.start) - else - end.compareTo(that.end) - def length(): Long = { end - start } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala index 5686f4ee27..10542aa0fd 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/Coverage.scala @@ -21,8 +21,9 @@ import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import scala.math._ -import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion } +import org.bdgenomics.adam.models.{ ReferenceRegionContext, ReferenceRegionOrdering, SequenceDictionary, ReferenceRegion } import PairingRDD._ +import ReferenceRegionContext._ /** * A base is 'covered' by a region set if any region in the set contains the base itself. @@ -97,7 +98,7 @@ class Coverage(val window: Long) extends Serializable { case (None, None) => 0 case (None, Some(r2)) => -1 case (Some(r1), None) => 1 - case (Some(r1), Some(r2)) => r1.compareTo(r2) + case (Some(r1), Some(r2)) => ReferenceRegionOrdering.compare(r1, r2) } case class OrientedPoint(chrom: String, pos: Long, polarity: Boolean) extends Ordered[OrientedPoint] with Serializable { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala index d1c38d2884..847c6cd70c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDDFunctions.scala @@ -36,6 +36,7 @@ import parquet.hadoop.metadata.CompressionCodecName import parquet.hadoop.util.ContextUtil import scala.math.max import scala.Some +import ReferenceRegionContext._ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) extends ADAMSequenceDictionaryRDDAggregator[NucleotideContigFragment](rdd) { @@ -67,7 +68,7 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e assert(kv1._1.isAdjacent(kv2._1), "Regions being joined must be adjacent. For: " + kv1 + ", " + kv2) - (kv1._1.merge(kv2._1), if (kv1._1.compare(kv2._1) <= 0) { + (kv1._1.merge(kv2._1), if (ReferenceRegionOrdering.compare(kv1._1, kv2._1) <= 0) { kv1._2 + kv2._2 } else { kv2._2 + kv1._2 @@ -83,7 +84,7 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e .map(kv => getString(kv)) .reduce(reducePairs) - assert(pair._1.compare(region) == 0, + assert(ReferenceRegionOrdering.compare(pair._1, region) == 0, "Merging fragments returned a different region than requested.") pair._2 diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala index 1960e41bdb..c878335fa8 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala @@ -21,7 +21,7 @@ import com.esotericsoftware.kryo.io.{ Input, Output } import com.esotericsoftware.kryo.{ Kryo, Serializer } import htsjdk.samtools.CigarOperator import org.apache.spark.Logging -import org.bdgenomics.adam.models.ReferenceRegion +import org.bdgenomics.adam.models.{ ReferenceRegionOrdering, ReferenceRegion } import org.bdgenomics.adam.rdd.ADAMContext._ import org.bdgenomics.adam.rich.RichAlignmentRecord import org.bdgenomics.formats.avro.AlignmentRecord @@ -50,7 +50,8 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] { * @param b Indel realignment target to compare. * @return Comparison done by starting position. */ - def compare(a: IndelRealignmentTarget, b: IndelRealignmentTarget): Int = a.readRange compare b.readRange + def compare(a: IndelRealignmentTarget, b: IndelRealignmentTarget): Int = + ReferenceRegionOrdering.compare(a.readRange, b.readRange) /** * Check to see if an indel realignment target contains the given read. @@ -75,7 +76,7 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] { def lt(target: IndelRealignmentTarget, read: RichAlignmentRecord): Boolean = { val region = read.readRegion - region.forall(r => target.readRange.compare(r) < 0) + region.forall(r => ReferenceRegionOrdering.compare(target.readRange, r) < 0) } /**