Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extended TwoBitFile and NucleotideContigFragmentRDDFunctions to behave more similar #1079

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,16 @@
package org.bdgenomics.adam.rdd.contig

import com.google.common.base.Splitter
import java.util.logging.Level
import org.apache.avro.specific.SpecificRecord
import org.bdgenomics.utils.misc.Logging
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.converters.FragmentConverter
import org.bdgenomics.adam.models._
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.ADAMSequenceDictionaryRDDAggregator
import org.bdgenomics.adam.util.ParquetLogger
import org.bdgenomics.adam.util.ReferenceFile
import org.bdgenomics.formats.avro._
import org.bdgenomics.utils.misc.HadoopUtil
import org.apache.parquet.avro.AvroParquetOutputFormat
import org.apache.parquet.hadoop.ParquetOutputFormat
import org.apache.parquet.hadoop.metadata.CompressionCodecName
import org.apache.parquet.hadoop.util.ContextUtil
import scala.collection.JavaConversions._
import scala.math.max
import scala.Some

class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) extends ADAMSequenceDictionaryRDDAggregator[NucleotideContigFragment](rdd) {
class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) extends ADAMSequenceDictionaryRDDAggregator[NucleotideContigFragment](rdd) with ReferenceFile {

/**
* Converts an RDD of nucleotide contig fragments into reads. Adjacent contig fragments are
Expand Down Expand Up @@ -103,6 +92,16 @@ class NucleotideContigFragmentRDDFunctions(rdd: RDD[NucleotideContigFragment]) e
.values
}

/**
* Added for ReferenceFile trait.
*
* @see ReferenceFile.scala
*
* @param region The desired ReferenceRegion to extract.
* @return The reference sequence at the desired locus.
*/
def extract(region: ReferenceRegion): String = getReferenceString(region)

/**
* From a set of contigs, returns the base sequence that corresponds to a region of the reference.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import java.nio.{ ByteOrder, ByteBuffer }
import com.esotericsoftware.kryo.io.{ Output, Input }
import com.esotericsoftware.kryo.{ Kryo, Serializer }
import org.bdgenomics.utils.io.{ ByteArrayByteAccess, ByteAccess }
import org.bdgenomics.adam.models.{ NonoverlappingRegions, ReferencePosition, ReferenceRegion }
import org.bdgenomics.adam.models._

object TwoBitFile {
val MAGIC_NUMBER: Int = 0x1A412743
Expand Down Expand Up @@ -92,6 +92,16 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile {
name -> contigOffset
}

def getSequenceDictionary(performLexSort: Boolean = false): SequenceDictionary = {
val sd = new SequenceDictionary(seqRecords.toVector.map(r => SequenceRecord(r._1, r._2.dnaSize)))

if (performLexSort) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why's this necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need the sequence dictionary from 2 bit file. Is there a better way to get it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, yeah, that makes sense. My comment was with regards to the lexicographic sort.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, we've added lex sort functions to the SequenceDictionary, so I'd rather not dupe the code.

implicit val ordering = SequenceOrderingByName
SequenceDictionary(sd.records.map(_.copy(referenceIndex = None)).sorted: _*)
} else
sd
}

/**
* Extract reference sequence from the .2bit data.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,13 @@ class TwoBitSuite extends ADAMFunSuite {
val twoBitFile = new TwoBitFile(byteAccess)
assert(twoBitFile.extract(ReferenceRegion("1", 9990, 10010), true) == "NNNNNNNNNNTAACCCTAAC")
}

test("correctly calculates sequence dictionary") {
val file = new File(resourcePath("hg19.chrM.2bit"))
val byteAccess = new LocalFileByteAccess(file)
val twoBitFile = new TwoBitFile(byteAccess)
val dict = twoBitFile.getSequenceDictionary(true)
assert(dict.records.length == 1)
assert(dict.records.head.length == 16571)
}
}